aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c55
-rw-r--r--module/zfs/bpobj.c51
-rw-r--r--module/zfs/dbuf.c319
-rw-r--r--module/zfs/ddt.c28
-rw-r--r--module/zfs/dmu.c211
-rw-r--r--module/zfs/dmu_diff.c221
-rw-r--r--module/zfs/dmu_object.c6
-rw-r--r--module/zfs/dmu_objset.c276
-rw-r--r--module/zfs/dmu_send.c93
-rw-r--r--module/zfs/dmu_traverse.c69
-rw-r--r--module/zfs/dmu_tx.c35
-rw-r--r--module/zfs/dnode.c565
-rw-r--r--module/zfs/dnode_sync.c34
-rw-r--r--module/zfs/dsl_dataset.c252
-rw-r--r--module/zfs/dsl_deleg.c34
-rw-r--r--module/zfs/dsl_pool.c6
-rw-r--r--module/zfs/dsl_scan.c53
-rw-r--r--module/zfs/dsl_synctask.c5
-rw-r--r--module/zfs/fm.c16
-rw-r--r--module/zfs/include/sys/dbuf.h60
-rw-r--r--module/zfs/include/sys/dmu.h11
-rw-r--r--module/zfs/include/sys/dmu_objset.h25
-rw-r--r--module/zfs/include/sys/dmu_traverse.h3
-rw-r--r--module/zfs/include/sys/dnode.h31
-rw-r--r--module/zfs/include/sys/dsl_dataset.h25
-rw-r--r--module/zfs/include/sys/dsl_deleg.h5
-rw-r--r--module/zfs/include/sys/fm/protocol.h31
-rw-r--r--module/zfs/include/sys/fm/util.h6
-rw-r--r--module/zfs/include/sys/refcount.h7
-rw-r--r--module/zfs/include/sys/sa.h5
-rw-r--r--module/zfs/include/sys/sa_impl.h5
-rw-r--r--module/zfs/include/sys/spa.h6
-rw-r--r--module/zfs/include/sys/spa_impl.h6
-rw-r--r--module/zfs/include/sys/vdev_impl.h2
-rw-r--r--module/zfs/include/sys/zfs_acl.h6
-rw-r--r--module/zfs/include/sys/zfs_ioctl.h53
-rw-r--r--module/zfs/include/sys/zfs_onexit.h66
-rw-r--r--module/zfs/include/sys/zfs_stat.h56
-rw-r--r--module/zfs/include/sys/zfs_vfsops.h1
-rw-r--r--module/zfs/include/sys/zfs_znode.h15
-rw-r--r--module/zfs/include/sys/zil.h15
-rw-r--r--module/zfs/include/sys/zil_impl.h31
-rw-r--r--module/zfs/include/sys/zio.h2
-rw-r--r--module/zfs/include/sys/zrlock.h66
-rw-r--r--module/zfs/lzjb.c9
-rw-r--r--module/zfs/refcount.c35
-rw-r--r--module/zfs/sa.c218
-rw-r--r--module/zfs/spa.c519
-rw-r--r--module/zfs/spa_config.c23
-rw-r--r--module/zfs/spa_misc.c29
-rw-r--r--module/zfs/txg.c2
-rw-r--r--module/zfs/vdev.c86
-rw-r--r--module/zfs/vdev_label.c12
-rw-r--r--module/zfs/zfs_acl.c215
-rw-r--r--module/zfs/zfs_ctldir.c7
-rw-r--r--module/zfs/zfs_dir.c10
-rw-r--r--module/zfs/zfs_fuid.c22
-rw-r--r--module/zfs/zfs_ioctl.c803
-rw-r--r--module/zfs/zfs_log.c59
-rw-r--r--module/zfs/zfs_onexit.c246
-rw-r--r--module/zfs/zfs_replay.c61
-rw-r--r--module/zfs/zfs_sa.c34
-rw-r--r--module/zfs/zfs_vfsops.c52
-rw-r--r--module/zfs/zfs_vnops.c198
-rw-r--r--module/zfs/zfs_znode.c278
-rw-r--r--module/zfs/zil.c615
-rw-r--r--module/zfs/zio.c23
-rw-r--r--module/zfs/zio_inject.c17
-rw-r--r--module/zfs/zrlock.c194
69 files changed, 5091 insertions, 1544 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 8adb54dc6..a82718e8b 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -952,11 +952,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
void
arc_buf_thaw(arc_buf_t *buf)
{
- kmutex_t *hash_lock;
-
- hash_lock = HDR_LOCK(buf->b_hdr);
- mutex_enter(hash_lock);
-
if (zfs_flags & ZFS_DEBUG_MODIFY) {
if (buf->b_hdr->b_state != arc_anon)
panic("modifying non-anon buffer!");
@@ -978,7 +973,6 @@ arc_buf_thaw(arc_buf_t *buf)
}
mutex_exit(&buf->b_hdr->b_freeze_lock);
- mutex_exit(hash_lock);
}
void
@@ -1750,6 +1744,7 @@ static void
arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t marker = { 0 };
list_t *list = &state->arcs_list[ARC_BUFC_DATA];
kmutex_t *hash_lock;
uint64_t bytes_deleted = 0;
@@ -1762,6 +1757,11 @@ top:
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
hash_lock = HDR_LOCK(ab);
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
@@ -1788,15 +1788,21 @@ top:
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
- } else {
- if (bytes < 0) {
- mutex_exit(&state->arcs_mtx);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
+ } else if (bytes < 0) {
+ /*
+ * Insert a list marker and then wait for the
+ * hash lock to become available. Once its
+ * available, restart from where we left off.
+ */
+ list_insert_after(list, ab, &marker);
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ mutex_enter(&state->arcs_mtx);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ } else
bufs_skipped += 1;
- }
}
mutex_exit(&state->arcs_mtx);
@@ -1825,8 +1831,9 @@ arc_adjust(void)
* Adjust MRU size
*/
- adjustment = MIN(arc_size - arc_c,
- arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+ adjustment = MIN((int64_t)(arc_size - arc_c),
+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+ arc_p));
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
@@ -2113,9 +2120,7 @@ arc_reclaim_thread(void)
arc_no_grow = FALSE;
}
- if (2 * arc_c < arc_size +
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
- arc_adjust();
+ arc_adjust();
if (arc_eviction_list != NULL)
arc_do_user_evicts();
@@ -2159,6 +2164,7 @@ arc_adapt(int bytes, arc_state_t *state)
if (state == arc_mru_ghost) {
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
@@ -2166,6 +2172,7 @@ arc_adapt(int bytes, arc_state_t *state)
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
arc_p = MAX(arc_p_min, arc_p - delta);
@@ -4438,6 +4445,16 @@ l2arc_feed_thread(void)
ASSERT(spa != NULL);
/*
+ * If the pool is read-only then force the feed thread to
+ * sleep a little longer.
+ */
+ if (!spa_writeable(spa)) {
+ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+ spa_config_exit(spa, SCL_L2ARC, dev);
+ continue;
+ }
+
+ /*
* Avoid contributing to memory pressure.
*/
if (arc_reclaim_needed()) {
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index f81c48aca..72be31235 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -113,16 +113,15 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+
bpo->bpo_os = os;
bpo->bpo_object = object;
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
-
- err = dmu_bonus_hold(bpo->bpo_os,
- bpo->bpo_object, bpo, &bpo->bpo_dbuf);
- if (err)
- return (err);
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
return (0);
}
@@ -140,6 +139,7 @@ bpobj_close(bpobj_t *bpo)
bpo->bpo_dbuf = NULL;
bpo->bpo_phys = NULL;
bpo->bpo_cached_dbuf = NULL;
+ bpo->bpo_object = 0;
mutex_destroy(&bpo->bpo_lock);
}
@@ -210,8 +210,10 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
ASSERT(bpo->bpo_havecomp);
err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
- if (err)
+ if (err) {
+ mutex_exit(&bpo->bpo_lock);
return (err);
+ }
epb = doi.doi_data_block_size / sizeof (uint64_t);
for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
@@ -252,7 +254,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
&used_after, &comp_after, &uncomp_after));
bpo->bpo_phys->bpo_bytes -= used_before - used_after;
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
- bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+ bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
bpo->bpo_phys->bpo_uncomp -=
uncomp_before - uncomp_after;
}
@@ -312,17 +314,17 @@ void
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
{
bpobj_t subbpo;
- uint64_t used, comp, uncomp;
+ uint64_t used, comp, uncomp, subsubobjs;
ASSERT(bpo->bpo_havesubobj);
ASSERT(bpo->bpo_havecomp);
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
- bpobj_close(&subbpo);
if (used == 0) {
/* No point in having an empty subobj. */
+ bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
return;
}
@@ -338,10 +340,41 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
sizeof (subobj), &subobj, tx);
bpo->bpo_phys->bpo_num_subobjs++;
+
+ /*
+ * If subobj has only one block of subobjs, then move subobj's
+ * subobjs to bpo's subobj list directly. This reduces
+ * recursion in bpobj_iterate due to nested subobjs.
+ */
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ if (subsubobjs != 0) {
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset == doi.doi_data_block_size) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+ subsubobjs, tx));
+ }
+ }
bpo->bpo_phys->bpo_bytes += used;
bpo->bpo_phys->bpo_comp += comp;
bpo->bpo_phys->bpo_uncomp += uncomp;
mutex_exit(&bpo->bpo_lock);
+
+ bpobj_close(&subbpo);
}
void
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 42ae43997..9c4e0296d 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -217,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db)
db->db_evict_func = NULL;
}
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
void
dbuf_evict(dmu_buf_impl_t *db)
{
@@ -281,7 +297,7 @@ dbuf_fini(void)
static void
dbuf_verify(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db)
return;
ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if (dn == NULL) {
ASSERT(db->db_parent == NULL);
ASSERT(db->db_blkptr == NULL);
@@ -297,8 +315,9 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
- DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !list_is_empty(&dn->dn_dbufs));
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
@@ -355,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
* have the struct_rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
@@ -380,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -424,8 +444,11 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
+ spa_t *spa;
+
mutex_exit(&db->db_mtx);
- abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+ DB_GET_SPA(&spa, db);
+ abuf = arc_loan_buf(spa, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
@@ -484,11 +507,14 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
arc_buf_t *pbuf;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -506,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -524,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
+ DB_DNODE_EXIT(db);
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -531,6 +559,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
+ spa = dn->dn_objset->os_spa;
+ DB_DNODE_EXIT(db);
+
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
@@ -549,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
else
pbuf = db->db_objset->os_phys_buf;
- (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+ (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -563,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
+ dnode_t *dn;
/*
* We don't have to hold the mutex to check db_state because it
@@ -573,46 +605,51 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_NOFILL)
return (EIO);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
- if (zio == NULL) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- }
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
dbuf_read_impl(db, zio, &flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
if (!havepzio)
err = zio_wait(zio);
} else {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -642,11 +679,12 @@ dbuf_noread(dmu_buf_impl_t *db)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
+ DB_GET_SPA(&spa, db);
+ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_set_data(db, NULL);
@@ -687,7 +725,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
/*
* If the last dirty record for this dbuf has not yet synced
* and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
+ * reset the reference to point to a new copy,
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
@@ -700,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dr->dt.dl.dr_data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -726,9 +766,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
- if (!BP_IS_HOLE(bp))
- zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
+ if (!BP_IS_HOLE(bp)) {
+ spa_t *spa;
+ DB_GET_SPA(&spa, db);
+ zio_free(spa, txg, bp);
+ }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
* Release the already-written buffer, so we leave it in
@@ -865,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
- /* If we don't exist or are in a snapshot, we can't be freed */
+ /*
+ * If we don't exist or are in a snapshot, we can't be freed.
+ * Don't pass the bp to dsl_dataset_block_freeable() since we
+ * are holding the db_mtx lock and might deadlock if we are
+ * prefetching a dedup-ed block.
+ */
if (birth_txg)
return (ds == NULL ||
- dsl_dataset_block_freeable(ds, db->db_blkptr, birth_txg));
+ dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (FALSE);
}
@@ -879,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
/* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -898,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -918,15 +970,17 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);
- dnode_willuse_space(db->db_dnode, size-osize, tx);
+ dnode_willuse_space(dn, size-osize, tx);
+ DB_DNODE_EXIT(db);
}
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
- objset_t *os = db->db_dnode->dn_objset;
+ objset_t *os;
zbookmark_t zb;
+ DB_GET_OBJSET(&os, db);
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -944,8 +998,8 @@ dbuf_release_bp(dmu_buf_impl_t *db)
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
@@ -955,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
DMU_TX_DIRTY_BUF(tx, db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
@@ -1009,6 +1065,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
+ DB_DNODE_EXIT(db);
+
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
@@ -1044,6 +1102,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* we already dirtied it in open context. Hence we must make
* this assertion only if we're not already dirty.
*/
+ os = dn->dn_objset;
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
@@ -1132,6 +1191,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
} else if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
@@ -1145,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* db_blkptr, but since this is just a guess,
* it's OK if we get an odd answer.
*/
+ ddt_prefetch(os->os_spa, bp);
dnode_willuse_space(dn, -willfree, tx);
}
@@ -1193,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_level+1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL ||
- db->db_parent == db->db_dnode->dn_dbuf);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1204,13 +1264,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
}
static int
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
uint64_t txg = tx->tx_txg;
dbuf_dirty_record_t *dr, **drp;
@@ -1231,6 +1292,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
/*
* If this buffer is currently held, we cannot undirty
* it, since one of the current holders may be in the
@@ -1243,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1264,6 +1329,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
+ DB_DNODE_EXIT(db);
if (db->db_level == 0) {
if (db->db_state != DB_NOFILL) {
@@ -1309,8 +1375,10 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
@@ -1372,7 +1440,6 @@ void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
@@ -1436,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
* in this case. For callers from the DMU we will usually see:
* dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
* For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
* Sometimes, though, we will get a mix of these two:
* DMU: dbuf_clear()->arc_buf_evict()
* ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1444,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
void
dbuf_clear(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ dmu_buf_impl_t *dndb;
int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1470,10 +1537,26 @@ dbuf_clear(dmu_buf_impl_t *db)
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
+ } else {
+ DB_DNODE_EXIT(db);
}
if (db->db_buf)
@@ -1483,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
/*
- * If this dbuf is referened from an indirect dbuf,
+ * If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
if (parent && parent != dndb)
@@ -1575,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkid = blkid;
db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
- db->db_dnode = dn;
+ db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
@@ -1632,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
@@ -1671,15 +1755,24 @@ dbuf_destroy(dmu_buf_impl_t *db)
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (db->db_dnode) {
- dnode_t *dn = db->db_dnode;
+ if (db->db_dnode_handle != NULL) {
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
-
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold
+ * corresponding to the removed dbuf is no longer
+ * discounted in dnode_move(), so the dnode cannot be
+ * moved until after we release the hold.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
}
dbuf_hash_remove(db);
}
@@ -1710,17 +1803,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
/* dbuf_find() returns with db_mtx held */
if (db = dbuf_find(dn, 0, blkid)) {
- if (refcount_count(&db->db_holds) > 0) {
- /*
- * This dbuf is active. We assume that it is
- * already CACHED, or else about to be either
- * read or filled.
- */
- mutex_exit(&db->db_mtx);
- return;
- }
+ /*
+ * This dbuf is already in the cache. We assume that
+ * it is already CACHED, or else about to be either
+ * read or filled.
+ */
mutex_exit(&db->db_mtx);
- db = NULL;
+ return;
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
@@ -1818,7 +1907,7 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
@@ -1834,7 +1923,7 @@ top:
if (parent)
dbuf_rele(parent, NULL);
- ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;
@@ -1871,6 +1960,8 @@ int
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
if (db->db_blkid != DMU_SPILL_BLKID)
return (ENOTSUP);
if (blksz == 0)
@@ -1880,9 +1971,12 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
else
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dbuf_new_size(db, blksz, tx);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1901,6 +1995,13 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
ASSERT(holds > 1);
}
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
@@ -1921,6 +2022,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
@@ -1938,7 +2044,20 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
mutex_exit(&db->db_mtx);
- dnode_rele(db->db_dnode, db);
+
+ /*
+ * If the dnode moves here, we cannot cross this barrier
+ * until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+ DB_DNODE_EXIT(db);
+ /*
+ * The bonus buffer's dnode hold is no longer discounted
+ * in dnode_move(). The dnode cannot move until after
+ * the dnode_rele().
+ */
+ dnode_rele(DB_DNODE(db), db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
@@ -2089,7 +2208,7 @@ static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
zio_t *zio;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2107,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
}
ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT(db->db_buf != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
db->db_data_pending = dr;
@@ -2130,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2154,6 +2276,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
DBUF_VERIFY(db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
if (db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
@@ -2173,6 +2298,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ DB_DNODE_EXIT(db);
+
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2191,6 +2318,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
return;
}
+ os = dn->dn_objset;
+
/*
* This function may have dropped the db_mtx lock allowing a dmu_sync
* operation to sneak in. As a result, we need to ensure that we
@@ -2200,7 +2329,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immdiate write,
+ * If this buffer is in the middle of an immediate write,
* wait for the synchronous IO to complete.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2237,10 +2366,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- else
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
+ }
}
void
@@ -2274,9 +2413,9 @@ static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
- dnode_t *dn = db->db_dnode;
spa_t *spa = zio->io_spa;
int64_t delta;
uint64_t fill = 0;
@@ -2284,12 +2423,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_blkptr == bp);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
ASSERT(bp->blk_fill == 0);
+ DB_DNODE_EXIT(db);
return;
}
@@ -2303,7 +2445,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn = db->db_dnode;
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
@@ -2336,6 +2477,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill += ibp->blk_fill;
}
}
+ DB_DNODE_EXIT(db);
bp->blk_fill = fill;
@@ -2349,8 +2491,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_impl_t *db = vdb;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
uint64_t txg = zio->io_txg;
dbuf_dirty_record_t **drp, *dr;
@@ -2360,8 +2500,13 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+
+ DB_GET_OBJSET(&os, db);
+ ds = os->os_dsl_dataset;
+ tx = os->os_synctx;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
@@ -2382,10 +2527,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
+ DB_DNODE_EXIT(db);
}
#endif
@@ -2400,6 +2549,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
} else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2411,6 +2564,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
+ DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
@@ -2466,8 +2620,8 @@ static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_t zb;
@@ -2475,6 +2629,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
zio_t *zio;
int wp_flag = 0;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
if (db->db_state != DB_NOFILL) {
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
/*
@@ -2519,6 +2677,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
ASSERT(db->db_state != DB_NOFILL);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 926b4df9a..718331496 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -36,6 +36,11 @@
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
@@ -456,9 +461,6 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
if (ddo_total->ddo_count != 0) {
ddo_total->ddo_dspace /= ddo_total->ddo_count;
ddo_total->ddo_mspace /= ddo_total->ddo_count;
- } else {
- ASSERT(ddo_total->ddo_dspace == 0);
- ASSERT(ddo_total->ddo_mspace == 0);
}
}
@@ -730,13 +732,13 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
ddt_t *ddt;
ddt_entry_t dde;
- if (!BP_GET_DEDUP(bp))
+ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
return;
/*
- * We remove the DDT once it's empty and only prefetch dedup blocks
- * when there are entries in the DDT. Thus no locking is required
- * as the DDT can't disappear on us.
+ * We only remove the DDT once all tables are empty and only
+ * prefetch dedup blocks when there are entries in the DDT.
+ * Thus no locking is required as the DDT can't disappear on us.
*/
ddt = ddt_select(spa, bp);
ddt_key_fill(&dde.dde_key, bp);
@@ -1072,11 +1074,15 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
}
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ uint64_t count = 0;
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (ddt_object_exists(ddt, type, class)) {
+ ddt_object_sync(ddt, type, class, tx);
+ count += ddt_object_count(ddt, type, class);
+ }
+ }
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
- if (!ddt_object_exists(ddt, type, class))
- continue;
- ddt_object_sync(ddt, type, class, tx);
- if (ddt_object_count(ddt, type, class) == 0)
+ if (count == 0 && ddt_object_exists(ddt, type, class))
ddt_object_destroy(ddt, type, class, tx);
}
}
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 5b87c81c6..39234eba5 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -133,7 +133,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
}
dnode_rele(dn, FTAG);
- *dbp = &db->db;
+ *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -144,31 +144,64 @@ dmu_bonus_max(void)
}
int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
- if (newsize < 0 || newsize > db->db_size)
- return (EINVAL);
- dnode_setbonuslen(dn, newsize, tx);
- return (0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = EINVAL;
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
}
int
-dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (type > DMU_OT_NUMTYPES)
- return (EINVAL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
+ if (type > DMU_OT_NUMTYPES) {
+ error = EINVAL;
+ } else if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
- dnode_setbonus_type(dn, type, tx);
- return (0);
+ DB_DNODE_EXIT(db);
+ return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
}
int
@@ -208,11 +241,19 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
/* as long as the bonus buf is held, the dnode will be held */
- if (refcount_add(&db->db_holds, tag) == 1)
+ if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
@@ -246,35 +287,56 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
rw_exit(&dn->dn_struct_rwlock);
ASSERT(db != NULL);
- err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags);
- *dbp = &db->db;
+ err = dbuf_read(db, NULL, flags);
+ if (err == 0)
+ *dbp = &db->db;
+ else
+ dbuf_rele(db, tag);
return (err);
}
int
dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
int err;
- if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
- return (EINVAL);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = EINVAL;
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = ENOENT;
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
- if (!dn->dn_have_spill) {
rw_exit(&dn->dn_struct_rwlock);
- return (ENOENT);
}
- err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp);
- rw_exit(&dn->dn_struct_rwlock);
+
+ DB_DNODE_EXIT(db);
return (err);
}
int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
- return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
- 0, tag, dbp));
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
}
/*
@@ -396,14 +458,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
}
int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
int err;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
return (err);
}
@@ -436,7 +502,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
if (len == 0) { /* they're interested in the bonus buffer */
- dn = os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
@@ -997,11 +1063,19 @@ int
dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
if (size == 0)
return (0);
- return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode,
- uio, size, tx));
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
}
int
@@ -1087,9 +1161,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ spa_t *spa;
- return (arc_loan_buf(dn->dn_objset->os_spa, size));
+ DB_GET_SPA(&spa, db);
+ return (arc_loan_buf(spa, size));
}
/*
@@ -1111,23 +1187,35 @@ void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+ dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_size(buf);
uint64_t blkid;
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(dbuf);
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
+ objset_t *os;
+ uint64_t object;
+
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
+ os = dn->dn_objset;
+ object = dn->dn_object;
+ DB_DNODE_EXIT(dbuf);
+
dbuf_rele(db, FTAG);
- dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
- buf->b_data, tx);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
@@ -1146,7 +1234,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
@@ -1157,7 +1244,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
*/
BP_SET_LSIZE(bp, db->db_size);
} else {
- ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@@ -1280,6 +1366,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dmu_sync_arg_t *dsa;
zbookmark_t zb;
zio_prop_t zp;
+ dnode_t *dn;
ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
@@ -1288,7 +1375,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid);
- dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
/*
* If we're frozen (running ziltest), we always need to generate a bp.
@@ -1413,7 +1503,8 @@ void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
{
dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
- boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+ boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
+ (wp & WP_SPILL));
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
@@ -1569,9 +1660,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
- dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
}
/*
@@ -1579,14 +1674,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
* This is specifically optimized for zfs_getattr().
*/
void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add 1 for dnode space */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + 1;
+ DB_DNODE_EXIT(db);
}
void
@@ -1638,23 +1739,25 @@ void
dmu_init(void)
{
zfs_dbgmsg_init();
- dbuf_init();
+ sa_cache_init();
+ xuio_stat_init();
+ dmu_objset_init();
dnode_init();
+ dbuf_init();
zfetch_init();
arc_init();
l2arc_init();
- xuio_stat_init();
- sa_cache_init();
}
void
dmu_fini(void)
{
+ l2arc_fini();
arc_fini();
zfetch_fini();
- dnode_fini();
dbuf_fini();
- l2arc_fini();
+ dnode_fini();
+ dmu_objset_fini();
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c
new file mode 100644
index 000000000..22340ebc5
--- /dev/null
+++ b/module/zfs/dmu_diff.c
@@ -0,0 +1,221 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+ struct vnode *da_vp; /* file to which we are reporting */
+ offset_t *da_offp;
+ int da_err; /* error that stopped diff search */
+ dmu_diff_record_t da_ddr;
+};
+
+static int
+write_record(struct diffarg *da)
+{
+ ssize_t resid; /* have to get resid to get detailed errno */
+
+ if (da->da_ddr.ddr_type == DDR_NONE) {
+ da->da_err = 0;
+ return (0);
+ }
+
+ da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr,
+ sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND,
+ RLIM64_INFINITY, CRED(), &resid);
+ *da->da_offp += sizeof (da->da_ddr);
+ return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+ ASSERT(first <= last);
+ if (da->da_ddr.ddr_type != DDR_FREE ||
+ first != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_FREE;
+ da->da_ddr.ddr_first = first;
+ da->da_ddr.ddr_last = last;
+ return (0);
+ }
+ da->da_ddr.ddr_last = last;
+ return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+ ASSERT(dnp != NULL);
+ if (dnp->dn_type == DMU_OT_NONE)
+ return (report_free_dnode_range(da, object, object));
+
+ if (da->da_ddr.ddr_type != DDR_INUSE ||
+ object != da->da_ddr.ddr_last + 1) {
+ if (write_record(da) != 0)
+ return (da->da_err);
+ da->da_ddr.ddr_type = DDR_INUSE;
+ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+ return (0);
+ }
+ da->da_ddr.ddr_last = object;
+ return (0);
+}
+
+#define DBP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ struct diffarg *da = arg;
+ int err = 0;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (EINTR);
+
+ if (zb->zb_object != DMU_META_DNODE_OBJECT)
+ return (0);
+
+ if (bp == NULL) {
+ uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+ err = report_free_dnode_range(da, dnobj,
+ dnobj + (span >> DNODE_SHIFT) - 1);
+ if (err)
+ return (err);
+ } else if (zb->zb_level == 0) {
+ dnode_phys_t *blk;
+ arc_buf_t *abuf;
+ uint32_t aflags = ARC_WAIT;
+ int blksz = BP_GET_LSIZE(bp);
+ int i;
+
+ if (dsl_read(NULL, spa, bp, pbuf,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ blk = abuf->b_data;
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = (zb->zb_blkid <<
+ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = report_dnode(da, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ if (err)
+ return (err);
+ /* Don't care about the data blocks */
+ return (TRAVERSE_VISIT_NO_CHILDREN);
+ }
+ return (0);
+}
+
+int
+dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp)
+{
+ struct diffarg da;
+ dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap->os_dsl_dataset;
+ dsl_dataset_t *findds;
+ dsl_dataset_t *relds;
+ int err = 0;
+
+ /* make certain we are looking at snapshots */
+ if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds))
+ return (EINVAL);
+
+ /* fromsnap must be earlier and from the same lineage as tosnap */
+ if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)
+ return (EXDEV);
+
+ relds = NULL;
+ findds = ds;
+
+ while (fromds->ds_dir != findds->ds_dir) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (!dsl_dir_is_clone(findds->ds_dir)) {
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+ return (EXDEV);
+ }
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_hold_obj(dp,
+ findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds);
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ if (err)
+ return (EXDEV);
+
+ relds = findds;
+ }
+
+ if (relds)
+ dsl_dataset_rele(relds, FTAG);
+
+ da.da_vp = vp;
+ da.da_offp = offp;
+ da.da_ddr.ddr_type = DDR_NONE;
+ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+ da.da_err = 0;
+
+ err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+ if (err) {
+ da.da_err = err;
+ } else {
+ /* we set the da.da_err we return as side-effect */
+ (void) write_record(&da);
+ }
+
+ return (da.da_err);
+}
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index 98228d403..8dff46048 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -33,7 +33,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
{
uint64_t object;
uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
- (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
int restarted = B_FALSE;
@@ -49,7 +49,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
*/
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
- int error = dnode_next_offset(os->os_meta_dnode,
+ int error = dnode_next_offset(DMU_META_DNODE(os),
DNODE_FIND_HOLE,
&offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
@@ -187,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
int error;
- error = dnode_next_offset(os->os_meta_dnode,
+ error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 690e6ecde..7caebd979 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -41,8 +41,26 @@
#include <sys/zil.h>
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
-#include <sys/sunddi.h>
#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
spa_t *
dmu_objset_spa(objset_t *os)
@@ -350,7 +368,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_secondary_cache = ZFS_CACHE_ALL;
}
- os->os_zil_header = os->os_phys->os_zil_header;
+ if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+ os->os_zil_header = os->os_phys->os_zil_header;
os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
@@ -368,13 +387,16 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
- os->os_meta_dnode = dnode_special_open(os,
- &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+ DMU_META_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+ &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
- os->os_userused_dnode = dnode_special_open(os,
- &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
- os->os_groupused_dnode = dnode_special_open(os,
- &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+ DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+ &os->os_userused_dnode);
+ DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+ &os->os_groupused_dnode);
}
/*
@@ -401,7 +423,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
*osp = ds->ds_objset;
if (*osp == NULL) {
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, osp);
+ ds, dsl_dataset_get_blkptr(ds), osp);
}
mutex_exit(&ds->ds_opening_lock);
return (err);
@@ -470,8 +492,8 @@ dmu_objset_evict_dbufs(objset_t *os)
mutex_enter(&os->os_lock);
/* process the mdn last, since the other dnodes have holds on it */
- list_remove(&os->os_dnodes, os->os_meta_dnode);
- list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
+ list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+ list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
/*
* Find the first dnode with holds. We have to do this dance
@@ -497,8 +519,9 @@ dmu_objset_evict_dbufs(objset_t *os)
mutex_enter(&os->os_lock);
dn = next_dn;
}
+ dn = list_head(&os->os_dnodes);
mutex_exit(&os->os_lock);
- return (list_head(&os->os_dnodes) != os->os_meta_dnode);
+ return (dn != DMU_META_DNODE(os));
}
void
@@ -539,16 +562,26 @@ dmu_objset_evict(objset_t *os)
*/
(void) dmu_objset_evict_dbufs(os);
- dnode_special_close(os->os_meta_dnode);
- if (os->os_userused_dnode) {
- dnode_special_close(os->os_userused_dnode);
- dnode_special_close(os->os_groupused_dnode);
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
}
zil_free(os->os_zil);
ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
+
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
@@ -570,12 +603,12 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
- if (ds)
- mutex_enter(&ds->ds_opening_lock);
- VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
- if (ds)
- mutex_exit(&ds->ds_opening_lock);
- mdn = os->os_meta_dnode;
+ if (ds != NULL)
+ VERIFY(0 == dmu_objset_from_ds(ds, &os));
+ else
+ VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
+
+ mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -663,34 +696,33 @@ static void
dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
+ spa_t *spa = dd->dd_pool->dp_spa;
struct oscarg *oa = arg2;
- uint64_t dsobj;
+ uint64_t obj;
ASSERT(dmu_tx_is_syncing(tx));
- dsobj = dsl_dataset_create_sync(dd, oa->lastname,
+ obj = dsl_dataset_create_sync(dd, oa->lastname,
oa->clone_origin, oa->flags, oa->cr, tx);
if (oa->clone_origin == NULL) {
+ dsl_pool_t *dp = dd->dd_pool;
dsl_dataset_t *ds;
blkptr_t *bp;
objset_t *os;
- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
- FTAG, &ds));
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
bp = dsl_dataset_get_blkptr(ds);
ASSERT(BP_IS_HOLE(bp));
- os = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, bp, oa->type, tx);
+ os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
if (oa->userfunc)
oa->userfunc(os, oa->userarg, oa->cr, tx);
dsl_dataset_rele(ds, FTAG);
}
- spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa,
- tx, "dataset = %llu", dsobj);
+ spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj);
}
int
@@ -758,18 +790,8 @@ dmu_objset_destroy(const char *name, boolean_t defer)
dsl_dataset_t *ds;
int error;
- /*
- * dsl_dataset_destroy() can free any claimed-but-unplayed
- * intent log, but if there is an active log, it has blocks that
- * are allocated, but may not yet be reflected in the on-disk
- * structure. Only the ZIL knows how to free them, so we have
- * to call into it here.
- */
error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
if (error == 0) {
- objset_t *os;
- if (dmu_objset_from_ds(ds, &os) == 0)
- zil_destroy(dmu_objset_zil(os), B_FALSE);
error = dsl_dataset_destroy(ds, FTAG, defer);
/* dsl_dataset_destroy() closes the ds. */
}
@@ -780,9 +802,14 @@ dmu_objset_destroy(const char *name, boolean_t defer)
struct snaparg {
dsl_sync_task_group_t *dstg;
char *snapname;
+ char *htag;
char failed[MAXPATHLEN];
boolean_t recursive;
+ boolean_t needsuspend;
+ boolean_t temporary;
nvlist_t *props;
+ struct dsl_ds_holdarg *ha; /* only needed in the temporary case */
+ dsl_dataset_t *newds;
};
static int
@@ -790,11 +817,41 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
struct snaparg *sn = arg2;
+ int error;
/* The props have already been checked by zfs_check_userprops(). */
- return (dsl_dataset_snapshot_check(os->os_dsl_dataset,
- sn->snapname, tx));
+ error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
+ sn->snapname, tx);
+ if (error)
+ return (error);
+
+ if (sn->temporary) {
+ /*
+ * Ideally we would just call
+ * dsl_dataset_user_hold_check() and
+ * dsl_dataset_destroy_check() here. However the
+ * dataset we want to hold and destroy is the snapshot
+ * that we just confirmed we can create, but it won't
+ * exist until after these checks are run. Do any
+ * checks we can here and if more checks are added to
+ * those routines in the future, similar checks may be
+ * necessary here.
+ */
+ if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ /*
+ * Not checking number of tags because the tag will be
+ * unique, as it will be the only tag.
+ */
+ if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+ return (E2BIG);
+
+ sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ sn->ha->temphold = B_TRUE;
+ sn->ha->htag = sn->htag;
+ }
+ return (error);
}
static void
@@ -812,6 +869,19 @@ snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
pa.pa_source = ZPROP_SRC_LOCAL;
dsl_props_set_sync(ds->ds_prev, &pa, tx);
}
+
+ if (sn->temporary) {
+ struct dsl_ds_destroyarg da;
+
+ dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx);
+ kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg));
+ sn->ha = NULL;
+ sn->newds = ds->ds_prev;
+
+ da.ds = ds->ds_prev;
+ da.defer = B_TRUE;
+ dsl_dataset_destroy_sync(&da, FTAG, tx);
+ }
}
static int
@@ -857,29 +927,27 @@ dmu_objset_snapshot_one(const char *name, void *arg)
return (sn->recursive ? 0 : EBUSY);
}
- /*
- * NB: we need to wait for all in-flight changes to get to disk,
- * so that we snapshot those changes. zil_suspend does this as
- * a side effect.
- */
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0) {
- dsl_sync_task_create(sn->dstg, snapshot_check,
- snapshot_sync, os, sn, 3);
- } else {
- dmu_objset_rele(os, sn);
+ if (sn->needsuspend) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err) {
+ dmu_objset_rele(os, sn);
+ return (err);
+ }
}
+ dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync,
+ os, sn, 3);
- return (err);
+ return (0);
}
int
-dmu_objset_snapshot(char *fsname, char *snapname,
- nvlist_t *props, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd)
{
dsl_sync_task_t *dst;
struct snaparg sn;
spa_t *spa;
+ minor_t minor;
int err;
(void) strcpy(sn.failed, fsname);
@@ -888,10 +956,26 @@ dmu_objset_snapshot(char *fsname, char *snapname,
if (err)
return (err);
+ if (temporary) {
+ if (cleanup_fd < 0) {
+ spa_close(spa, FTAG);
+ return (EINVAL);
+ }
+ if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
+ spa_close(spa, FTAG);
+ return (err);
+ }
+ }
+
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
+ sn.htag = tag;
sn.props = props;
sn.recursive = recursive;
+ sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+ sn.temporary = temporary;
+ sn.ha = NULL;
+ sn.newds = NULL;
if (recursive) {
err = dmu_objset_find(fsname,
@@ -907,14 +991,20 @@ dmu_objset_snapshot(char *fsname, char *snapname,
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
objset_t *os = dst->dst_arg1;
dsl_dataset_t *ds = os->os_dsl_dataset;
- if (dst->dst_err)
+ if (dst->dst_err) {
dsl_dataset_name(ds, sn.failed);
- zil_resume(dmu_objset_zil(os));
+ } else if (temporary) {
+ dsl_register_onexit_hold_cleanup(sn.newds, tag, minor);
+ }
+ if (sn.needsuspend)
+ zil_resume(dmu_objset_zil(os));
dmu_objset_rele(os, &sn);
}
if (err)
(void) strcpy(fsname, sn.failed);
+ if (temporary)
+ zfs_onexit_fd_rele(cleanup_fd);
dsl_sync_task_group_destroy(sn.dstg);
spa_close(spa, FTAG);
return (err);
@@ -1035,17 +1125,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
/*
* Sync special dnodes - the parent IO for the sync is the root block
*/
- os->os_meta_dnode->dn_zio = zio;
- dnode_sync(os->os_meta_dnode, tx);
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
os->os_phys->os_flags = os->os_flags;
- if (os->os_userused_dnode &&
- os->os_userused_dnode->dn_type != DMU_OT_NONE) {
- os->os_userused_dnode->dn_zio = zio;
- dnode_sync(os->os_userused_dnode, tx);
- os->os_groupused_dnode->dn_zio = zio;
- dnode_sync(os->os_groupused_dnode, tx);
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
txgoff = tx->tx_txg & TXG_MASK;
@@ -1063,7 +1153,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
- list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
ASSERT(dr->dr_dbuf->db_level == 0);
list_remove(list, dr);
@@ -1085,7 +1175,16 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg)
!list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
}
-objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+boolean_t
+dmu_objset_is_dirty_anywhere(objset_t *os)
+{
+ for (int t = 0; t < TXG_SIZE; t++)
+ if (dmu_objset_is_dirty(os, t))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1097,8 +1196,8 @@ boolean_t
dmu_objset_userused_enabled(objset_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
- used_cbs[os->os_phys->os_type] &&
- os->os_userused_dnode);
+ used_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
}
static void
@@ -1125,13 +1224,14 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
while (dn = list_head(list)) {
+ int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
/* Allocate the user/groupused objects if necessary. */
- if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY(0 == zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
@@ -1148,18 +1248,19 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
* a bprewrite.
*/
- mutex_enter(&dn->dn_mtx);
- ASSERT(dn->dn_id_flags);
- if (dn->dn_id_flags & DN_ID_OLD_EXIST) {
+ flags = dn->dn_id_flags;
+ ASSERT(flags);
+ if (flags & DN_ID_OLD_EXIST) {
do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
}
- if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ if (flags & DN_ID_NEW_EXIST) {
do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
dn->dn_phys->dn_flags, dn->dn_newuid,
dn->dn_newgid, B_FALSE, tx);
}
+ mutex_enter(&dn->dn_mtx);
dn->dn_oldused = 0;
dn->dn_oldflags = 0;
if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
@@ -1199,13 +1300,23 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (dr->dr_txg == tx->tx_txg)
break;
- if (dr == NULL)
+ if (dr == NULL) {
data = NULL;
- else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 &&
- dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
- data = dr->dt.dl.dr_data->b_data;
- else
- data = dr->dt.dl.dr_data;
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(dr->dr_dbuf);
+ dn = DB_DNODE(dr->dr_dbuf);
+
+ if (dn->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+
+ DB_DNODE_EXIT(dr->dr_dbuf);
+ }
+
return (data);
}
@@ -1242,7 +1353,8 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
- error = dmu_spill_hold_by_dnode(dn, rf,
+ error = dmu_spill_hold_by_dnode(dn,
+ rf | DB_RF_MUST_SUCCEED,
FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0);
mutex_enter(&db->db_mtx);
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 6b00b73b4..e47d533a4 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -42,6 +42,7 @@
#include <zfs_fletcher.h>
#include <sys/avl.h>
#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
static char *dmu_recv_tag = "dmu_recv_tag";
@@ -573,6 +574,14 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
return (ETXTBSY);
+ /* new snapshot name must not exist */
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
if (rbsa->fromguid) {
/* if incremental, most recent snapshot must match fromguid */
if (ds->ds_prev == NULL)
@@ -620,13 +629,6 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (err != ENOENT)
return (err);
- /* new snapshot name must not exist */
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
return (0);
}
@@ -661,7 +663,6 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
dp->dp_spa, tx, "dataset = %lld", dsobj);
}
-
static boolean_t
dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
{
@@ -786,7 +787,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
return (err);
if (dmu_recv_verify_features(ds, drrb)) {
- dsl_dataset_rele(ds, dmu_recv_tag);
+ dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
@@ -810,7 +811,7 @@ struct restorearg {
uint64_t voff;
int bufsize; /* amount of memory allocated for buf */
zio_cksum_t cksum;
- avl_tree_t guid_to_ds_map;
+ avl_tree_t *guid_to_ds_map;
};
typedef struct guid_map_entry {
@@ -887,6 +888,21 @@ find_ds_by_guid(const char *name, void *arg)
return (0);
}
+static void
+free_guid_map_onexit(void *arg)
+{
+ avl_tree_t *ca = arg;
+ void *cookie = NULL;
+ guid_map_entry_t *gmep;
+
+ while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+ dsl_dataset_rele(gmep->gme_ds, ca);
+ kmem_free(gmep, sizeof (guid_map_entry_t));
+ }
+ avl_destroy(ca);
+ kmem_free(ca, sizeof (avl_tree_t));
+}
+
static void *
restore_read(struct restorearg *ra, int len)
{
@@ -1173,7 +1189,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
*/
if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
gmesrch.guid = drrwbr->drr_refguid;
- if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
+ if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
&where)) == NULL) {
return (EINVAL);
}
@@ -1276,13 +1292,13 @@ restore_free(struct restorearg *ra, objset_t *os,
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
-dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep)
{
struct restorearg ra = { 0 };
dmu_replay_record_t *drr;
objset_t *os;
zio_cksum_t pcksum;
- guid_map_entry_t *gmep;
int featureflags;
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
@@ -1336,12 +1352,38 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
/* if this stream is dedup'ed, set up the avl tree for guid mapping */
if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
- avl_create(&ra.guid_to_ds_map, guid_compare,
- sizeof (guid_map_entry_t),
- offsetof(guid_map_entry_t, avlnode));
- (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
- (void *)&ra.guid_to_ds_map,
- DS_FIND_CHILDREN);
+ minor_t minor;
+
+ if (cleanup_fd == -1) {
+ ra.err = EBADF;
+ goto out;
+ }
+ ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (ra.err) {
+ cleanup_fd = -1;
+ goto out;
+ }
+
+ if (*action_handlep == 0) {
+ ra.guid_to_ds_map =
+ kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(ra.guid_to_ds_map, guid_compare,
+ sizeof (guid_map_entry_t),
+ offsetof(guid_map_entry_t, avlnode));
+ (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+ (void *)ra.guid_to_ds_map,
+ DS_FIND_CHILDREN);
+ ra.err = zfs_onexit_add_cb(minor,
+ free_guid_map_onexit, ra.guid_to_ds_map,
+ action_handlep);
+ if (ra.err)
+ goto out;
+ } else {
+ ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+ (void **)&ra.guid_to_ds_map);
+ if (ra.err)
+ goto out;
+ }
}
/*
@@ -1423,6 +1465,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
ASSERT(ra.err != 0);
out:
+ if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+ zfs_onexit_fd_rele(cleanup_fd);
+
if (ra.err != 0) {
/*
* destroy what we created, so we don't leave it in the
@@ -1438,16 +1483,6 @@ out:
}
}
- if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
- void *cookie = NULL;
-
- while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
- dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
- kmem_free(gmep, sizeof (guid_map_entry_t));
- }
- avl_destroy(&ra.guid_to_ds_map);
- }
-
kmem_free(ra.buf, ra.bufsize);
*voffp = ra.voff;
return (ra.err);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 429c76ae1..023f90e12 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -36,7 +36,9 @@
#include <sys/sa_impl.h>
#include <sys/callb.h>
-struct prefetch_data {
+int zfs_pd_blks_max = 100;
+
+typedef struct prefetch_data {
kmutex_t pd_mtx;
kcondvar_t pd_cv;
int pd_blks_max;
@@ -44,27 +46,26 @@ struct prefetch_data {
int pd_flags;
boolean_t pd_cancel;
boolean_t pd_exited;
-};
+} prefetch_data_t;
-struct traverse_data {
+typedef struct traverse_data {
spa_t *td_spa;
uint64_t td_objset;
blkptr_t *td_rootbp;
uint64_t td_min_txg;
int td_flags;
- struct prefetch_data *td_pfd;
+ prefetch_data_t *td_pfd;
blkptr_cb_t *td_func;
void *td_arg;
-};
+} traverse_data_t;
-static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object);
-/* ARGSUSED */
static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
zbookmark_t zb;
if (bp->blk_birth == 0)
@@ -81,11 +82,10 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
return (0);
}
-/* ARGSUSED */
static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
- struct traverse_data *td = arg;
+ traverse_data_t *td = arg;
if (lrc->lrc_txtype == TX_WRITE) {
lr_write_t *lr = (lr_write_t *)lrc;
@@ -98,8 +98,8 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
if (claim_txg == 0 || bp->blk_birth < claim_txg)
return (0);
- SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
- lr->lr_offset / BP_GET_LSIZE(bp));
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
td->td_arg);
@@ -108,7 +108,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
}
static void
-traverse_zil(struct traverse_data *td, zil_header_t *zh)
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
{
uint64_t claim_txg = zh->zh_claim_txg;
zilog_t *zilog;
@@ -129,13 +129,13 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
}
static int
-traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
int err = 0, lasterr = 0;
arc_buf_t *buf = NULL;
- struct prefetch_data *pd = td->td_pfd;
+ prefetch_data_t *pd = td->td_pfd;
boolean_t hard = td->td_flags & TRAVERSE_HARD;
if (bp->blk_birth == 0) {
@@ -162,6 +162,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
if (td->td_flags & TRAVERSE_PRE) {
err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
if (err)
return (err);
}
@@ -225,8 +227,6 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
return (err);
osp = buf->b_data;
- traverse_zil(td, &osp->os_zil_header);
-
dnp = &osp->os_meta_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
DMU_META_DNODE_OBJECT);
@@ -262,7 +262,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
}
static int
-traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
{
int j, err = 0, lasterr = 0;
@@ -300,7 +300,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
void *arg)
{
- struct prefetch_data *pfd = arg;
+ prefetch_data_t *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
ASSERT(pfd->pd_blks_fetched >= 0);
@@ -330,8 +330,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static void
traverse_prefetch_thread(void *arg)
{
- struct traverse_data *td_main = arg;
- struct traverse_data td = *td_main;
+ traverse_data_t *td_main = arg;
+ traverse_data_t td = *td_main;
zbookmark_t czb;
td.td_func = traverse_prefetcher;
@@ -353,16 +353,16 @@ traverse_prefetch_thread(void *arg)
* in syncing context).
*/
static int
-traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
{
- struct traverse_data td;
- struct prefetch_data pd = { 0 };
+ traverse_data_t td;
+ prefetch_data_t pd = { 0 };
zbookmark_t czb;
int err;
td.td_spa = spa;
- td.td_objset = objset;
+ td.td_objset = ds ? ds->ds_object : 0;
td.td_rootbp = rootbp;
td.td_min_txg = txg_start;
td.td_func = func;
@@ -370,17 +370,28 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
td.td_pfd = &pd;
td.td_flags = flags;
- pd.pd_blks_max = 100;
+ pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
+ /* See comment on ZIL traversal in dsl_scan_visitds. */
+ if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
+ objset_t *os;
+
+ err = dmu_objset_from_ds(ds, &os);
+ if (err)
+ return (err);
+
+ traverse_zil(&td, &os->os_zil_header);
+ }
+
if (!(flags & TRAVERSE_PREFETCH) ||
0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
&td, TQ_NOQUEUE))
pd.pd_exited = B_TRUE;
- SET_BOOKMARK(&czb, objset,
+ SET_BOOKMARK(&czb, td.td_objset,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
@@ -405,7 +416,7 @@ int
traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
- return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
+ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
&ds->ds_phys->ds_bp, txg_start, flags, func, arg));
}
@@ -423,7 +434,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
boolean_t hard = (flags & TRAVERSE_HARD);
/* visit the MOS */
- err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
+ err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
txg_start, flags, func, arg);
if (err)
return (err);
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 5fc062c16..bd5c71a22 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -186,7 +186,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
ASSERT(level != 0);
db = NULL;
} else {
- ASSERT(db->db_dnode == dn);
+ ASSERT(DB_DNODE(db) == dn);
ASSERT(db->db_level == level);
ASSERT(db->db.db_size == space);
ASSERT(db->db_blkid == blkid);
@@ -384,7 +384,7 @@ static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
dnode_t *dn = txh->txh_dnode;
- dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
+ dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
uint64_t space = mdn->dn_datablksz +
((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
@@ -787,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
{
dmu_tx_hold_t *txh;
int match_object = FALSE, match_offset = FALSE;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(tx->tx_txg != 0);
ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
ASSERT3U(dn->dn_object, ==, db->db.db_object);
- if (tx->tx_anyobj)
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
return;
+ }
/* XXX No checking on the meta dnode for now */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
return;
+ }
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
@@ -870,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
ASSERT(!"bad txh_type");
}
}
- if (match_object && match_offset)
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
return;
+ }
}
+ DB_DNODE_EXIT(db);
panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
(u_longlong_t)db->db.db_object, db->db_level,
(u_longlong_t)db->db_blkid);
@@ -1355,9 +1364,19 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
- if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
- ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
ASSERT(tx->tx_txg == 0);
dmu_tx_hold_spill(tx, object);
+ } else {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
}
}
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index c16902d21..850dd5816 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -38,19 +38,33 @@
static int free_range_compar(const void *node1, const void *node2);
static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define DNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define DNODE_STAT_ADD(stat) /* nothing */
+#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
- int i;
dnode_t *dn = arg;
- bzero(dn, sizeof (dnode_t));
+ int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -59,8 +73,18 @@ dnode_cons(void *arg, void *unused, int kmflag)
refcount_create(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
+ list_link_init(&dn->dn_dirty_link[i]);
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
@@ -69,9 +93,27 @@ dnode_cons(void *arg, void *unused, int kmflag)
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
+ dn->dn_moved = 0;
return (0);
}
@@ -88,27 +130,56 @@ dnode_dest(void *arg, void *unused)
cv_destroy(&dn->dn_notxholds);
refcount_destroy(&dn->dn_holds);
refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
avl_destroy(&dn->dn_ranges[i]);
list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
}
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_free_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT3U(dn->dn_dirtyctx, ==, 0);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT3U(dn->dn_oldused, ==, 0);
+ ASSERT3U(dn->dn_oldflags, ==, 0);
+ ASSERT3U(dn->dn_olduid, ==, 0);
+ ASSERT3U(dn->dn_oldgid, ==, 0);
+ ASSERT3U(dn->dn_newuid, ==, 0);
+ ASSERT3U(dn->dn_newgid, ==, 0);
+ ASSERT3U(dn->dn_id_flags, ==, 0);
+
+ ASSERT3U(dn->dn_dbufs_count, ==, 0);
list_destroy(&dn->dn_dbufs);
}
void
dnode_init(void)
{
+ ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t",
sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(dnode_cache, dnode_move);
}
void
dnode_fini(void)
{
kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
}
@@ -120,6 +191,7 @@ dnode_verify(dnode_t *dn)
ASSERT(dn->dn_phys);
ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
@@ -298,18 +370,29 @@ dnode_setdblksz(dnode_t *dn, int size)
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
- uint64_t object)
+ uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
- (void) dnode_cons(dn, NULL, 0); /* XXX */
- dn->dn_objset = os;
+ ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
dn->dn_object = object;
dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
dn->dn_phys = dnp;
- if (dnp->dn_datablkszsec)
+ if (dnp->dn_datablkszsec) {
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
dn->dn_indblkshift = dnp->dn_indblkshift;
dn->dn_nlevels = dnp->dn_nlevels;
dn->dn_type = dnp->dn_type;
@@ -325,45 +408,65 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
mutex_enter(&os->os_lock);
list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning dn_objset makes the
+ * dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
static void
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
-#ifdef ZFS_DEBUG
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
- ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
- }
- ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
list_remove(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
- if (dn->dn_dirtyctx_firstset) {
+ /* the dnode can no longer move, so we can release the handle */
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ if (dn->dn_dirtyctx_firstset != NULL) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
- dmu_zfetch_rele(&dn->dn_zfetch);
- if (dn->dn_bonus) {
+ if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
}
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
}
@@ -408,6 +511,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
@@ -522,9 +626,304 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
+#ifdef DNODE_STATS
+static struct {
+ uint64_t dms_dnode_invalid;
+ uint64_t dms_dnode_recheck1;
+ uint64_t dms_dnode_recheck2;
+ uint64_t dms_dnode_special;
+ uint64_t dms_dnode_handle;
+ uint64_t dms_dnode_rwlock;
+ uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+ refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(list_is_empty(&ndn->dn_dbufs));
+ list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+ ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+ ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ odn->dn_zfetch.zf_dnode = NULL;
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_ranges[i].avl_root = NULL;
+ odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef _KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = odn->dn_dbufs_count;
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == ndn->dn_dbufs_count);
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+
void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
{
+ dnode_t *dn = dnh->dnh_dnode;
+
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
@@ -533,13 +932,19 @@ dnode_special_close(dnode_t *dn)
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
dnode_t *
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+ dnh->dnh_dnode = dn;
+ zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
return (dn);
}
@@ -547,34 +952,43 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
static void
dnode_buf_pageout(dmu_buf_t *db, void *arg)
{
- dnode_t **children_dnodes = arg;
+ dnode_children_t *children_dnodes = arg;
int i;
int epb = db->db_size >> DNODE_SHIFT;
+ ASSERT(epb == children_dnodes->dnc_count);
+
for (i = 0; i < epb; i++) {
- dnode_t *dn = children_dnodes[i];
- int n;
+ dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ dnode_t *dn;
- if (dn == NULL)
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (dnh->dnh_dnode == NULL) {
+ zrl_destroy(&dnh->dnh_zrlock);
continue;
-#ifdef ZFS_DEBUG
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
- * it wouldn't be eligable for eviction and this function
+ * it wouldn't be eligible for eviction and this function
* would not have been called.
*/
ASSERT(refcount_is_zero(&dn->dn_holds));
- ASSERT(list_head(&dn->dn_dbufs) == NULL);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- for (n = 0; n < TXG_SIZE; n++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
- children_dnodes[i] = NULL;
- dnode_destroy(dn);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
}
/*
@@ -593,7 +1007,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_t **children_dnodes;
+ dnode_children_t *children_dnodes;
+ dnode_handle_t *dnh;
/*
* If you are holding the spa config lock as writer, you shouldn't
@@ -603,12 +1018,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
*/
ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
(spa_is_root(os->os_spa) &&
- spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) &&
- !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER)));
+ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
- os->os_userused_dnode : os->os_groupused_dnode;
+ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
return (ENOENT);
type = dn->dn_type;
@@ -625,7 +1039,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
- mdn = os->os_meta_dnode;
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
DNODE_VERIFY(mdn);
@@ -652,26 +1067,39 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
idx = object & (epb-1);
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- dnode_t **winner;
- children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
- KM_SLEEP);
+ int i;
+ dnode_children_t *winner;
+ children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ children_dnodes->dnc_count = epb;
+ dnh = &children_dnodes->dnc_children[0];
+ for (i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+ dnh[i].dnh_dnode = NULL;
+ }
if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
dnode_buf_pageout)) {
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
+ ASSERT(children_dnodes->dnc_count == epb);
- if ((dn = children_dnodes[idx]) == NULL) {
- dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ if ((dn = dnh->dnh_dnode) == NULL) {
+ dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, dnp, db, object);
- winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ dn = dnode_create(os, phys, db, object, dnh);
+ winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
if (winner != NULL) {
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
dn = winner;
}
}
@@ -683,13 +1111,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
((flag & DNODE_MUST_BE_FREE) &&
(type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
+ zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dn);
+ dbuf_add_ref(db, dnh);
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
@@ -731,13 +1162,37 @@ void
dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
- if (refs == 0 && dn->dn_dbuf)
- dbuf_rele(dn->dn_dbuf, dn);
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ dbuf_rele(db, dnh);
+ }
}
void
@@ -756,7 +1211,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
#ifdef ZFS_DEBUG
mutex_enter(&dn->dn_mtx);
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
- /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
mutex_exit(&dn->dn_mtx);
#endif
@@ -795,7 +1250,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
- * dbuf maintaines a hold on the dnode. When the last child
+ * dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index f9ec9f602..2ee990a3b 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -76,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
if (child == NULL)
continue;
- ASSERT3P(child->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
if (child->db_parent && child->db_parent != dn->dn_dbuf) {
ASSERT(child->db_parent->db_level == db->db_level);
ASSERT(child->db_blkptr !=
@@ -135,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
int off, num;
int i, err, epbs;
uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
off = start - (db->db_blkid * 1<<epbs);
num = end - start + 1;
ASSERT3U(off, >=, 0);
ASSERT3U(num, >=, 0);
ASSERT3U(db->db_level, >, 0);
- ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
ASSERT(db->db_blkptr != NULL);
@@ -155,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
ASSERT(err == 0);
@@ -200,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
dbuf_rele(child, FTAG);
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -209,7 +217,7 @@ static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
@@ -230,7 +238,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
shift = (db->db_level - 1) * epbs;
dbstart = db->db_blkid << epbs;
start = blkid >> shift;
@@ -253,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
@@ -272,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
}
dbuf_rele(subdb, FTAG);
}
+ DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
@@ -375,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
- ASSERT3P(db->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index ddd83576c..59ac4a609 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -37,15 +37,11 @@
#include <sys/zfs_ioctl.h>
#include <sys/spa.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
#include <sys/dsl_scan.h>
#include <sys/dsl_deadlist.h>
-/*
- * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
- */
-int zfs_dedup_prefetch = 1;
-
static char *dsl_reaper = "the grim reaper";
static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
@@ -253,8 +249,7 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
return (B_FALSE);
- if (zfs_dedup_prefetch && bp && BP_GET_DEDUP(bp))
- ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+ ddt_prefetch(dsl_dataset_get_spa(ds), bp);
return (B_TRUE);
}
@@ -372,6 +367,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
int err;
+ dmu_object_info_t doi;
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
dsl_pool_sync_context(dp));
@@ -379,6 +375,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
if (err)
return (err);
+
+ /* Make sure dsobj has the correct object type. */
+ dmu_object_info_from_db(dbuf, &doi);
+ if (doi.doi_type != DMU_OT_DSL_DATASET)
+ return (EINVAL);
+
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
dsl_dataset_t *winner;
@@ -881,6 +883,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dir_close(dd, FTAG);
+ /*
+ * If we are creating a clone, make sure we zero out any stale
+ * data from the origin snapshots zil header.
+ */
+ if (origin != NULL) {
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ dsl_dataset_dirty(ds, tx);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
return (dsobj);
}
@@ -1081,11 +1098,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
*/
(void) dmu_free_object(os, obj);
}
+ if (err != ESRCH)
+ goto out;
/*
- * We need to sync out all in-flight IO before we try to evict
- * (the dataset evict func is trying to clear the cached entries
- * for this dataset in the ARC).
+ * Only the ZIL knows how to free log blocks.
+ */
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+
+ /*
+ * Sync out all in-flight IO.
*/
txg_wait_synced(dd->dd_pool, 0);
@@ -1103,9 +1125,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
count == 0);
}
- if (err != ESRCH)
- goto out;
-
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
rw_exit(&dd->dd_pool->dp_config_rwlock);
@@ -1356,6 +1375,11 @@ dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
return (0);
}
+/*
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
+ */
/* ARGSUSED */
int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1597,21 +1621,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
dsl_dataset_t *ds_prev = NULL;
+ boolean_t wont_destroy;
uint64_t obj;
- ASSERT(ds->ds_owner);
+ wont_destroy = (dsda->defer &&
+ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
+
+ ASSERT(ds->ds_owner || wont_destroy);
ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
- if (dsda->defer) {
+ if (wont_destroy) {
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
- if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
- return;
- }
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ return;
}
/* signal any waiters that this dataset is going away */
@@ -1620,11 +1646,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
cv_broadcast(&ds->ds_exclusive_cv);
mutex_exit(&ds->ds_lock);
- if (ds->ds_objset) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
-
/* Remove our reservation */
if (ds->ds_reserved != 0) {
dsl_prop_setarg_t psa;
@@ -1850,6 +1871,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
}
}
+ /*
+ * This must be done after the dsl_traverse(), because it will
+ * re-open the objset.
+ */
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
/* Erase the link in the dir */
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
@@ -1928,7 +1958,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
*/
ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
- if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (ENOSPC);
/*
@@ -2224,8 +2254,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
if (ds->ds_prev == NULL)
return (B_FALSE);
if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (B_TRUE);
+ ds->ds_prev->ds_phys->ds_creation_txg) {
+ objset_t *os, *os_prev;
+ /*
+ * It may be that only the ZIL differs, because it was
+ * reset in the head. Don't count that as being
+ * modified.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0)
+ return (B_TRUE);
+ if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+ return (B_TRUE);
+ return (bcmp(&os->os_phys->os_meta_dnode,
+ &os_prev->os_phys->os_meta_dnode,
+ sizeof (os->os_phys->os_meta_dnode)) != 0);
+ }
return (B_FALSE);
}
@@ -3144,9 +3187,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
ASSERT(clone->ds_owner);
ASSERT(origin_head->ds_owner);
retry:
- /* Need exclusive access for the swap */
- rw_enter(&clone->ds_rwlock, RW_WRITER);
- if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+ /*
+ * Need exclusive access for the swap. If we're swapping these
+ * datasets back after an error, we already hold the locks.
+ */
+ if (!RW_WRITE_HELD(&clone->ds_rwlock))
+ rw_enter(&clone->ds_rwlock, RW_WRITER);
+ if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
+ !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
rw_exit(&clone->ds_rwlock);
rw_enter(&origin_head->ds_rwlock, RW_WRITER);
if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
@@ -3411,22 +3459,41 @@ dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
return (err);
}
-struct dsl_ds_holdarg {
- dsl_sync_task_group_t *dstg;
- char *htag;
- char *snapname;
- boolean_t recursive;
- boolean_t gotone;
- boolean_t temphold;
- char failed[MAXPATHLEN];
-};
+typedef struct zfs_hold_cleanup_arg {
+ dsl_pool_t *dp;
+ uint64_t dsobj;
+ char htag[MAXNAMELEN];
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+ zfs_hold_cleanup_arg_t *ca = arg;
+
+ (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
+ B_TRUE);
+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+}
+
+void
+dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor)
+{
+ zfs_hold_cleanup_arg_t *ca;
+
+ ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
+ ca->dp = ds->ds_dir->dd_pool;
+ ca->dsobj = ds->ds_object;
+ (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
+ VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
+ dsl_dataset_user_release_onexit, ca, NULL));
+}
/*
- * The max length of a temporary tag prefix is the number of hex digits
- * required to express UINT64_MAX plus one for the hyphen.
+ * If you add new checks here, you may need to add
+ * additional checks to the "temporary" case in
+ * snapshot_check() in dmu_objset.c.
*/
-#define MAX_TAG_PREFIX_LEN 17
-
static int
dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -3461,7 +3528,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (error);
}
-static void
+void
dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
@@ -3524,13 +3591,41 @@ dsl_dataset_user_hold_one(const char *dsname, void *arg)
}
int
+dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold)
+{
+ struct dsl_ds_holdarg *ha;
+ int error;
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+ ha->htag = htag;
+ ha->temphold = temphold;
+ error = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
+ ds, ha, 0);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+
+ return (error);
+}
+
+int
dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
- boolean_t recursive, boolean_t temphold)
+ boolean_t recursive, boolean_t temphold, int cleanup_fd)
{
struct dsl_ds_holdarg *ha;
dsl_sync_task_t *dst;
spa_t *spa;
int error;
+ minor_t minor = 0;
+
+ if (cleanup_fd != -1) {
+ /* Currently we only support cleanup-on-exit of tempholds. */
+ if (!temphold)
+ return (EINVAL);
+ error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+ if (error)
+ return (error);
+ }
ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
@@ -3539,6 +3634,8 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
error = spa_open(dsname, &spa, FTAG);
if (error) {
kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
return (error);
}
@@ -3547,6 +3644,7 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
ha->snapname = snapname;
ha->recursive = recursive;
ha->temphold = temphold;
+
if (recursive) {
error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
ha, DS_FIND_CHILDREN);
@@ -3563,6 +3661,12 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
if (dst->dst_err) {
dsl_dataset_name(ds, ha->failed);
*strchr(ha->failed, '@') = '\0';
+ } else if (error == 0 && minor != 0 && temphold) {
+ /*
+ * If this hold is to be released upon process exit,
+ * register that action now.
+ */
+ dsl_register_onexit_hold_cleanup(ds, htag, minor);
}
dsl_dataset_rele(ds, ha->dstg);
}
@@ -3574,8 +3678,11 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
dsl_sync_task_group_destroy(ha->dstg);
+
kmem_free(ha, sizeof (struct dsl_ds_holdarg));
spa_close(spa, FTAG);
+ if (cleanup_fd != -1)
+ zfs_onexit_fd_rele(cleanup_fd);
return (error);
}
@@ -3667,11 +3774,6 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
uint64_t refs;
int error;
- if (ds->ds_objset) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
-
mutex_enter(&ds->ds_lock);
ds->ds_userrefs--;
refs = ds->ds_userrefs;
@@ -3831,10 +3933,12 @@ top:
}
/*
- * Called at spa_load time to release a stale temporary user hold.
+ * Called at spa_load time (with retry == B_FALSE) to release a stale
+ * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
*/
int
-dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
+ boolean_t retry)
{
dsl_dataset_t *ds;
char *snap;
@@ -3842,20 +3946,36 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
int namelen;
int error;
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
- rw_exit(&dp->dp_config_rwlock);
- if (error)
- return (error);
- namelen = dsl_dataset_namelen(ds)+1;
- name = kmem_alloc(namelen, KM_SLEEP);
- dsl_dataset_name(ds, name);
- dsl_dataset_rele(ds, FTAG);
+ do {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error)
+ return (error);
+ namelen = dsl_dataset_namelen(ds)+1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(ds, name);
+ dsl_dataset_rele(ds, FTAG);
- snap = strchr(name, '@');
- *snap = '\0';
- ++snap;
- return (dsl_dataset_user_release(name, snap, htag, B_FALSE));
+ snap = strchr(name, '@');
+ *snap = '\0';
+ ++snap;
+ error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
+ kmem_free(name, namelen);
+
+ /*
+ * The object can't have been destroyed because we have a hold,
+ * but it might have been renamed, resulting in ENOENT. Retry
+ * if we've been requested to do so.
+ *
+ * It would be nice if we could use the dsobj all the way
+ * through and avoid ENOENT entirely. But we might need to
+ * unmount the snapshot, and there's currently no way to lookup
+ * a vfsp using a ZFS object id.
+ */
+ } while ((error == ENOENT) && retry);
+
+ return (error);
}
int
diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c
index 85490c8d5..529fb052f 100644
--- a/module/zfs/dsl_deleg.c
+++ b/module/zfs/dsl_deleg.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -528,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
* Check if user has requested permission.
*/
int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
{
- dsl_dataset_t *ds;
dsl_dir_t *dd;
dsl_pool_t *dp;
void *cookie;
@@ -540,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
avl_tree_t permsets;
perm_set_t *setnode;
- error = dsl_dataset_hold(dsname, FTAG, &ds);
- if (error)
- return (error);
-
dp = ds->ds_dir->dd_pool;
mos = dp->dp_meta_objset;
- if (dsl_delegation_on(mos) == B_FALSE) {
- dsl_dataset_rele(ds, FTAG);
+ if (dsl_delegation_on(mos) == B_FALSE)
return (ECANCELED);
- }
if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
- SPA_VERSION_DELEGATED_PERMS) {
- dsl_dataset_rele(ds, FTAG);
+ SPA_VERSION_DELEGATED_PERMS)
return (EPERM);
- }
if (dsl_dataset_is_snapshot(ds)) {
/*
@@ -633,7 +624,6 @@ again:
error = EPERM;
success:
rw_exit(&dp->dp_config_rwlock);
- dsl_dataset_rele(ds, FTAG);
cookie = NULL;
while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -642,6 +632,22 @@ success:
return (error);
}
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+ dsl_dataset_t *ds;
+ int error;
+
+ error = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (error)
+ return (error);
+
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
/*
* Other routines.
*/
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 2cd21a102..700cc9628 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -42,7 +42,7 @@
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */
+int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
@@ -451,7 +451,7 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
while (ds = list_head(&dp->dp_synced_datasets)) {
list_remove(&dp->dp_synced_datasets, ds);
os = ds->ds_objset;
- zil_clean(os->os_zil);
+ zil_clean(os->os_zil, txg);
ASSERT(!dmu_objset_is_dirty(os, txg));
dmu_buf_rele(ds->ds_dbuf, ds);
}
@@ -768,7 +768,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
*htag = '\0';
++htag;
dsobj = strtonum(za.za_name, NULL);
- (void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+ (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
}
zap_cursor_fini(&zc);
}
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 23c37c7cc..56d410836 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -56,6 +56,11 @@ static scan_cb_t dsl_scan_remove_cb;
static dsl_syncfunc_t dsl_scan_cancel_sync;
static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
+int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
+int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
+int zfs_scan_idle = 50; /* idle window in clock ticks */
+
int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
@@ -601,8 +606,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
* done before setting xlateall (similar to dsl_read())
*/
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
- buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &flags, &czb);
+ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
}
static boolean_t
@@ -650,6 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
{
dsl_pool_t *dp = scn->scn_dp;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
int err;
if (BP_GET_LEVEL(bp) > 0) {
@@ -660,7 +666,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, bufp,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@@ -683,7 +689,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, bufp,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@@ -696,7 +702,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, bufp,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@@ -719,7 +725,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, bufp,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@@ -727,9 +733,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
osp = (*bufp)->b_data;
- if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
- dsl_scan_zil(dp, &osp->os_zil_header);
-
dsl_scan_visitdnode(scn, ds, osp->os_type,
&osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
@@ -1072,9 +1075,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
dsl_dataset_t *ds;
+ objset_t *os;
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (dmu_objset_from_ds(ds, &os))
+ goto out;
+
+ /*
+ * Only the ZIL in the head (non-snapshot) is valid. Even though
+ * snapshots can have ZIL block pointers (which may be the same
+ * BP as in the head), they must be ignored. So we traverse the
+ * ZIL here, rather than in scan_recurse(), because the regular
+ * snapshot block-sharing rules don't apply to it.
+ */
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+ dsl_scan_zil(dp, &os->os_zil_header);
+
/*
* Iterate over the bps in this ds.
*/
@@ -1446,7 +1463,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_scan_setup_sync(scn, &func, tx);
}
-
if (!dsl_scan_active(scn) ||
spa_sync_pass(dp->dp_spa) > 1)
return;
@@ -1489,7 +1505,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
-
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1644,8 +1659,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io;
- int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
int zio_priority;
+ int scan_delay = 0;
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg)
@@ -1658,10 +1674,12 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
zio_flags |= ZIO_FLAG_SCRUB;
zio_priority = ZIO_PRIORITY_SCRUB;
needs_io = B_TRUE;
+ scan_delay = zfs_scrub_delay;
} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
zio_flags |= ZIO_FLAG_RESILVER;
zio_priority = ZIO_PRIORITY_RESILVER;
needs_io = B_FALSE;
+ scan_delay = zfs_resilver_delay;
}
/* If it's an intent log block, failure is expected. */
@@ -1699,14 +1717,23 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
}
if (needs_io && !zfs_no_scrub_io) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+ while (spa->spa_scrub_inflight >= maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
+ /*
+ * If we're seeing recent (zfs_scan_idle) "important" I/Os
+ * then throttle our workload to limit the impact of a scan.
+ */
+ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+ delay(scan_delay);
+
zio_nowait(zio_read(NULL, spa, bp, data, size,
dsl_scan_scrub_done, NULL, zio_priority,
zio_flags, zb));
diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c
index 832685b0f..b0818ce27 100644
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c
@@ -213,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp,
dsl_sync_task_group_t *dstg;
int err;
+ ASSERT(spa_writeable(dp->dp_spa));
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
@@ -228,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp,
{
dsl_sync_task_group_t *dstg;
+ if (!spa_writeable(dp->dp_spa))
+ return;
+
dstg = dsl_sync_task_group_create(dp);
dsl_sync_task_create(dstg, checkfunc, syncfunc,
arg1, arg2, blocks_modified);
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index 78943eda8..4efcff4f4 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -384,6 +384,20 @@ fm_panic(const char *format, ...)
}
/*
+ * Simply tell the caller if fm_panicstr is set, ie. an fma event has
+ * caused the panic. If so, something other than the default panic
+ * diagnosis method will diagnose the cause of the panic.
+ */
+int
+is_fm_panic()
+{
+ if (fm_panicstr)
+ return (1);
+ else
+ return (0);
+}
+
+/*
* Print any appropriate FMA banner message before the panic message. This
* function is called by panicsys() and prints the message for fm_panic().
* We print the message here so that it comes after the system is quiesced.
@@ -610,8 +624,8 @@ fm_nvlist_create(nv_alloc_t *nva)
if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
if (hdl_alloced) {
- kmem_free(nvhdl, sizeof (nv_alloc_t));
nv_alloc_fini(nvhdl);
+ kmem_free(nvhdl, sizeof (nv_alloc_t));
}
return (NULL);
}
diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h
index 4c05806e3..cf1bbc030 100644
--- a/module/zfs/include/sys/dbuf.h
+++ b/module/zfs/include/sys/dbuf.h
@@ -32,6 +32,7 @@
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -82,9 +83,6 @@ struct dmu_tx;
* etc.
*/
-#define LIST_LINK_INACTIVE(link) \
- ((link)->list_next == NULL && (link)->list_prev == NULL)
-
struct dmu_buf_impl;
typedef enum override_states {
@@ -149,15 +147,17 @@ typedef struct dmu_buf_impl {
struct objset *db_objset;
/*
- * the dnode we belong to (NULL when evicted)
+ * handle to safely access the dnode we belong to (NULL when evicted)
*/
- struct dnode *db_dnode;
+ struct dnode_handle *db_dnode_handle;
/*
* our parent buffer; if the dnode points to us directly,
- * db_parent == db_dnode->dn_dbuf
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
* only accessed by sync thread ???
* (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
*/
struct dmu_buf_impl *db_parent;
@@ -284,24 +284,46 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define DB_GET_SPA(_spa_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_spa_p) = __dn->dn_objset->os_spa; \
+ DB_DNODE_EXIT(_db); \
+}
+#define DB_GET_OBJSET(_os_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_os_p) = __dn->dn_objset; \
+ DB_DNODE_EXIT(_db); \
+}
+
void dbuf_init(void);
void dbuf_fini(void);
-#define DBUF_IS_METADATA(db) \
- ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_IS_METADATA(_db) \
+ (dbuf_is_metadata(_db))
-#define DBUF_GET_BUFC_TYPE(db) \
- (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-#define DBUF_IS_CACHEABLE(db) \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-#define DBUF_IS_L2CACHEABLE(db) \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
@@ -332,7 +354,7 @@ _NOTE(CONSTCOND) } while (0)
sprintf_blkptr(__blkbuf, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
+ } \
_NOTE(CONSTCOND) } while (0)
#define DBUF_VERIFY(db) dbuf_verify(db)
diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h
index 83932f467..07f5949eb 100644
--- a/module/zfs/include/sys/dmu.h
+++ b/module/zfs/include/sys/dmu.h
@@ -192,8 +192,8 @@ int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
uint64_t flags);
int dmu_objset_destroy(const char *name, boolean_t defer);
int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
- boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
int dmu_objset_rename(const char *name, const char *newname,
boolean_t recursive);
int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
@@ -335,6 +335,7 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
/*
@@ -721,9 +722,13 @@ typedef struct dmu_recv_cookie {
int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+ int cleanup_fd, uint64_t *action_handlep);
int dmu_recv_end(dmu_recv_cookie_t *drc);
+int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp,
+ offset_t *off);
+
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
extern uint64_t zfs_crc64_table[256];
diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h
index 5c5119a20..c6d202e2e 100644
--- a/module/zfs/include/sys/dmu_objset.h
+++ b/module/zfs/include/sys/dmu_objset.h
@@ -40,6 +40,8 @@
extern "C" {
#endif
+extern krwlock_t os_lock;
+
struct dsl_dataset;
struct dmu_tx;
@@ -68,9 +70,15 @@ struct objset {
spa_t *os_spa;
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
- dnode_t *os_meta_dnode;
- dnode_t *os_userused_dnode;
- dnode_t *os_groupused_dnode;
+ /*
+ * The following "special" dnodes have no parent and are exempt from
+ * dnode_move(), but they root their descendents in this objset using
+ * handles anyway, so that all access to dnodes from dbufs consistently
+ * uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
/* can change, under dsl_dir's locks: */
@@ -113,6 +121,9 @@ struct objset {
#define DMU_META_OBJSET 0
#define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
#define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
@@ -131,8 +142,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
uint64_t flags);
int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
- boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
+ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd);
void dmu_objset_stats(objset_t *os, nvlist_t *nv);
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
@@ -150,6 +161,7 @@ timestruc_t dmu_objset_snap_cmtime(objset_t *os);
/* called from dsl */
void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+boolean_t dmu_objset_is_dirty_anywhere(objset_t *os);
objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
@@ -161,6 +173,9 @@ boolean_t dmu_objset_userused_enabled(objset_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/module/zfs/include/sys/dmu_traverse.h b/module/zfs/include/sys/dmu_traverse.h
index 844e7f1ae..5b326cd99 100644
--- a/module/zfs/include/sys/dmu_traverse.h
+++ b/module/zfs/include/sys/dmu_traverse.h
@@ -49,6 +49,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
#define TRAVERSE_HARD (1<<4)
+/* Special traverse error return value to indicate skipping of children */
+#define TRAVERSE_VISIT_NO_CHILDREN -1
+
int traverse_dataset(struct dsl_dataset *ds,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
int traverse_pool(spa_t *spa,
diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h
index 8bae1602e..9ad4be36b 100644
--- a/module/zfs/include/sys/dnode.h
+++ b/module/zfs/include/sys/dnode.h
@@ -32,6 +32,7 @@
#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -156,6 +157,7 @@ typedef struct dnode {
struct objset *dn_objset;
uint64_t dn_object;
struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
@@ -172,6 +174,7 @@ typedef struct dnode {
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
@@ -183,6 +186,9 @@ typedef struct dnode {
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
@@ -202,8 +208,11 @@ typedef struct dnode {
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ list_t dn_dbufs; /* descendent dbufs */
+
+ /* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
boolean_t dn_have_spill; /* have spill or are spilling */
/* parent IO for current sync write */
@@ -220,6 +229,22 @@ typedef struct dnode {
struct zfetch dn_zfetch;
} dnode_t;
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
typedef struct free_range {
avl_node_t fr_node;
uint64_t fr_blkid;
@@ -227,8 +252,8 @@ typedef struct free_range {
} free_range_t;
dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
- uint64_t object);
-void dnode_special_close(dnode_t *dn);
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h
index 58414e133..22733d070 100644
--- a/module/zfs/include/sys/dsl_dataset.h
+++ b/module/zfs/include/sys/dsl_dataset.h
@@ -162,6 +162,22 @@ struct dsl_ds_destroyarg {
boolean_t need_prep; /* do we need to retry due to EBUSY? */
};
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define MAX_TAG_PREFIX_LEN 17
+
+struct dsl_ds_holdarg {
+ dsl_sync_task_group_t *dstg;
+ char *htag;
+ char *snapname;
+ boolean_t recursive;
+ boolean_t gotone;
+ boolean_t temphold;
+ char failed[MAXPATHLEN];
+};
+
#define dsl_dataset_is_snapshot(ds) \
((ds)->ds_phys->ds_num_children != 0)
@@ -182,6 +198,8 @@ void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
void *tag);
void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
+ minor_t minor);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -192,16 +210,19 @@ dsl_checkfunc_t dsl_dataset_destroy_check;
dsl_syncfunc_t dsl_dataset_destroy_sync;
dsl_checkfunc_t dsl_dataset_snapshot_check;
dsl_syncfunc_t dsl_dataset_snapshot_sync;
+dsl_syncfunc_t dsl_dataset_user_hold_sync;
int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
int dsl_dataset_promote(const char *name, char *conflsnap);
int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
boolean_t force);
int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
- boolean_t recursive, boolean_t temphold);
+ boolean_t recursive, boolean_t temphold, int cleanup_fd);
+int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
+ boolean_t temphold);
int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
boolean_t recursive);
int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
- char *htag);
+ char *htag, boolean_t retry);
int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h
index a26a3f705..73c43bd23 100644
--- a/module/zfs/include/sys/dsl_deleg.h
+++ b/module/zfs/include/sys/dsl_deleg.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
@@ -55,6 +54,7 @@ extern "C" {
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
#define ZFS_DELEG_PERM_HOLD "hold"
#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
/*
* Note: the names of properties that are marked delegatable are also
@@ -64,6 +64,7 @@ extern "C" {
int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
diff --git a/module/zfs/include/sys/fm/protocol.h b/module/zfs/include/sys/fm/protocol.h
index c4103c48a..5eca760da 100644
--- a/module/zfs/include/sys/fm/protocol.h
+++ b/module/zfs/include/sys/fm/protocol.h
@@ -43,12 +43,13 @@ extern "C" {
#define FM_CLASS "class"
#define FM_VERSION "version"
-/* FM event class values */
+/* FM protocol category 1 class names */
#define FM_EREPORT_CLASS "ereport"
#define FM_FAULT_CLASS "fault"
#define FM_DEFECT_CLASS "defect"
#define FM_RSRC_CLASS "resource"
#define FM_LIST_EVENT "list"
+#define FM_IREPORT_CLASS "ireport"
/* FM list.* event class values */
#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
@@ -72,6 +73,12 @@ extern "C" {
/* list.* event payload member names */
#define FM_LIST_EVENT_SIZE "list-sz"
+/* ireport.* event payload member names */
+#define FM_IREPORT_DETECTOR "detector"
+#define FM_IREPORT_UUID "uuid"
+#define FM_IREPORT_PRIORITY "pri"
+#define FM_IREPORT_ATTRIBUTES "attr"
+
/*
* list.suspect, isolated, updated, repaired and resolved
* versions/payload member names.
@@ -192,6 +199,7 @@ extern "C" {
#define FM_FMRI_SCHEME_PKG "pkg"
#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
#define FM_FMRI_SCHEME_ZFS "zfs"
+#define FM_FMRI_SCHEME_SW "sw"
/* Scheme versions */
#define FMD_SCHEME_VERSION0 0
@@ -215,6 +223,8 @@ extern "C" {
#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0
#define ZFS_SCHEME_VERSION0 0
#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
+#define SW_SCHEME_VERSION0 0
+#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0
/* hc scheme member names */
#define FM_FMRI_HC_SERIAL_ID "serial"
@@ -299,6 +309,25 @@ extern "C" {
#define FM_FMRI_ZFS_POOL "pool"
#define FM_FMRI_ZFS_VDEV "vdev"
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define FM_FMRI_SW_OBJ "object"
+#define FM_FMRI_SW_OBJ_PATH "path"
+#define FM_FMRI_SW_OBJ_ROOT "root"
+#define FM_FMRI_SW_OBJ_PKG "pkg"
+#define FM_FMRI_SW_SITE "site"
+#define FM_FMRI_SW_SITE_TOKEN "token"
+#define FM_FMRI_SW_SITE_MODULE "module"
+#define FM_FMRI_SW_SITE_FILE "file"
+#define FM_FMRI_SW_SITE_LINE "line"
+#define FM_FMRI_SW_SITE_FUNC "func"
+#define FM_FMRI_SW_CTXT "context"
+#define FM_FMRI_SW_CTXT_ORIGIN "origin"
+#define FM_FMRI_SW_CTXT_EXECNAME "execname"
+#define FM_FMRI_SW_CTXT_PID "pid"
+#define FM_FMRI_SW_CTXT_ZONE "zone"
+#define FM_FMRI_SW_CTXT_CTID "ctid"
+#define FM_FMRI_SW_CTXT_STACK "stack"
+
extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
extern void fm_nva_xdestroy(nv_alloc_t *);
diff --git a/module/zfs/include/sys/fm/util.h b/module/zfs/include/sys/fm/util.h
index 4934814d8..37334101b 100644
--- a/module/zfs/include/sys/fm/util.h
+++ b/module/zfs/include/sys/fm/util.h
@@ -20,15 +20,12 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FM_UTIL_H
#define _SYS_FM_UTIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -96,6 +93,7 @@ extern void fm_ereport_post(nvlist_t *, int);
extern void fm_payload_stack_add(nvlist_t *, const pc_t *, int);
+extern int is_fm_panic();
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/module/zfs/include/sys/refcount.h b/module/zfs/include/sys/refcount.h
index bc3ade80f..1752c64e3 100644
--- a/module/zfs/include/sys/refcount.h
+++ b/module/zfs/include/sys/refcount.h
@@ -40,7 +40,7 @@ extern "C" {
*/
#define FTAG ((char *)__func__)
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
typedef struct reference {
list_node_t ref_link;
void *ref_holder;
@@ -67,11 +67,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag);
int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_init(void);
void refcount_fini(void);
-#else /* DEBUG */
+#else /* ZFS_DEBUG */
typedef struct refcount {
uint64_t rc_count;
@@ -97,7 +98,7 @@ typedef struct refcount {
#define refcount_init()
#define refcount_fini()
-#endif /* DEBUG */
+#endif /* ZFS_DEBUG */
#ifdef __cplusplus
}
diff --git a/module/zfs/include/sys/sa.h b/module/zfs/include/sys/sa.h
index e9a96a0f9..bc89fa07d 100644
--- a/module/zfs/include/sys/sa.h
+++ b/module/zfs/include/sys/sa.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SA_H
@@ -141,7 +140,7 @@ dmu_buf_t *sa_get_db(sa_handle_t *);
uint64_t sa_handle_object(sa_handle_t *);
boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
void sa_register_update_callback(objset_t *, sa_update_cb_t *);
-sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
void sa_tear_down(objset_t *);
int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
int, dmu_tx_t *);
diff --git a/module/zfs/include/sys/sa_impl.h b/module/zfs/include/sys/sa_impl.h
index 62497e702..6661e47cf 100644
--- a/module/zfs/include/sys/sa_impl.h
+++ b/module/zfs/include/sys/sa_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SA_IMPL_H
@@ -232,7 +231,7 @@ struct sa_handle {
((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
#define SA_BONUSTYPE_FROM_DB(db) \
- (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+ (dmu_get_bonustype((dmu_buf_t *)db))
#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h
index 41a40300e..456ec06dc 100644
--- a/module/zfs/include/sys/spa.h
+++ b/module/zfs/include/sys/spa.h
@@ -418,8 +418,8 @@ extern int spa_get_stats(const char *pool, nvlist_t **config,
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
const char *history_str, nvlist_t *zplprops);
extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+ uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -602,6 +602,7 @@ extern objset_t *spa_meta_objset(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
@@ -620,7 +621,6 @@ extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
-extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
extern int spa_mode(spa_t *spa);
extern uint64_t strtonum(const char *str, char **nptr);
diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h
index e2e1851ec..c965ffbbe 100644
--- a/module/zfs/include/sys/spa_impl.h
+++ b/module/zfs/include/sys/spa_impl.h
@@ -114,13 +114,14 @@ struct spa {
nvlist_t *spa_config; /* last synced config */
nvlist_t *spa_config_syncing; /* currently syncing config */
nvlist_t *spa_config_splitting; /* config for splitting */
+ nvlist_t *spa_load_info; /* info and errors from load */
uint64_t spa_config_txg; /* txg of last config change */
int spa_sync_pass; /* iterate-to-convergence */
pool_state_t spa_state; /* pool state */
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
- boolean_t spa_load_verbatim; /* load the given config? */
+ uint64_t spa_import_flags; /* import specific flags */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
metaslab_class_t *spa_normal_class; /* normal data class */
@@ -130,6 +131,7 @@ struct spa {
uint64_t spa_freeze_txg; /* freeze pool at this txg */
uint64_t spa_load_max_txg; /* best initial ub_txg */
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
+ timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
@@ -146,9 +148,9 @@ struct spa {
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
+ uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
- uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h
index 2b886bc58..161bd21f0 100644
--- a/module/zfs/include/sys/vdev_impl.h
+++ b/module/zfs/include/sys/vdev_impl.h
@@ -169,6 +169,7 @@ struct vdev {
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
+ uint64_t vdev_resilvering; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
@@ -283,6 +284,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
* vdev sync load and sync
*/
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h
index 72e868fab..c1a0aeebd 100644
--- a/module/zfs/include/sys/zfs_acl.h
+++ b/module/zfs/include/sys/zfs_acl.h
@@ -185,10 +185,6 @@ typedef struct zfs_acl_ids {
struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
} zfs_acl_ids_t;
-#define ZFS_EXTERNAL_ACL(zp) \
- (zp->z_is_sa ? 0 : zfs_external_acl(zp))
-#define ZNODE_ACL_VERSION(zp) \
- (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp))
/*
* Property values for acl_mode and acl_inherit.
*
@@ -222,7 +218,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
+void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr);
diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h
index b0cb4955e..84bf794fe 100644
--- a/module/zfs/include/sys/zfs_ioctl.h
+++ b/module/zfs/include/sys/zfs_ioctl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@@ -31,6 +30,7 @@
#include <sys/zio.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
+#include <sys/zfs_stat.h>
#ifdef _KERNEL
#include <sys/nvpair.h>
@@ -199,6 +199,22 @@ typedef struct dmu_replay_record {
} drr_u;
} dmu_replay_record_t;
+/* diff record range types */
+typedef enum diff_type {
+ DDR_NONE = 0x1,
+ DDR_INUSE = 0x2,
+ DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+ uint64_t ddr_type;
+ uint64_t ddr_first;
+ uint64_t ddr_last;
+} dmu_diff_record_t;
+
typedef struct zinject_record {
uint64_t zi_objset;
uint64_t zi_object;
@@ -265,6 +281,13 @@ typedef struct zfs_cmd {
zinject_record_t zc_inject_record;
boolean_t zc_defer_destroy;
boolean_t zc_temphold;
+ uint64_t zc_action_handle;
+ int zc_cleanup_fd;
+ uint8_t zc_pad[4]; /* alignment */
+ uint64_t zc_sendobj;
+ uint64_t zc_fromobj;
+ uint64_t zc_createtxg;
+ zfs_stat_t zc_stat;
} zfs_cmd_t;
typedef struct zfs_useracct {
@@ -274,8 +297,8 @@ typedef struct zfs_useracct {
uint64_t zu_space;
} zfs_useracct_t;
-#define ZVOL_MAX_MINOR (1 << 16)
-#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
+#define ZFSDEV_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1)
#define ZPOOL_EXPORT_AFTER_SPLIT 0x1
@@ -295,6 +318,28 @@ extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
extern int zfs_busy(void);
extern int zfs_unmount_snap(const char *, void *);
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+ ZSST_ZVOL,
+ ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+ enum zfs_soft_state_type zss_type;
+ void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+ enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
+extern kmutex_t zfsdev_state_lock;
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/module/zfs/include/sys/zfs_onexit.h b/module/zfs/include/sys/zfs_onexit.h
new file mode 100644
index 000000000..4982bd4d0
--- /dev/null
+++ b/module/zfs/include/sys/zfs_onexit.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_ONEXIT_H
+#define _SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+ kmutex_t zo_lock;
+ list_t zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+ list_node_t za_link;
+ void (*za_func)(void *);
+ void *za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+ boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+ void **data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_ONEXIT_H */
diff --git a/module/zfs/include/sys/zfs_stat.h b/module/zfs/include/sys/zfs_stat.h
new file mode 100644
index 000000000..465aefaa2
--- /dev/null
+++ b/module/zfs/include/sys/zfs_stat.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_STAT_H
+#define _SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl. zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+ uint64_t zs_gen;
+ uint64_t zs_mode;
+ uint64_t zs_links;
+ uint64_t zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_STAT_H */
diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h
index 86dcdacc0..38c87df43 100644
--- a/module/zfs/include/sys/zfs_vfsops.h
+++ b/module/zfs/include/sys/zfs_vfsops.h
@@ -79,6 +79,7 @@ struct zfsvfs {
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h
index 4781ee686..3e9621a0e 100644
--- a/module/zfs/include/sys/zfs_znode.h
+++ b/module/zfs/include/sys/zfs_znode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ZNODE_H
@@ -36,6 +35,7 @@
#include <sys/zfs_vfsops.h>
#include <sys/rrwlock.h>
#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -60,6 +60,8 @@ extern "C" {
#define ZFS_AV_QUARANTINED 0x0000020000000000
#define ZFS_AV_MODIFIED 0x0000040000000000
#define ZFS_REPARSE 0x0000080000000000
+#define ZFS_OFFLINE 0x0000100000000000
+#define ZFS_SPARSE 0x0000200000000000
#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
{ \
@@ -188,17 +190,17 @@ typedef struct znode {
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
- uint64_t z_last_itx; /* last ZIL itx on this znode */
uint64_t z_gen; /* generation (cached) */
uint64_t z_size; /* file size (cached) */
uint64_t z_atime[2]; /* atime (cached) */
uint64_t z_links; /* file links (cached) */
uint64_t z_pflags; /* pflags (cached) */
- uid_t z_uid; /* uid mapped (cached) */
- uid_t z_gid; /* gid mapped (cached) */
+ uint64_t z_uid; /* uid fuid (cached) */
+ uint64_t z_gid; /* gid fuid (cached) */
mode_t z_mode; /* mode (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
@@ -321,7 +323,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
vattr_t *vap);
extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name);
+ znode_t *dzp, char *name, uint64_t foid);
+#define ZFS_NO_OBJECT 0 /* no object id */
extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name);
extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h
index 2f01cf922..a4c5575b2 100644
--- a/module/zfs/include/sys/zil.h
+++ b/module/zfs/include/sys/zil.h
@@ -169,18 +169,14 @@ typedef enum zil_create {
(txtype) == TX_ACL || \
(txtype) == TX_WRITE2)
-
/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
* Each log record has a common structure at the beginning.
*
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number. The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record. The two sequence numbers are
- * different because the transactions can now be pushed out of order.
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
*/
typedef struct { /* common log record header */
uint64_t lrc_txtype; /* intent log transaction type */
@@ -371,6 +367,7 @@ typedef struct itx {
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
uint64_t itx_sod; /* record size on disk */
+ uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
@@ -402,15 +399,15 @@ extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
extern void zil_itx_destroy(itx_t *itx);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern int zil_vdev_offline(const char *osname, void *txarg);
extern int zil_claim(const char *osname, void *txarg);
extern int zil_check_log_chain(const char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void zil_clean(zilog_t *zilog);
+extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h
index 6560a7942..1d4c0cc6c 100644
--- a/module/zfs/include/sys/zil_impl.h
+++ b/module/zfs/include/sys/zil_impl.h
@@ -50,6 +50,28 @@ typedef struct lwb {
} lwb_t;
/*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+ list_t i_sync_list; /* list of synchronous itxs */
+ avl_tree_t i_async_tree; /* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+ kmutex_t itxg_lock; /* lock for this structure */
+ uint64_t itxg_txg; /* txg for this chain */
+ uint64_t itxg_sod; /* total size on disk for this txg */
+ itxs_t *itxg_itxs; /* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+ uint64_t ia_foid; /* file object id */
+ list_t ia_list; /* list of async itxs for this foid */
+ avl_node_t ia_node; /* AVL tree linkage */
+} itx_async_node_t;
+
+/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
@@ -71,9 +93,7 @@ struct zilog {
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* next in-core itx sequence number */
uint64_t zl_lr_seq; /* on-disk log record sequence number */
- uint64_t zl_commit_seq; /* committed upto this number */
uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
@@ -93,10 +113,13 @@ struct zilog {
uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
uint64_t zl_parse_blk_count; /* number of blocks parsed */
uint64_t zl_parse_lr_count; /* number of log records parsed */
- list_t zl_itx_list; /* in-memory itx list */
+ uint64_t zl_next_batch; /* next batch number */
+ uint64_t zl_com_batch; /* committed batch number */
+ kcondvar_t zl_cv_batch[2]; /* batch condition variables */
+ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
+ list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
- uint64_t zl_prev_used; /* previous commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h
index 0400c1702..97d8ec74d 100644
--- a/module/zfs/include/sys/zio.h
+++ b/module/zfs/include/sys/zio.h
@@ -147,7 +147,7 @@ enum zio_flag {
ZIO_FLAG_SELF_HEAL = 1 << 2,
ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4,
- ZIO_FLAG_SCRUB_THREAD = 1 << 5,
+ ZIO_FLAG_SCAN_THREAD = 1 << 5,
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
diff --git a/module/zfs/include/sys/zrlock.h b/module/zfs/include/sys/zrlock.h
new file mode 100644
index 000000000..dcd63f7b5
--- /dev/null
+++ b/module/zfs/include/sys/zrlock.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define zrl_add(_z) zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c
index 10952f472..ab3de51b7 100644
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@@ -20,12 +20,11 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
- * We keep our own copy of this algorithm for 2 main reasons:
+ * We keep our own copy of this algorithm for 3 main reasons:
* 1. If we didn't, anyone modifying common/os/compress.c would
* directly break our on disk format
* 2. Our version of lzjb does not have a number of checks that the
@@ -33,8 +32,8 @@
* 3. We initialize the lempel to ensure deterministic results,
* so that identical blocks can always be deduplicated.
* In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
*/
#include <sys/types.h>
diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c
index 8358b4cee..600132f08 100644
--- a/module/zfs/refcount.c
+++ b/module/zfs/refcount.c
@@ -25,7 +25,7 @@
#include <sys/zfs_context.h>
#include <sys/refcount.h>
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
#ifdef _KERNEL
int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -189,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder)
return (refcount_remove_many(rc, 1, holder));
}
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+#endif /* ZFS_DEBUG */
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index a91b379f9..4cb4546b2 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -300,8 +300,8 @@ sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
return (crc);
}
-static boolean_t
-sa_has_blkptr(sa_handle_t *hdl)
+static int
+sa_get_spill(sa_handle_t *hdl)
{
int rc;
if (hdl->sa_spill == NULL) {
@@ -312,7 +312,7 @@ sa_has_blkptr(sa_handle_t *hdl)
rc = 0;
}
- return (rc == 0 ? B_TRUE : B_FALSE);
+ return (rc);
}
/*
@@ -349,7 +349,8 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
buftypes |= SA_BONUS;
}
}
- if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) {
+ if (bulk[i].sa_addr == NULL &&
+ ((error = sa_get_spill(hdl)) == 0)) {
if (TOC_ATTR_PRESENT(
hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
SA_ATTR_INFO(sa, hdl->sa_spill_tab,
@@ -362,6 +363,10 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
}
}
}
+ if (error && error != ENOENT) {
+ return ((error == ECKSUM) ? EIO : error);
+ }
+
switch (data_op) {
case SA_LOOKUP:
if (bulk[i].sa_addr == NULL)
@@ -421,12 +426,10 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
char attr_name[8];
if (sa->sa_layout_attr_obj == 0) {
- int error;
sa->sa_layout_attr_obj = zap_create(os,
DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
- error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
- &sa->sa_layout_attr_obj, tx);
- ASSERT3U(error, ==, 0);
+ VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+ &sa->sa_layout_attr_obj, tx) == 0);
}
(void) snprintf(attr_name, sizeof (attr_name),
@@ -667,10 +670,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
boolean_t dummy;
if (hdl->sa_spill == NULL) {
- int error;
- error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
- &hdl->sa_spill);
- ASSERT3U(error, ==, 0);
+ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+ &hdl->sa_spill) == 0);
}
dmu_buf_will_dirty(hdl->sa_spill, tx);
@@ -712,7 +713,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
length = attr_desc[i].sa_length;
if (buf_space < length) { /* switch to spill buffer */
- ASSERT(bonustype != DMU_OT_ZNODE);
+ VERIFY(bonustype == DMU_OT_SA);
if (buftype == SA_BONUS && !sa->sa_force_spill) {
sa_find_layout(hdl->sa_os, hash, attrs_start,
lot_count, tx, &lot);
@@ -746,6 +747,14 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
}
sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+ /*
+ * Verify that old znodes always have layout number 0.
+ * Must be DMU_OT_SA for arbitrary layouts
+ */
+ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+ (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
if (bonustype == DMU_OT_SA) {
SA_SET_HDR(sahdr, lot->lot_num,
buftype == SA_BONUS ? hdrsize : spillhdrsize);
@@ -763,11 +772,6 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
if (!spilling) {
/*
* remove spill block that is no longer needed.
- * set sa_spill_remove to prevent sa_attr_op
- * from trying to retrieve spill block before its
- * been removed. The flag will be cleared if/when
- * the handle is destroyed recreated or
- * sa_build_layouts() needs to spill again.
*/
dmu_buf_rele(hdl->sa_spill, NULL);
hdl->sa_spill = NULL;
@@ -783,10 +787,31 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
}
static void
+sa_free_attr_table(sa_os_t *sa)
+{
+ int i;
+
+ if (sa->sa_attr_table == NULL)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ sa->sa_attr_table = NULL;
+}
+
+static int
sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
{
sa_os_t *sa = os->os_sa;
uint64_t sa_attr_count = 0;
+ uint64_t sa_reg_count;
int error = 0;
uint64_t attr_value;
sa_attr_table_t *tb;
@@ -800,8 +825,20 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
- if (sa->sa_reg_attr_obj != 0)
- VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0);
+ if (sa->sa_reg_attr_obj != 0) {
+ error = zap_count(os, sa->sa_reg_attr_obj,
+ &sa_attr_count);
+
+ /*
+ * Make sure we retrieved a count and that it isn't zero
+ */
+ if (error || (error == 0 && sa_attr_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto bail;
+ }
+ sa_reg_count = sa_attr_count;
+ }
if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
sa_attr_count += sa_legacy_attr_count;
@@ -830,7 +867,6 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
else
error = ENOENT;
switch (error) {
- default:
case ENOENT:
sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
sa_attr_count++;
@@ -838,11 +874,13 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
case 0:
sa->sa_user_table[i] = ATTR_NUM(attr_value);
break;
+ default:
+ goto bail;
}
}
- os->os_sa->sa_num_attrs = sa_attr_count;
- tb = os->os_sa->sa_attr_table =
+ sa->sa_num_attrs = sa_attr_count;
+ tb = sa->sa_attr_table =
kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
/*
@@ -853,7 +891,7 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
if (sa->sa_reg_attr_obj) {
for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
uint64_t value;
value = za.za_first_integer;
@@ -873,6 +911,15 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
strlen(za.za_name) +1);
}
zap_cursor_fini(&zc);
+ /*
+ * Make sure we processed the correct number of registered
+ * attributes
+ */
+ if (registered_count != sa_reg_count) {
+ ASSERT(error != 0);
+ goto bail;
+ }
+
}
if (ostype == DMU_OST_ZFS) {
@@ -908,18 +955,27 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
strlen(reg_attrs[i].sa_name) + 1);
}
- os->os_sa->sa_need_attr_registration =
+ sa->sa_need_attr_registration =
(sa_attr_count != registered_count);
+
+ return (0);
+bail:
+ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+ sa->sa_user_table = NULL;
+ sa_free_attr_table(sa);
+ return ((error != 0) ? error : EINVAL);
}
-sa_attr_type_t *
-sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+ sa_attr_type_t **user_table)
{
zap_cursor_t zc;
zap_attribute_t za;
sa_os_t *sa;
dmu_objset_type_t ostype = dmu_objset_type(os);
sa_attr_type_t *tb;
+ int error;
mutex_enter(&os->os_lock);
if (os->os_sa) {
@@ -927,13 +983,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
mutex_exit(&os->os_lock);
tb = os->os_sa->sa_user_table;
mutex_exit(&os->os_sa->sa_lock);
- return (tb);
+ *user_table = tb;
+ return (0);
}
sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
sa->sa_master_obj = sa_obj;
+ os->os_sa = sa;
mutex_enter(&sa->sa_lock);
mutex_exit(&os->os_lock);
avl_create(&sa->sa_layout_num_tree, layout_num_compare,
@@ -942,26 +1000,36 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
if (sa_obj) {
- int error;
error = zap_lookup(os, sa_obj, SA_LAYOUTS,
8, 1, &sa->sa_layout_attr_obj);
- if (error != 0 && error != ENOENT) {
- return (NULL);
- }
+ if (error != 0 && error != ENOENT)
+ goto fail;
error = zap_lookup(os, sa_obj, SA_REGISTRY,
8, 1, &sa->sa_reg_attr_obj);
- if (error != 0 && error != ENOENT) {
- mutex_exit(&sa->sa_lock);
- return (NULL);
- }
+ if (error != 0 && error != ENOENT)
+ goto fail;
}
- os->os_sa = sa;
- sa_attr_table_setup(os, reg_attrs, count);
+ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+ goto fail;
if (sa->sa_layout_attr_obj != 0) {
+ uint64_t layout_count;
+
+ error = zap_count(os, sa->sa_layout_attr_obj,
+ &layout_count);
+
+ /*
+ * Layout number count should be > 0
+ */
+ if (error || (error == 0 && layout_count == 0)) {
+ if (error == 0)
+ error = EINVAL;
+ goto fail;
+ }
+
for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
sa_attr_type_t *lot_attrs;
uint64_t lot_num;
@@ -969,8 +1037,13 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
za.za_num_integers, KM_SLEEP);
- VERIFY(zap_lookup(os, sa->sa_layout_attr_obj,
- za.za_name, 2, za.za_num_integers, lot_attrs) == 0);
+ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers,
+ lot_attrs))) != 0) {
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ break;
+ }
VERIFY(ddi_strtoull(za.za_name, NULL, 10,
(unsigned long long *)&lot_num) == 0);
@@ -982,6 +1055,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
za.za_num_integers);
}
zap_cursor_fini(&zc);
+
+ /*
+ * Make sure layout count matches number of entries added
+ * to AVL tree
+ */
+ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+ ASSERT(error != 0);
+ goto fail;
+ }
}
/* Add special layout number for old ZNODES */
@@ -994,8 +1076,17 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
0, B_FALSE, NULL);
}
+ *user_table = os->os_sa->sa_user_table;
mutex_exit(&sa->sa_lock);
- return (os->os_sa->sa_user_table);
+ return (0);
+fail:
+ os->os_sa = NULL;
+ sa_free_attr_table(sa);
+ if (sa->sa_user_table)
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+ mutex_exit(&sa->sa_lock);
+ kmem_free(sa, sizeof (sa_os_t));
+ return ((error == ECKSUM) ? EIO : error);
}
void
@@ -1004,20 +1095,12 @@ sa_tear_down(objset_t *os)
sa_os_t *sa = os->os_sa;
sa_lot_t *layout;
void *cookie;
- int i;
kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
/* Free up attr table */
- for (i = 0; i != sa->sa_num_attrs; i++) {
- if (sa->sa_attr_table[i].sa_name)
- kmem_free(sa->sa_attr_table[i].sa_name,
- strlen(sa->sa_attr_table[i].sa_name) + 1);
- }
-
- kmem_free(sa->sa_attr_table,
- sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+ sa_free_attr_table(sa);
cookie = NULL;
while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
@@ -1361,11 +1444,9 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
ASSERT(hdl);
mutex_enter(&hdl->sa_lock);
- if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) {
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
uio->uio_resid), UIO_READ, uio);
- } else {
- error = ENOENT;
}
mutex_exit(&hdl->sa_lock);
return (error);
@@ -1373,11 +1454,6 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
}
#endif
-/*
- * Find an already existing TOC from given os and data
- * This is a special interface to be used by the ZPL for
- * finding the uid/gid/gen attributes.
- */
void *
sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
{
@@ -1475,12 +1551,10 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
}
if (sa->sa_reg_attr_obj == NULL) {
- int error;
sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
- error = zap_add(hdl->sa_os, sa->sa_master_obj,
- SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx);
- ASSERT(error == 0);
+ VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
+ SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
}
for (i = 0; i != sa->sa_num_attrs; i++) {
if (sa->sa_attr_table[i].sa_registered)
@@ -1538,6 +1612,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
uint16_t buflen, dmu_tx_t *tx)
{
sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
sa_bulk_attr_t *attr_desc;
void *old_data[2];
int bonus_attr_count = 0;
@@ -1555,7 +1631,9 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
/* First make of copy of the old data */
- if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
bonus_data_size = hdl->sa_bonus->db_size;
old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
bcopy(hdl->sa_bonus->db_data, old_data[0],
@@ -1564,16 +1642,21 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
} else {
old_data[0] = NULL;
}
+ DB_DNODE_EXIT(db);
/* Bring spill buffer online if it isn't currently */
- if (sa_has_blkptr(hdl)) {
+ if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size;
old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
bcopy(hdl->sa_spill->db_data, old_data[1],
hdl->sa_spill->db_size);
spill_attr_count =
hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else if (error && error != ENOENT) {
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ return (error);
} else {
old_data[1] = NULL;
}
@@ -1722,6 +1805,7 @@ int
sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
{
sa_bulk_attr_t bulk;
+ int error;
bulk.sa_data = NULL;
bulk.sa_attr = attr;
@@ -1729,9 +1813,9 @@ sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
ASSERT(hdl);
mutex_enter(&hdl->sa_lock);
- if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) {
+ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
mutex_exit(&hdl->sa_lock);
- return (ENOENT);
+ return (error);
}
*size = bulk.sa_size;
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index d7c5de0d3..b6190e4cf 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -116,6 +116,7 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa);
static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
id_t zio_taskq_psrset_bind = PS_NONE;
@@ -180,6 +181,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
size - alloc, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+ (spa_mode(spa) == FREAD), src);
cap = (size == 0) ? 0 : (alloc * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
@@ -529,7 +532,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
nvpair_name(elem))) == ZPROP_INVAL)
return (EINVAL);
- if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+ if (prop == ZPOOL_PROP_CACHEFILE ||
+ prop == ZPOOL_PROP_ALTROOT ||
+ prop == ZPOOL_PROP_READONLY)
continue;
need_sync = B_TRUE;
@@ -1284,33 +1289,131 @@ spa_check_removed(vdev_t *vd)
}
/*
- * Load the slog device state from the config object since it's possible
- * that the label does not contain the most up-to-date information.
+ * Validate the current config against the MOS config
*/
-void
-spa_load_log_state(spa_t *spa, nvlist_t *nv)
+static boolean_t
+spa_config_valid(spa_t *spa, nvlist_t *config)
{
- vdev_t *ovd, *rvd = spa->spa_root_vdev;
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv;
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+ ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
/*
- * Load the original root vdev tree from the passed config.
+ * If we're doing a normal import, then build up any additional
+ * diagnostic information about missing devices in this config.
+ * We'll pass this up to the user for further processing.
*/
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+ nvlist_t **child, *nv;
+ uint64_t idx = 0;
+
+ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+ KM_SLEEP);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops &&
+ mtvd->vdev_islog)
+ child[idx++] = vdev_config_generate(spa, mtvd,
+ B_FALSE, 0);
+ }
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+
+ for (int i = 0; i < idx; i++)
+ nvlist_free(child[i]);
+ }
+ nvlist_free(nv);
+ kmem_free(child, rvd->vdev_children * sizeof (char **));
+ }
+
+ /*
+ * Compare the root vdev tree with the information we have
+ * from the MOS config (mrvd). Check each top-level vdev
+ * with the corresponding MOS config top-level (mtvd).
+ */
for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *cvd = rvd->vdev_child[c];
- if (cvd->vdev_islog)
- vdev_load_log_state(cvd, ovd->vdev_child[c]);
+ vdev_t *tvd = rvd->vdev_child[c];
+ vdev_t *mtvd = mrvd->vdev_child[c];
+
+ /*
+ * Resolve any "missing" vdevs in the current configuration.
+ * If we find that the MOS config has more accurate information
+ * about the top-level vdev then use that vdev instead.
+ */
+ if (tvd->vdev_ops == &vdev_missing_ops &&
+ mtvd->vdev_ops != &vdev_missing_ops) {
+
+ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
+ continue;
+
+ /*
+ * Device specific actions.
+ */
+ if (mtvd->vdev_islog) {
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
+ } else {
+ /*
+ * XXX - once we have 'readonly' pool
+ * support we should be able to handle
+ * missing data devices by transitioning
+ * the pool to readonly.
+ */
+ continue;
+ }
+
+ /*
+ * Swap the missing vdev with the data we were
+ * able to obtain from the MOS config.
+ */
+ vdev_remove_child(rvd, tvd);
+ vdev_remove_child(mrvd, mtvd);
+
+ vdev_add_child(rvd, mtvd);
+ vdev_add_child(mrvd, tvd);
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ vdev_load(mtvd);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ vdev_reopen(rvd);
+ } else if (mtvd->vdev_islog) {
+ /*
+ * Load the slog device's state from the MOS config
+ * since it's possible that the label does not
+ * contain the most up-to-date information.
+ */
+ vdev_load_log_state(tvd, mtvd);
+ vdev_reopen(tvd);
+ }
}
- vdev_free(ovd);
+ vdev_free(mrvd);
spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Ensure we were able to validate the config.
+ */
+ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
}
/*
* Check for missing log devices
*/
-int
+static int
spa_check_logs(spa_t *spa)
{
switch (spa->spa_log_state) {
@@ -1474,9 +1577,19 @@ spa_load_verify(spa_t *spa)
if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
sle.sle_data_count <= policy.zrp_maxdata) {
+ int64_t loss = 0;
+
verify_ok = B_TRUE;
spa->spa_load_txg = spa->spa_uberblock.ub_txg;
spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+ VERIFY(nvlist_add_int64(spa->spa_load_info,
+ ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+ VERIFY(nvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
} else {
spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
}
@@ -1635,13 +1748,21 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
KM_SLEEP) == 0);
}
+ gethrestime(&spa->spa_loaded_ts);
error = spa_load_impl(spa, pool_guid, config, state, type,
mosconfig, &ereport);
}
spa->spa_minref = refcount_count(&spa->spa_refcount);
- if (error && error != EBADF)
- zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ if (error) {
+ if (error != EEXIST) {
+ spa->spa_loaded_ts.tv_sec = 0;
+ spa->spa_loaded_ts.tv_nsec = 0;
+ }
+ if (error != EBADF) {
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+ }
+ }
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
spa->spa_ena = 0;
@@ -1661,7 +1782,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
nvlist_t *nvroot = NULL;
vdev_t *rvd;
uberblock_t *ub = &spa->spa_uberblock;
- uint64_t config_cache_txg = spa->spa_config_txg;
+ uint64_t children, config_cache_txg = spa->spa_config_txg;
int orig_mode = spa->spa_mode;
int parse;
uint64_t obj;
@@ -1760,9 +1881,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
/*
* If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration.
+ * incomplete configuration. We first check to see if the pool
+ * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+ * If it is, defer the vdev_guid_sum check till later so we
+ * can handle missing vdevs.
*/
- if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
rvd->vdev_guid_sum != ub->ub_guid_sum)
return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
@@ -1982,13 +2107,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
spa_config_exit(spa, SCL_ALL, FTAG);
/*
- * Check the state of the root vdev. If it can't be opened, it
- * indicates one or more toplevel vdevs are faulted.
- */
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
- return (ENXIO);
-
- /*
* Load the DDTs (dedup tables).
*/
error = ddt_load(spa);
@@ -1997,16 +2115,12 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
spa_update_dspace(spa);
- if (state != SPA_LOAD_TRYIMPORT) {
- error = spa_load_verify(spa);
- if (error)
- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
- error));
- }
-
/*
- * Load the intent log state and check log integrity. If we're
- * assembling a pool from a split, the log is not transferred over.
+ * Validate the config, using the MOS config to fill in any
+ * information which might be missing. If we fail to validate
+ * the config then declare the pool unfit for use. If we're
+ * assembling a pool from a split, the log is not transferred
+ * over.
*/
if (type != SPA_IMPORT_ASSEMBLE) {
nvlist_t *nvconfig;
@@ -2014,17 +2128,37 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- spa_load_log_state(spa, nvroot);
+ if (!spa_config_valid(spa, nvconfig)) {
+ nvlist_free(nvconfig);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
nvlist_free(nvconfig);
+ /*
+ * Now that we've validate the config, check the state of the
+ * root vdev. If it can't be opened, it indicates one or
+ * more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+
if (spa_check_logs(spa)) {
*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
}
}
+ /*
+ * We've successfully opened the pool, verify that we're ready
+ * to start pushing transactions.
+ */
+ if (state != SPA_LOAD_TRYIMPORT) {
+ if (error = spa_load_verify(spa))
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+ error));
+ }
+
if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
@@ -2066,12 +2200,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
* If the config cache is stale, or we have uninitialized
* metaslabs (see spa_vdev_add()), then update the config.
*
- * If spa_load_verbatim is true, trust the current
+ * If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
- state == SPA_LOAD_RECOVER)
+ state == SPA_LOAD_IMPORT ||
+ state == SPA_LOAD_RECOVER ||
+ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
need_update = B_TRUE;
for (int c = 0; c < rvd->vdev_children; c++)
@@ -2110,12 +2245,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
static int
spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
{
+ int mode = spa->spa_mode;
+
spa_unload(spa);
spa_deactivate(spa);
spa->spa_load_max_txg--;
- spa_activate(spa, spa_mode_global);
+ spa_activate(spa, mode);
spa_async_suspend(spa);
return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
@@ -2173,9 +2310,6 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
rewind_error = spa_load_retry(spa, state, mosconfig);
}
- if (config)
- spa_rewind_data_to_nvlist(spa, config);
-
spa->spa_extreme_rewind = B_FALSE;
spa->spa_load_max_txg = UINT64_MAX;
@@ -2202,6 +2336,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
nvlist_t **config)
{
spa_t *spa;
+ spa_load_state_t state = SPA_LOAD_OPEN;
int error;
int locked = B_FALSE;
@@ -2225,7 +2360,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
}
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
- spa_load_state_t state = SPA_LOAD_OPEN;
zpool_rewind_policy_t policy;
zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
@@ -2264,9 +2398,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
* information: the state of each vdev after the
* attempted vdev_open(). Return this to the user.
*/
- if (config != NULL && spa->spa_config)
+ if (config != NULL && spa->spa_config) {
VERIFY(nvlist_dup(spa->spa_config, config,
KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist(*config,
+ ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
spa_unload(spa);
spa_deactivate(spa);
spa->spa_last_open_failed = error;
@@ -2275,15 +2413,22 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
*spapp = NULL;
return (error);
}
-
}
spa_open_ref(spa, tag);
-
if (config != NULL)
*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ /*
+ * If we've recovered the pool, pass back any information we
+ * gathered while doing the load.
+ */
+ if (state == SPA_LOAD_RECOVER) {
+ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
+ }
+
if (locked) {
spa->spa_last_open_failed = 0;
spa->spa_last_ubsync_txg = 0;
@@ -2459,6 +2604,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
if (*config != NULL) {
+ uint64_t loadtimes[2];
+
+ loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+ loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+ VERIFY(nvlist_add_uint64_array(*config,
+ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
VERIFY(nvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
spa_get_errlog_size(spa)) == 0);
@@ -3032,7 +3184,7 @@ spa_import_rootpool(char *devpath, char *devid)
spa = spa_add(pname, config, NULL);
spa->spa_is_root = B_TRUE;
- spa->spa_load_verbatim = B_TRUE;
+ spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
/*
* Build up a vdev tree based on the boot device's label config.
@@ -3081,7 +3233,8 @@ spa_import_rootpool(char *devpath, char *devid)
!bvd->vdev_isspare) {
cmn_err(CE_NOTE, "The boot device is currently spared. Please "
"try booting from '%s'",
- bvd->vdev_parent->vdev_child[1]->vdev_path);
+ bvd->vdev_parent->
+ vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
error = EINVAL;
goto out;
}
@@ -3101,48 +3254,17 @@ out:
#endif
/*
- * Take a pool and insert it into the namespace as if it had been loaded at
- * boot.
- */
-int
-spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
-{
- spa_t *spa;
- char *altroot = NULL;
-
- mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
- mutex_exit(&spa_namespace_lock);
- return (EEXIST);
- }
-
- (void) nvlist_lookup_string(props,
- zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, config, altroot);
-
- spa->spa_load_verbatim = B_TRUE;
-
- if (props != NULL)
- spa_configfile_set(spa, props, B_FALSE);
-
- spa_config_sync(spa, B_FALSE, B_TRUE);
-
- mutex_exit(&spa_namespace_lock);
- spa_history_log_version(spa, LOG_POOL_IMPORT);
-
- return (0);
-}
-
-/*
* Import a non-root pool into the system.
*/
int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
{
spa_t *spa;
char *altroot = NULL;
spa_load_state_t state = SPA_LOAD_IMPORT;
zpool_rewind_policy_t policy;
+ uint64_t mode = spa_mode_global;
+ uint64_t readonly = B_FALSE;
int error;
nvlist_t *nvroot;
nvlist_t **spares, **l2cache;
@@ -3157,23 +3279,45 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
return (EEXIST);
}
- zpool_get_rewind_policy(config, &policy);
- if (policy.zrp_request & ZPOOL_DO_REWIND)
- state = SPA_LOAD_RECOVER;
-
/*
* Create and initialize the spa structure.
*/
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ (void) nvlist_lookup_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+ if (readonly)
+ mode = FREAD;
spa = spa_add(pool, config, altroot);
- spa_activate(spa, spa_mode_global);
+ spa->spa_import_flags = flags;
+
+ /*
+ * Verbatim import - Take a pool and insert it into the namespace
+ * as if it had been loaded at boot.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+
+ mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
+
+ return (0);
+ }
+
+ spa_activate(spa, mode);
/*
* Don't start async tasks until we know everything is healthy.
*/
spa_async_suspend(spa);
+ zpool_get_rewind_policy(config, &policy);
+ if (policy.zrp_request & ZPOOL_DO_REWIND)
+ state = SPA_LOAD_RECOVER;
+
/*
* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
* because the user-supplied config is actually the one to trust when
@@ -3181,14 +3325,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
*/
if (state != SPA_LOAD_RECOVER)
spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
policy.zrp_request);
/*
- * Propagate anything learned about failing or best txgs
- * back to caller
+ * Propagate anything learned while loading the pool and pass it
+ * back to caller (i.e. rewind info, missing devices, etc).
*/
- spa_rewind_data_to_nvlist(spa, config);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+ spa->spa_load_info) == 0);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -3228,6 +3374,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
return (error);
}
+ spa_async_resume(spa);
+
/*
* Override any spares and level 2 cache devices as specified by
* the user, as these may have correct device names/devids, etc.
@@ -3278,8 +3426,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
- spa_async_resume(spa);
-
/*
* It's possible that the pool was expanded while it was exported.
* We kick off an async task to handle this for us.
@@ -3542,6 +3688,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
@@ -3653,6 +3801,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
int newvd_isspare;
int error;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3702,7 +3852,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* spares.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_child[1] == oldvd &&
+ oldvd->vdev_isspare &&
!spa_has_spare(spa, newvd->vdev_guid))
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -3714,13 +3864,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* the same (spare replaces spare, non-spare replaces
* non-spare).
*/
- if (pvd->vdev_ops == &vdev_replacing_ops)
+ if (pvd->vdev_ops == &vdev_replacing_ops &&
+ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops == &vdev_spare_ops &&
- newvd->vdev_isspare != oldvd->vdev_isspare)
+ } else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops != &vdev_spare_ops &&
- newvd->vdev_isspare)
+ }
+
+ if (newvd->vdev_isspare)
pvops = &vdev_spare_ops;
else
pvops = &vdev_replacing_ops;
@@ -3755,6 +3907,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
+ /* mark the device being resilvered */
+ newvd->vdev_resilvering = B_TRUE;
+
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@@ -3823,6 +3978,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
return (0);
}
@@ -3840,9 +3998,10 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
uint64_t unspare_guid;
- size_t len;
char *vdpath;
+ ASSERT(spa_writeable(spa));
+
txg = spa_vdev_enter(spa);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3872,18 +4031,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
/*
- * If replace_done is specified, only remove this device if it's
- * the first child of a replacing vdev. For the 'spare' vdev, either
- * disk can be removed.
+ * Only 'replacing' or 'spare' vdevs can be replaced.
*/
- if (replace_done) {
- if (pvd->vdev_ops == &vdev_replacing_ops) {
- if (vd->vdev_id != 0)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- } else if (pvd->vdev_ops != &vdev_spare_ops) {
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- }
- }
+ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
spa_version(spa) >= SPA_VERSION_SPARES);
@@ -3910,16 +4062,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* check to see if we changed the original vdev's path to have "/old"
* at the end in spa_vdev_attach(). If so, undo that change now.
*/
- if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
- pvd->vdev_child[0]->vdev_path != NULL &&
- pvd->vdev_child[1]->vdev_path != NULL) {
- ASSERT(pvd->vdev_child[1] == vd);
- cvd = pvd->vdev_child[0];
- len = strlen(vd->vdev_path);
- if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
- strcmp(cvd->vdev_path + len, "/old") == 0) {
- spa_strfree(cvd->vdev_path);
- cvd->vdev_path = spa_strdup(vd->vdev_path);
+ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+ vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path);
+
+ for (int c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+
+ if (cvd == vd || cvd->vdev_path == NULL)
+ continue;
+
+ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+ strcmp(cvd->vdev_path + len, "/old") == 0) {
+ spa_strfree(cvd->vdev_path);
+ cvd->vdev_path = spa_strdup(vd->vdev_path);
+ break;
+ }
}
}
@@ -3929,7 +4087,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* active spare list for the pool.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+ vd->vdev_id == 0 &&
+ pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
/*
@@ -3951,7 +4110,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
/*
* Remember one of the remaining children so we can get tvd below.
*/
- cvd = pvd->vdev_child[0];
+ cvd = pvd->vdev_child[pvd->vdev_children - 1];
/*
* If we need to remove the remaining child from the list of hot spares,
@@ -3967,14 +4126,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid;
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ cvd->vdev_unspare = B_TRUE;
}
/*
* If the parent mirror/replacing vdev only has one child,
* the parent is no longer needed. Remove it from the tree.
*/
- if (pvd->vdev_children == 1)
+ if (pvd->vdev_children == 1) {
+ if (pvd->vdev_ops == &vdev_spare_ops)
+ cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
+ cvd->vdev_resilvering = B_FALSE;
+ }
+
/*
* We don't set tvd until now because the parent we just removed
@@ -4016,6 +4181,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+ /* hang on to the spa before we release the lock */
+ spa_open_ref(spa, FTAG);
+
error = spa_vdev_exit(spa, vd, txg, 0);
spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
@@ -4028,24 +4196,31 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* list of every other pool.
*/
if (unspare) {
- spa_t *myspa = spa;
- spa = NULL;
+ spa_t *altspa = NULL;
+
mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa->spa_state != POOL_STATE_ACTIVE)
- continue;
- if (spa == myspa)
+ while ((altspa = spa_next(altspa)) != NULL) {
+ if (altspa->spa_state != POOL_STATE_ACTIVE ||
+ altspa == spa)
continue;
- spa_open_ref(spa, FTAG);
+
+ spa_open_ref(altspa, FTAG);
mutex_exit(&spa_namespace_lock);
- (void) spa_vdev_remove(spa, unspare_guid,
- B_TRUE);
+ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
+ spa_close(altspa, FTAG);
}
mutex_exit(&spa_namespace_lock);
+
+ /* search the rest of the vdevs for spares to remove */
+ spa_vdev_resilver_done(spa);
}
+ /* all done with the spa; OK to release */
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
return (error);
}
@@ -4066,8 +4241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
vdev_t *rvd, **vml = NULL; /* vdev modify list */
boolean_t activate_slog;
- if (!spa_writeable(spa))
- return (EROFS);
+ ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
@@ -4484,6 +4658,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+ ASSERT(spa_writeable(spa));
+
if (!locked)
txg = spa_vdev_enter(spa);
@@ -4593,11 +4769,18 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
}
/*
- * Check for a completed replacement.
+ * Check for a completed replacement. We always consider the first
+ * vdev in the list to be the oldest vdev, and the last one to be
+ * the newest (see spa_vdev_attach() for how that works). In
+ * the case where the newest vdev is faulted, we will not automatically
+ * remove it after a resilver completes. This is OK as it will require
+ * user intervention to determine which disk the admin wishes to keep.
*/
- if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ if (vd->vdev_ops == &vdev_replacing_ops) {
+ ASSERT(vd->vdev_children > 1);
+
+ newvd = vd->vdev_child[vd->vdev_children - 1];
oldvd = vd->vdev_child[0];
- newvd = vd->vdev_child[1];
if (vdev_dtl_empty(newvd, DTL_MISSING) &&
vdev_dtl_empty(newvd, DTL_OUTAGE) &&
@@ -4608,16 +4791,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
/*
* Check for a completed resilver with the 'unspare' flag set.
*/
- if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
- newvd = vd->vdev_child[0];
- oldvd = vd->vdev_child[1];
+ if (vd->vdev_ops == &vdev_spare_ops) {
+ vdev_t *first = vd->vdev_child[0];
+ vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+ if (last->vdev_unspare) {
+ oldvd = first;
+ newvd = last;
+ } else if (first->vdev_unspare) {
+ oldvd = last;
+ newvd = first;
+ } else {
+ oldvd = NULL;
+ }
- if (newvd->vdev_unspare &&
+ if (oldvd != NULL &&
vdev_dtl_empty(newvd, DTL_MISSING) &&
vdev_dtl_empty(newvd, DTL_OUTAGE) &&
- !vdev_dtl_required(oldvd)) {
- newvd->vdev_unspare = 0;
+ !vdev_dtl_required(oldvd))
return (oldvd);
+
+ /*
+ * If there are more than two spares attached to a disk,
+ * and those spares are not required, then we want to
+ * attempt to free them up now so that they can be used
+ * by other pools. Once we're back down to a single
+ * disk+spare, we stop removing them.
+ */
+ if (vd->vdev_children > 2) {
+ newvd = vd->vdev_child[1];
+
+ if (newvd->vdev_isspare && last->vdev_isspare &&
+ vdev_dtl_empty(last, DTL_MISSING) &&
+ vdev_dtl_empty(last, DTL_OUTAGE) &&
+ !vdev_dtl_required(newvd))
+ return (newvd);
}
}
@@ -4644,9 +4852,9 @@ spa_vdev_resilver_done(spa_t *spa)
* we need to detach the parent's first child (the original hot
* spare) as well.
*/
- if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+ ppvd->vdev_children == 2) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
- ASSERT(ppvd->vdev_children == 2);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -4670,6 +4878,8 @@ spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
vdev_t *vd;
boolean_t sync = B_FALSE;
+ ASSERT(spa_writeable(spa));
+
spa_vdev_state_enter(spa, SCL_ALL);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
@@ -5115,9 +5325,11 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
ASSERT(spa->spa_root != NULL);
break;
+ case ZPOOL_PROP_READONLY:
case ZPOOL_PROP_CACHEFILE:
/*
- * 'cachefile' is also a non-persisitent property.
+ * 'readonly' and 'cachefile' are also non-persisitent
+ * properties.
*/
break;
default:
@@ -5249,6 +5461,8 @@ spa_sync(spa_t *spa, uint64_t txg)
dmu_tx_t *tx;
int error;
+ VERIFY(spa_writeable(spa));
+
/*
* Lock out configuration changes.
*/
@@ -5467,7 +5681,8 @@ spa_sync_allpools(void)
spa_t *spa = NULL;
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -5547,6 +5762,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
void
spa_upgrade(spa_t *spa, uint64_t version)
{
+ ASSERT(spa_writeable(spa));
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index cdeda3f93..69d57f66d 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -304,24 +304,6 @@ spa_config_set(spa_t *spa, nvlist_t *config)
mutex_exit(&spa->spa_props_lock);
}
-/* Add discovered rewind info, if any to the provided nvlist */
-void
-spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
-{
- int64_t loss = 0;
-
- if (tonvl == NULL || spa->spa_load_txg == 0)
- return;
-
- VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
- spa->spa_load_txg_ts) == 0);
- if (spa->spa_last_ubsync_txg)
- loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
- VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
- VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
- spa->spa_load_data_errors) == 0);
-}
-
/*
* Generate the pool's configuration based on the current in-core state.
* We infer whether to generate a complete config or just one top-level config
@@ -403,8 +385,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
/*
* Add the top-level config. We even add this on pools which
- * don't support holes in the namespace as older pools will
- * just ignore it.
+ * don't support holes in the namespace.
*/
vdev_top_config_generate(spa, config);
@@ -449,8 +430,6 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
kmem_free(dds, sizeof (ddt_stat_t));
}
- spa_rewind_data_to_nvlist(spa, config);
-
if (locked)
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 52af7fcb7..1b54afb0b 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -478,6 +478,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
list_insert_head(&spa->spa_config_list, dp);
+ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+
if (config != NULL)
VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
@@ -516,6 +519,7 @@ spa_remove(spa_t *spa)
list_destroy(&spa->spa_config_list);
+ nvlist_free(spa->spa_load_info);
spa_config_set(spa, NULL);
refcount_destroy(&spa->spa_refcount);
@@ -886,10 +890,6 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
- /*
- * If the config changed, notify the scrub that it must restart.
- * This will initiate a resilver if needed.
- */
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
config_changed = B_TRUE;
spa->spa_config_generation++;
@@ -1078,12 +1078,12 @@ spa_rename(const char *name, const char *newname)
}
/*
- * Determine whether a pool with given pool_guid exists. If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
+ * Return the spa_t associated with given pool_guid, if it exists. If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
*/
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
{
spa_t *spa;
avl_tree_t *t = &spa_namespace_avl;
@@ -1114,7 +1114,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
}
}
- return (spa != NULL);
+ return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ return (spa_by_guid(pool_guid, device_guid) != NULL);
}
char *
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index f478ad0c6..9b308ca4e 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -37,7 +37,7 @@
static void txg_sync_thread(dsl_pool_t *dp);
static void txg_quiesce_thread(dsl_pool_t *dp);
-int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */
+int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
/*
* Prepare the txg subsystem.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index a61f29b8e..bac3e8605 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -207,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
}
void
@@ -244,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
*/
for (; pvd != NULL; pvd = pvd->vdev_parent)
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
}
/*
@@ -524,6 +518,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ &vd->vdev_resilvering);
+
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
@@ -1375,10 +1372,10 @@ vdev_validate(vdev_t *vd)
nvlist_free(label);
/*
- * If spa->spa_load_verbatim is true, no need to check the
+ * If this is a verbatim import, no need to check the
* state of the pool.
*/
- if (!spa->spa_load_verbatim &&
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
@@ -1544,6 +1541,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
ASSERT(vd == vd->vdev_top);
ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
+ ASSERT(spa_writeable(vd->vdev_spa));
if (flags & VDD_METASLAB)
(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1599,6 +1597,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+ ASSERT(spa_writeable(vd->vdev_spa));
mutex_enter(sm->sm_lock);
if (!space_map_contains(sm, txg, size))
@@ -1855,6 +1854,9 @@ vdev_dtl_required(vdev_t *vd)
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+ if (!required && zio_injection_enabled)
+ required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
return (required);
}
@@ -2070,7 +2072,7 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
int
vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
spa_vdev_state_enter(spa, SCL_NONE);
@@ -2080,6 +2082,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+
/*
* We don't directly use the aux state here, but if we do a
* vdev_reopen(), we need this value to be present to remember why we
@@ -2099,7 +2103,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
* If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead.
*/
- if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -2107,7 +2111,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
* If we reopen the device and it's not dead, only then do we
* mark it degraded.
*/
- vdev_reopen(vd);
+ vdev_reopen(tvd);
if (vdev_readable(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
@@ -2349,15 +2353,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
*/
vd->vdev_forcefault = B_TRUE;
- vd->vdev_faulted = vd->vdev_degraded = 0;
+ vd->vdev_faulted = vd->vdev_degraded = 0ULL;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
- vdev_reopen(vd);
+ vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
vd->vdev_forcefault = B_FALSE;
- if (vd != rvd)
+ if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2541,7 +2545,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
dsl_scan_phys_t *scn_phys =
&spa->spa_dsl_pool->dp_scan->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
@@ -2597,7 +2601,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD) ||
+ (flags & ZIO_FLAG_SCAN_THREAD) ||
spa->spa_claiming)) {
/*
* This is either a normal write (not a repair), or it's
@@ -2616,7 +2620,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
*/
if (vd->vdev_ops->vdev_op_leaf) {
uint64_t commit_txg = txg;
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ if (flags & ZIO_FLAG_SCAN_THREAD) {
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
@@ -2699,6 +2703,8 @@ vdev_config_dirty(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int c;
+ ASSERT(spa_writeable(spa));
+
/*
* If this is an aux vdev (as with l2cache and spare devices), then we
* update the vdev config manually and set the sync flag.
@@ -2787,6 +2793,7 @@ vdev_state_dirty(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT(spa_writeable(spa));
ASSERT(vd == vd->vdev_top);
/*
@@ -2944,12 +2951,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
- * If we fail to open a vdev during an import, we mark it as
- * "not available", which signifies that it was never there to
- * begin with. Failure to open such a device is not considered
- * an error.
+ * If we fail to open a vdev during an import or recovery, we
+ * mark it as "not available", which signifies that it was
+ * never there to begin with. Failure to open such a device
+ * is not considered an error.
*/
- if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
+ if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER) &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
@@ -3042,32 +3050,52 @@ vdev_is_bootable(vdev_t *vd)
/*
* Load the state from the original vdev tree (ovd) which
* we've retrieved from the MOS config object. If the original
- * vdev was offline then we transfer that state to the device
- * in the current vdev tree (nvd).
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
*/
void
vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
spa_t *spa = nvd->vdev_spa;
+ ASSERT(nvd->vdev_top->vdev_islog);
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
for (int c = 0; c < nvd->vdev_children; c++)
vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
- if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
+ if (nvd->vdev_ops->vdev_op_leaf) {
/*
- * It would be nice to call vdev_offline()
- * directly but the pool isn't fully loaded and
- * the txg threads have not been started yet.
+ * Restore the persistent vdev state
*/
nvd->vdev_offline = ovd->vdev_offline;
- vdev_reopen(nvd->vdev_top);
+ nvd->vdev_faulted = ovd->vdev_faulted;
+ nvd->vdev_degraded = ovd->vdev_degraded;
+ nvd->vdev_removed = ovd->vdev_removed;
}
}
/*
+ * Determine if a log device has valid content. If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+ !vd->vdev_removed)
+ return (B_TRUE);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_log_state_valid(vd->vdev_child[c]))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
* Expand a vdev if possible.
*/
void
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 75ec54534..c08ed8ba0 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -353,6 +353,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_offline && !vd->vdev_tmpoffline)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
B_TRUE) == 0);
+ if (vd->vdev_resilvering)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
+ B_TRUE) == 0);
if (vd->vdev_faulted)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
B_TRUE) == 0);
@@ -571,6 +574,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
return (B_TRUE);
/*
+ * We can't rely on a pool's state if it's been imported
+ * read-only. Instead we look to see if the pools is marked
+ * read-only in the namespace and set the state to active.
+ */
+ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ spa_mode(spa) == FREAD)
+ state = POOL_STATE_ACTIVE;
+
+ /*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
index 1181bd443..843b5ff06 100644
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -327,19 +327,35 @@ static acl_ops_t zfs_acl_fuid_ops = {
* an external ACL and what version of ACL previously existed on the
* file. Would really be nice to not need this, sigh.
*/
-
uint64_t
zfs_external_acl(znode_t *zp)
{
zfs_acl_phys_t acl_phys;
+ int error;
if (zp->z_is_sa)
return (0);
- VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
- &acl_phys, sizeof (acl_phys)));
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
- return (acl_phys.z_acl_extern_obj);
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
}
/*
@@ -357,6 +373,7 @@ zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
int size;
int error;
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
if (zp->z_is_sa) {
if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
&size)) != 0)
@@ -387,13 +404,31 @@ zfs_znode_acl_version(znode_t *zp)
{
zfs_acl_phys_t acl_phys;
- if (zp->z_is_sa) {
+ if (zp->z_is_sa)
return (ZFS_ACL_VERSION_FUID);
- } else {
- VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
- &acl_phys, sizeof (acl_phys)));
- return (acl_phys.z_acl_version);
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
}
}
@@ -1024,7 +1059,8 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
* create a new acl and leave any cached acl in place.
*/
static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
{
zfs_acl_t *aclp;
int aclsize;
@@ -1033,6 +1069,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
zfs_acl_phys_t znode_acl;
int version;
int error;
+ boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
@@ -1041,11 +1078,23 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
return (0);
}
- version = ZNODE_ACL_VERSION(zp);
+ /*
+ * close race where znode could be upgrade while trying to
+ * read the znode attributes.
+ *
+ * But this could only happen if the file isn't already an SA
+ * znode
+ */
+ if (!zp->z_is_sa && !have_lock) {
+ mutex_enter(&zp->z_lock);
+ drop_lock = B_TRUE;
+ }
+ version = zfs_znode_acl_version(zp);
if ((error = zfs_acl_znode_info(zp, &aclsize,
- &acl_count, &znode_acl)) != 0)
- return (error);
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
aclp = zfs_acl_alloc(version);
@@ -1076,7 +1125,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = EIO;
- return (error);
+ goto done;
}
list_insert_head(&aclp->z_acl, aclnode);
@@ -1084,7 +1133,10 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
*aclpp = aclp;
if (!will_modify)
zp->z_acl_cached = aclp;
- return (0);
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+ return (error);
}
/*ARGSUSED*/
@@ -1104,44 +1156,18 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
*length = cb->cb_acl_node->z_size;
}
-
-static int
-zfs_acl_get_owner_fuids(znode_t *zp, uint64_t *fuid, uint64_t *fgid)
-{
- int count = 0;
- sa_bulk_attr_t bulk[2];
- int error;
-
- if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), NULL,
- &fuid, sizeof (fuid));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), NULL,
- &fgid, sizeof (fuid));
- if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
- return (error);
- }
- } else {
- *fuid = zp->z_uid;
- *fgid = zp->z_gid;
- }
- return (0);
-}
-
int
zfs_acl_chown_setattr(znode_t *zp)
{
int error;
zfs_acl_t *aclp;
- uint64_t fuid, fgid;
- if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
- return (error);
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- mutex_enter(&zp->z_acl_lock);
- if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
+ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
- &zp->z_pflags, fuid, fgid);
- mutex_exit(&zp->z_acl_lock);
+ &zp->z_pflags, zp->z_uid, zp->z_gid);
return (error);
}
@@ -1163,14 +1189,11 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
sa_bulk_attr_t bulk[5];
uint64_t ctime[2];
int count = 0;
- uint64_t fuid, fgid;
mode = zp->z_mode;
- if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0)
- return (error);
-
- mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, fuid, fgid);
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ zp->z_uid, zp->z_gid);
zp->z_mode = mode;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
@@ -1482,18 +1505,17 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
list_insert_tail(&aclp->z_acl, newnode);
}
-int
+void
zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
{
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp);
- mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
ASSERT(*aclp);
- return (0);
}
/*
@@ -1660,7 +1682,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
gid_t gid;
boolean_t need_chmod = B_TRUE;
boolean_t inherited = B_FALSE;
- uint64_t parentgid;
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1682,12 +1703,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
} else {
- if (IS_EPHEMERAL(dzp->z_gid))
- VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
- &parentgid, sizeof (parentgid)));
- else
- parentgid = (uint64_t)dzp->z_gid;
-
acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
cr, &acl_ids->z_fuidp);
acl_ids->z_fgid = 0;
@@ -1696,7 +1711,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
(uint64_t)vap->va_gid,
cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
- if (acl_ids->z_fgid != parentgid &&
+ if (acl_ids->z_fgid != dzp->z_gid &&
!groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0)
acl_ids->z_fgid = 0;
@@ -1706,7 +1721,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
char *domain;
uint32_t rid;
- acl_ids->z_fgid = parentgid;
+ acl_ids->z_fgid = dzp->z_gid;
gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP);
@@ -1746,15 +1761,15 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
}
if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
(dzp->z_pflags & ZFS_INHERIT_ACE)) &&
!(dzp->z_pflags & ZFS_XATTR)) {
- mutex_enter(&dzp->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
- mutex_exit(&dzp->z_acl_lock);
inherited = B_TRUE;
} else {
acl_ids->z_aclp =
@@ -1762,6 +1777,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
mutex_exit(&dzp->z_lock);
+ mutex_exit(&dzp->z_acl_lock);
if (need_chmod) {
acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
ZFS_ACL_AUTO_INHERIT : 0;
@@ -1824,7 +1840,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1970,6 +1986,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
zfs_acl_t *aclp;
zfs_fuid_info_t *fuidp = NULL;
boolean_t fuid_dirtied;
+ uint64_t acl_obj;
if (mask == 0)
return (ENOSYS);
@@ -1994,8 +2011,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
(zp->z_pflags & V4_ACL_WIDE_FLAGS);
}
top:
- mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os);
@@ -2010,14 +2027,15 @@ top:
* upgrading then take out necessary DMU holds
*/
- if (ZFS_EXTERNAL_ACL(zp)) {
- if (zfsvfs->z_version <= ZPL_VERSION_SA &&
- ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
} else {
- dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp),
- 0, aclp->z_acl_bytes);
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
}
} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
@@ -2041,6 +2059,7 @@ top:
error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
zp->z_acl_cached = aclp;
if (fuid_dirtied)
@@ -2052,8 +2071,8 @@ top:
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
done:
- mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
return (error);
}
@@ -2137,11 +2156,14 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
- uint64_t gowner;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp, B_FALSE);
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -2149,12 +2171,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
ASSERT(zp->z_acl_cached);
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
- &gowner, sizeof (gowner))) != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
uint32_t mask_matched;
@@ -2176,7 +2192,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
switch (entry_type) {
case ACE_OWNER:
- if (uid == zp->z_uid)
+ if (uid == fowner)
checkit = B_TRUE;
break;
case OWNING_GROUP:
@@ -2254,8 +2270,10 @@ zfs_has_access(znode_t *zp, cred_t *cr)
uint32_t have = ACE_ALL_PERMS;
if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
- return (secpolicy_vnode_any_access(cr, ZTOV(zp),
- zp->z_uid) == 0);
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
}
return (B_TRUE);
}
@@ -2332,7 +2350,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
return (0);
}
- if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) {
+ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
mutex_exit(&zdp->z_acl_lock);
goto slow;
}
@@ -2389,6 +2407,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
znode_t *xzp;
znode_t *check_zp = zp;
mode_t needed_bits;
+ uid_t owner;
is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
@@ -2425,6 +2444,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
}
+ owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
/*
* Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
* in needed_bits. Map the bits mapped by working_mode (currently
@@ -2436,7 +2456,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
working_mode = mode;
if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- zp->z_uid == crgetuid(cr))
+ owner == crgetuid(cr))
working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2452,7 +2472,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
&check_privs, skipaclchk, cr)) == 0) {
if (is_attr)
VN_RELE(ZTOV(xzp));
- return (secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+ return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
needed_bits, needed_bits));
}
@@ -2478,7 +2498,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
ASSERT(working_mode != 0);
if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
- zp->z_uid == crgetuid(cr)))
+ owner == crgetuid(cr)))
working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2490,20 +2510,20 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
if (working_mode & ACE_EXECUTE)
checkmode |= VEXEC;
- error = secpolicy_vnode_access2(cr, ZTOV(check_zp), zp->z_uid,
+ error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
needed_bits & ~checkmode, needed_bits);
if (error == 0 && (working_mode & ACE_WRITE_OWNER))
- error = secpolicy_vnode_chown(cr, zp->z_uid);
+ error = secpolicy_vnode_chown(cr, owner);
if (error == 0 && (working_mode & ACE_WRITE_ACL))
- error = secpolicy_vnode_setdac(cr, zp->z_uid);
+ error = secpolicy_vnode_setdac(cr, owner);
if (error == 0 && (working_mode &
(ACE_DELETE|ACE_DELETE_CHILD)))
error = secpolicy_vnode_remove(cr);
if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
- error = secpolicy_vnode_chown(cr, zp->z_uid);
+ error = secpolicy_vnode_chown(cr, owner);
}
if (error == 0) {
/*
@@ -2515,7 +2535,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
}
} else if (error == 0) {
- error = secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid,
+ error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
needed_bits, needed_bits);
}
@@ -2552,9 +2572,12 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp,
mode_t available_perms, cred_t *cr)
{
int error;
+ uid_t downer;
+
+ downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
error = secpolicy_vnode_access2(cr, ZTOV(dzp),
- dzp->z_uid, available_perms, VWRITE|VEXEC);
+ downer, available_perms, VWRITE|VEXEC);
if (error == 0)
error = zfs_sticky_remove_access(dzp, zp, cr);
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 362de4dd1..815f8895e 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -590,7 +590,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath);
+ vfs_setmntpoint(vfsp, newpath, 0);
pathref = vfs_getresource(vfsp);
(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
@@ -599,7 +599,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
(void) strcat(newpath, nm);
refstr_rele(pathref);
- vfs_setresource(vfsp, newpath);
+ vfs_setresource(vfsp, newpath, 0);
vfs_unlock(vfsp);
}
@@ -749,7 +749,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
return (err);
if (err == 0) {
- err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
+ err = dmu_objset_snapshot(name, dirname, NULL, NULL,
+ B_FALSE, B_FALSE, -1);
if (err)
return (err);
err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 6d6666822..b06d29ab3 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -630,7 +630,7 @@ zfs_rmnode(znode_t *zp)
ASSERT(error == 0);
}
- acl_obj = ZFS_EXTERNAL_ACL(zp);
+ acl_obj = zfs_external_acl(zp);
/*
* Set up the final transaction.
@@ -1067,6 +1067,9 @@ int
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
{
uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
if (zdp->z_zfsvfs->z_replay)
return (0);
@@ -1074,7 +1077,10 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
if ((zdp->z_mode & S_ISVTX) == 0)
return (0);
- if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid ||
+ downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
(ZTOV(zp)->v_type == VREG &&
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
return (0);
diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c
index 8c0424e84..a853f4d73 100644
--- a/module/zfs/zfs_fuid.c
+++ b/module/zfs/zfs_fuid.c
@@ -388,26 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
void
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
{
- uint64_t fuid, fgid;
- sa_bulk_attr_t bulk[2];
- int count = 0;
-
- if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs),
- NULL, &fuid, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs),
- NULL, &fgid, 8);
- VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count));
- }
- if (IS_EPHEMERAL(zp->z_uid))
- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
- else
- *uidp = zp->z_uid;
- if (IS_EPHEMERAL(zp->z_gid))
- *gidp = zfs_fuid_map_id(zp->z_zfsvfs,
- zp->z_gid, cr, ZFS_GROUP);
- else
- *gidp = zp->z_gid;
+ *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
}
uid_t
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index de5fb1e4c..1b63c9bf4 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -60,6 +60,7 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
#include <sys/zvol.h>
#include <sys/dsl_scan.h>
#include <sharefs/share.h>
@@ -87,12 +88,18 @@ typedef enum {
DATASET_NAME
} zfs_ioc_namecheck_t;
+typedef enum {
+ POOL_CHECK_NONE = 1 << 0,
+ POOL_CHECK_SUSPENDED = 1 << 1,
+ POOL_CHECK_READONLY = 1 << 2
+} zfs_ioc_poolcheck_t;
+
typedef struct zfs_ioc_vec {
zfs_ioc_func_t *zvec_func;
zfs_secpolicy_func_t *zvec_secpolicy;
zfs_ioc_namecheck_t zvec_namecheck;
boolean_t zvec_his_log;
- boolean_t zvec_pool_check;
+ zfs_ioc_poolcheck_t zvec_pool_check;
} zfs_ioc_vec_t;
/* This array is indexed by zfs_userquota_prop_t */
@@ -281,9 +288,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
}
static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
{
- uint64_t zoned;
int writable = 1;
/*
@@ -294,9 +300,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
!zone_dataset_visible(dataset, &writable))
return (ENOENT);
- if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
- return (ENOENT);
-
if (INGLOBALZONE(curproc)) {
/*
* If the fs is zoned, only root can access it from the
@@ -318,6 +321,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
return (0);
}
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+
+ if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
+ return (ENOENT);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+ uint64_t zoned;
+
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) {
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ return (ENOENT);
+ }
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+
+ return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
int
zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
{
@@ -332,6 +361,21 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
return (error);
}
+int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+ const char *perm, cred_t *cr)
+{
+ int error;
+
+ error = zfs_dozonecheck_ds(name, ds, cr);
+ if (error == 0) {
+ error = secpolicy_zfs(cr);
+ if (error)
+ error = dsl_deleg_access_impl(ds, perm, cr);
+ }
+ return (error);
+}
+
/*
* Policy for setting the security label property.
*
@@ -507,8 +551,38 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
int
zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
{
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_SEND, cr));
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ char *cp;
+ int error;
+
+ /*
+ * Generate the current snapshot name from the given objsetid, then
+ * use that name for the secpolicy/zone checks.
+ */
+ cp = strchr(zc->zc_name, '@');
+ if (cp == NULL)
+ return (EINVAL);
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ dsl_dataset_name(ds, zc->zc_name);
+
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND, cr);
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
}
static int
@@ -786,6 +860,22 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
}
/*
+ * Policy for object to name lookups.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr)
+{
+ int error;
+
+ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+ return (error);
+}
+
+/*
* Policy for fault injection. Requires all privileges.
*/
/* ARGSUSED */
@@ -876,6 +966,33 @@ zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
}
/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr)
+{
+ /*
+ * A temporary snapshot is the same as a snapshot,
+ * hold, destroy and release all rolled into one.
+ * Delegated diff alone is sufficient that we allow this.
+ */
+ int error;
+
+ if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ return (0);
+
+ error = zfs_secpolicy_snapshot(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_hold(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_release(zc, cr);
+ if (!error)
+ error = zfs_secpolicy_destroy(zc, cr);
+ return (error);
+}
+
+/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
@@ -1001,14 +1118,15 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
* case its z_vfs will be NULL, and it will be opened as the owner.
*/
static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
{
int error = 0;
if (getzfsvfs(name, zfvp) != 0)
error = zfsvfs_create(name, zfvp);
if (error == 0) {
- rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag);
+ rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+ RW_READER, tag);
if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
@@ -1137,13 +1255,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
guid != zc->zc_guid)
error = EINVAL;
- else if (zc->zc_cookie)
- error = spa_import_verbatim(zc->zc_name, config, props);
else
- error = spa_import(zc->zc_name, config, props);
+ error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
- if (zc->zc_nvlist_dst != 0)
- (void) put_nvlist(zc, config);
+ if (zc->zc_nvlist_dst != 0) {
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
+ }
nvlist_free(config);
@@ -1366,6 +1486,35 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_obj object to find
+ *
+ * outputs:
+ * zc_stat stats on object
+ * zc_value path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+ /* XXX reading from objset not owned */
+ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+ return (error);
+ if (dmu_objset_type(os) != DMU_OST_ZFS) {
+ dmu_objset_rele(os, FTAG);
+ return (EINVAL);
+ }
+ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_rele(os, FTAG);
+
+ return (error);
+}
+
static int
zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
@@ -1577,26 +1726,12 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
return (error);
}
-/*
- * inputs:
- * zc_name name of filesystem
- * zc_nvlist_dst_size size of buffer for property nvlist
- *
- * outputs:
- * zc_objset_stats stats
- * zc_nvlist_dst property nvlist
- * zc_nvlist_dst_size size of property nvlist
- */
static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
{
- objset_t *os = NULL;
- int error;
+ int error = 0;
nvlist_t *nv;
- if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
- return (error);
-
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
if (zc->zc_nvlist_dst != 0 &&
@@ -1617,7 +1752,32 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
nvlist_free(nv);
}
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_nvlist_dst_size size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats stats
+ * zc_nvlist_dst property nvlist
+ * zc_nvlist_dst_size size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (error);
+
+ error = zfs_ioc_objset_stats_impl(zc, os);
+
dmu_objset_rele(os, FTAG);
+
return (error);
}
@@ -1850,19 +2010,43 @@ top:
error = dmu_snapshot_list_next(os,
sizeof (zc->zc_name) - strlen(zc->zc_name),
- zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
- dmu_objset_rele(os, FTAG);
+ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
+ NULL);
+
if (error == 0) {
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
- if (error == ENOENT) {
- /* We lost a race with destroy, get the next one. */
- *strchr(zc->zc_name, '@') = '\0';
- goto top;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+
+ /*
+ * Since we probably don't have a hold on this snapshot,
+ * it's possible that the objsetid could have been destroyed
+ * and reused for a new objset. It's OK if this happens during
+ * a zfs send operation, since the new createtxg will be
+ * beyond the range we're interested in.
+ */
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ if (error == ENOENT) {
+ /* Racing with destroy, get the next one. */
+ *strchr(zc->zc_name, '@') = '\0';
+ dmu_objset_rele(os, FTAG);
+ goto top;
+ }
+ } else {
+ objset_t *ossnap;
+
+ error = dmu_objset_from_ds(ds, &ossnap);
+ if (error == 0)
+ error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
error = ESRCH;
}
+ dmu_objset_rele(os, FTAG);
/* if we failed, undo the @ that we tacked on to zc_name */
if (error)
*strchr(zc->zc_name, '@') = '\0';
@@ -1905,7 +2089,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
rid = valary[1];
quota = valary[2];
- err = zfsvfs_hold(dsname, FTAG, &zfsvfs);
+ err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
if (err == 0) {
err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
zfsvfs_rele(zfsvfs, FTAG);
@@ -1970,7 +2154,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
{
zfsvfs_t *zfsvfs;
- if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
+ if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
break;
err = zfs_set_version(zfsvfs, intval);
@@ -2872,8 +3056,8 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
goto out;
}
- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
- nvprops, recursive);
+ error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL,
+ nvprops, recursive, B_FALSE, -1);
out:
nvlist_free(nvprops);
@@ -3342,11 +3526,14 @@ static boolean_t zfs_ioc_recv_inject_err;
* zc_cookie file descriptor to recv from
* zc_begin_record the BEGIN record of the stream (not byteswapped)
* zc_guid force flag
+ * zc_cleanup_fd cleanup-on-exit file descriptor
+ * zc_action_handle handle for this guid/ds mapping (or zero on first call)
*
* outputs:
* zc_cookie number of bytes read
* zc_nvlist_dst{_size} error for each unapplied received property
* zc_obj zprop_errflags_t
+ * zc_action_handle handle for this guid/ds mapping
*/
static int
zfs_ioc_recv(zfs_cmd_t *zc)
@@ -3475,7 +3662,8 @@ zfs_ioc_recv(zfs_cmd_t *zc)
}
off = fp->f_offset;
- error = dmu_recv_stream(&drc, fp->f_vnode, &off);
+ error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd,
+ &zc->zc_action_handle);
if (error == 0) {
zfsvfs_t *zfsvfs = NULL;
@@ -3567,9 +3755,10 @@ out:
/*
* inputs:
* zc_name name of snapshot to send
- * zc_value short name of incremental fromsnap (may be empty)
* zc_cookie file descriptor to send stream to
- * zc_obj fromorigin flag (mutually exclusive with zc_value)
+ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj objsetid of snapshot to send
+ * zc_fromobj objsetid of incremental fromsnap (may be zero)
*
* outputs: none
*/
@@ -3581,34 +3770,55 @@ zfs_ioc_send(zfs_cmd_t *zc)
file_t *fp;
int error;
offset_t off;
+ dsl_dataset_t *ds;
+ dsl_dataset_t *dsfrom = NULL;
+ spa_t *spa;
+ dsl_pool_t *dp;
- error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+ error = spa_open(zc->zc_name, &spa, FTAG);
if (error)
return (error);
- if (zc->zc_value[0] != '\0') {
- char *buf;
- char *cp;
-
- buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- (void) strncpy(buf, zc->zc_name, MAXPATHLEN);
- cp = strchr(buf, '@');
- if (cp)
- *(cp+1) = 0;
- (void) strncat(buf, zc->zc_value, MAXPATHLEN);
- error = dmu_objset_hold(buf, FTAG, &fromsnap);
- kmem_free(buf, MAXPATHLEN);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ if (error) {
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ error = dmu_objset_from_ds(ds, &tosnap);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+
+ if (zc->zc_fromobj != 0) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ error = dmu_objset_from_ds(dsfrom, &fromsnap);
if (error) {
- dmu_objset_rele(tosnap, FTAG);
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
+ } else {
+ spa_close(spa, FTAG);
}
fp = getf(zc->zc_cookie);
if (fp == NULL) {
- dmu_objset_rele(tosnap, FTAG);
- if (fromsnap)
- dmu_objset_rele(fromsnap, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
return (EBADF);
}
@@ -3618,9 +3828,9 @@ zfs_ioc_send(zfs_cmd_t *zc)
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
releasef(zc->zc_cookie);
- if (fromsnap)
- dmu_objset_rele(fromsnap, FTAG);
- dmu_objset_rele(tosnap, FTAG);
+ if (dsfrom)
+ dsl_dataset_rele(dsfrom, FTAG);
+ dsl_dataset_rele(ds, FTAG);
return (error);
}
@@ -3717,7 +3927,10 @@ zfs_ioc_clear(zfs_cmd_t *zc)
error = spa_open_rewind(zc->zc_name, &spa, FTAG,
policy, &config);
if (config != NULL) {
- (void) put_nvlist(zc, config);
+ int err;
+
+ if ((err = put_nvlist(zc, config)) != 0)
+ error = err;
nvlist_free(config);
}
nvlist_free(policy);
@@ -3801,7 +4014,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
return (EINVAL);
- error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+ error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
@@ -3832,7 +4045,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
if (bufsize <= 0)
return (ENOMEM);
- int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+ int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
if (error)
return (error);
@@ -4032,6 +4245,113 @@ ace_t full_access[] = {
};
/*
+ * inputs:
+ * zc_name name of containing filesystem
+ * zc_obj object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+ if (error)
+ return (error);
+
+ error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
+ os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+
+ dmu_objset_rele(os, FTAG);
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value prefix name for snapshot
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+ char *snap_name;
+ int error;
+
+ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+ (u_longlong_t)ddi_get_lbolt64());
+
+ if (strlen(snap_name) >= MAXNAMELEN) {
+ strfree(snap_name);
+ return (E2BIG);
+ }
+
+ error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name,
+ NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd);
+ if (error != 0) {
+ strfree(snap_name);
+ return (error);
+ }
+
+ (void) strcpy(zc->zc_value, snap_name);
+ strfree(snap_name);
+ return (0);
+}
+
+/*
+ * inputs:
+ * zc_name name of "to" snapshot
+ * zc_value name of "from" snapshot
+ * zc_cookie file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+ objset_t *fromsnap;
+ objset_t *tosnap;
+ file_t *fp;
+ offset_t off;
+ int error;
+
+ error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
+ if (error)
+ return (error);
+
+ error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap);
+ if (error) {
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+ }
+
+ fp = getf(zc->zc_cookie);
+ if (fp == NULL) {
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (EBADF);
+ }
+
+ off = fp->f_offset;
+
+ error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off);
+
+ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
+ fp->f_offset = off;
+ releasef(zc->zc_cookie);
+
+ dmu_objset_rele(fromsnap, FTAG);
+ dmu_objset_rele(tosnap, FTAG);
+ return (error);
+}
+
+/*
* Remove all ACL files in shares dir
*/
static int
@@ -4182,11 +4502,14 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
/*
* inputs:
- * zc_name name of filesystem
- * zc_value short name of snap
- * zc_string user-supplied tag for this reference
- * zc_cookie recursive flag
- * zc_temphold set if hold is temporary
+ * zc_name name of filesystem
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this hold
+ * zc_cookie recursive flag
+ * zc_temphold set if hold is temporary
+ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process
+ * zc_sendobj if non-zero, the objid for zc_name@zc_value
+ * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg
*
* outputs: none
*/
@@ -4194,22 +4517,76 @@ static int
zfs_ioc_hold(zfs_cmd_t *zc)
{
boolean_t recursive = zc->zc_cookie;
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds;
+ int error;
+ minor_t minor = 0;
if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
return (EINVAL);
- return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
- zc->zc_string, recursive, zc->zc_temphold));
+ if (zc->zc_sendobj == 0) {
+ return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive, zc->zc_temphold,
+ zc->zc_cleanup_fd));
+ }
+
+ if (recursive)
+ return (EINVAL);
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error)
+ return (error);
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ if (error)
+ return (error);
+
+ /*
+ * Until we have a hold on this snapshot, it's possible that
+ * zc_sendobj could've been destroyed and reused as part
+ * of a later txg. Make sure we're looking at the right object.
+ */
+ if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOENT);
+ }
+
+ if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) {
+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+ if (error) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ }
+
+ error = dsl_dataset_user_hold_for_send(ds, zc->zc_string,
+ zc->zc_temphold);
+ if (minor != 0) {
+ if (error == 0) {
+ dsl_register_onexit_hold_cleanup(ds, zc->zc_string,
+ minor);
+ }
+ zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+ }
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
}
/*
* inputs:
- * zc_name name of dataset from which we're releasing a user reference
+ * zc_name name of dataset from which we're releasing a user hold
* zc_value short name of snap
- * zc_string user-supplied tag for this reference
+ * zc_string user-supplied tag for this hold
* zc_cookie recursive flag
*
- * outputs: none
+ * outputs: none
*/
static int
zfs_ioc_release(zfs_cmd_t *zc)
@@ -4251,132 +4628,264 @@ zfs_ioc_get_holds(zfs_cmd_t *zc)
*/
static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_READONLY },
{ zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_FALSE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED },
{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED },
{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_TRUE },
- { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
- { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
+ POOL_CHECK_SUSPENDED },
+ { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
- B_TRUE},
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
- B_TRUE },
- { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE },
- { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
- { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE,
+ POOL_CHECK_NONE },
{ zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
+ POOL_CHECK_NONE },
+ { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ POOL_CHECK_NONE },
{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
- B_TRUE, B_TRUE },
+ B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
- B_TRUE },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE,
+ POOL_CHECK_NONE },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ POOL_CHECK_SUSPENDED },
{ zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
+ POOL_CHECK_NONE },
+ { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE,
+ POOL_CHECK_NONE },
{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
- B_FALSE },
- { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
- DATASET_NAME, B_FALSE, B_FALSE },
- { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
- DATASET_NAME, B_FALSE, B_FALSE },
+ POOL_CHECK_NONE },
+ { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME,
+ B_FALSE, POOL_CHECK_NONE },
+ { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME,
+ B_FALSE, POOL_CHECK_NONE },
{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
- DATASET_NAME, B_FALSE, B_TRUE },
- { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
{ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_TRUE },
+ POOL_CHECK_SUSPENDED },
{ zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
- B_FALSE },
+ POOL_CHECK_NONE },
{ zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
- B_TRUE }
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ POOL_CHECK_NONE },
+ { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ POOL_CHECK_NONE },
+ { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME,
+ B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
+ { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE,
+ POOL_CHECK_SUSPENDED }
};
int
-pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+ zfs_ioc_poolcheck_t check)
{
spa_t *spa;
int error;
ASSERT(type == POOL_NAME || type == DATASET_NAME);
+ if (check & POOL_CHECK_NONE)
+ return (0);
+
error = spa_open(name, &spa, FTAG);
if (error == 0) {
- if (spa_suspended(spa))
+ if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
error = EAGAIN;
+ else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+ error = EROFS;
spa_close(spa, FTAG);
}
return (error);
}
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+ static minor_t last_minor;
+ minor_t m;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ for (m = last_minor + 1; m != last_minor; m++) {
+ if (m > ZFSDEV_MAX_MINOR)
+ m = 1;
+ if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+ last_minor = m;
+ return (m);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ctldev_init(dev_t *devp)
+{
+ minor_t minor;
+ zfs_soft_state_t *zs;
+
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ ASSERT(getminor(*devp) == 0);
+
+ minor = zfsdev_minor_alloc();
+ if (minor == 0)
+ return (ENXIO);
+
+ if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+ return (EAGAIN);
+
+ *devp = makedevice(getemajor(*devp), minor);
+
+ zs = ddi_get_soft_state(zfsdev_state, minor);
+ zs->zss_type = ZSST_CTLDEV;
+ zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+ return (0);
+}
+
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+ zfs_onexit_destroy(zo);
+ ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+ zfs_soft_state_t *zp;
+
+ zp = ddi_get_soft_state(zfsdev_state, minor);
+ if (zp == NULL || zp->zss_type != which)
+ return (NULL);
+
+ return (zp->zss_data);
+}
+
+static int
+zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+ int error = 0;
+
+ if (getminor(*devp) != 0)
+ return (zvol_open(devp, flag, otyp, cr));
+
+ /* This is the control device. Allocate a new minor if requested. */
+ if (flag & FEXCL) {
+ mutex_enter(&zfsdev_state_lock);
+ error = zfs_ctldev_init(devp);
+ mutex_exit(&zfsdev_state_lock);
+ }
+
+ return (error);
+}
+
+static int
+zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+ zfs_onexit_t *zo;
+ minor_t minor = getminor(dev);
+
+ if (minor == 0)
+ return (0);
+
+ mutex_enter(&zfsdev_state_lock);
+ zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (zo == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (zvol_close(dev, flag, otyp, cr));
+ }
+ zfs_ctldev_destroy(zo, minor);
+ mutex_exit(&zfsdev_state_lock);
+
+ return (0);
+}
+
static int
zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
{
zfs_cmd_t *zc;
uint_t vec;
int error, rc;
+ minor_t minor = getminor(dev);
- if (getminor(dev) != 0)
+ if (minor != 0 &&
+ zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL)
return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
vec = cmd - ZFS_IOC;
@@ -4405,17 +4914,17 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
error = EINVAL;
- if (zfs_ioc_vec[vec].zvec_pool_check)
- error = pool_status_check(zc->zc_name,
- zfs_ioc_vec[vec].zvec_namecheck);
+ error = pool_status_check(zc->zc_name,
+ zfs_ioc_vec[vec].zvec_namecheck,
+ zfs_ioc_vec[vec].zvec_pool_check);
break;
case DATASET_NAME:
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
error = EINVAL;
- if (zfs_ioc_vec[vec].zvec_pool_check)
- error = pool_status_check(zc->zc_name,
- zfs_ioc_vec[vec].zvec_namecheck);
+ error = pool_status_check(zc->zc_name,
+ zfs_ioc_vec[vec].zvec_namecheck,
+ zfs_ioc_vec[vec].zvec_pool_check);
break;
case NO_NAME:
@@ -4499,8 +5008,8 @@ zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
* so most of the standard driver entry points are in zvol.c.
*/
static struct cb_ops zfs_cb_ops = {
- zvol_open, /* open */
- zvol_close, /* close */
+ zfsdev_open, /* open */
+ zfsdev_close, /* close */
zvol_strategy, /* strategy */
nodev, /* print */
zvol_dump, /* dump */
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index bf9f37bca..26ab78279 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -170,6 +169,12 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
*attrs |= (xoap->xoa_reparse == 0) ? 0 :
XAT0_REPARSE;
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ XAT0_OFFLINE;
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ XAT0_SPARSE;
}
static void *
@@ -231,7 +236,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
zfs_fuid_info_t *fuidp, vattr_t *vap)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
lr_acl_create_t *lracl;
size_t aclsize;
@@ -333,9 +337,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
bcopy(name, end, namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -343,10 +345,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name)
+ znode_t *dzp, char *name, uint64_t foid)
{
itx_t *itx;
- uint64_t seq;
lr_remove_t *lr;
size_t namesize = strlen(name) + 1;
@@ -358,8 +359,9 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr->lr_doid = dzp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
+ itx->itx_oid = foid;
+
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -370,7 +372,6 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
- uint64_t seq;
lr_link_t *lr;
size_t namesize = strlen(name) + 1;
@@ -383,9 +384,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr->lr_link_obj = zp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -396,7 +395,6 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link)
{
itx_t *itx;
- uint64_t seq;
lr_create_t *lr;
size_t namesize = strlen(name) + 1;
size_t linksize = strlen(link) + 1;
@@ -418,9 +416,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
bcopy(name, (char *)(lr + 1), namesize);
bcopy(link, (char *)(lr + 1) + namesize, linksize);
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -431,7 +427,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
- uint64_t seq;
lr_rename_t *lr;
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
@@ -445,11 +440,9 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr->lr_tdoid = tdzp->z_id;
bcopy(sname, (char *)(lr + 1), snamesize);
bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ itx->itx_oid = szp->z_id;
- seq = zil_itx_assign(zilog, itx, tx);
- sdzp->z_last_itx = seq;
- tdzp->z_last_itx = seq;
- szp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -520,13 +513,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx->itx_private = zp->z_zfsvfs;
- if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
- (ioflag & (FSYNC | FDSYNC)))
- itx->itx_sync = B_TRUE;
- else
+ if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+ (fsync_cnt == 0))
itx->itx_sync = B_FALSE;
- zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+ zil_itx_assign(zilog, itx, tx);
off += len;
resid -= len;
@@ -541,7 +532,6 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
- uint64_t seq;
lr_truncate_t *lr;
if (zil_replaying(zilog, tx) || zp->z_unlinked)
@@ -554,8 +544,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
lr->lr_length = len;
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -566,7 +555,6 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_setattr_t *lr;
xvattr_t *xvap = (xvattr_t *)vap;
size_t recsize = sizeof (lr_setattr_t);
@@ -618,8 +606,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(void) zfs_log_fuid_domains(fuidp, start);
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
/*
@@ -630,7 +617,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
- uint64_t seq;
lr_acl_v0_t *lrv0;
lr_acl_t *lr;
int txtype;
@@ -686,6 +672,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
}
itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
+ zil_itx_assign(zilog, itx, tx);
}
diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c
new file mode 100644
index 000000000..9706de2b4
--- /dev/null
+++ b/module/zfs/zfs_onexit.c
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mkdev.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls. User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation. Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+ zfs_onexit_t *zo;
+
+ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+ offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+ zfs_onexit_action_node_t *ap;
+
+ mutex_enter(&zo->zo_lock);
+ while ((ap = list_head(&zo->zo_actions)) != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ mutex_enter(&zo->zo_lock);
+ }
+ mutex_exit(&zo->zo_lock);
+
+ list_destroy(&zo->zo_actions);
+ mutex_destroy(&zo->zo_lock);
+ kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+ *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+ if (*zo == NULL)
+ return (EBADF);
+
+ return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+ file_t *fp;
+ zfs_onexit_t *zo;
+
+ fp = getf(fd);
+ if (fp == NULL)
+ return (EBADF);
+
+ *minorp = getminor(fp->f_vnode->v_rdev);
+ return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+ releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+ uint64_t *action_handle)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+ list_link_init(&ap->za_link);
+ ap->za_func = func;
+ ap->za_data = data;
+
+ mutex_enter(&zo->zo_lock);
+ list_insert_tail(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (action_handle)
+ *action_handle = (uint64_t)(uintptr_t)ap;
+
+ return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+ zfs_onexit_action_node_t *match;
+ zfs_onexit_action_node_t *ap;
+ list_t *l;
+
+ ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+ match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+ l = &zo->zo_actions;
+ for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+ if (match == ap)
+ break;
+ }
+ return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL) {
+ list_remove(&zo->zo_actions, ap);
+ mutex_exit(&zo->zo_lock);
+ if (fire)
+ ap->za_func(ap->za_data);
+ kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+ } else {
+ mutex_exit(&zo->zo_lock);
+ error = ENOENT;
+ }
+
+ return (error);
+}
+
+/*
+ * Return the data associated with this callback. This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+ zfs_onexit_t *zo;
+ zfs_onexit_action_node_t *ap;
+ int error;
+
+ *data = NULL;
+
+ error = zfs_onexit_minor_to_state(minor, &zo);
+ if (error)
+ return (error);
+
+ mutex_enter(&zo->zo_lock);
+ ap = zfs_onexit_find_cb(zo, action_handle);
+ if (ap != NULL)
+ *data = ap->za_data;
+ else
+ error = ENOENT;
+ mutex_exit(&zo->zo_lock);
+
+ return (error);
+}
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index f26009b02..9fb336856 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -129,6 +128,10 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
}
static int
@@ -625,7 +628,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
znode_t *zp;
int error;
ssize_t resid;
- uint64_t orig_eof, eod, offset, length;
+ uint64_t eod, offset, length;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -643,9 +646,20 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
offset = lr->lr_offset;
length = lr->lr_length;
- eod = offset + length; /* end of data for this write */
+ eod = offset + length; /* end of data for this write */
- orig_eof = zp->z_size;
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
+ */
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
/* If it's a dmu_sync() block, write the whole block */
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -654,23 +668,15 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
offset -= offset % blocksize;
length = blocksize;
}
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
}
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
- /*
- * This may be a write from a dmu_sync() for a whole block,
- * and may extend beyond the current end of the file.
- * We can't just replay what was written for this TX_WRITE as
- * a future TX_WRITE2 may extend the eof and the data for that
- * write needs to be there. So we write the whole block and
- * reduce the eof.
- */
- if (orig_eof < zp->z_size) /* file length grew ? */
- zp->z_size = eod;
-
VN_RELE(ZTOV(zp));
+ zfsvfs->z_replay_eof = 0; /* safety */
return (error);
}
@@ -694,10 +700,31 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
+top:
end = lr->lr_offset + lr->lr_length;
if (end > zp->z_size) {
- ASSERT3U(end - zp->z_size, <, zp->z_blksz);
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
}
VN_RELE(ZTOV(zp));
diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c
index 73a40aa4f..d141e43d7 100644
--- a/module/zfs/zfs_sa.c
+++ b/module/zfs/zfs_sa.c
@@ -125,6 +125,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
+ ASSERT(MUTEX_HELD(&zp->z_lock));
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa) {
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -158,6 +159,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
+ ASSERT(MUTEX_HELD(&zp->z_lock));
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -204,6 +206,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
uint64_t crtime[2], mtime[2], ctime[2];
zfs_acl_phys_t znode_acl;
char scanstamp[AV_SCANSTAMP_SZ];
+ boolean_t drop_lock = B_FALSE;
/*
* No upgrade if ACL isn't cached
@@ -214,6 +217,22 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
return;
+ /*
+ * If the z_lock is held and we aren't the owner
+ * the just return since we don't want to deadlock
+ * trying to update the status of z_is_sa. This
+ * file can then be upgraded at a later time.
+ *
+ * Otherwise, we know we are doing the
+ * sa_update() that caused us to enter this function.
+ */
+ if (mutex_owner(&zp->z_lock) != curthread) {
+ if (mutex_tryenter(&zp->z_lock) == 0)
+ return;
+ else
+ drop_lock = B_TRUE;
+ }
+
/* First do a bulk query of the attributes that aren't cached */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
@@ -228,7 +247,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
&znode_acl, 88);
if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
- return;
+ goto done;
/*
@@ -269,9 +288,10 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
locate.cb_aclp = zp->z_acl_cached;
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
if (xattr)
- SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs),
- NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+ NULL, &xattr, 8);
/* if scanstamp then add scanstamp */
@@ -291,6 +311,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
znode_acl.z_acl_extern_obj, tx));
zp->z_is_sa = B_TRUE;
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
}
void
@@ -299,12 +322,11 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
return;
- ASSERT(!zp->z_is_sa);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- if (ZFS_EXTERNAL_ACL(zp)) {
- dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ if (zfs_external_acl(zp)) {
+ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
DMU_OBJECT_END);
}
}
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index f68dde85f..cb8c1d086 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -166,7 +166,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
}
if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
+ zil_commit(zfsvfs->z_log, 0);
ZFS_EXIT(zfsvfs);
} else {
@@ -417,7 +417,8 @@ zfs_register_callbacks(vfs_t *vfsp)
* of mount options, we stash away the current values and
* restore them after we register the callbacks.
*/
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+ !spa_writeable(dmu_objset_spa(os))) {
readonly = B_TRUE;
do_readonly = B_TRUE;
} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
@@ -821,23 +822,14 @@ zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
{
uint64_t fuid;
uint64_t quotaobj;
- uid_t id;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
- id = isgroup ? zp->z_gid : zp->z_uid;
+ fuid = isgroup ? zp->z_gid : zp->z_uid;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
- if (IS_EPHEMERAL(id)) {
- VERIFY(0 == sa_lookup(zp->z_sa_hdl,
- isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs),
- &fuid, sizeof (fuid)));
- } else {
- fuid = (uint64_t)id;
- }
-
return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
}
@@ -922,7 +914,10 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
sa_obj = 0;
}
- zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error)
+ goto out;
if (zfsvfs->z_version >= ZPL_VERSION_SA)
sa_register_update_callback(os, zfs_sa_upgrade);
@@ -1043,12 +1038,15 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* allocated and in the unlinked set, and there is an
* intent log record saying to allocate it.
*/
- if (zil_replay_disable) {
- zil_destroy(zfsvfs->z_log, B_FALSE);
- } else {
- zfsvfs->z_replay = B_TRUE;
- zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
- zfsvfs->z_replay = B_FALSE;
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
}
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
@@ -1172,6 +1170,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
@@ -1808,10 +1807,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
/*
* Evict cached data
*/
- if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
- }
+ if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
@@ -2031,8 +2030,9 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
goto bail;
- zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj,
- zfs_attr_table, ZPL_END);
+ if ((err = sa_setup(zfsvfs->z_os, sa_obj,
+ zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0)
+ goto bail;
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
@@ -2272,7 +2272,7 @@ static vfsdef_t vfw = {
MNTTYPE_ZFS,
zfs_vfsinit,
VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
- VSW_XID,
+ VSW_XID|VSW_ZMOUNT,
&zfs_mntopts
};
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index aa43c065f..a0720079c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -132,7 +132,7 @@
* (6) At the end of each vnode op, the DMU tx must always commit,
* regardless of whether there were any errors.
*
- * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
* to ensure that synchronous semantics are provided when necessary.
*
* In general, this is how things should be ordered in each vnode op:
@@ -164,7 +164,7 @@
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
- * zil_commit(zilog, seq, foid); // synchronous when necessary
+ * zil_commit(zilog, foid); // synchronous when necessary
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
@@ -490,7 +490,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* If we're in FRSYNC mode, sync out this znode before reading it.
*/
if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ zil_commit(zfsvfs->z_log, zp->z_id);
/*
* Lock the range against changes.
@@ -670,7 +670,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
xuio = (xuio_t *)uio;
else
- uio_prefaultpages(n, uio);
+ uio_prefaultpages(MIN(n, max_blksz), uio);
/*
* If in append mode, set the io offset pointer to eof.
@@ -866,6 +866,8 @@ again:
* been done, but that would still expose the ISUID/ISGID
* to another app after the partial write is committed.
*
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
@@ -893,6 +895,14 @@ again:
uio->uio_loffset);
ASSERT(error == 0);
}
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
@@ -902,6 +912,9 @@ again:
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
+
+ if (!xuio && n > 0)
+ uio_prefaultpages(MIN(n, max_blksz), uio);
}
zfs_range_unlock(rl);
@@ -917,7 +930,7 @@ again:
if (ioflag & (FSYNC | FDSYNC) ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, zp->z_last_itx, zp->z_id);
+ zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
@@ -1356,6 +1369,8 @@ top:
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
NULL, NULL);
if (error) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
if (strcmp(name, "..") == 0)
error = EISDIR;
ZFS_EXIT(zfsvfs);
@@ -1371,6 +1386,8 @@ top:
* to reference it.
*/
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
goto out;
}
@@ -1381,6 +1398,8 @@ top:
if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
error = EINVAL;
goto out;
}
@@ -1440,6 +1459,10 @@ top:
} else {
int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ have_acl = B_FALSE;
+
/*
* A directory entry already exists for this name.
*/
@@ -1496,7 +1519,7 @@ out:
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1527,12 +1550,13 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
int flags)
{
znode_t *zp, *dzp = VTOZ(dvp);
- znode_t *xzp = NULL;
+ znode_t *xzp;
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- uint64_t acl_obj, xattr_obj = 0;
+ uint64_t acl_obj, xattr_obj;
uint64_t xattr_obj_unlinked = 0;
+ uint64_t obj = 0;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
boolean_t may_delete_now, delete_now = FALSE;
@@ -1554,6 +1578,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
}
top:
+ xattr_obj = 0;
+ xzp = NULL;
/*
* Attempt to lock directory; fail if entry doesn't exist.
*/
@@ -1596,6 +1622,7 @@ top:
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
+ obj = zp->z_id;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -1612,16 +1639,17 @@ top:
/* are there any extended attributes? */
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj, sizeof (xattr_obj));
- if (xattr_obj) {
+ if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT3U(error, ==, 0);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
- /* are there any additional acls */
- if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now)
+ mutex_enter(&zp->z_lock);
+ if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ mutex_exit(&zp->z_lock);
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
@@ -1630,6 +1658,8 @@ top:
if (error) {
zfs_dirent_unlock(dl);
VN_RELE(vp);
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
@@ -1654,13 +1684,18 @@ top:
if (unlinked) {
+ /*
+ * Hold z_lock so that we can make sure that the ACL obj
+ * hasn't changed. Could have been deleted due to
+ * zfs_sa_upgrade().
+ */
+ mutex_enter(&zp->z_lock);
mutex_enter(&vp->v_lock);
-
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
&xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
delete_now = may_delete_now && !toobig &&
vp->v_count == 1 && !vn_has_cached_data(vp) &&
- xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) ==
+ xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
acl_obj;
mutex_exit(&vp->v_lock);
}
@@ -1676,6 +1711,7 @@ top:
ASSERT3U(error, ==, 0);
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
+
if (zp->z_is_sa)
error = sa_remove(zp->z_sa_hdl,
SA_ZPL_XATTR(zfsvfs), tx);
@@ -1685,7 +1721,6 @@ top:
sizeof (uint64_t), tx);
ASSERT3U(error, ==, 0);
}
- mutex_enter(&zp->z_lock);
mutex_enter(&vp->v_lock);
vp->v_count--;
ASSERT3U(vp->v_count, ==, 0);
@@ -1693,13 +1728,14 @@ top:
mutex_exit(&zp->z_lock);
zfs_znode_delete(zp, tx);
} else if (unlinked) {
+ mutex_exit(&zp->z_lock);
zfs_unlinked_add(zp, tx);
}
txtype = TX_REMOVE;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
@@ -1714,7 +1750,7 @@ out:
VN_RELE(ZTOV(xzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1896,7 +1932,7 @@ top:
zfs_dirent_unlock(dl);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (0);
@@ -2011,7 +2047,7 @@ top:
uint64_t txtype = TX_RMDIR;
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name);
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
@@ -2024,7 +2060,7 @@ out:
VN_RELE(vp);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -2164,7 +2200,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
while (outcount < bytes_wanted) {
ino64_t objnum;
ushort_t reclen;
- off64_t *next;
+ off64_t *next = NULL;
/*
* Special case `.', `..', and `.zfs'.
@@ -2290,7 +2326,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
} else {
offset += 1;
}
- *next = offset;
+ if (next)
+ *next = offset;
}
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
@@ -2343,7 +2380,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
}
return (0);
@@ -2384,6 +2421,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
@@ -2397,7 +2436,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
- if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) {
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
@@ -2413,8 +2453,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
mutex_enter(&zp->z_lock);
vap->va_type = vp->v_type;
vap->va_mode = zp->z_mode & MODEMASK;
- vap->va_uid = zp->z_uid;
- vap->va_gid = zp->z_gid;
vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
vap->va_nodeid = zp->z_id;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
@@ -2515,6 +2553,22 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = zp->z_gen;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
@@ -2570,7 +2624,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
- uint64_t xattr_obj = 0;
+ uint64_t xattr_obj;
uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
@@ -2578,7 +2632,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
- zfs_acl_t *aclp = NULL;
+ zfs_acl_t *aclp;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
boolean_t fuid_dirtied = B_FALSE;
sa_bulk_attr_t bulk[7], xattr_bulk[7];
@@ -2657,6 +2711,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
top:
attrzp = NULL;
+ aclp = NULL;
/* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
@@ -2692,6 +2747,8 @@ top:
((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
@@ -2748,8 +2805,7 @@ top:
mutex_enter(&zp->z_lock);
oldva.va_mode = zp->z_mode;
- oldva.va_uid = zp->z_uid;
- oldva.va_gid = zp->z_gid;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
/*
* Update xvattr mask to include only those attributes
@@ -2880,10 +2936,10 @@ top:
mask = vap->va_mask;
if ((mask & (AT_UID | AT_GID))) {
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj,
- sizeof (xattr_obj));
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
- if (xattr_obj) {
+ if (err == 0 && xattr_obj) {
err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err)
goto out2;
@@ -2891,8 +2947,10 @@ top:
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
- if (vap->va_uid != zp->z_uid &&
+ if (new_uid != zp->z_uid &&
zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
goto out2;
}
@@ -2903,6 +2961,8 @@ top:
cr, ZFS_GROUP, &fuidp);
if (new_gid != zp->z_gid &&
zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
err = EDQUOT;
goto out2;
}
@@ -2912,32 +2972,33 @@ top:
if (mask & AT_MODE) {
uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
- goto out;
+ zfs_acl_chmod_setattr(zp, &aclp, new_mode);
- if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) {
+ mutex_enter(&zp->z_lock);
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
/*
* Are we upgrading ACL from old V0 format
* to V1 format?
*/
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- ZNODE_ACL_VERSION(zp) ==
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- ZFS_EXTERNAL_ACL(zp), 0,
+ dmu_tx_hold_free(tx, acl_obj, 0,
DMU_OBJECT_END);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
} else {
- dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ dmu_tx_hold_write(tx, acl_obj, 0,
aclp->z_acl_bytes);
}
} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
}
+ mutex_exit(&zp->z_lock);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
} else {
if ((mask & AT_XVATTR) &&
@@ -2973,12 +3034,17 @@ top:
* updated as a side-effect of calling this function.
*/
+
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&zp->z_acl_lock);
mutex_enter(&zp->z_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, sizeof (zp->z_pflags));
if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
mutex_enter(&attrzp->z_lock);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
@@ -2990,26 +3056,24 @@ top:
if (mask & AT_UID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
&new_uid, sizeof (new_uid));
- zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid,
- cr, ZFS_OWNER);
+ zp->z_uid = new_uid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_UID(zfsvfs), NULL, &new_uid,
sizeof (new_uid));
- attrzp->z_uid = zp->z_uid;
+ attrzp->z_uid = new_uid;
}
}
if (mask & AT_GID) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
NULL, &new_gid, sizeof (new_gid));
- zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr,
- ZFS_GROUP);
+ zp->z_gid = new_gid;
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_GID(zfsvfs), NULL, &new_gid,
sizeof (new_gid));
- attrzp->z_gid = zp->z_gid;
+ attrzp->z_gid = new_gid;
}
}
if (!(mask & AT_MODE)) {
@@ -3026,20 +3090,18 @@ top:
}
if (mask & AT_MODE) {
- mutex_enter(&zp->z_acl_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
&new_mode, sizeof (new_mode));
zp->z_mode = new_mode;
ASSERT3U((uintptr_t)aclp, !=, NULL);
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = aclp;
aclp = NULL;
- mutex_exit(&zp->z_acl_lock);
}
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
if (mask & AT_ATIME) {
ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
@@ -3118,7 +3180,14 @@ top:
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
mutex_exit(&zp->z_lock);
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&zp->z_acl_lock);
+ if (attrzp) {
+ if (mask & (AT_UID|AT_GID|AT_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ mutex_exit(&attrzp->z_lock);
+ }
out:
if (err == 0 && attrzp) {
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
@@ -3145,10 +3214,9 @@ out:
dmu_tx_commit(tx);
}
-
out2:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (err);
@@ -3555,9 +3623,8 @@ top:
error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
if (error == 0) {
zfs_log_rename(zilog, tx, TX_RENAME |
- (flags & FIGNORECASE ? TX_CI : 0),
- sdzp, sdl->dl_name, tdzp, tdl->dl_name,
- szp);
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
/*
* Update path information for the target vnode
@@ -3600,7 +3667,7 @@ out:
VN_RELE(ZTOV(tzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -3724,11 +3791,13 @@ top:
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
+ mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
link, len, tx);
else
zfs_sa_symlink(zp, link, len, tx);
+ mutex_exit(&zp->z_lock);
zp->z_size = len;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -3751,7 +3820,7 @@ top:
VN_RELE(ZTOV(zp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -3785,11 +3854,13 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_lookup_uio(zp->z_sa_hdl,
SA_ZPL_SYMLINK(zfsvfs), uio);
else
error = zfs_sa_readlink(zp, uio);
+ mutex_exit(&zp->z_lock);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -3828,6 +3899,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
int error;
int zf = ZNEW;
uint64_t parent;
+ uid_t owner;
ASSERT(tdvp->v_type == VDIR);
@@ -3887,8 +3959,8 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
}
- if (szp->z_uid != crgetuid(cr) &&
- secpolicy_basic_link(cr) != 0) {
+ owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -3944,7 +4016,7 @@ top:
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
@@ -4181,7 +4253,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
out:
zfs_range_unlock(rl);
if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
+ zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -4836,7 +4908,7 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, UINT64_MAX, 0);
+ zil_commit(zilog, 0);
ZFS_EXIT(zfsvfs);
return (error);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 24bd3ddcd..e1e4e9e03 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -63,6 +63,7 @@
#include <sys/zfs_znode.h>
#include <sys/sa.h>
#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
#include "zfs_prop.h"
#include "zfs_comutil.h"
@@ -81,9 +82,6 @@
#define ZNODE_STAT_ADD(stat) /* nothing */
#endif /* ZNODE_STATS */
-#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
-#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
/*
* Functions needed for userland (ie: libzpool) are not put under
* #ifdef_KERNEL; the rest of the functions have dependencies
@@ -136,6 +134,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
+ zp->z_moved = 0;
return (0);
}
@@ -196,7 +195,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
nzp->z_blksz = ozp->z_blksz;
nzp->z_seq = ozp->z_seq;
nzp->z_mapcnt = ozp->z_mapcnt;
- nzp->z_last_itx = ozp->z_last_itx;
nzp->z_gen = ozp->z_gen;
nzp->z_sync_cnt = ozp->z_sync_cnt;
nzp->z_is_sa = ozp->z_is_sa;
@@ -228,6 +226,12 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
*/
ozp->z_sa_hdl = NULL;
POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+ /*
+ * Mark the znode.
+ */
+ nzp->z_moved = 1;
+ ozp->z_moved = (uint8_t)-1;
}
/*ARGSUSED*/
@@ -478,6 +482,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
vattr.va_gid = crgetgid(kcred);
sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
@@ -619,7 +625,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
vnode_t *vp;
uint64_t mode;
uint64_t parent;
- uint64_t uid, gid;
sa_bulk_attr_t bulk[9];
int count = 0;
@@ -627,6 +632,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
ASSERT(zp->z_dirlocks == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
/*
* Defer setting z_zfsvfs until the znode is ready to be a candidate for
@@ -636,7 +642,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_unlinked = 0;
zp->z_atime_dirty = 0;
zp->z_mapcnt = 0;
- zp->z_last_itx = 0;
zp->z_id = db->db_object;
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
@@ -659,9 +664,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&zp->z_atime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
- &uid, 8);
+ &zp->z_uid, 8);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
- &gid, 8);
+ &zp->z_gid, 8);
if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
if (hdl == NULL)
@@ -670,8 +675,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
return (NULL);
}
- zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
- zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
zp->z_mode = mode;
vp->v_vfsp = zfsvfs->z_parent->z_vfs;
@@ -705,7 +708,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
case VREG:
vp->v_flag |= VMODSORT;
if (parent == zfsvfs->z_shares_dir) {
- ASSERT(uid == 0 && gid == 0);
+ ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
vn_setops(vp, zfs_sharevnodeops);
} else {
vn_setops(vp, zfs_fvnodeops);
@@ -759,7 +762,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
{
uint64_t crtime[2], atime[2], mtime[2], ctime[2];
uint64_t mode, size, links, parent, pflags;
- uint64_t dzp_pflags = 0;
+ uint64_t dzp_pflags = 0;
uint64_t rdev = 0;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
dmu_buf_t *db;
@@ -794,7 +797,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
/*
* There's currently no mechanism for pre-reading the blocks that will
- * be to needed allocate a new object, so we accept the small chance
+ * be needed to allocate a new object, so we accept the small chance
* that there will be an i/o error and we will fail one of the
* assertions below.
*/
@@ -1085,6 +1088,16 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
}
int
@@ -1174,7 +1187,6 @@ zfs_rezget(znode_t *zp)
dmu_buf_t *db;
uint64_t obj_num = zp->z_id;
uint64_t mode;
- uint64_t uid, gid;
sa_bulk_attr_t bulk[8];
int err;
int count = 0;
@@ -1220,28 +1232,26 @@ zfs_rezget(znode_t *zp)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&zp->z_atime, sizeof (zp->z_atime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
- &uid, sizeof (uid));
+ &zp->z_uid, sizeof (zp->z_uid));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
- &gid, sizeof (gid));
+ &zp->z_gid, sizeof (zp->z_gid));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
&mode, sizeof (mode));
- zp->z_mode = mode;
-
if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
}
+ zp->z_mode = mode;
+
if (gen != zp->z_gen) {
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
}
- zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
- zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
zp->z_unlinked = (zp->z_links == 0);
zp->z_blksz = doi.doi_data_block_size;
@@ -1256,11 +1266,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
uint64_t obj = zp->z_id;
- uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp);
+ uint64_t acl_obj = zfs_external_acl(zp);
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- if (acl_obj)
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
VERIFY(0 == dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1562,6 +1574,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
dmu_tx_t *tx;
rl_t *rl;
int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
/*
* We will change zp_size, lock the whole file.
@@ -1598,9 +1612,15 @@ top:
}
zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
- &zp->z_size, sizeof (zp->z_size), tx));
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
dmu_tx_commit(tx);
@@ -1805,6 +1825,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vattr.va_gid = crgetgid(cr);
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
rootzp->z_is_sa = USE_SA(version, os);
@@ -1822,7 +1844,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
zfsvfs.z_use_sa = USE_SA(version, os);
zfsvfs.z_norm = norm;
- zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs.z_attr_table);
+
+ ASSERT(error == 0);
/*
* Fold case on file systems that are always or sometimes case
@@ -1838,7 +1863,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
@@ -1868,78 +1892,121 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
#endif /* _KERNEL */
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir,
- sa_attr_type_t *sa_table)
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db)
{
- dmu_buf_t *db;
dmu_object_info_t doi;
int error;
- uint64_t parent;
- uint64_t pflags;
- uint64_t mode;
- sa_bulk_attr_t bulk[3];
- sa_handle_t *hdl;
- int count = 0;
- if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0)
+ if ((error = sa_buf_hold(osp, obj, FTAG, db)) != 0)
return (error);
- dmu_object_info_from_db(db, &doi);
+ dmu_object_info_from_db(*db, &doi);
if ((doi.doi_bonus_type != DMU_OT_SA &&
doi.doi_bonus_type != DMU_OT_ZNODE) ||
doi.doi_bonus_type == DMU_OT_ZNODE &&
doi.doi_bonus_size < sizeof (znode_phys_t)) {
- sa_buf_rele(db, FTAG);
- return (EINVAL);
+ sa_buf_rele(*db, FTAG);
+ return (ENOTSUP);
}
- if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE,
- &hdl)) != 0) {
- sa_buf_rele(db, FTAG);
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, FTAG);
return (error);
}
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT],
- NULL, &parent, 8);
+ return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, FTAG);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp,
+ int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
- &pflags, 8);
+ &pflags, sizeof (pflags));
SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
- &mode, 8);
+ &mode, sizeof (mode));
- if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) {
- sa_buf_rele(db, FTAG);
- sa_handle_destroy(hdl);
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
return (error);
- }
+
*pobjp = parent;
*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
- sa_handle_destroy(hdl);
- sa_buf_rele(db, FTAG);
return (0);
}
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
char *path = buf + len - 1;
- sa_attr_type_t *sa_table;
int error;
- uint64_t sa_obj = 0;
*path = '\0';
-
- error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
-
- if (error != 0 && error != ENOENT)
- return (error);
-
- sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END);
+ sa_hdl = hdl;
for (;;) {
uint64_t pobj;
@@ -1947,8 +2014,11 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
size_t complen;
int is_xattrdir;
- if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
- &is_xattrdir, sa_table)) != 0)
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb);
+
+ if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
break;
if (pobj == obj) {
@@ -1972,6 +2042,22 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
ASSERT(path >= buf);
bcopy(component, path, complen);
obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db);
}
if (error == 0)
@@ -1979,3 +2065,57 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
return (error);
}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db);
+ return (error);
+}
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 4aa4d10b0..c66313ff6 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -34,7 +34,7 @@
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
@@ -78,12 +78,21 @@ boolean_t zfs_nocacheflush = B_FALSE;
static kmem_cache_t *zil_lwb_cache;
-static boolean_t zil_empty(zilog_t *zilog);
+static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
static int
zil_bp_compare(const void *x1, const void *x2)
{
@@ -631,6 +640,7 @@ zil_check_log_chain(const char *osname, void *tx)
{
zilog_t *zilog;
objset_t *os;
+ blkptr_t *bp;
int error;
ASSERT(tx == NULL);
@@ -642,6 +652,29 @@ zil_check_log_chain(const char *osname, void *tx)
}
zilog = dmu_objset_zil(os);
+ bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the log
+ * as its content should have already been synced to the pool.
+ */
+ if (!BP_IS_HOLE(bp)) {
+ vdev_t *vd;
+ boolean_t valid = B_TRUE;
+
+ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ if (vd->vdev_islog && vdev_is_dead(vd))
+ valid = vdev_log_state_valid(vd);
+ spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+ if (!valid) {
+ dmu_objset_rele(os, FTAG);
+ return (0);
+ }
+ }
/*
* Because tx == NULL, zil_claim_log_block() will not actually claim
@@ -661,8 +694,8 @@ zil_check_log_chain(const char *osname, void *tx)
static int
zil_vdev_compare(const void *x1, const void *x2)
{
- uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
- uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
if (v1 < v2)
return (-1);
@@ -703,7 +736,7 @@ zil_add_block(zilog_t *zilog, const blkptr_t *bp)
mutex_exit(&zilog->zl_vdev_lock);
}
-void
+static void
zil_flush_vdevs(zilog_t *zilog)
{
spa_t *spa = zilog->zl_spa;
@@ -1045,6 +1078,7 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
itx->itx_lr.lrc_reclen = lrsize;
itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
+ itx->itx_sync = B_TRUE; /* default is synchronous */
return (itx);
}
@@ -1055,190 +1089,362 @@ zil_itx_destroy(itx_t *itx)
kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
}
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
{
- uint64_t seq;
+ itx_t *itx;
+ list_t *list;
+ avl_tree_t *t;
+ void *cookie;
+ itx_async_node_t *ian;
+
+ list = &itxs->i_sync_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
- ASSERT(itx->itx_lr.lrc_seq == 0);
- ASSERT(!zilog->zl_replay);
+ cookie = NULL;
+ t = &itxs->i_async_tree;
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list = &ian->ia_list;
+ while ((itx = list_head(list)) != NULL) {
+ list_remove(list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ avl_destroy(t);
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz += itx->itx_sod;
- itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
- itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
+ kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
- return (seq);
+ if (o1 < o2)
+ return (-1);
+ if (o1 > o2)
+ return (1);
+
+ return (0);
}
/*
- * Free up all in-memory intent log transactions that have now been synced.
+ * Remove all async itx with the given oid.
*/
static void
-zil_itx_clean(zilog_t *zilog)
+zil_remove_async(zilog_t *zilog, uint64_t oid)
{
- uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
- uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
list_t clean_list;
itx_t *itx;
+ ASSERT(oid != 0);
list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
- mutex_enter(&zilog->zl_lock);
- /* wait for a log writer to finish walking list */
- while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- }
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
- /*
- * Move the sync'd log transactions to a separate list so we can call
- * kmem_free without holding the zl_lock.
- *
- * There is no need to set zl_writer as we don't drop zl_lock here
- */
- while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
- itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
- list_insert_tail(&clean_list, itx);
- }
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
- /* destroy sync'd log transactions */
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * Locate the object node and append its list.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ ian = avl_find(t, &oid, &where);
+ if (ian != NULL)
+ list_move_tail(&clean_list, &ian->ia_list);
+ mutex_exit(&itxg->itxg_lock);
+ }
while ((itx = list_head(&clean_list)) != NULL) {
list_remove(&clean_list, itx);
- zil_itx_destroy(itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
}
list_destroy(&clean_list);
}
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t txg;
+ itxg_t *itxg;
+ itxs_t *itxs, *clean = NULL;
+
+ /*
+ * Object ids can be re-instantiated in the next txg so
+ * remove any async transactions to avoid future leaks.
+ * This can happen if a fsync occurs on the re-instantiated
+ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+ * the new file data and flushes a write record for the old object.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+ zil_remove_async(zilog, itx->itx_oid);
+
+ /*
+ * Ensure the data of a renamed file is committed before the rename.
+ */
+ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+ zil_async_to_sync(zilog, itx->itx_oid);
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+ txg = ZILTEST_TXG;
+ else
+ txg = dmu_tx_get_txg(tx);
+
+ itxg = &zilog->zl_itxg[txg & TXG_MASK];
+ mutex_enter(&itxg->itxg_lock);
+ itxs = itxg->itxg_itxs;
+ if (itxg->itxg_txg != txg) {
+ if (itxs != NULL) {
+ /*
+ * The zil_clean callback hasn't got around to cleaning
+ * this itxg. Save the itxs for release below.
+ * This should be rare.
+ */
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean = itxg->itxg_itxs;
+ }
+ ASSERT(itxg->itxg_sod == 0);
+ itxg->itxg_txg = txg;
+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+ list_create(&itxs->i_sync_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ avl_create(&itxs->i_async_tree, zil_aitx_compare,
+ sizeof (itx_async_node_t),
+ offsetof(itx_async_node_t, ia_node));
+ }
+ if (itx->itx_sync) {
+ list_insert_tail(&itxs->i_sync_list, itx);
+ atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
+ itxg->itxg_sod += itx->itx_sod;
+ } else {
+ avl_tree_t *t = &itxs->i_async_tree;
+ uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ itx_async_node_t *ian;
+ avl_index_t where;
+
+ ian = avl_find(t, &foid, &where);
+ if (ian == NULL) {
+ ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+ list_create(&ian->ia_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+ ian->ia_foid = foid;
+ avl_insert(t, ian, where);
+ }
+ list_insert_tail(&ian->ia_list, itx);
+ }
+
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ mutex_exit(&itxg->itxg_lock);
+
+ /* Release the old itxs now we've dropped the lock */
+ if (clean != NULL)
+ zil_itxg_clean(clean);
+}
+
/*
* If there are any in-memory intent log transactions which have now been
* synced then start up a taskq to free them.
*/
void
-zil_clean(zilog_t *zilog)
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
{
- itx_t *itx;
+ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+ itxs_t *clean_me;
- mutex_enter(&zilog->zl_lock);
- itx = list_head(&zilog->zl_itx_list);
- if ((itx != NULL) &&
- (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
- (void) taskq_dispatch(zilog->zl_clean_taskq,
- (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+ mutex_exit(&itxg->itxg_lock);
+ return;
+ }
+ ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+ ASSERT(itxg->itxg_txg != 0);
+ ASSERT(zilog->zl_clean_taskq != NULL);
+ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
+ itxg->itxg_sod = 0;
+ clean_me = itxg->itxg_itxs;
+ itxg->itxg_itxs = NULL;
+ itxg->itxg_txg = 0;
+ mutex_exit(&itxg->itxg_lock);
+ /*
+ * Preferably start a task queue to free up the old itxs but
+ * if taskq_dispatch can't allocate resources to do that then
+ * free it in-line. This should be rare. Note, using TQ_SLEEP
+ * created a bad performance problem.
+ */
+ if (taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL)
+ zil_itxg_clean(clean_me);
+}
+
+/*
+ * Get the list of itxs to commit into zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+ uint64_t otxg, txg;
+ list_t *commit_list = &zilog->zl_itx_commit_list;
+ uint64_t push_sod = 0;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+ push_sod += itxg->itxg_sod;
+ itxg->itxg_sod = 0;
+
+ mutex_exit(&itxg->itxg_lock);
+ }
+ atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+static void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+ uint64_t otxg, txg;
+ itx_async_node_t *ian;
+ avl_tree_t *t;
+ avl_index_t where;
+
+ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+ otxg = ZILTEST_TXG;
+ else
+ otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+ mutex_enter(&itxg->itxg_lock);
+ if (itxg->itxg_txg != txg) {
+ mutex_exit(&itxg->itxg_lock);
+ continue;
+ }
+
+ /*
+ * If a foid is specified then find that node and append its
+ * list. Otherwise walk the tree appending all the lists
+ * to the sync list. We add to the end rather than the
+ * beginning to ensure the create has happened.
+ */
+ t = &itxg->itxg_itxs->i_async_tree;
+ if (foid != 0) {
+ ian = avl_find(t, &foid, &where);
+ if (ian != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ }
+ } else {
+ void *cookie = NULL;
+
+ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+ list_move_tail(&itxg->itxg_itxs->i_sync_list,
+ &ian->ia_list);
+ list_destroy(&ian->ia_list);
+ kmem_free(ian, sizeof (itx_async_node_t));
+ }
+ }
+ mutex_exit(&itxg->itxg_lock);
}
- mutex_exit(&zilog->zl_lock);
}
static void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit_writer(zilog_t *zilog)
{
uint64_t txg;
- uint64_t commit_seq = 0;
- itx_t *itx, *itx_next;
+ itx_t *itx;
lwb_t *lwb;
- spa_t *spa;
+ spa_t *spa = zilog->zl_spa;
int error = 0;
- zilog->zl_writer = B_TRUE;
ASSERT(zilog->zl_root_zio == NULL);
- spa = zilog->zl_spa;
+
+ mutex_exit(&zilog->zl_lock);
+
+ zil_get_commit_list(zilog);
+
+ /*
+ * Return if there's nothing to commit before we dirty the fs by
+ * calling zil_create().
+ */
+ if (list_head(&zilog->zl_itx_commit_list) == NULL) {
+ mutex_enter(&zilog->zl_lock);
+ return;
+ }
if (zilog->zl_suspend) {
lwb = NULL;
} else {
lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- /*
- * Return if there's nothing to flush before we
- * dirty the fs by calling zil_create()
- */
- if (list_is_empty(&zilog->zl_itx_list)) {
- zilog->zl_writer = B_FALSE;
- return;
- }
- mutex_exit(&zilog->zl_lock);
+ if (lwb == NULL)
lwb = zil_create(zilog);
- mutex_enter(&zilog->zl_lock);
- }
}
- ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
- /* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-
- for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
- /*
- * Save the next pointer. Even though we drop zl_lock below,
- * all threads that can remove itx list entries (other writers
- * and zil_itx_clean()) can't do so until they have zl_writer.
- */
- itx_next = list_next(&zilog->zl_itx_list, itx);
-
- /*
- * Determine whether to push this itx.
- * Push all transactions related to specified foid and
- * all other transactions except those that can be logged
- * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
- * for all other files.
- *
- * If foid == 0 (meaning "push all foids") or
- * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
- */
- if (foid != 0 && !itx->itx_sync &&
- TX_OOO(itx->itx_lr.lrc_txtype) &&
- ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
- continue; /* skip this record */
-
- if ((itx->itx_lr.lrc_seq > seq) &&
- ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
- (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
- break;
-
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_sod;
-
- mutex_exit(&zilog->zl_lock);
-
+ while (itx = list_head(&zilog->zl_itx_commit_list)) {
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
- if (txg > spa_last_synced_txg(spa) ||
- txg > spa_freeze_txg(spa))
+ if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
-
- zil_itx_destroy(itx);
-
- mutex_enter(&zilog->zl_lock);
+ list_remove(&zilog->zl_itx_commit_list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
- /* determine commit sequence number */
- itx = list_head(&zilog->zl_itx_list);
- if (itx)
- commit_seq = itx->itx_lr.lrc_seq - 1;
- else
- commit_seq = zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
/* write the last block out */
if (lwb != NULL && lwb->lwb_zio != NULL)
lwb = zil_lwb_write_start(zilog, lwb);
- zilog->zl_prev_used = zilog->zl_cur_used;
zilog->zl_cur_used = 0;
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
if (zilog->zl_root_zio) {
- DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
error = zio_wait(zilog->zl_root_zio);
zilog->zl_root_zio = NULL;
- DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
zil_flush_vdevs(zilog);
}
@@ -1246,10 +1452,6 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
txg_wait_synced(zilog->zl_dmu_pool, 0);
mutex_enter(&zilog->zl_lock);
- zilog->zl_writer = B_FALSE;
-
- ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
- zilog->zl_commit_seq = commit_seq;
/*
* Remember the highest committed log sequence number for ztest.
@@ -1261,58 +1463,61 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
}
/*
- * Push zfs transactions to stable storage up to the supplied sequence number.
+ * Commit zfs transactions to stable storage.
* If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
+ * for that object or might reference that object.
+ *
+ * itxs are committed in batches. In a heavily stressed zil there will be
+ * a commit writer thread who is writing out a bunch of itxs to the log
+ * for a set of committing threads (cthreads) in the same batch as the writer.
+ * Those cthreads are all waiting on the same cv for that batch.
+ *
+ * There will also be a different and growing batch of threads that are
+ * waiting to commit (qthreads). When the committing batch completes
+ * a transition occurs such that the cthreads exit and the qthreads become
+ * cthreads. One of the new cthreads becomes the writer thread for the
+ * batch. Any new threads arriving become new qthreads.
+ *
+ * Only 2 condition variables are needed and there's no transition
+ * between the two cvs needed. They just flip-flop between qthreads
+ * and cthreads.
+ *
+ * Using this scheme we can efficiently wakeup up only those threads
+ * that have been committed.
*/
void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit(zilog_t *zilog, uint64_t foid)
{
- if (zilog->zl_sync == ZFS_SYNC_DISABLED || seq == 0)
- return;
+ uint64_t mybatch;
- mutex_enter(&zilog->zl_lock);
+ if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+ return;
- seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+ /* move the async itxs for the foid to the sync queues */
+ zil_async_to_sync(zilog, foid);
+ mutex_enter(&zilog->zl_lock);
+ mybatch = zilog->zl_next_batch;
while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- if (seq <= zilog->zl_commit_seq) {
+ cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
+ if (mybatch <= zilog->zl_com_batch) {
mutex_exit(&zilog->zl_lock);
return;
}
}
- zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
- /* wake up others waiting on the commit */
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Report whether all transactions are committed.
- */
-static boolean_t
-zil_is_committed(zilog_t *zilog)
-{
- lwb_t *lwb;
- boolean_t committed;
-
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ zilog->zl_next_batch++;
+ zilog->zl_writer = B_TRUE;
+ zil_commit_writer(zilog);
+ zilog->zl_com_batch = mybatch;
+ zilog->zl_writer = B_FALSE;
+ mutex_exit(&zilog->zl_lock);
- if (!list_is_empty(&zilog->zl_itx_list))
- committed = B_FALSE; /* unpushed transactions */
- else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
- committed = B_TRUE; /* intent log never used */
- else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
- committed = B_FALSE; /* zil_sync() not done yet */
- else
- committed = B_TRUE; /* everything synced */
+ /* wake up one thread to become the next writer */
+ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
- mutex_exit(&zilog->zl_lock);
- return (committed);
+ /* wake up all threads waiting for this batch to be committed */
+ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
}
/*
@@ -1425,15 +1630,21 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_destroy_txg = TXG_INITIAL - 1;
zilog->zl_logbias = dmu_objset_logbias(os);
zilog->zl_sync = dmu_objset_syncprop(os);
+ zilog->zl_next_batch = 1;
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zilog->zl_itx_list, sizeof (itx_t),
- offsetof(itx_t, itx_node));
+ for (int i = 0; i < TXG_SIZE; i++) {
+ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
offsetof(lwb_t, lwb_node));
+ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
@@ -1441,6 +1652,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
return (zilog);
}
@@ -1448,27 +1661,47 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
void
zil_free(zilog_t *zilog)
{
- lwb_t *lwb;
+ lwb_t *head_lwb;
zilog->zl_stop_sync = 1;
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- list_remove(&zilog->zl_lwb_list, lwb);
- if (lwb->lwb_buf != NULL)
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- kmem_cache_free(zil_lwb_cache, lwb);
+ /*
+ * After zil_close() there should only be one lwb with a buffer.
+ */
+ head_lwb = list_head(&zilog->zl_lwb_list);
+ if (head_lwb) {
+ ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
+ list_remove(&zilog->zl_lwb_list, head_lwb);
+ zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, head_lwb);
}
list_destroy(&zilog->zl_lwb_list);
avl_destroy(&zilog->zl_vdev_tree);
mutex_destroy(&zilog->zl_vdev_lock);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
- list_destroy(&zilog->zl_itx_list);
+ ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+ list_destroy(&zilog->zl_itx_commit_list);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ /*
+ * It's possible for an itx to be generated that doesn't dirty
+ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+ * callback to remove the entry. We remove those here.
+ *
+ * Also free up the ziltest itxs.
+ */
+ if (zilog->zl_itxg[i].itxg_itxs)
+ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+ mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+ }
+
mutex_destroy(&zilog->zl_lock);
cv_destroy(&zilog->zl_cv_writer);
cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_cv_batch[0]);
+ cv_destroy(&zilog->zl_cv_batch[1]);
kmem_free(zilog, sizeof (zilog_t));
}
@@ -1494,26 +1727,28 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
void
zil_close(zilog_t *zilog)
{
+ lwb_t *tail_lwb;
+ uint64_t txg = 0;
+
+ zil_commit(zilog, 0); /* commit all itx */
+
/*
- * If the log isn't already committed, mark the objset dirty
- * (so zil_sync() will be called) and wait for that txg to sync.
+ * The lwb_max_txg for the stubby lwb will reflect the last activity
+ * for the zil. After a txg_wait_synced() on the txg we know all the
+ * callbacks have occurred that may clean the zil. Only then can we
+ * destroy the zl_clean_taskq.
*/
- if (!zil_is_committed(zilog)) {
- uint64_t txg;
- dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
+ mutex_enter(&zilog->zl_lock);
+ tail_lwb = list_tail(&zilog->zl_lwb_list);
+ if (tail_lwb != NULL)
+ txg = tail_lwb->lwb_max_txg;
+ mutex_exit(&zilog->zl_lock);
+ if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- }
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
zilog->zl_get_data = NULL;
-
- zil_itx_clean(zilog);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
}
/*
@@ -1545,15 +1780,7 @@ zil_suspend(zilog_t *zilog)
zilog->zl_suspending = B_TRUE;
mutex_exit(&zilog->zl_lock);
- zil_commit(zilog, UINT64_MAX, 0);
-
- /*
- * Wait for any in-flight log writes to complete.
- */
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- mutex_exit(&zilog->zl_lock);
+ zil_commit(zilog, 0);
zil_destroy(zilog, B_FALSE);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 88d80af4e..1ba2330bd 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2247,6 +2247,26 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ /*
+ * We keep track of time-sensitive I/Os so that the scan thread
+ * can quickly react to certain workloads. In particular, we care
+ * about non-scrubbing, top-level reads and writes with the following
+ * characteristics:
+ * - synchronous writes of user data to non-slog devices
+ * - any reads of user data
+ * When these conditions are met, adjust the timestamp of spa_last_io
+ * which allows the scan thread to adjust its workload accordingly.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+ vd == vd->vdev_top && !vd->vdev_islog &&
+ zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+ zio->io_txg != spa_syncing_txg(spa)) {
+ uint64_t old = spa->spa_last_io;
+ uint64_t new = ddi_get_lbolt64();
+ if (old != new)
+ (void) atomic_cas_64(&spa->spa_last_io, old, new);
+ }
+
align = 1ULL << vd->vdev_top->vdev_ashift;
if (P2PHASE(zio->io_size, align) != 0) {
@@ -2262,7 +2282,7 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -2744,6 +2764,7 @@ zio_done(zio_t *zio)
if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) &&
+ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
zio->io_error == ENXIO &&
spa_load_state(spa) == SPA_LOAD_NONE &&
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 16eaed668..9ae7d1f69 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -476,7 +476,6 @@ int
zio_clear_fault(int id)
{
inject_handler_t *handler;
- int ret;
rw_enter(&inject_lock, RW_WRITER);
@@ -486,18 +485,18 @@ zio_clear_fault(int id)
break;
if (handler == NULL) {
- ret = ENOENT;
- } else {
- list_remove(&inject_handlers, handler);
- spa_inject_delref(handler->zi_spa);
- kmem_free(handler, sizeof (inject_handler_t));
- atomic_add_32(&zio_injection_enabled, -1);
- ret = 0;
+ rw_exit(&inject_lock);
+ return (ENOENT);
}
+ list_remove(&inject_handlers, handler);
rw_exit(&inject_lock);
- return (ret);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+
+ return (0);
}
void
diff --git a/module/zfs/zrlock.c b/module/zfs/zrlock.c
new file mode 100644
index 000000000..ec94b0855
--- /dev/null
+++ b/module/zfs/zrlock.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED ((uint32_t)-1)
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == 0);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ ASSERT(zrl->zr_refcount >= 0);
+ zrl->zr_refcount++;
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT((int32_t)n > ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ int n = (int)zrl->zr_refcount;
+ return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif