diff options
author | Brian Behlendorf <[email protected]> | 2010-08-26 14:24:34 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2010-08-26 14:24:34 -0700 |
commit | 572e285762521df27fe5b026f409ba1a21abb7ac (patch) | |
tree | f5d0e8e3bd3c0956d437251974b67d88fea46304 /module/zfs | |
parent | 1980602bfae0605d3231e79627b3e25c07991b0e (diff) |
Update to onnv_147
This is the last official OpenSolaris tag before the public
development tree was closed.
Diffstat (limited to 'module/zfs')
69 files changed, 5091 insertions, 1544 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8adb54dc6..a82718e8b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -952,11 +952,6 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) void arc_buf_thaw(arc_buf_t *buf) { - kmutex_t *hash_lock; - - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); @@ -978,7 +973,6 @@ arc_buf_thaw(arc_buf_t *buf) } mutex_exit(&buf->b_hdr->b_freeze_lock); - mutex_exit(hash_lock); } void @@ -1750,6 +1744,7 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t marker = { 0 }; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; @@ -1762,6 +1757,11 @@ top: ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + hash_lock = HDR_LOCK(ab); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) @@ -1788,15 +1788,21 @@ top: DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; - } else { - if (bytes < 0) { - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } + } else if (bytes < 0) { + /* + * Insert a list marker and then wait for the + * hash lock to become available. Once its + * available, restart from where we left off. + */ + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + } else bufs_skipped += 1; - } } mutex_exit(&state->arcs_mtx); @@ -1825,8 +1831,9 @@ arc_adjust(void) * Adjust MRU size */ - adjustment = MIN(arc_size - arc_c, - arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); + adjustment = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); @@ -2113,9 +2120,7 @@ arc_reclaim_thread(void) arc_no_grow = FALSE; } - if (2 * arc_c < arc_size + - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) - arc_adjust(); + arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); @@ -2159,6 +2164,7 @@ arc_adapt(int bytes, arc_state_t *state) if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { @@ -2166,6 +2172,7 @@ arc_adapt(int bytes, arc_state_t *state) mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); @@ -4438,6 +4445,16 @@ l2arc_feed_thread(void) ASSERT(spa != NULL); /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index f81c48aca..72be31235 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -113,16 +113,15 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + bpo->bpo_os = os; bpo->bpo_object = object; bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - - err = dmu_bonus_hold(bpo->bpo_os, - bpo->bpo_object, bpo, &bpo->bpo_dbuf); - if (err) - return (err); bpo->bpo_phys = bpo->bpo_dbuf->db_data; return (0); } @@ -140,6 +139,7 @@ bpobj_close(bpobj_t *bpo) bpo->bpo_dbuf = NULL; bpo->bpo_phys = NULL; bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; mutex_destroy(&bpo->bpo_lock); } @@ -210,8 +210,10 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT(bpo->bpo_havecomp); err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); - if (err) + if (err) { + mutex_exit(&bpo->bpo_lock); return (err); + } epb = doi.doi_data_block_size / sizeof (uint64_t); for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { @@ -252,7 +254,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, &used_after, &comp_after, &uncomp_after)); bpo->bpo_phys->bpo_bytes -= used_before - used_after; ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - bpo->bpo_phys->bpo_comp -= comp_before - used_after; + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; bpo->bpo_phys->bpo_uncomp -= uncomp_before - uncomp_after; } @@ -312,17 +314,17 @@ void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) { bpobj_t subbpo; - uint64_t used, comp, uncomp; + uint64_t used, comp, uncomp, subsubobjs; ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - bpobj_close(&subbpo); if (used == 0) { /* No point in having an empty subobj. */ + bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); return; } @@ -338,10 +340,41 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), sizeof (subobj), &subobj, tx); bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } bpo->bpo_phys->bpo_bytes += used; bpo->bpo_phys->bpo_comp += comp; bpo->bpo_phys->bpo_uncomp += uncomp; mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); } void diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 42ae43997..9c4e0296d 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -217,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db) db->db_evict_func = NULL; } +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + void dbuf_evict(dmu_buf_impl_t *db) { @@ -281,7 +297,7 @@ dbuf_fini(void) static void dbuf_verify(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db) return; ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); @@ -297,8 +315,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == - DMU_SPILL_BLKID || list_head(&dn->dn_dbufs)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !list_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -355,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db) * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -380,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db) } } } + DB_DNODE_EXIT(db); } #endif @@ -424,8 +444,11 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; + spa_t *spa; + mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); + DB_GET_SPA(&spa, db); + abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; @@ -484,11 +507,14 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; arc_buf_t *pbuf; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -506,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -524,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); + DB_DNODE_EXIT(db); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -531,6 +559,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) return; } + spa = dn->dn_objset->os_spa; + DB_DNODE_EXIT(db); + db->db_state = DB_READ; mutex_exit(&db->db_mtx); @@ -549,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) else pbuf = db->db_objset->os_phys_buf; - (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) dsl_read(zio, spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -563,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) int err = 0; int havepzio = (zio != NULL); int prefetch; + dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it @@ -573,46 +605,51 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (db->db_state == DB_NOFILL) return (EIO); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { - if (zio == NULL) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - } + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { @@ -642,11 +679,12 @@ dbuf_noread(dmu_buf_impl_t *db) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); @@ -687,7 +725,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, + * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ @@ -700,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dr->dt.dl.dr_data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); + spa_t *spa; + + DB_GET_SPA(&spa, db); + dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_set_data(db, NULL); @@ -726,9 +766,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp)) - zio_free(db->db_dnode->dn_objset->os_spa, txg, bp); + if (!BP_IS_HOLE(bp)) { + spa_t *spa; + DB_GET_SPA(&spa, db); + zio_free(spa, txg, bp); + } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* * Release the already-written buffer, so we leave it in @@ -865,10 +908,15 @@ dbuf_block_freeable(dmu_buf_impl_t *db) else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; - /* If we don't exist or are in a snapshot, we can't be freed */ + /* + * If we don't exist or are in a snapshot, we can't be freed. + * Don't pass the bp to dsl_dataset_block_freeable() since we + * are holding the db_mtx lock and might deadlock if we are + * prefetching a dedup-ed block. + */ if (birth_txg) return (ds == NULL || - dsl_dataset_block_freeable(ds, db->db_blkptr, birth_txg)); + dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (FALSE); } @@ -879,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dbuf_will_dirty() with the dn_struct_rwlock held @@ -898,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -918,15 +970,17 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); - dnode_willuse_space(db->db_dnode, size-osize, tx); + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { - objset_t *os = db->db_dnode->dn_objset; + objset_t *os; zbookmark_t zb; + DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); @@ -944,8 +998,8 @@ dbuf_release_bp(dmu_buf_impl_t *db) dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; @@ -955,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they @@ -1009,6 +1065,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, @@ -1044,6 +1102,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ + os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); @@ -1132,6 +1191,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; @@ -1145,6 +1205,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ + ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } @@ -1193,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || - db->db_parent == db->db_dnode->dn_dbuf); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1204,13 +1264,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; @@ -1231,6 +1292,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* * If this buffer is currently held, we cannot undirty * it, since one of the current holders may be in the @@ -1243,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); + DB_DNODE_EXIT(db); return (0); } @@ -1264,6 +1329,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } + DB_DNODE_EXIT(db); if (db->db_level == 0) { if (db->db_state != DB_NOFILL) { @@ -1309,8 +1375,10 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } @@ -1372,7 +1440,6 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); @@ -1436,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_buf_evict() * ARC: dbuf_do_evict()->dbuf_destroy() @@ -1444,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) void dbuf_clear(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb = dn->dn_dbuf; + dmu_buf_impl_t *dndb; int dbuf_gone = FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1470,10 +1537,26 @@ dbuf_clear(dmu_buf_impl_t *db) db->db_state = DB_EVICTING; db->db_blkptr = NULL; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; + } else { + DB_DNODE_EXIT(db); } if (db->db_buf) @@ -1483,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); /* - * If this dbuf is referened from an indirect dbuf, + * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) @@ -1575,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; - db->db_dnode = dn; + db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; @@ -1632,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); @@ -1671,15 +1755,24 @@ dbuf_destroy(dmu_buf_impl_t *db) * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (db->db_dnode) { - dnode_t *dn = db->db_dnode; + if (db->db_dnode_handle != NULL) { + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); - + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold + * corresponding to the removed dbuf is no longer + * discounted in dnode_move(), so the dnode cannot be + * moved until after we release the hold. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } @@ -1710,17 +1803,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) /* dbuf_find() returns with db_mtx held */ if (db = dbuf_find(dn, 0, blkid)) { - if (refcount_count(&db->db_holds) > 0) { - /* - * This dbuf is active. We assume that it is - * already CACHED, or else about to be either - * read or filled. - */ - mutex_exit(&db->db_mtx); - return; - } + /* + * This dbuf is already in the cache. We assume that + * it is already CACHED, or else about to be either + * read or filled. + */ mutex_exit(&db->db_mtx); - db = NULL; + return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { @@ -1818,7 +1907,7 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); @@ -1834,7 +1923,7 @@ top: if (parent) dbuf_rele(parent, NULL); - ASSERT3P(db->db_dnode, ==, dn); + ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; @@ -1871,6 +1960,8 @@ int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + if (db->db_blkid != DMU_SPILL_BLKID) return (ENOTSUP); if (blksz == 0) @@ -1880,9 +1971,12 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) else blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); return (0); } @@ -1901,6 +1995,13 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) ASSERT(holds > 1); } +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ #pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) @@ -1921,6 +2022,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); @@ -1938,7 +2044,20 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); - dnode_rele(db->db_dnode, db); + + /* + * If the dnode moves here, we cannot cross this barrier + * until the move completes. + */ + DB_DNODE_ENTER(db); + (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); + DB_DNODE_EXIT(db); + /* + * The bonus buffer's dnode hold is no longer discounted + * in dnode_move(). The dnode cannot move until after + * the dnode_rele(). + */ + dnode_rele(DB_DNODE(db), db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this @@ -2089,7 +2208,7 @@ static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; + dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); @@ -2107,10 +2226,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); ASSERT(db->db_buf != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); db->db_data_pending = dr; @@ -2130,8 +2252,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); @@ -2154,6 +2276,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; @@ -2173,6 +2298,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -2191,6 +2318,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) return; } + os = dn->dn_objset; + /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we @@ -2200,7 +2329,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immdiate write, + * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { @@ -2237,10 +2366,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - else + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); + } } void @@ -2274,9 +2413,9 @@ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; @@ -2284,12 +2423,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_blkptr == bp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (BP_IS_HOLE(bp)) { ASSERT(bp->blk_fill == 0); + DB_DNODE_EXIT(db); return; } @@ -2303,7 +2445,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn = db->db_dnode; ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); @@ -2336,6 +2477,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill += ibp->blk_fill; } } + DB_DNODE_EXIT(db); bp->blk_fill = fill; @@ -2349,8 +2491,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_impl_t *db = vdb; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; uint64_t txg = zio->io_txg; dbuf_dirty_record_t **drp, *dr; @@ -2360,8 +2500,13 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; + objset_t *os; + dsl_dataset_t *ds; + dmu_tx_t *tx; + + DB_GET_OBJSET(&os, db); + ds = os->os_dsl_dataset; + tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); @@ -2382,10 +2527,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); } #endif @@ -2400,6 +2549,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -2411,6 +2564,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } + DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -2466,8 +2620,8 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; @@ -2475,6 +2629,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) zio_t *zio; int wp_flag = 0; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* @@ -2519,6 +2677,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 926b4df9a..718331496 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -36,6 +36,11 @@ #include <sys/zio_compress.h> #include <sys/dsl_scan.h> +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; @@ -456,9 +461,6 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) if (ddo_total->ddo_count != 0) { ddo_total->ddo_dspace /= ddo_total->ddo_count; ddo_total->ddo_mspace /= ddo_total->ddo_count; - } else { - ASSERT(ddo_total->ddo_dspace == 0); - ASSERT(ddo_total->ddo_mspace == 0); } } @@ -730,13 +732,13 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) ddt_t *ddt; ddt_entry_t dde; - if (!BP_GET_DEDUP(bp)) + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) return; /* - * We remove the DDT once it's empty and only prefetch dedup blocks - * when there are entries in the DDT. Thus no locking is required - * as the DDT can't disappear on us. + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. */ ddt = ddt_select(spa, bp); ddt_key_fill(&dde.dde_key, bp); @@ -1072,11 +1074,15 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) } for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + count += ddt_object_count(ddt, type, class); + } + } for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - if (!ddt_object_exists(ddt, type, class)) - continue; - ddt_object_sync(ddt, type, class, tx); - if (ddt_object_count(ddt, type, class) == 0) + if (count == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5b87c81c6..39234eba5 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -133,7 +133,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, } dnode_rele(dn, FTAG); - *dbp = &db->db; + *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } @@ -144,31 +144,64 @@ dmu_bonus_max(void) } int -dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); - if (newsize < 0 || newsize > db->db_size) - return (EINVAL); - dnode_setbonuslen(dn, newsize, tx); - return (0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = EINVAL; + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = EINVAL; + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); } int -dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx) +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (type > DMU_OT_NUMTYPES) - return (EINVAL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); + if (type > DMU_OT_NUMTYPES) { + error = EINVAL; + } else if (dn->dn_bonus != db) { + error = EINVAL; + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } - dnode_setbonus_type(dn, type, tx); - return (0); + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); } int @@ -208,11 +241,19 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dbuf_create_bonus(dn); } db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); /* as long as the bonus buf is held, the dnode will be held */ - if (refcount_add(&db->db_holds, tag) == 1) + if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -246,35 +287,56 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); - err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags); - *dbp = &db->db; + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { - dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; int err; - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) - return (EINVAL); - rw_enter(&dn->dn_struct_rwlock, RW_READER); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = EINVAL; + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = ENOENT; + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } - if (!dn->dn_have_spill) { rw_exit(&dn->dn_struct_rwlock); - return (ENOENT); } - err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp); - rw_exit(&dn->dn_struct_rwlock); + + DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { - return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode, - 0, tag, dbp)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); } /* @@ -396,14 +458,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, } int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; int err; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); return (err); } @@ -436,7 +502,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; @@ -997,11 +1063,19 @@ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + if (size == 0) return (0); - return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode, - uio, size, tx)); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); } int @@ -1087,9 +1161,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + spa_t *spa; - return (arc_loan_buf(dn->dn_objset->os_spa, size)); + DB_GET_SPA(&spa, db); + return (arc_loan_buf(spa, size)); } /* @@ -1111,23 +1187,35 @@ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(dbuf); if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { + objset_t *os; + uint64_t object; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + os = dn->dn_objset; + object = dn->dn_object; + DB_DNODE_EXIT(dbuf); + dbuf_rele(db, FTAG); - dmu_write(dn->dn_objset, dn->dn_object, offset, blksz, - buf->b_data, tx); + dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } @@ -1146,7 +1234,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { @@ -1157,7 +1244,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) */ BP_SET_LSIZE(bp, db->db_size); } else { - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } @@ -1280,6 +1366,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_sync_arg_t *dsa; zbookmark_t zb; zio_prop_t zp; + dnode_t *dn; ASSERT(pio != NULL); ASSERT(BP_IS_HOLE(bp)); @@ -1288,7 +1375,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); - dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. @@ -1413,7 +1503,8 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata); + boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || + (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; @@ -1569,9 +1660,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) * As above, but faster; can be used when you have a held dbuf in hand. */ void -dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { - dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); } /* @@ -1579,14 +1674,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) * This is specifically optimized for zfs_getattr(). */ void -dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; + DB_DNODE_EXIT(db); } void @@ -1638,23 +1739,25 @@ void dmu_init(void) { zfs_dbgmsg_init(); - dbuf_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); dnode_init(); + dbuf_init(); zfetch_init(); arc_init(); l2arc_init(); - xuio_stat_init(); - sa_cache_init(); } void dmu_fini(void) { + l2arc_fini(); arc_fini(); zfetch_fini(); - dnode_fini(); dbuf_fini(); - l2arc_fini(); + dnode_fini(); + dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c new file mode 100644 index 000000000..22340ebc5 --- /dev/null +++ b/module/zfs/dmu_diff.c @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> + +struct diffarg { + struct vnode *da_vp; /* file to which we are reporting */ + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; +}; + +static int +write_record(struct diffarg *da) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + + da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr, + sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct diffarg *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + if (zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (bp == NULL) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + uint32_t aflags = ARC_WAIT; + int blksz = BP_GET_LSIZE(bp); + int i; + + if (dsl_read(NULL, spa, bp, pbuf, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + (void) arc_buf_remove_ref(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) +{ + struct diffarg da; + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; + dsl_dataset_t *findds; + dsl_dataset_t *relds; + int err = 0; + + /* make certain we are looking at snapshots */ + if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) + return (EINVAL); + + /* fromsnap must be earlier and from the same lineage as tosnap */ + if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) + return (EXDEV); + + relds = NULL; + findds = ds; + + while (fromds->ds_dir != findds->ds_dir) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (!dsl_dir_is_clone(findds->ds_dir)) { + if (relds) + dsl_dataset_rele(relds, FTAG); + return (EXDEV); + } + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); + rw_exit(&dp->dp_config_rwlock); + + if (relds) + dsl_dataset_rele(relds, FTAG); + + if (err) + return (EXDEV); + + relds = findds; + } + + if (relds) + dsl_dataset_rele(relds, FTAG); + + da.da_vp = vp; + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + + err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + + if (err) { + da.da_err = err; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + return (da.da_err); +} diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 98228d403..8dff46048 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -33,7 +33,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, { uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; @@ -49,7 +49,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(os->os_meta_dnode, + int error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; @@ -187,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; - error = dnode_next_offset(os->os_meta_dnode, + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 690e6ecde..7caebd979 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -41,8 +41,26 @@ #include <sys/zil.h> #include <sys/dmu_impl.h> #include <sys/zfs_ioctl.h> -#include <sys/sunddi.h> #include <sys/sa.h> +#include <sys/zfs_onexit.h> + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} spa_t * dmu_objset_spa(objset_t *os) @@ -350,7 +368,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_secondary_cache = ZFS_CACHE_ALL; } - os->os_zil_header = os->os_phys->os_zil_header; + if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { @@ -368,13 +387,16 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - os->os_meta_dnode = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - os->os_userused_dnode = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); - os->os_groupused_dnode = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); } /* @@ -401,7 +423,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) *osp = ds->ds_objset; if (*osp == NULL) { err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, osp); + ds, dsl_dataset_get_blkptr(ds), osp); } mutex_exit(&ds->ds_opening_lock); return (err); @@ -470,8 +492,8 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, os->os_meta_dnode); - list_insert_tail(&os->os_dnodes, os->os_meta_dnode); + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance @@ -497,8 +519,9 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); dn = next_dn; } + dn = list_head(&os->os_dnodes); mutex_exit(&os->os_lock); - return (list_head(&os->os_dnodes) != os->os_meta_dnode); + return (dn != DMU_META_DNODE(os)); } void @@ -539,16 +562,26 @@ dmu_objset_evict(objset_t *os) */ (void) dmu_objset_evict_dbufs(os); - dnode_special_close(os->os_meta_dnode); - if (os->os_userused_dnode) { - dnode_special_close(os->os_userused_dnode); - dnode_special_close(os->os_groupused_dnode); + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); + mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); @@ -570,12 +603,12 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - if (ds) - mutex_enter(&ds->ds_opening_lock); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os)); - if (ds) - mutex_exit(&ds->ds_opening_lock); - mdn = os->os_meta_dnode; + if (ds != NULL) + VERIFY(0 == dmu_objset_from_ds(ds, &os)); + else + VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -663,34 +696,33 @@ static void dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; + spa_t *spa = dd->dd_pool->dp_spa; struct oscarg *oa = arg2; - uint64_t dsobj; + uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); - dsobj = dsl_dataset_create_sync(dd, oa->lastname, + obj = dsl_dataset_create_sync(dd, oa->lastname, oa->clone_origin, oa->flags, oa->cr, tx); if (oa->clone_origin == NULL) { + dsl_pool_t *dp = dd->dd_pool; dsl_dataset_t *ds; blkptr_t *bp; objset_t *os; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, - FTAG, &ds)); + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); ASSERT(BP_IS_HOLE(bp)); - os = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, bp, oa->type, tx); + os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); if (oa->userfunc) oa->userfunc(os, oa->userarg, oa->cr, tx); dsl_dataset_rele(ds, FTAG); } - spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, "dataset = %llu", dsobj); + spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); } int @@ -758,18 +790,8 @@ dmu_objset_destroy(const char *name, boolean_t defer) dsl_dataset_t *ds; int error; - /* - * dsl_dataset_destroy() can free any claimed-but-unplayed - * intent log, but if there is an active log, it has blocks that - * are allocated, but may not yet be reflected in the on-disk - * structure. Only the ZIL knows how to free them, so we have - * to call into it here. - */ error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); if (error == 0) { - objset_t *os; - if (dmu_objset_from_ds(ds, &os) == 0) - zil_destroy(dmu_objset_zil(os), B_FALSE); error = dsl_dataset_destroy(ds, FTAG, defer); /* dsl_dataset_destroy() closes the ds. */ } @@ -780,9 +802,14 @@ dmu_objset_destroy(const char *name, boolean_t defer) struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; + char *htag; char failed[MAXPATHLEN]; boolean_t recursive; + boolean_t needsuspend; + boolean_t temporary; nvlist_t *props; + struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ + dsl_dataset_t *newds; }; static int @@ -790,11 +817,41 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; struct snaparg *sn = arg2; + int error; /* The props have already been checked by zfs_check_userprops(). */ - return (dsl_dataset_snapshot_check(os->os_dsl_dataset, - sn->snapname, tx)); + error = dsl_dataset_snapshot_check(os->os_dsl_dataset, + sn->snapname, tx); + if (error) + return (error); + + if (sn->temporary) { + /* + * Ideally we would just call + * dsl_dataset_user_hold_check() and + * dsl_dataset_destroy_check() here. However the + * dataset we want to hold and destroy is the snapshot + * that we just confirmed we can create, but it won't + * exist until after these checks are run. Do any + * checks we can here and if more checks are added to + * those routines in the future, similar checks may be + * necessary here. + */ + if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + /* + * Not checking number of tags because the tag will be + * unique, as it will be the only tag. + */ + if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + sn->ha->temphold = B_TRUE; + sn->ha->htag = sn->htag; + } + return (error); } static void @@ -812,6 +869,19 @@ snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) pa.pa_source = ZPROP_SRC_LOCAL; dsl_props_set_sync(ds->ds_prev, &pa, tx); } + + if (sn->temporary) { + struct dsl_ds_destroyarg da; + + dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); + kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); + sn->ha = NULL; + sn->newds = ds->ds_prev; + + da.ds = ds->ds_prev; + da.defer = B_TRUE; + dsl_dataset_destroy_sync(&da, FTAG, tx); + } } static int @@ -857,29 +927,27 @@ dmu_objset_snapshot_one(const char *name, void *arg) return (sn->recursive ? 0 : EBUSY); } - /* - * NB: we need to wait for all in-flight changes to get to disk, - * so that we snapshot those changes. zil_suspend does this as - * a side effect. - */ - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) { - dsl_sync_task_create(sn->dstg, snapshot_check, - snapshot_sync, os, sn, 3); - } else { - dmu_objset_rele(os, sn); + if (sn->needsuspend) { + err = zil_suspend(dmu_objset_zil(os)); + if (err) { + dmu_objset_rele(os, sn); + return (err); + } } + dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, + os, sn, 3); - return (err); + return (0); } int -dmu_objset_snapshot(char *fsname, char *snapname, - nvlist_t *props, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) { dsl_sync_task_t *dst; struct snaparg sn; spa_t *spa; + minor_t minor; int err; (void) strcpy(sn.failed, fsname); @@ -888,10 +956,26 @@ dmu_objset_snapshot(char *fsname, char *snapname, if (err) return (err); + if (temporary) { + if (cleanup_fd < 0) { + spa_close(spa, FTAG); + return (EINVAL); + } + if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { + spa_close(spa, FTAG); + return (err); + } + } + sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; + sn.htag = tag; sn.props = props; sn.recursive = recursive; + sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + sn.temporary = temporary; + sn.ha = NULL; + sn.newds = NULL; if (recursive) { err = dmu_objset_find(fsname, @@ -907,14 +991,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, dst = list_next(&sn.dstg->dstg_tasks, dst)) { objset_t *os = dst->dst_arg1; dsl_dataset_t *ds = os->os_dsl_dataset; - if (dst->dst_err) + if (dst->dst_err) { dsl_dataset_name(ds, sn.failed); - zil_resume(dmu_objset_zil(os)); + } else if (temporary) { + dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); + } + if (sn.needsuspend) + zil_resume(dmu_objset_zil(os)); dmu_objset_rele(os, &sn); } if (err) (void) strcpy(fsname, sn.failed); + if (temporary) + zfs_onexit_fd_rele(cleanup_fd); dsl_sync_task_group_destroy(sn.dstg); spa_close(spa, FTAG); return (err); @@ -1035,17 +1125,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) /* * Sync special dnodes - the parent IO for the sync is the root block */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != DMU_OT_NONE) { - os->os_userused_dnode->dn_zio = zio; - dnode_sync(os->os_userused_dnode, tx); - os->os_groupused_dnode->dn_zio = zio; - dnode_sync(os->os_groupused_dnode, tx); + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; @@ -1063,7 +1153,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT(dr->dr_dbuf->db_level == 0); list_remove(list, dr); @@ -1085,7 +1175,16 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } -objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +boolean_t +dmu_objset_is_dirty_anywhere(objset_t *os) +{ + for (int t = 0; t < TXG_SIZE; t++) + if (dmu_objset_is_dirty(os, t)) + return (B_TRUE); + return (B_FALSE); +} + +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) @@ -1097,8 +1196,8 @@ boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] && - os->os_userused_dnode); + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); } static void @@ -1125,13 +1224,14 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { + int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ - if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); @@ -1148,18 +1248,19 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * a bprewrite. */ - mutex_enter(&dn->dn_mtx); - ASSERT(dn->dn_id_flags); - if (dn->dn_id_flags & DN_ID_OLD_EXIST) { + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); } - if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); } + mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { @@ -1199,13 +1300,23 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr->dr_txg == tx->tx_txg) break; - if (dr == NULL) + if (dr == NULL) { data = NULL; - else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 && - dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; - else - data = dr->dt.dl.dr_data; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + return (data); } @@ -1242,7 +1353,8 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; - error = dmu_spill_hold_by_dnode(dn, rf, + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6b00b73b4..e47d533a4 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -42,6 +42,7 @@ #include <zfs_fletcher.h> #include <sys/avl.h> #include <sys/ddt.h> +#include <sys/zfs_onexit.h> static char *dmu_recv_tag = "dmu_recv_tag"; @@ -573,6 +574,14 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + if (rbsa->fromguid) { /* if incremental, most recent snapshot must match fromguid */ if (ds->ds_prev == NULL) @@ -620,13 +629,6 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) if (err != ENOENT) return (err); - /* new snapshot name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); return (0); } @@ -661,7 +663,6 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) dp->dp_spa, tx, "dataset = %lld", dsobj); } - static boolean_t dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) { @@ -786,7 +787,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, return (err); if (dmu_recv_verify_features(ds, drrb)) { - dsl_dataset_rele(ds, dmu_recv_tag); + dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } @@ -810,7 +811,7 @@ struct restorearg { uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; - avl_tree_t guid_to_ds_map; + avl_tree_t *guid_to_ds_map; }; typedef struct guid_map_entry { @@ -887,6 +888,21 @@ find_ds_by_guid(const char *name, void *arg) return (0); } +static void +free_guid_map_onexit(void *arg) +{ + avl_tree_t *ca = arg; + void *cookie = NULL; + guid_map_entry_t *gmep; + + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + dsl_dataset_rele(gmep->gme_ds, ca); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} + static void * restore_read(struct restorearg *ra, int len) { @@ -1173,7 +1189,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch, + if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (EINVAL); } @@ -1276,13 +1292,13 @@ restore_free(struct restorearg *ra, objset_t *os, * NB: callers *must* call dmu_recv_end() if this succeeds. */ int -dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) +dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep) { struct restorearg ra = { 0 }; dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; - guid_map_entry_t *gmep; int featureflags; if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) @@ -1336,12 +1352,38 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - avl_create(&ra.guid_to_ds_map, guid_compare, - sizeof (guid_map_entry_t), - offsetof(guid_map_entry_t, avlnode)); - (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, - (void *)&ra.guid_to_ds_map, - DS_FIND_CHILDREN); + minor_t minor; + + if (cleanup_fd == -1) { + ra.err = EBADF; + goto out; + } + ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (ra.err) { + cleanup_fd = -1; + goto out; + } + + if (*action_handlep == 0) { + ra.guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(ra.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, + (void *)ra.guid_to_ds_map, + DS_FIND_CHILDREN); + ra.err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, ra.guid_to_ds_map, + action_handlep); + if (ra.err) + goto out; + } else { + ra.err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&ra.guid_to_ds_map); + if (ra.err) + goto out; + } } /* @@ -1423,6 +1465,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) ASSERT(ra.err != 0); out: + if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); + if (ra.err != 0) { /* * destroy what we created, so we don't leave it in the @@ -1438,16 +1483,6 @@ out: } } - if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - void *cookie = NULL; - - while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) { - dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map); - kmem_free(gmep, sizeof (guid_map_entry_t)); - } - avl_destroy(&ra.guid_to_ds_map); - } - kmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; return (ra.err); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 429c76ae1..023f90e12 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -36,7 +36,9 @@ #include <sys/sa_impl.h> #include <sys/callb.h> -struct prefetch_data { +int zfs_pd_blks_max = 100; + +typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; int pd_blks_max; @@ -44,27 +46,26 @@ struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; -}; +} prefetch_data_t; -struct traverse_data { +typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; int td_flags; - struct prefetch_data *td_pfd; + prefetch_data_t *td_pfd; blkptr_cb_t *td_func; void *td_arg; -}; +} traverse_data_t; -static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, +static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object); -/* ARGSUSED */ static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; + traverse_data_t *td = arg; zbookmark_t zb; if (bp->blk_birth == 0) @@ -81,11 +82,10 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) return (0); } -/* ARGSUSED */ static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; + traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; @@ -98,8 +98,8 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); - SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); @@ -108,7 +108,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) } static void -traverse_zil(struct traverse_data *td, zil_header_t *zh) +traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; @@ -129,13 +129,13 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh) } static int -traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; int err = 0, lasterr = 0; arc_buf_t *buf = NULL; - struct prefetch_data *pd = td->td_pfd; + prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { @@ -162,6 +162,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); if (err) return (err); } @@ -225,8 +227,6 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, return (err); osp = buf->b_data; - traverse_zil(td, &osp->os_zil_header); - dnp = &osp->os_meta_dnode; err = traverse_dnode(td, dnp, buf, zb->zb_objset, DMU_META_DNODE_OBJECT); @@ -262,7 +262,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } static int -traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, +traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object) { int j, err = 0, lasterr = 0; @@ -300,7 +300,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { - struct prefetch_data *pfd = arg; + prefetch_data_t *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; ASSERT(pfd->pd_blks_fetched >= 0); @@ -330,8 +330,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, static void traverse_prefetch_thread(void *arg) { - struct traverse_data *td_main = arg; - struct traverse_data td = *td_main; + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; zbookmark_t czb; td.td_func = traverse_prefetcher; @@ -353,16 +353,16 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, +traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - struct traverse_data td; - struct prefetch_data pd = { 0 }; + traverse_data_t td; + prefetch_data_t pd = { 0 }; zbookmark_t czb; int err; td.td_spa = spa; - td.td_objset = objset; + td.td_objset = ds ? ds->ds_object : 0; td.td_rootbp = rootbp; td.td_min_txg = txg_start; td.td_func = func; @@ -370,17 +370,28 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, td.td_pfd = &pd; td.td_flags = flags; - pd.pd_blks_max = 100; + pd.pd_blks_max = zfs_pd_blks_max; pd.pd_flags = flags; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { + objset_t *os; + + err = dmu_objset_from_ds(ds, &os); + if (err) + return (err); + + traverse_zil(&td, &os->os_zil_header); + } + if (!(flags & TRAVERSE_PREFETCH) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; - SET_BOOKMARK(&czb, objset, + SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); @@ -405,7 +416,7 @@ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); } @@ -423,7 +434,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), + err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), txg_start, flags, func, arg); if (err) return (err); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fc062c16..bd5c71a22 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -186,7 +186,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, ASSERT(level != 0); db = NULL; } else { - ASSERT(db->db_dnode == dn); + ASSERT(DB_DNODE(db) == dn); ASSERT(db->db_level == level); ASSERT(db->db.db_size == space); ASSERT(db->db_blkid == blkid); @@ -384,7 +384,7 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); @@ -787,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { @@ -870,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); @@ -1355,9 +1364,19 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - if (sa->sa_force_spill || may_grow || hdl->sa_spill || - ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) { + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); } } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index c16902d21..850dd5816 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -38,19 +38,33 @@ static int free_range_compar(const void *node1, const void *node2); static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { - int i; dnode_t *dn = arg; - bzero(dn, sizeof (dnode_t)); + int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -59,8 +73,18 @@ dnode_cons(void *arg, void *unused, int kmflag) refcount_create(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); avl_create(&dn->dn_ranges[i], free_range_compar, sizeof (free_range_t), offsetof(struct free_range, fr_node)); @@ -69,9 +93,27 @@ dnode_cons(void *arg, void *unused, int kmflag) offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + dn->dn_moved = 0; return (0); } @@ -88,27 +130,56 @@ dnode_dest(void *arg, void *unused) cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); avl_destroy(&dn->dn_ranges[i]); list_destroy(&dn->dn_dirty_records[i]); + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); } + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_free_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT3U(dn->dn_dirtyctx, ==, 0); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT3U(dn->dn_oldused, ==, 0); + ASSERT3U(dn->dn_oldflags, ==, 0); + ASSERT3U(dn->dn_olduid, ==, 0); + ASSERT3U(dn->dn_oldgid, ==, 0); + ASSERT3U(dn->dn_newuid, ==, 0); + ASSERT3U(dn->dn_newgid, ==, 0); + ASSERT3U(dn->dn_id_flags, ==, 0); + + ASSERT3U(dn->dn_dbufs_count, ==, 0); list_destroy(&dn->dn_dbufs); } void dnode_init(void) { + ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; } @@ -120,6 +191,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); @@ -298,18 +370,29 @@ dnode_setdblksz(dnode_t *dn, int size) static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object) + uint64_t object, dnode_handle_t *dnh) { dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - (void) dnode_cons(dn, NULL, 0); /* XXX */ - dn->dn_objset = os; + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ dn->dn_object = object; dn->dn_dbuf = db; + dn->dn_handle = dnh; dn->dn_phys = dnp; - if (dnp->dn_datablkszsec) + if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; @@ -325,45 +408,65 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); + membar_producer(); + /* + * Everything else must be valid before assigning dn_objset makes the + * dnode eligible for dnode_move(). + */ + dn->dn_objset = os; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } +/* + * Caller must be holding the dnode handle, which is released upon return. + */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; -#ifdef ZFS_DEBUG - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); - ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); - } - ASSERT(NULL == list_head(&dn->dn_dbufs)); -#endif ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); list_remove(&os->os_dnodes, dn); mutex_exit(&os->os_lock); - if (dn->dn_dirtyctx_firstset) { + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } - dmu_zfetch_rele(&dn->dn_zfetch); - if (dn->dn_bonus) { + if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } @@ -408,6 +511,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); @@ -522,9 +626,304 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_exit(&dn->dn_mtx); } +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(list_is_empty(&ndn->dn_dbufs)); + list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; + ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_ranges[i].avl_root = NULL; + odn->dn_ranges[i].avl_numnodes = 0; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef _KERNEL +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ + void -dnode_special_close(dnode_t *dn) +dnode_special_close(dnode_handle_t *dnh) { + dnode_t *dn = dnh->dnh_dnode; + /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that @@ -533,13 +932,19 @@ dnode_special_close(dnode_t *dn) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } dnode_t * -dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object); + dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); + dnh->dnh_dnode = dn; + zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); return (dn); } @@ -547,34 +952,43 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) static void dnode_buf_pageout(dmu_buf_t *db, void *arg) { - dnode_t **children_dnodes = arg; + dnode_children_t *children_dnodes = arg; int i; int epb = db->db_size >> DNODE_SHIFT; + ASSERT(epb == children_dnodes->dnc_count); + for (i = 0; i < epb; i++) { - dnode_t *dn = children_dnodes[i]; - int n; + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; - if (dn == NULL) + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); continue; -#ifdef ZFS_DEBUG + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligable for eviction and this function + * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); - ASSERT(list_head(&dn->dn_dbufs) == NULL); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - for (n = 0; n < TXG_SIZE; n++) - ASSERT(!list_link_active(&dn->dn_dirty_link[n])); -#endif - children_dnodes[i] = NULL; - dnode_destroy(dn); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); } /* @@ -593,7 +1007,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_t **children_dnodes; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't @@ -603,12 +1018,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && - spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) && - !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER))); + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? - os->os_userused_dnode : os->os_groupused_dnode; + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (ENOENT); type = dn->dn_type; @@ -625,7 +1039,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); - mdn = os->os_meta_dnode; + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); @@ -652,26 +1067,39 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, idx = object & (epb-1); + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { - dnode_t **winner; - children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), - KM_SLEEP); + int i; + dnode_children_t *winner; + children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + dnh[i].dnh_dnode = NULL; + } if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, dnode_buf_pageout)) { - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); children_dnodes = winner; } } + ASSERT(children_dnodes->dnc_count == epb); - if ((dn = children_dnodes[idx]) == NULL) { - dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + if ((dn = dnh->dnh_dnode) == NULL) { + dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, dnp, db, object); - winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + dn = dnode_create(os, phys, db, object, dnh); + winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); if (winner != NULL) { - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ dn = winner; } } @@ -683,13 +1111,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, ((flag & DNODE_MUST_BE_FREE) && (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } mutex_exit(&dn->dn_mtx); if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dn); + dbuf_add_ref(db, dnh); + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -731,13 +1162,37 @@ void dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && dn->dn_dbuf) - dbuf_rele(dn->dn_dbuf, dn); + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + dbuf_rele(db, dnh); + } } void @@ -756,7 +1211,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif @@ -795,7 +1250,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child - * dbuf maintaines a hold on the dnode. When the last child + * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f9ec9f602..2ee990a3b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -76,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) if (child == NULL) continue; - ASSERT3P(child->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != @@ -135,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; + dnode_t *dn; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<<epbs); num = end - start + 1; ASSERT3U(off, >=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); @@ -155,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); @@ -200,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) dbuf_rele(child, FTAG); } + DB_DNODE_EXIT(db); } #endif @@ -209,7 +217,7 @@ static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; @@ -230,7 +238,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; @@ -253,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -272,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } dbuf_rele(subdb, FTAG); } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); #ifdef ZFS_DEBUG bp -= (end-start)+1; @@ -375,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); - ASSERT3P(db->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index ddd83576c..59ac4a609 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -37,15 +37,11 @@ #include <sys/zfs_ioctl.h> #include <sys/spa.h> #include <sys/zfs_znode.h> +#include <sys/zfs_onexit.h> #include <sys/zvol.h> #include <sys/dsl_scan.h> #include <sys/dsl_deadlist.h> -/* - * Enable/disable prefetching of dedup-ed blocks which are going to be freed. - */ -int zfs_dedup_prefetch = 1; - static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; @@ -253,8 +249,7 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) return (B_FALSE); - if (zfs_dedup_prefetch && bp && BP_GET_DEDUP(bp)) - ddt_prefetch(dsl_dataset_get_spa(ds), bp); + ddt_prefetch(dsl_dataset_get_spa(ds), bp); return (B_TRUE); } @@ -372,6 +367,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; + dmu_object_info_t doi; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); @@ -379,6 +375,12 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err) return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_type != DMU_OT_DSL_DATASET) + return (EINVAL); + ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner; @@ -881,6 +883,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dir_close(dd, FTAG); + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL) { + dsl_dataset_t *ds; + objset_t *os; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + dsl_dataset_dirty(ds, tx); + dsl_dataset_rele(ds, FTAG); + } + return (dsobj); } @@ -1081,11 +1098,16 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) */ (void) dmu_free_object(os, obj); } + if (err != ESRCH) + goto out; /* - * We need to sync out all in-flight IO before we try to evict - * (the dataset evict func is trying to clear the cached entries - * for this dataset in the ARC). + * Only the ZIL knows how to free log blocks. + */ + zil_destroy(dmu_objset_zil(os), B_FALSE); + + /* + * Sync out all in-flight IO. */ txg_wait_synced(dd->dd_pool, 0); @@ -1103,9 +1125,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) count == 0); } - if (err != ESRCH) - goto out; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); @@ -1356,6 +1375,11 @@ dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, return (0); } +/* + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. + */ /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1597,21 +1621,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; + boolean_t wont_destroy; uint64_t obj; - ASSERT(ds->ds_owner); + wont_destroy = (dsda->defer && + (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); + + ASSERT(ds->ds_owner || wont_destroy); ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); - if (dsda->defer) { + if (wont_destroy) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; - return; - } + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; } /* signal any waiters that this dataset is going away */ @@ -1620,11 +1646,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; @@ -1850,6 +1871,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) } } + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); @@ -1928,7 +1958,7 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); /* @@ -2224,8 +2254,21 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (B_TRUE); + ds->ds_prev->ds_phys->ds_creation_txg) { + objset_t *os, *os_prev; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_prev->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } return (B_FALSE); } @@ -3144,9 +3187,14 @@ dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, ASSERT(clone->ds_owner); ASSERT(origin_head->ds_owner); retry: - /* Need exclusive access for the swap */ - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + /* + * Need exclusive access for the swap. If we're swapping these + * datasets back after an error, we already hold the locks. + */ + if (!RW_WRITE_HELD(&clone->ds_rwlock)) + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && + !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { rw_exit(&clone->ds_rwlock); rw_enter(&origin_head->ds_rwlock, RW_WRITER); if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { @@ -3411,22 +3459,41 @@ dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, return (err); } -struct dsl_ds_holdarg { - dsl_sync_task_group_t *dstg; - char *htag; - char *snapname; - boolean_t recursive; - boolean_t gotone; - boolean_t temphold; - char failed[MAXPATHLEN]; -}; +typedef struct zfs_hold_cleanup_arg { + dsl_pool_t *dp; + uint64_t dsobj; + char htag[MAXNAMELEN]; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + + (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, + B_TRUE); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); +} + +void +dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); + ca->dp = ds->ds_dir->dd_pool; + ca->dsobj = ds->ds_object; + (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); + VERIFY3U(0, ==, zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} /* - * The max length of a temporary tag prefix is the number of hex digits - * required to express UINT64_MAX plus one for the hyphen. + * If you add new checks here, you may need to add + * additional checks to the "temporary" case in + * snapshot_check() in dmu_objset.c. */ -#define MAX_TAG_PREFIX_LEN 17 - static int dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -3461,7 +3528,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) return (error); } -static void +void dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; @@ -3524,13 +3591,41 @@ dsl_dataset_user_hold_one(const char *dsname, void *arg) } int +dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, + boolean_t temphold) +{ + struct dsl_ds_holdarg *ha; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ha->htag = htag; + ha->temphold = temphold; + error = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, + ds, ha, 0); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + + return (error); +} + +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold) + boolean_t recursive, boolean_t temphold, int cleanup_fd) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; + minor_t minor = 0; + + if (cleanup_fd != -1) { + /* Currently we only support cleanup-on-exit of tempholds. */ + if (!temphold) + return (EINVAL); + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error) + return (error); + } ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); @@ -3539,6 +3634,8 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); return (error); } @@ -3547,6 +3644,7 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, ha->snapname = snapname; ha->recursive = recursive; ha->temphold = temphold; + if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, ha, DS_FIND_CHILDREN); @@ -3563,6 +3661,12 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, if (dst->dst_err) { dsl_dataset_name(ds, ha->failed); *strchr(ha->failed, '@') = '\0'; + } else if (error == 0 && minor != 0 && temphold) { + /* + * If this hold is to be released upon process exit, + * register that action now. + */ + dsl_register_onexit_hold_cleanup(ds, htag, minor); } dsl_dataset_rele(ds, ha->dstg); } @@ -3574,8 +3678,11 @@ dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); + if (cleanup_fd != -1) + zfs_onexit_fd_rele(cleanup_fd); return (error); } @@ -3667,11 +3774,6 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) uint64_t refs; int error; - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; @@ -3831,10 +3933,12 @@ top: } /* - * Called at spa_load time to release a stale temporary user hold. + * Called at spa_load time (with retry == B_FALSE) to release a stale + * temporary user hold. Also called by the onexit code (with retry == B_TRUE). */ int -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, + boolean_t retry) { dsl_dataset_t *ds; char *snap; @@ -3842,20 +3946,36 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) int namelen; int error; - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) - return (error); - namelen = dsl_dataset_namelen(ds)+1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); + do { + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) + return (error); + namelen = dsl_dataset_namelen(ds)+1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); - snap = strchr(name, '@'); - *snap = '\0'; - ++snap; - return (dsl_dataset_user_release(name, snap, htag, B_FALSE)); + snap = strchr(name, '@'); + *snap = '\0'; + ++snap; + error = dsl_dataset_user_release(name, snap, htag, B_FALSE); + kmem_free(name, namelen); + + /* + * The object can't have been destroyed because we have a hold, + * but it might have been renamed, resulting in ENOENT. Retry + * if we've been requested to do so. + * + * It would be nice if we could use the dsobj all the way + * through and avoid ENOENT entirely. But we might need to + * unmount the snapshot, and there's currently no way to lookup + * a vfsp using a ZFS object id. + */ + } while ((error == ENOENT) && retry); + + return (error); } int diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 85490c8d5..529fb052f 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -528,9 +528,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, * Check if user has requested permission. */ int -dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) { - dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; void *cookie; @@ -540,23 +539,15 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) avl_tree_t permsets; perm_set_t *setnode; - error = dsl_dataset_hold(dsname, FTAG, &ds); - if (error) - return (error); - dp = ds->ds_dir->dd_pool; mos = dp->dp_meta_objset; - if (dsl_delegation_on(mos) == B_FALSE) { - dsl_dataset_rele(ds, FTAG); + if (dsl_delegation_on(mos) == B_FALSE) return (ECANCELED); - } if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < - SPA_VERSION_DELEGATED_PERMS) { - dsl_dataset_rele(ds, FTAG); + SPA_VERSION_DELEGATED_PERMS) return (EPERM); - } if (dsl_dataset_is_snapshot(ds)) { /* @@ -633,7 +624,6 @@ again: error = EPERM; success: rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) @@ -642,6 +632,22 @@ success: return (error); } +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dsname, FTAG, &ds); + if (error) + return (error); + + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + + return (error); +} + /* * Other routines. */ diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 2cd21a102..700cc9628 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -42,7 +42,7 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ -int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */ +int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ @@ -451,7 +451,7 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) while (ds = list_head(&dp->dp_synced_datasets)) { list_remove(&dp->dp_synced_datasets, ds); os = ds->ds_objset; - zil_clean(os->os_zil); + zil_clean(os->os_zil, txg); ASSERT(!dmu_objset_is_dirty(os, txg)); dmu_buf_rele(ds->ds_dbuf, ds); } @@ -768,7 +768,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) *htag = '\0'; ++htag; dsobj = strtonum(za.za_name, NULL); - (void) dsl_dataset_user_release_tmp(dp, dsobj, htag); + (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); } zap_cursor_fini(&zc); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 23c37c7cc..56d410836 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -56,6 +56,11 @@ static scan_cb_t dsl_scan_remove_cb; static dsl_syncfunc_t dsl_scan_cancel_sync; static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +int zfs_scan_idle = 50; /* idle window in clock ticks */ + int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ @@ -601,8 +606,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, * done before setting xlateall (similar to dsl_read()) */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t @@ -650,6 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { @@ -660,7 +666,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -683,7 +689,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -696,7 +702,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -719,7 +725,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -727,9 +733,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, osp = (*bufp)->b_data; - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) - dsl_scan_zil(dp, &osp->os_zil_header); - dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); @@ -1072,9 +1075,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; + objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (dmu_objset_from_ds(ds, &os)) + goto out; + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. So we traverse the + * ZIL here, rather than in scan_recurse(), because the regular + * snapshot block-sharing rules don't apply to it. + */ + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + dsl_scan_zil(dp, &os->os_zil_header); + /* * Iterate over the bps in this ds. */ @@ -1446,7 +1463,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_setup_sync(scn, &func, tx); } - if (!dsl_scan_active(scn) || spa_sync_pass(dp->dp_spa) > 1) return; @@ -1489,7 +1505,6 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_phys.scn_state != DSS_SCANNING) return; - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " @@ -1644,8 +1659,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; + int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) @@ -1658,10 +1674,12 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_flags |= ZIO_FLAG_SCRUB; zio_priority = ZIO_PRIORITY_SCRUB; needs_io = B_TRUE; + scan_delay = zfs_scrub_delay; } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { zio_flags |= ZIO_FLAG_RESILVER; zio_priority = ZIO_PRIORITY_RESILVER; needs_io = B_FALSE; + scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ @@ -1699,14 +1717,23 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, } if (needs_io && !zfs_no_scrub_io) { + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); + /* + * If we're seeing recent (zfs_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) + delay(scan_delay); + zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, zio_priority, zio_flags, zb)); diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 832685b0f..b0818ce27 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -213,6 +213,8 @@ dsl_sync_task_do(dsl_pool_t *dp, dsl_sync_task_group_t *dstg; int err; + ASSERT(spa_writeable(dp->dp_spa)); + dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); @@ -228,6 +230,9 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp, { dsl_sync_task_group_t *dstg; + if (!spa_writeable(dp->dp_spa)) + return; + dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 78943eda8..4efcff4f4 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -384,6 +384,20 @@ fm_panic(const char *format, ...) } /* + * Simply tell the caller if fm_panicstr is set, ie. an fma event has + * caused the panic. If so, something other than the default panic + * diagnosis method will diagnose the cause of the panic. + */ +int +is_fm_panic() +{ + if (fm_panicstr) + return (1); + else + return (0); +} + +/* * Print any appropriate FMA banner message before the panic message. This * function is called by panicsys() and prints the message for fm_panic(). * We print the message here so that it comes after the system is quiesced. @@ -610,8 +624,8 @@ fm_nvlist_create(nv_alloc_t *nva) if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { if (hdl_alloced) { - kmem_free(nvhdl, sizeof (nv_alloc_t)); nv_alloc_fini(nvhdl); + kmem_free(nvhdl, sizeof (nv_alloc_t)); } return (NULL); } diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h index 4c05806e3..cf1bbc030 100644 --- a/module/zfs/include/sys/dbuf.h +++ b/module/zfs/include/sys/dbuf.h @@ -32,6 +32,7 @@ #include <sys/arc.h> #include <sys/zfs_context.h> #include <sys/refcount.h> +#include <sys/zrlock.h> #ifdef __cplusplus extern "C" { @@ -82,9 +83,6 @@ struct dmu_tx; * etc. */ -#define LIST_LINK_INACTIVE(link) \ - ((link)->list_next == NULL && (link)->list_prev == NULL) - struct dmu_buf_impl; typedef enum override_states { @@ -149,15 +147,17 @@ typedef struct dmu_buf_impl { struct objset *db_objset; /* - * the dnode we belong to (NULL when evicted) + * handle to safely access the dnode we belong to (NULL when evicted) */ - struct dnode *db_dnode; + struct dnode_handle *db_dnode_handle; /* * our parent buffer; if the dnode points to us directly, - * db_parent == db_dnode->dn_dbuf + * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf * only accessed by sync thread ??? * (NULL when evicted) + * May change from NULL to non-NULL under the protection of db_mtx + * (see dbuf_check_blkptr()) */ struct dmu_buf_impl *db_parent; @@ -284,24 +284,46 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); +#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) +#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) +#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#define DB_GET_SPA(_spa_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_spa_p) = __dn->dn_objset->os_spa; \ + DB_DNODE_EXIT(_db); \ +} +#define DB_GET_OBJSET(_os_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_os_p) = __dn->dn_objset; \ + DB_DNODE_EXIT(_db); \ +} + void dbuf_init(void); void dbuf_fini(void); -#define DBUF_IS_METADATA(db) \ - ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) +boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); + +#define DBUF_IS_METADATA(_db) \ + (dbuf_is_metadata(_db)) -#define DBUF_GET_BUFC_TYPE(db) \ - (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) +#define DBUF_GET_BUFC_TYPE(_db) \ + (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -#define DBUF_IS_CACHEABLE(db) \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_CACHEABLE(_db) \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(db) \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_L2CACHEABLE(_db) \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #ifdef ZFS_DEBUG @@ -332,7 +354,7 @@ _NOTE(CONSTCOND) } while (0) sprintf_blkptr(__blkbuf, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ + } \ _NOTE(CONSTCOND) } while (0) #define DBUF_VERIFY(db) dbuf_verify(db) diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index 83932f467..07f5949eb 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -192,8 +192,8 @@ int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, uint64_t flags); int dmu_objset_destroy(const char *name, boolean_t defer); int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); -int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, - boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, @@ -335,6 +335,7 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* @@ -721,9 +722,13 @@ typedef struct dmu_recv_cookie { int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, boolean_t force, objset_t *origin, dmu_recv_cookie_t *); -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc); +int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, + offset_t *off); + /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 5c5119a20..c6d202e2e 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -40,6 +40,8 @@ extern "C" { #endif +extern krwlock_t os_lock; + struct dsl_dataset; struct dmu_tx; @@ -68,9 +70,15 @@ struct objset { spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; - dnode_t *os_meta_dnode; - dnode_t *os_userused_dnode; - dnode_t *os_groupused_dnode; + /* + * The following "special" dnodes have no parent and are exempt from + * dnode_move(), but they root their descendents in this objset using + * handles anyway, so that all access to dnodes from dbufs consistently + * uses handles. + */ + dnode_handle_t os_meta_dnode; + dnode_handle_t os_userused_dnode; + dnode_handle_t os_groupused_dnode; zilog_t *os_zil; /* can change, under dsl_dir's locks: */ @@ -113,6 +121,9 @@ struct objset { #define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) +#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) +#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) +#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ @@ -131,8 +142,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, uint64_t flags); int dmu_objset_destroy(const char *name, boolean_t defer); -int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, - boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, + struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, @@ -150,6 +161,7 @@ timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +boolean_t dmu_objset_is_dirty_anywhere(objset_t *os); objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, @@ -161,6 +173,9 @@ boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); +void dmu_objset_init(void); +void dmu_objset_fini(void); + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/dmu_traverse.h b/module/zfs/include/sys/dmu_traverse.h index 844e7f1ae..5b326cd99 100644 --- a/module/zfs/include/sys/dmu_traverse.h +++ b/module/zfs/include/sys/dmu_traverse.h @@ -49,6 +49,9 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) #define TRAVERSE_HARD (1<<4) +/* Special traverse error return value to indicate skipping of children */ +#define TRAVERSE_VISIT_NO_CHILDREN -1 + int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); int traverse_pool(spa_t *spa, diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h index 8bae1602e..9ad4be36b 100644 --- a/module/zfs/include/sys/dnode.h +++ b/module/zfs/include/sys/dnode.h @@ -32,6 +32,7 @@ #include <sys/zio.h> #include <sys/refcount.h> #include <sys/dmu_zfetch.h> +#include <sys/zrlock.h> #ifdef __cplusplus extern "C" { @@ -156,6 +157,7 @@ typedef struct dnode { struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; + struct dnode_handle *dn_handle; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ /* @@ -172,6 +174,7 @@ typedef struct dnode { uint8_t dn_nlevels; uint8_t dn_indblkshift; uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint8_t dn_moved; /* Has this dnode been moved? */ uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; @@ -183,6 +186,9 @@ typedef struct dnode { uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ + uint32_t dn_dbufs_count; /* count of dn_dbufs */ + /* protected by os_lock: */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ @@ -202,8 +208,11 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + list_t dn_dbufs; /* descendent dbufs */ + + /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ @@ -220,6 +229,22 @@ typedef struct dnode { struct zfetch dn_zfetch; } dnode_t; +/* + * Adds a level of indirection between the dbuf and the dnode to avoid + * iterating descendent dbufs in dnode_move(). Handles are not allocated + * individually, but as an array of child dnodes in dnode_hold_impl(). + */ +typedef struct dnode_handle { + /* Protects dnh_dnode from modification by dnode_move(). */ + zrlock_t dnh_zrlock; + dnode_t *dnh_dnode; +} dnode_handle_t; + +typedef struct dnode_children { + size_t dnc_count; /* number of children */ + dnode_handle_t dnc_children[1]; /* sized dynamically */ +} dnode_children_t; + typedef struct free_range { avl_node_t fr_node; uint64_t fr_blkid; @@ -227,8 +252,8 @@ typedef struct free_range { } free_range_t; dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, - uint64_t object); -void dnode_special_close(dnode_t *dn); + uint64_t object, dnode_handle_t *dnh); +void dnode_special_close(dnode_handle_t *dnh); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h index 58414e133..22733d070 100644 --- a/module/zfs/include/sys/dsl_dataset.h +++ b/module/zfs/include/sys/dsl_dataset.h @@ -162,6 +162,22 @@ struct dsl_ds_destroyarg { boolean_t need_prep; /* do we need to retry due to EBUSY? */ }; +/* + * The max length of a temporary tag prefix is the number of hex digits + * required to express UINT64_MAX plus one for the hyphen. + */ +#define MAX_TAG_PREFIX_LEN 17 + +struct dsl_ds_holdarg { + dsl_sync_task_group_t *dstg; + char *htag; + char *snapname; + boolean_t recursive; + boolean_t gotone; + boolean_t temphold; + char failed[MAXPATHLEN]; +}; + #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) @@ -182,6 +198,8 @@ void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag); void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); +void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -192,16 +210,19 @@ dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; dsl_syncfunc_t dsl_dataset_snapshot_sync; +dsl_syncfunc_t dsl_dataset_user_hold_sync; int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold); + boolean_t recursive, boolean_t temphold, int cleanup_fd); +int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, + boolean_t temphold); int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, boolean_t recursive); int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, - char *htag); + char *htag, boolean_t retry); int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h index a26a3f705..73c43bd23 100644 --- a/module/zfs/include/sys/dsl_deleg.h +++ b/module/zfs/include/sys/dsl_deleg.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DELEG_H @@ -55,6 +54,7 @@ extern "C" { #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" /* * Note: the names of properties that are marked delegatable are also @@ -64,6 +64,7 @@ extern "C" { int dsl_deleg_get(const char *ddname, nvlist_t **nvp); int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); diff --git a/module/zfs/include/sys/fm/protocol.h b/module/zfs/include/sys/fm/protocol.h index c4103c48a..5eca760da 100644 --- a/module/zfs/include/sys/fm/protocol.h +++ b/module/zfs/include/sys/fm/protocol.h @@ -43,12 +43,13 @@ extern "C" { #define FM_CLASS "class" #define FM_VERSION "version" -/* FM event class values */ +/* FM protocol category 1 class names */ #define FM_EREPORT_CLASS "ereport" #define FM_FAULT_CLASS "fault" #define FM_DEFECT_CLASS "defect" #define FM_RSRC_CLASS "resource" #define FM_LIST_EVENT "list" +#define FM_IREPORT_CLASS "ireport" /* FM list.* event class values */ #define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" @@ -72,6 +73,12 @@ extern "C" { /* list.* event payload member names */ #define FM_LIST_EVENT_SIZE "list-sz" +/* ireport.* event payload member names */ +#define FM_IREPORT_DETECTOR "detector" +#define FM_IREPORT_UUID "uuid" +#define FM_IREPORT_PRIORITY "pri" +#define FM_IREPORT_ATTRIBUTES "attr" + /* * list.suspect, isolated, updated, repaired and resolved * versions/payload member names. @@ -192,6 +199,7 @@ extern "C" { #define FM_FMRI_SCHEME_PKG "pkg" #define FM_FMRI_SCHEME_LEGACY "legacy-hc" #define FM_FMRI_SCHEME_ZFS "zfs" +#define FM_FMRI_SCHEME_SW "sw" /* Scheme versions */ #define FMD_SCHEME_VERSION0 0 @@ -215,6 +223,8 @@ extern "C" { #define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 #define ZFS_SCHEME_VERSION0 0 #define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 +#define SW_SCHEME_VERSION0 0 +#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0 /* hc scheme member names */ #define FM_FMRI_HC_SERIAL_ID "serial" @@ -299,6 +309,25 @@ extern "C" { #define FM_FMRI_ZFS_POOL "pool" #define FM_FMRI_ZFS_VDEV "vdev" +/* sw scheme member names - extra indentation for members of an nvlist */ +#define FM_FMRI_SW_OBJ "object" +#define FM_FMRI_SW_OBJ_PATH "path" +#define FM_FMRI_SW_OBJ_ROOT "root" +#define FM_FMRI_SW_OBJ_PKG "pkg" +#define FM_FMRI_SW_SITE "site" +#define FM_FMRI_SW_SITE_TOKEN "token" +#define FM_FMRI_SW_SITE_MODULE "module" +#define FM_FMRI_SW_SITE_FILE "file" +#define FM_FMRI_SW_SITE_LINE "line" +#define FM_FMRI_SW_SITE_FUNC "func" +#define FM_FMRI_SW_CTXT "context" +#define FM_FMRI_SW_CTXT_ORIGIN "origin" +#define FM_FMRI_SW_CTXT_EXECNAME "execname" +#define FM_FMRI_SW_CTXT_PID "pid" +#define FM_FMRI_SW_CTXT_ZONE "zone" +#define FM_FMRI_SW_CTXT_CTID "ctid" +#define FM_FMRI_SW_CTXT_STACK "stack" + extern nv_alloc_t *fm_nva_xcreate(char *, size_t); extern void fm_nva_xdestroy(nv_alloc_t *); diff --git a/module/zfs/include/sys/fm/util.h b/module/zfs/include/sys/fm/util.h index 4934814d8..37334101b 100644 --- a/module/zfs/include/sys/fm/util.h +++ b/module/zfs/include/sys/fm/util.h @@ -20,15 +20,12 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FM_UTIL_H #define _SYS_FM_UTIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -96,6 +93,7 @@ extern void fm_ereport_post(nvlist_t *, int); extern void fm_payload_stack_add(nvlist_t *, const pc_t *, int); +extern int is_fm_panic(); #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/module/zfs/include/sys/refcount.h b/module/zfs/include/sys/refcount.h index bc3ade80f..1752c64e3 100644 --- a/module/zfs/include/sys/refcount.h +++ b/module/zfs/include/sys/refcount.h @@ -40,7 +40,7 @@ extern "C" { */ #define FTAG ((char *)__func__) -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG typedef struct reference { list_node_t ref_link; void *ref_holder; @@ -67,11 +67,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag); int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); +void refcount_transfer(refcount_t *dst, refcount_t *src); void refcount_init(void); void refcount_fini(void); -#else /* DEBUG */ +#else /* ZFS_DEBUG */ typedef struct refcount { uint64_t rc_count; @@ -97,7 +98,7 @@ typedef struct refcount { #define refcount_init() #define refcount_fini() -#endif /* DEBUG */ +#endif /* ZFS_DEBUG */ #ifdef __cplusplus } diff --git a/module/zfs/include/sys/sa.h b/module/zfs/include/sys/sa.h index e9a96a0f9..bc89fa07d 100644 --- a/module/zfs/include/sys/sa.h +++ b/module/zfs/include/sys/sa.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SA_H @@ -141,7 +140,7 @@ dmu_buf_t *sa_get_db(sa_handle_t *); uint64_t sa_handle_object(sa_handle_t *); boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); void sa_register_update_callback(objset_t *, sa_update_cb_t *); -sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int); +int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); void sa_tear_down(objset_t *); int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, int, dmu_tx_t *); diff --git a/module/zfs/include/sys/sa_impl.h b/module/zfs/include/sys/sa_impl.h index 62497e702..6661e47cf 100644 --- a/module/zfs/include/sys/sa_impl.h +++ b/module/zfs/include/sys/sa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -232,7 +231,7 @@ struct sa_handle { ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) #define SA_BONUSTYPE_FROM_DB(db) \ - (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype) + (dmu_get_bonustype((dmu_buf_t *)db)) #define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 41a40300e..456ec06dc 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -418,8 +418,8 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); extern int spa_import_rootpool(char *devpath, char *devid); -extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, @@ -602,6 +602,7 @@ extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); +extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); @@ -620,7 +621,6 @@ extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); -extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to); extern int spa_mode(spa_t *spa); extern uint64_t strtonum(const char *str, char **nptr); diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index e2e1851ec..c965ffbbe 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -114,13 +114,14 @@ struct spa { nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ + nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ - boolean_t spa_load_verbatim; /* load the given config? */ + uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ @@ -130,6 +131,7 @@ struct spa { uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ + timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ @@ -146,9 +148,9 @@ struct spa { uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ + uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ - uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 2b886bc58..161bd21f0 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -169,6 +169,7 @@ struct vdev { uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_resilvering; /* persistent resilvering state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ @@ -283,6 +284,7 @@ extern void vdev_remove_parent(vdev_t *cvd); * vdev sync load and sync */ extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); +extern boolean_t vdev_log_state_valid(vdev_t *vd); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h index 72e868fab..c1a0aeebd 100644 --- a/module/zfs/include/sys/zfs_acl.h +++ b/module/zfs/include/sys/zfs_acl.h @@ -185,10 +185,6 @@ typedef struct zfs_acl_ids { struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ } zfs_acl_ids_t; -#define ZFS_EXTERNAL_ACL(zp) \ - (zp->z_is_sa ? 0 : zfs_external_acl(zp)) -#define ZNODE_ACL_VERSION(zp) \ - (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp)) /* * Property values for acl_mode and acl_inherit. * @@ -222,7 +218,7 @@ int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); -int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index b0cb4955e..84bf794fe 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H @@ -31,6 +30,7 @@ #include <sys/zio.h> #include <sys/dsl_deleg.h> #include <sys/spa.h> +#include <sys/zfs_stat.h> #ifdef _KERNEL #include <sys/nvpair.h> @@ -199,6 +199,22 @@ typedef struct dmu_replay_record { } drr_u; } dmu_replay_record_t; +/* diff record range types */ +typedef enum diff_type { + DDR_NONE = 0x1, + DDR_INUSE = 0x2, + DDR_FREE = 0x4 +} diff_type_t; + +/* + * The diff reports back ranges of free or in-use objects. + */ +typedef struct dmu_diff_record { + uint64_t ddr_type; + uint64_t ddr_first; + uint64_t ddr_last; +} dmu_diff_record_t; + typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; @@ -265,6 +281,13 @@ typedef struct zfs_cmd { zinject_record_t zc_inject_record; boolean_t zc_defer_destroy; boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_pad[4]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; } zfs_cmd_t; typedef struct zfs_useracct { @@ -274,8 +297,8 @@ typedef struct zfs_useracct { uint64_t zu_space; } zfs_useracct_t; -#define ZVOL_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) +#define ZFSDEV_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 @@ -295,6 +318,28 @@ extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); extern int zfs_unmount_snap(const char *, void *); +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern void *zfsdev_get_soft_state(minor_t minor, + enum zfs_soft_state_type which); +extern minor_t zfsdev_minor_alloc(void); + +extern void *zfsdev_state; +extern kmutex_t zfsdev_state_lock; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/module/zfs/include/sys/zfs_onexit.h b/module/zfs/include/sys/zfs_onexit.h new file mode 100644 index 000000000..4982bd4d0 --- /dev/null +++ b/module/zfs/include/sys/zfs_onexit.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZFS_ONEXIT_H +#define _SYS_ZFS_ONEXIT_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct zfs_onexit { + kmutex_t zo_lock; + list_t zo_actions; +} zfs_onexit_t; + +typedef struct zfs_onexit_action_node { + list_node_t za_link; + void (*za_func)(void *); + void *za_data; +} zfs_onexit_action_node_t; + +extern void zfs_onexit_init(zfs_onexit_t **zo); +extern void zfs_onexit_destroy(zfs_onexit_t *zo); + +#endif + +extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(int fd); +extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle); +extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, + boolean_t fire); +extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, + void **data); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/module/zfs/include/sys/zfs_stat.h b/module/zfs/include/sys/zfs_stat.h new file mode 100644 index 000000000..465aefaa2 --- /dev/null +++ b/module/zfs/include/sys/zfs_stat.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_STAT_H +#define _SYS_FS_ZFS_STAT_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/dmu.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A limited number of zpl level stats are retrievable + * with an ioctl. zfs diff is the current consumer. + */ +typedef struct zfs_stat { + uint64_t zs_gen; + uint64_t zs_mode; + uint64_t zs_links; + uint64_t zs_ctime[2]; +} zfs_stat_t; + +extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_STAT_H */ diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index 86dcdacc0..38c87df43 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -79,6 +79,7 @@ struct zfsvfs { kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index 4781ee686..3e9621a0e 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_ZNODE_H @@ -36,6 +35,7 @@ #include <sys/zfs_vfsops.h> #include <sys/rrwlock.h> #include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> #endif #include <sys/zfs_acl.h> #include <sys/zil.h> @@ -60,6 +60,8 @@ extern "C" { #define ZFS_AV_QUARANTINED 0x0000020000000000 #define ZFS_AV_MODIFIED 0x0000040000000000 #define ZFS_REPARSE 0x0000080000000000 +#define ZFS_OFFLINE 0x0000100000000000 +#define ZFS_SPARSE 0x0000200000000000 #define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ { \ @@ -188,17 +190,17 @@ typedef struct znode { uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint8_t z_moved; /* Has this znode been moved? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ - uint64_t z_last_itx; /* last ZIL itx on this znode */ uint64_t z_gen; /* generation (cached) */ uint64_t z_size; /* file size (cached) */ uint64_t z_atime[2]; /* atime (cached) */ uint64_t z_links; /* file links (cached) */ uint64_t z_pflags; /* pflags (cached) */ - uid_t z_uid; /* uid mapped (cached) */ - uid_t z_gid; /* gid mapped (cached) */ + uint64_t z_uid; /* uid fuid (cached) */ + uint64_t z_gid; /* gid fuid (cached) */ mode_t z_mode; /* mode (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ @@ -321,7 +323,8 @@ extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, vattr_t *vap); extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name); + znode_t *dzp, char *name, uint64_t foid); +#define ZFS_NO_OBJECT 0 /* no object id */ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h index 2f01cf922..a4c5575b2 100644 --- a/module/zfs/include/sys/zil.h +++ b/module/zfs/include/sys/zil.h @@ -169,18 +169,14 @@ typedef enum zil_create { (txtype) == TX_ACL || \ (txtype) == TX_WRITE2) - /* * Format of log records. * The fields are carefully defined to allow them to be aligned * and sized the same on sparc & intel architectures. * Each log record has a common structure at the beginning. * - * Note, lrc_seq holds two different sequence numbers. Whilst in memory - * it contains the transaction sequence number. The log record on - * disk holds the sequence number of all log records which is used to - * ensure we don't replay the same record. The two sequence numbers are - * different because the transactions can now be pushed out of order. + * The log record on disk (lrc_seq) holds the sequence number of all log + * records which is used to ensure we don't replay the same record. */ typedef struct { /* common log record header */ uint64_t lrc_txtype; /* intent log transaction type */ @@ -371,6 +367,7 @@ typedef struct itx { itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ uint64_t itx_sod; /* record size on disk */ + uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; @@ -402,15 +399,15 @@ extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); extern void zil_itx_destroy(itx_t *itx); -extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); +extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); -extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); +extern void zil_commit(zilog_t *zilog, uint64_t oid); extern int zil_vdev_offline(const char *osname, void *txarg); extern int zil_claim(const char *osname, void *txarg); extern int zil_check_log_chain(const char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_clean(zilog_t *zilog); +extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h index 6560a7942..1d4c0cc6c 100644 --- a/module/zfs/include/sys/zil_impl.h +++ b/module/zfs/include/sys/zil_impl.h @@ -50,6 +50,28 @@ typedef struct lwb { } lwb_t; /* + * Intent log transaction lists + */ +typedef struct itxs { + list_t i_sync_list; /* list of synchronous itxs */ + avl_tree_t i_async_tree; /* tree of foids for async itxs */ +} itxs_t; + +typedef struct itxg { + kmutex_t itxg_lock; /* lock for this structure */ + uint64_t itxg_txg; /* txg for this chain */ + uint64_t itxg_sod; /* total size on disk for this txg */ + itxs_t *itxg_itxs; /* sync and async itxs */ +} itxg_t; + +/* for async nodes we build up an AVL tree of lists of async itxs per file */ +typedef struct itx_async_node { + uint64_t ia_foid; /* file object id */ + list_t ia_list; /* list of async itxs for this foid */ + avl_node_t ia_node; /* AVL tree linkage */ +} itx_async_node_t; + +/* * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs * we've touched so we know which ones need a write cache flush at the end. */ @@ -71,9 +93,7 @@ struct zilog { objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ zio_t *zl_root_zio; /* log writer root zio */ - uint64_t zl_itx_seq; /* next in-core itx sequence number */ uint64_t zl_lr_seq; /* on-disk log record sequence number */ - uint64_t zl_commit_seq; /* committed upto this number */ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ @@ -93,10 +113,13 @@ struct zilog { uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ uint64_t zl_parse_blk_count; /* number of blocks parsed */ uint64_t zl_parse_lr_count; /* number of log records parsed */ - list_t zl_itx_list; /* in-memory itx list */ + uint64_t zl_next_batch; /* next batch number */ + uint64_t zl_com_batch; /* committed batch number */ + kcondvar_t zl_cv_batch[2]; /* batch condition variables */ + itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ + list_t zl_itx_commit_list; /* itx list to be committed */ uint64_t zl_itx_list_sz; /* total size of records on list */ uint64_t zl_cur_used; /* current commit log size used */ - uint64_t zl_prev_used; /* previous commit log size used */ list_t zl_lwb_list; /* in-flight log write list */ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 0400c1702..97d8ec74d 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -147,7 +147,7 @@ enum zio_flag { ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, - ZIO_FLAG_SCRUB_THREAD = 1 << 5, + ZIO_FLAG_SCAN_THREAD = 1 << 5, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) diff --git a/module/zfs/include/sys/zrlock.h b/module/zfs/include/sys/zrlock.h new file mode 100644 index 000000000..dcd63f7b5 --- /dev/null +++ b/module/zfs/include/sys/zrlock.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZRLOCK_H +#define _SYS_ZRLOCK_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zrlock { + kmutex_t zr_mtx; + volatile int32_t zr_refcount; + kcondvar_t zr_cv; + uint16_t zr_pad; +#ifdef ZFS_DEBUG + kthread_t *zr_owner; + const char *zr_caller; +#endif +} zrlock_t; + +extern void zrl_init(zrlock_t *); +extern void zrl_destroy(zrlock_t *); +#ifdef ZFS_DEBUG +#define zrl_add(_z) zrl_add_debug((_z), __func__) +extern void zrl_add_debug(zrlock_t *, const char *); +#else +extern void zrl_add(zrlock_t *); +#endif +extern void zrl_remove(zrlock_t *); +extern int zrl_tryenter(zrlock_t *); +extern void zrl_exit(zrlock_t *); +extern int zrl_is_zero(zrlock_t *); +extern int zrl_is_locked(zrlock_t *); +#ifdef ZFS_DEBUG +extern kthread_t *zrl_owner(zrlock_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZRLOCK_H */ diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c index 10952f472..ab3de51b7 100644 --- a/module/zfs/lzjb.c +++ b/module/zfs/lzjb.c @@ -20,12 +20,11 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * We keep our own copy of this algorithm for 2 main reasons: + * We keep our own copy of this algorithm for 3 main reasons: * 1. If we didn't, anyone modifying common/os/compress.c would * directly break our on disk format * 2. Our version of lzjb does not have a number of checks that the @@ -33,8 +32,8 @@ * 3. We initialize the lempel to ensure deterministic results, * so that identical blocks can always be deduplicated. * In particular, we are adding the "feature" that compress() can - * take a destination buffer size and return -1 if the data will not - * compress to d_len or less. + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. */ #include <sys/types.h> diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 8358b4cee..600132f08 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -25,7 +25,7 @@ #include <sys/zfs_context.h> #include <sys/refcount.h> -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG #ifdef _KERNEL int reference_tracking_enable = FALSE; /* runs out of memory too easily */ @@ -189,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder) return (refcount_remove_many(rc, 1, holder)); } -#endif +void +refcount_transfer(refcount_t *dst, refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +#endif /* ZFS_DEBUG */ diff --git a/module/zfs/sa.c b/module/zfs/sa.c index a91b379f9..4cb4546b2 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -300,8 +300,8 @@ sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) return (crc); } -static boolean_t -sa_has_blkptr(sa_handle_t *hdl) +static int +sa_get_spill(sa_handle_t *hdl) { int rc; if (hdl->sa_spill == NULL) { @@ -312,7 +312,7 @@ sa_has_blkptr(sa_handle_t *hdl) rc = 0; } - return (rc == 0 ? B_TRUE : B_FALSE); + return (rc); } /* @@ -349,7 +349,8 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, buftypes |= SA_BONUS; } } - if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) { + if (bulk[i].sa_addr == NULL && + ((error = sa_get_spill(hdl)) == 0)) { if (TOC_ATTR_PRESENT( hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_spill_tab, @@ -362,6 +363,10 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, } } } + if (error && error != ENOENT) { + return ((error == ECKSUM) ? EIO : error); + } + switch (data_op) { case SA_LOOKUP: if (bulk[i].sa_addr == NULL) @@ -421,12 +426,10 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { - int error; sa->sa_layout_attr_obj = zap_create(os, DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); - error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, - &sa->sa_layout_attr_obj, tx); - ASSERT3U(error, ==, 0); + VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, + &sa->sa_layout_attr_obj, tx) == 0); } (void) snprintf(attr_name, sizeof (attr_name), @@ -667,10 +670,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, boolean_t dummy; if (hdl->sa_spill == NULL) { - int error; - error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, - &hdl->sa_spill); - ASSERT3U(error, ==, 0); + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); @@ -712,7 +713,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, length = attr_desc[i].sa_length; if (buf_space < length) { /* switch to spill buffer */ - ASSERT(bonustype != DMU_OT_ZNODE); + VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); @@ -746,6 +747,14 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, } sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + + /* + * Verify that old znodes always have layout number 0. + * Must be DMU_OT_SA for arbitrary layouts + */ + VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || + (bonustype == DMU_OT_SA && lot->lot_num > 1)); + if (bonustype == DMU_OT_SA) { SA_SET_HDR(sahdr, lot->lot_num, buftype == SA_BONUS ? hdrsize : spillhdrsize); @@ -763,11 +772,6 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, if (!spilling) { /* * remove spill block that is no longer needed. - * set sa_spill_remove to prevent sa_attr_op - * from trying to retrieve spill block before its - * been removed. The flag will be cleared if/when - * the handle is destroyed recreated or - * sa_build_layouts() needs to spill again. */ dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; @@ -783,10 +787,31 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, } static void +sa_free_attr_table(sa_os_t *sa) +{ + int i; + + if (sa->sa_attr_table == NULL) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + sa->sa_attr_table = NULL; +} + +static int sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; + uint64_t sa_reg_count; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; @@ -800,8 +825,20 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); - if (sa->sa_reg_attr_obj != 0) - VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0); + if (sa->sa_reg_attr_obj != 0) { + error = zap_count(os, sa->sa_reg_attr_obj, + &sa_attr_count); + + /* + * Make sure we retrieved a count and that it isn't zero + */ + if (error || (error == 0 && sa_attr_count == 0)) { + if (error == 0) + error = EINVAL; + goto bail; + } + sa_reg_count = sa_attr_count; + } if (ostype == DMU_OST_ZFS && sa_attr_count == 0) sa_attr_count += sa_legacy_attr_count; @@ -830,7 +867,6 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) else error = ENOENT; switch (error) { - default: case ENOENT: sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; sa_attr_count++; @@ -838,11 +874,13 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) case 0: sa->sa_user_table[i] = ATTR_NUM(attr_value); break; + default: + goto bail; } } - os->os_sa->sa_num_attrs = sa_attr_count; - tb = os->os_sa->sa_attr_table = + sa->sa_num_attrs = sa_attr_count; + tb = sa->sa_attr_table = kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); /* @@ -853,7 +891,7 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) if (sa->sa_reg_attr_obj) { for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t value; value = za.za_first_integer; @@ -873,6 +911,15 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) strlen(za.za_name) +1); } zap_cursor_fini(&zc); + /* + * Make sure we processed the correct number of registered + * attributes + */ + if (registered_count != sa_reg_count) { + ASSERT(error != 0); + goto bail; + } + } if (ostype == DMU_OST_ZFS) { @@ -908,18 +955,27 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) strlen(reg_attrs[i].sa_name) + 1); } - os->os_sa->sa_need_attr_registration = + sa->sa_need_attr_registration = (sa_attr_count != registered_count); + + return (0); +bail: + kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); + sa->sa_user_table = NULL; + sa_free_attr_table(sa); + return ((error != 0) ? error : EINVAL); } -sa_attr_type_t * -sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) +int +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, + sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; sa_os_t *sa; dmu_objset_type_t ostype = dmu_objset_type(os); sa_attr_type_t *tb; + int error; mutex_enter(&os->os_lock); if (os->os_sa) { @@ -927,13 +983,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) mutex_exit(&os->os_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); - return (tb); + *user_table = tb; + return (0); } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); sa->sa_master_obj = sa_obj; + os->os_sa = sa; mutex_enter(&sa->sa_lock); mutex_exit(&os->os_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, @@ -942,26 +1000,36 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); if (sa_obj) { - int error; error = zap_lookup(os, sa_obj, SA_LAYOUTS, 8, 1, &sa->sa_layout_attr_obj); - if (error != 0 && error != ENOENT) { - return (NULL); - } + if (error != 0 && error != ENOENT) + goto fail; error = zap_lookup(os, sa_obj, SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj); - if (error != 0 && error != ENOENT) { - mutex_exit(&sa->sa_lock); - return (NULL); - } + if (error != 0 && error != ENOENT) + goto fail; } - os->os_sa = sa; - sa_attr_table_setup(os, reg_attrs, count); + if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) + goto fail; if (sa->sa_layout_attr_obj != 0) { + uint64_t layout_count; + + error = zap_count(os, sa->sa_layout_attr_obj, + &layout_count); + + /* + * Layout number count should be > 0 + */ + if (error || (error == 0 && layout_count == 0)) { + if (error == 0) + error = EINVAL; + goto fail; + } + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { sa_attr_type_t *lot_attrs; uint64_t lot_num; @@ -969,8 +1037,13 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * za.za_num_integers, KM_SLEEP); - VERIFY(zap_lookup(os, sa->sa_layout_attr_obj, - za.za_name, 2, za.za_num_integers, lot_attrs) == 0); + if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, + lot_attrs))) != 0) { + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + break; + } VERIFY(ddi_strtoull(za.za_name, NULL, 10, (unsigned long long *)&lot_num) == 0); @@ -982,6 +1055,15 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) za.za_num_integers); } zap_cursor_fini(&zc); + + /* + * Make sure layout count matches number of entries added + * to AVL tree + */ + if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { + ASSERT(error != 0); + goto fail; + } } /* Add special layout number for old ZNODES */ @@ -994,8 +1076,17 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, 0, B_FALSE, NULL); } + *user_table = os->os_sa->sa_user_table; mutex_exit(&sa->sa_lock); - return (os->os_sa->sa_user_table); + return (0); +fail: + os->os_sa = NULL; + sa_free_attr_table(sa); + if (sa->sa_user_table) + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + mutex_exit(&sa->sa_lock); + kmem_free(sa, sizeof (sa_os_t)); + return ((error == ECKSUM) ? EIO : error); } void @@ -1004,20 +1095,12 @@ sa_tear_down(objset_t *os) sa_os_t *sa = os->os_sa; sa_lot_t *layout; void *cookie; - int i; kmem_free(sa->sa_user_table, sa->sa_user_table_sz); /* Free up attr table */ - for (i = 0; i != sa->sa_num_attrs; i++) { - if (sa->sa_attr_table[i].sa_name) - kmem_free(sa->sa_attr_table[i].sa_name, - strlen(sa->sa_attr_table[i].sa_name) + 1); - } - - kmem_free(sa->sa_attr_table, - sizeof (sa_attr_table_t) * sa->sa_num_attrs); + sa_free_attr_table(sa); cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { @@ -1361,11 +1444,9 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) ASSERT(hdl); mutex_enter(&hdl->sa_lock); - if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) { + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, uio->uio_resid), UIO_READ, uio); - } else { - error = ENOENT; } mutex_exit(&hdl->sa_lock); return (error); @@ -1373,11 +1454,6 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) } #endif -/* - * Find an already existing TOC from given os and data - * This is a special interface to be used by the ZPL for - * finding the uid/gid/gen attributes. - */ void * sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) { @@ -1475,12 +1551,10 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) } if (sa->sa_reg_attr_obj == NULL) { - int error; sa->sa_reg_attr_obj = zap_create(hdl->sa_os, DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); - error = zap_add(hdl->sa_os, sa->sa_master_obj, - SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx); - ASSERT(error == 0); + VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, + SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) @@ -1538,6 +1612,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; @@ -1555,7 +1631,9 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* First make of copy of the old data */ - if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) { + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); bcopy(hdl->sa_bonus->db_data, old_data[0], @@ -1564,16 +1642,21 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, } else { old_data[0] = NULL; } + DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ - if (sa_has_blkptr(hdl)) { + if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else if (error && error != ENOENT) { + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + return (error); } else { old_data[1] = NULL; } @@ -1722,6 +1805,7 @@ int sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) { sa_bulk_attr_t bulk; + int error; bulk.sa_data = NULL; bulk.sa_attr = attr; @@ -1729,9 +1813,9 @@ sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) ASSERT(hdl); mutex_enter(&hdl->sa_lock); - if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) { + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { mutex_exit(&hdl->sa_lock); - return (ENOENT); + return (error); } *size = bulk.sa_size; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d7c5de0d3..b6190e4cf 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -116,6 +116,7 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ id_t zio_taskq_psrset_bind = PS_NONE; @@ -180,6 +181,8 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); @@ -529,7 +532,9 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) nvpair_name(elem))) == ZPROP_INVAL) return (EINVAL); - if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) continue; need_sync = B_TRUE; @@ -1284,33 +1289,131 @@ spa_check_removed(vdev_t *vd) } /* - * Load the slog device state from the config object since it's possible - * that the label does not contain the most up-to-date information. + * Validate the current config against the MOS config */ -void -spa_load_log_state(spa_t *spa, nvlist_t *nv) +static boolean_t +spa_config_valid(spa_t *spa, nvlist_t *config) { - vdev_t *ovd, *rvd = spa->spa_root_vdev; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv; + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* - * Load the original root vdev tree from the passed config. + * If we're doing a normal import, then build up any additional + * diagnostic information about missing devices in this config. + * We'll pass this up to the user for further processing. */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops && + mtvd->vdev_islog) + child[idx++] = vdev_config_generate(spa, mtvd, + B_FALSE, 0); + } + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + + for (int i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + } + + /* + * Compare the root vdev tree with the information we have + * from the MOS config (mrvd). Check each top-level vdev + * with the corresponding MOS config top-level (mtvd). + */ for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - if (cvd->vdev_islog) - vdev_load_log_state(cvd, ovd->vdev_child[c]); + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + /* + * Resolve any "missing" vdevs in the current configuration. + * If we find that the MOS config has more accurate information + * about the top-level vdev then use that vdev instead. + */ + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) { + + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) + continue; + + /* + * Device specific actions. + */ + if (mtvd->vdev_islog) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * XXX - once we have 'readonly' pool + * support we should be able to handle + * missing data devices by transitioning + * the pool to readonly. + */ + continue; + } + + /* + * Swap the missing vdev with the data we were + * able to obtain from the MOS config. + */ + vdev_remove_child(rvd, tvd); + vdev_remove_child(mrvd, mtvd); + + vdev_add_child(rvd, mtvd); + vdev_add_child(mrvd, tvd); + + spa_config_exit(spa, SCL_ALL, FTAG); + vdev_load(mtvd); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + vdev_reopen(rvd); + } else if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS config + * since it's possible that the label does not + * contain the most up-to-date information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } } - vdev_free(ovd); + vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Ensure we were able to validate the config. + */ + return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ -int +static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { @@ -1474,9 +1577,19 @@ spa_load_verify(spa_t *spa) if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { + int64_t loss = 0; + verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } @@ -1635,13 +1748,21 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, KM_SLEEP) == 0); } + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + } + } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; @@ -1661,7 +1782,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, nvlist_t *nvroot = NULL; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; @@ -1760,9 +1881,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. + * incomplete configuration. We first check to see if the pool + * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). + * If it is, defer the vdev_guid_sum check till later so we + * can handle missing vdevs. */ - if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -1982,13 +2107,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_config_exit(spa, SCL_ALL, FTAG); /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); - - /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); @@ -1997,16 +2115,12 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_update_dspace(spa); - if (state != SPA_LOAD_TRYIMPORT) { - error = spa_load_verify(spa); - if (error) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } - /* - * Load the intent log state and check log integrity. If we're - * assembling a pool from a split, the log is not transferred over. + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; @@ -2014,17 +2128,37 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - spa_load_log_state(spa, nvroot); + if (!spa_config_valid(spa, nvconfig)) { + nvlist_free(nvconfig); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } nvlist_free(nvconfig); + /* + * Now that we've validate the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (state != SPA_LOAD_TRYIMPORT) { + if (error = spa_load_verify(spa)) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; @@ -2066,12 +2200,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * - * If spa_load_verbatim is true, trust the current + * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || - state == SPA_LOAD_RECOVER) + state == SPA_LOAD_IMPORT || + state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -2110,12 +2245,14 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { + int mode = spa->spa_mode; + spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg--; - spa_activate(spa, spa_mode_global); + spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); @@ -2173,9 +2310,6 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, rewind_error = spa_load_retry(spa, state, mosconfig); } - if (config) - spa_rewind_data_to_nvlist(spa, config); - spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; @@ -2202,6 +2336,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; @@ -2225,7 +2360,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_load_state_t state = SPA_LOAD_OPEN; zpool_rewind_policy_t policy; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, @@ -2264,9 +2398,13 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_config) + if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; @@ -2275,15 +2413,22 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, *spapp = NULL; return (error); } - } spa_open_ref(spa, tag); - if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; @@ -2459,6 +2604,13 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); @@ -3032,7 +3184,7 @@ spa_import_rootpool(char *devpath, char *devid) spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; - spa->spa_load_verbatim = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. @@ -3081,7 +3233,8 @@ spa_import_rootpool(char *devpath, char *devid) !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", - bvd->vdev_parent->vdev_child[1]->vdev_path); + bvd->vdev_parent-> + vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = EINVAL; goto out; } @@ -3101,48 +3254,17 @@ out: #endif /* - * Take a pool and insert it into the namespace as if it had been loaded at - * boot. - */ -int -spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) -{ - spa_t *spa; - char *altroot = NULL; - - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, config, altroot); - - spa->spa_load_verbatim = B_TRUE; - - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); - - spa_config_sync(spa, B_FALSE, B_TRUE); - - mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); - - return (0); -} - -/* * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; + uint64_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; @@ -3157,23 +3279,45 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (EEXIST); } - zpool_get_rewind_policy(config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = FREAD; spa = spa_add(pool, config, altroot); - spa_activate(spa, spa_mode_global); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); + + return (0); + } + + spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when @@ -3181,14 +3325,16 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* - * Propagate anything learned about failing or best txgs - * back to caller + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). */ - spa_rewind_data_to_nvlist(spa, config); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -3228,6 +3374,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (error); } + spa_async_resume(spa); + /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -3278,8 +3426,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } - spa_async_resume(spa); - /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -3542,6 +3688,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, @@ -3653,6 +3801,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) int newvd_isspare; int error; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3702,7 +3852,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && - pvd->vdev_child[1] == oldvd && + oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -3714,13 +3864,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * the same (spare replaces spare, non-spare replaces * non-spare). */ - if (pvd->vdev_ops == &vdev_replacing_ops) + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops != &vdev_spare_ops && - newvd->vdev_isspare) + } + + if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; @@ -3755,6 +3907,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } + /* mark the device being resilvered */ + newvd->vdev_resilvering = B_TRUE; + /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -3823,6 +3978,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); + return (0); } @@ -3840,9 +3998,10 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid; - size_t len; char *vdpath; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3872,18 +4031,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* - * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. For the 'spare' vdev, either - * disk can be removed. + * Only 'replacing' or 'spare' vdevs can be replaced. */ - if (replace_done) { - if (pvd->vdev_ops == &vdev_replacing_ops) { - if (vd->vdev_id != 0) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } else if (pvd->vdev_ops != &vdev_spare_ops) { - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } - } + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); @@ -3910,16 +4062,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ - if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && - pvd->vdev_child[0]->vdev_path != NULL && - pvd->vdev_child[1]->vdev_path != NULL) { - ASSERT(pvd->vdev_child[1] == vd); - cvd = pvd->vdev_child[0]; - len = strlen(vd->vdev_path); - if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && - strcmp(cvd->vdev_path + len, "/old") == 0) { - spa_strfree(cvd->vdev_path); - cvd->vdev_path = spa_strdup(vd->vdev_path); + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (int c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } } } @@ -3929,7 +4087,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* @@ -3951,7 +4110,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) /* * Remember one of the remaining children so we can get tvd below. */ - cvd = pvd->vdev_child[0]; + cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, @@ -3967,14 +4126,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ - if (pvd->vdev_children == 1) + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); + cvd->vdev_resilvering = B_FALSE; + } + /* * We don't set tvd until now because the parent we just removed @@ -4016,6 +4181,9 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, @@ -4028,24 +4196,31 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * list of every other pool. */ if (unspare) { - spa_t *myspa = spa; - spa = NULL; + spa_t *altspa = NULL; + mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - if (spa == myspa) + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) continue; - spa_open_ref(spa, FTAG); + + spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, - B_TRUE); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); + spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); } + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + return (error); } @@ -4066,8 +4241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; - if (!spa_writeable(spa)) - return (EROFS); + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -4484,6 +4658,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + ASSERT(spa_writeable(spa)); + if (!locked) txg = spa_vdev_enter(spa); @@ -4593,11 +4769,18 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) } /* - * Check for a completed replacement. + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. */ - if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; - newvd = vd->vdev_child[1]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && @@ -4608,16 +4791,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. */ - if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { - newvd = vd->vdev_child[0]; - oldvd = vd->vdev_child[1]; + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; + + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } - if (newvd->vdev_unspare && + if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && - !vdev_dtl_required(oldvd)) { - newvd->vdev_unspare = 0; + !vdev_dtl_required(oldvd)) return (oldvd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); } } @@ -4644,9 +4852,9 @@ spa_vdev_resilver_done(spa_t *spa) * we need to detach the parent's first child (the original hot * spare) as well. */ - if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(ppvd->vdev_children == 2); sguid = ppvd->vdev_child[1]->vdev_guid; } spa_config_exit(spa, SCL_ALL, FTAG); @@ -4670,6 +4878,8 @@ spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, vdev_t *vd; boolean_t sync = B_FALSE; + ASSERT(spa_writeable(spa)); + spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) @@ -5115,9 +5325,11 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(spa->spa_root != NULL); break; + case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is also a non-persisitent property. + * 'readonly' and 'cachefile' are also non-persisitent + * properties. */ break; default: @@ -5249,6 +5461,8 @@ spa_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx; int error; + VERIFY(spa_writeable(spa)); + /* * Lock out configuration changes. */ @@ -5467,7 +5681,8 @@ spa_sync_allpools(void) spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -5547,6 +5762,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) void spa_upgrade(spa_t *spa, uint64_t version) { + ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index cdeda3f93..69d57f66d 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -304,24 +304,6 @@ spa_config_set(spa_t *spa, nvlist_t *config) mutex_exit(&spa->spa_props_lock); } -/* Add discovered rewind info, if any to the provided nvlist */ -void -spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl) -{ - int64_t loss = 0; - - if (tonvl == NULL || spa->spa_load_txg == 0) - return; - - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME, - spa->spa_load_txg_ts) == 0); - if (spa->spa_last_ubsync_txg) - loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; - VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS, - spa->spa_load_data_errors) == 0); -} - /* * Generate the pool's configuration based on the current in-core state. * We infer whether to generate a complete config or just one top-level config @@ -403,8 +385,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) /* * Add the top-level config. We even add this on pools which - * don't support holes in the namespace as older pools will - * just ignore it. + * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); @@ -449,8 +430,6 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) kmem_free(dds, sizeof (ddt_stat_t)); } - spa_rewind_data_to_nvlist(spa, config); - if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 52af7fcb7..1b54afb0b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -478,6 +478,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + if (config != NULL) VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); @@ -516,6 +519,7 @@ spa_remove(spa_t *spa) list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); @@ -886,10 +890,6 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - /* - * If the config changed, notify the scrub that it must restart. - * This will initiate a resilver if needed. - */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; @@ -1078,12 +1078,12 @@ spa_rename(const char *name, const char *newname) } /* - * Determine whether a pool with given pool_guid exists. If device_guid is - * non-zero, determine whether the pool exists *and* contains a device with the - * specified device_guid. + * Return the spa_t associated with given pool_guid, if it exists. If + * device_guid is non-zero, determine whether the pool exists *and* contains + * a device with the specified device_guid. */ -boolean_t -spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +spa_t * +spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; @@ -1114,7 +1114,16 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) } } - return (spa != NULL); + return (spa); +} + +/* + * Determine whether a pool with the given pool_guid exists. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + return (spa_by_guid(pool_guid, device_guid) != NULL); } char * diff --git a/module/zfs/txg.c b/module/zfs/txg.c index f478ad0c6..9b308ca4e 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -37,7 +37,7 @@ static void txg_sync_thread(dsl_pool_t *dp); static void txg_quiesce_thread(dsl_pool_t *dp); -int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ /* * Prepare the txg subsystem. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index a61f29b8e..bac3e8605 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -207,9 +207,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; } void @@ -244,9 +241,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; } /* @@ -524,6 +518,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, + &vd->vdev_resilvering); + /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be @@ -1375,10 +1372,10 @@ vdev_validate(vdev_t *vd) nvlist_free(label); /* - * If spa->spa_load_verbatim is true, no need to check the + * If this is a verbatim import, no need to check the * state of the pool. */ - if (!spa->spa_load_verbatim && + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1544,6 +1541,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); @@ -1599,6 +1597,7 @@ vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) @@ -1855,6 +1854,9 @@ vdev_dtl_required(vdev_t *vd) vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + if (!required && zio_injection_enabled) + required = !!zio_handle_device_injection(vd, NULL, ECHILD); + return (required); } @@ -2070,7 +2072,7 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { - vdev_t *vd; + vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); @@ -2080,6 +2082,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we @@ -2099,7 +2103,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ - if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -2107,7 +2111,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) * If we reopen the device and it's not dead, only then do we * mark it degraded. */ - vdev_reopen(vd); + vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); @@ -2349,15 +2353,15 @@ vdev_clear(spa_t *spa, vdev_t *vd) */ vd->vdev_forcefault = B_TRUE; - vd->vdev_faulted = vd->vdev_degraded = 0; + vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; - vdev_reopen(vd); + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; - if (vd != rvd) + if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) @@ -2541,7 +2545,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; @@ -2597,7 +2601,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCRUB_THREAD) || + (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's @@ -2616,7 +2620,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); @@ -2699,6 +2703,8 @@ vdev_config_dirty(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int c; + ASSERT(spa_writeable(spa)); + /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. @@ -2787,6 +2793,7 @@ vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* @@ -2944,12 +2951,13 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* - * If we fail to open a vdev during an import, we mark it as - * "not available", which signifies that it was never there to - * begin with. Failure to open such a device is not considered - * an error. + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. */ - if (spa_load_state(spa) == SPA_LOAD_IMPORT && + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -3042,32 +3050,52 @@ vdev_is_bootable(vdev_t *vd) /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original - * vdev was offline then we transfer that state to the device - * in the current vdev tree (nvd). + * vdev was offline or faulted then we transfer that state to the + * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; + ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); - if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { + if (nvd->vdev_ops->vdev_op_leaf) { /* - * It would be nice to call vdev_offline() - * directly but the pool isn't fully loaded and - * the txg threads have not been started yet. + * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; - vdev_reopen(nvd->vdev_top); + nvd->vdev_faulted = ovd->vdev_faulted; + nvd->vdev_degraded = ovd->vdev_degraded; + nvd->vdev_removed = ovd->vdev_removed; } } /* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* * Expand a vdev if possible. */ void diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 75ec54534..c08ed8ba0 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -353,6 +353,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); + if (vd->vdev_resilvering) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING, + B_TRUE) == 0); if (vd->vdev_faulted) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE) == 0); @@ -571,6 +574,15 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, return (B_TRUE); /* + * We can't rely on a pool's state if it's been imported + * read-only. Instead we look to see if the pools is marked + * read-only in the namespace and set the state to active. + */ + if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL && + spa_mode(spa) == FREAD) + state = POOL_STATE_ACTIVE; + + /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 1181bd443..843b5ff06 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -327,19 +327,35 @@ static acl_ops_t zfs_acl_fuid_ops = { * an external ACL and what version of ACL previously existed on the * file. Would really be nice to not need this, sigh. */ - uint64_t zfs_external_acl(znode_t *zp) { zfs_acl_phys_t acl_phys; + int error; if (zp->z_is_sa) return (0); - VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), - &acl_phys, sizeof (acl_phys))); + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ - return (acl_phys.z_acl_extern_obj); + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } } /* @@ -357,6 +373,7 @@ zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, int size; int error; + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_is_sa) { if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), &size)) != 0) @@ -387,13 +404,31 @@ zfs_znode_acl_version(znode_t *zp) { zfs_acl_phys_t acl_phys; - if (zp->z_is_sa) { + if (zp->z_is_sa) return (ZFS_ACL_VERSION_FUID); - } else { - VERIFY(0 == sa_lookup(zp->z_sa_hdl, + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), - &acl_phys, sizeof (acl_phys))); - return (acl_phys.z_acl_version); + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } } } @@ -1024,7 +1059,8 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, * create a new acl and leave any cached acl in place. */ static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) { zfs_acl_t *aclp; int aclsize; @@ -1033,6 +1069,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) zfs_acl_phys_t znode_acl; int version; int error; + boolean_t drop_lock = B_FALSE; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); @@ -1041,11 +1078,23 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) return (0); } - version = ZNODE_ACL_VERSION(zp); + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, - &acl_count, &znode_acl)) != 0) - return (error); + &acl_count, &znode_acl)) != 0) { + goto done; + } aclp = zfs_acl_alloc(version); @@ -1076,7 +1125,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) /* convert checksum errors into IO errors */ if (error == ECKSUM) error = EIO; - return (error); + goto done; } list_insert_head(&aclp->z_acl, aclnode); @@ -1084,7 +1133,10 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; - return (0); +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); } /*ARGSUSED*/ @@ -1104,44 +1156,18 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, *length = cb->cb_acl_node->z_size; } - -static int -zfs_acl_get_owner_fuids(znode_t *zp, uint64_t *fuid, uint64_t *fgid) -{ - int count = 0; - sa_bulk_attr_t bulk[2]; - int error; - - if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), NULL, - &fuid, sizeof (fuid)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), NULL, - &fgid, sizeof (fuid)); - if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { - return (error); - } - } else { - *fuid = zp->z_uid; - *fgid = zp->z_gid; - } - return (0); -} - int zfs_acl_chown_setattr(znode_t *zp) { int error; zfs_acl_t *aclp; - uint64_t fuid, fgid; - if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0) - return (error); + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - mutex_enter(&zp->z_acl_lock); - if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, - &zp->z_pflags, fuid, fgid); - mutex_exit(&zp->z_acl_lock); + &zp->z_pflags, zp->z_uid, zp->z_gid); return (error); } @@ -1163,14 +1189,11 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) sa_bulk_attr_t bulk[5]; uint64_t ctime[2]; int count = 0; - uint64_t fuid, fgid; mode = zp->z_mode; - if ((error = zfs_acl_get_owner_fuids(zp, &fuid, &fgid)) != 0) - return (error); - - mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, fuid, fgid); + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); zp->z_mode = mode; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, @@ -1482,18 +1505,17 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) list_insert_tail(&aclp->z_acl, newnode); } -int +void zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(zp->z_zfsvfs, mode, *aclp); - mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); ASSERT(*aclp); - return (0); } /* @@ -1660,7 +1682,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, gid_t gid; boolean_t need_chmod = B_TRUE; boolean_t inherited = B_FALSE; - uint64_t parentgid; bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1682,12 +1703,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { - if (IS_EPHEMERAL(dzp->z_gid)) - VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs), - &parentgid, sizeof (parentgid))); - else - parentgid = (uint64_t)dzp->z_gid; - acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, cr, &acl_ids->z_fuidp); acl_ids->z_fgid = 0; @@ -1696,7 +1711,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (acl_ids->z_fgid != parentgid && + if (acl_ids->z_fgid != dzp->z_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; @@ -1706,7 +1721,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, char *domain; uint32_t rid; - acl_ids->z_fgid = parentgid; + acl_ids->z_fgid = dzp->z_gid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); @@ -1746,15 +1761,15 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, } if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && (dzp->z_pflags & ZFS_INHERIT_ACE)) && !(dzp->z_pflags & ZFS_XATTR)) { - mutex_enter(&dzp->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); - mutex_exit(&dzp->z_acl_lock); inherited = B_TRUE; } else { acl_ids->z_aclp = @@ -1762,6 +1777,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); if (need_chmod) { acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ? ZFS_ACL_AUTO_INHERIT : 0; @@ -1824,7 +1840,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); @@ -1970,6 +1986,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; + uint64_t acl_obj; if (mask == 0) return (ENOSYS); @@ -1994,8 +2011,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); tx = dmu_tx_create(zfsvfs->z_os); @@ -2010,14 +2027,15 @@ top: * upgrading then take out necessary DMU holds */ - if (ZFS_EXTERNAL_ACL(zp)) { - if (zfsvfs->z_version <= ZPL_VERSION_SA && - ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0, + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); } else { - dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), - 0, aclp->z_acl_bytes); + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); @@ -2041,6 +2059,7 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) @@ -2052,8 +2071,8 @@ top: zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); done: - mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); return (error); } @@ -2137,11 +2156,14 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; - uint64_t gowner; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); @@ -2149,12 +2171,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, ASSERT(zp->z_acl_cached); - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs), - &gowner, sizeof (gowner))) != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { uint32_t mask_matched; @@ -2176,7 +2192,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, switch (entry_type) { case ACE_OWNER: - if (uid == zp->z_uid) + if (uid == fowner) checkit = B_TRUE; break; case OWNING_GROUP: @@ -2254,8 +2270,10 @@ zfs_has_access(znode_t *zp, cred_t *cr) uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - return (secpolicy_vnode_any_access(cr, ZTOV(zp), - zp->z_uid) == 0); + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); } return (B_TRUE); } @@ -2332,7 +2350,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) return (0); } - if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) { + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } @@ -2389,6 +2407,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) znode_t *xzp; znode_t *check_zp = zp; mode_t needed_bits; + uid_t owner; is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); @@ -2425,6 +2444,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } } + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC * in needed_bits. Map the bits mapped by working_mode (currently @@ -2436,7 +2456,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) working_mode = mode; if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - zp->z_uid == crgetuid(cr)) + owner == crgetuid(cr)) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| @@ -2452,7 +2472,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) &check_privs, skipaclchk, cr)) == 0) { if (is_attr) VN_RELE(ZTOV(xzp)); - return (secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid, + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits)); } @@ -2478,7 +2498,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && - zp->z_uid == crgetuid(cr))) + owner == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| @@ -2490,20 +2510,20 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) if (working_mode & ACE_EXECUTE) checkmode |= VEXEC; - error = secpolicy_vnode_access2(cr, ZTOV(check_zp), zp->z_uid, + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, zp->z_uid); + error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(cr, zp->z_uid); + error = secpolicy_vnode_setdac(cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, zp->z_uid); + error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* @@ -2515,7 +2535,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } } } else if (error == 0) { - error = secpolicy_vnode_access2(cr, ZTOV(zp), zp->z_uid, + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits); } @@ -2552,9 +2572,12 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, mode_t available_perms, cred_t *cr) { int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); error = secpolicy_vnode_access2(cr, ZTOV(dzp), - dzp->z_uid, available_perms, VWRITE|VEXEC); + downer, available_perms, VWRITE|VEXEC); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 362de4dd1..815f8895e 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -590,7 +590,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); - vfs_setmntpoint(vfsp, newpath); + vfs_setmntpoint(vfsp, newpath, 0); pathref = vfs_getresource(vfsp); (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); @@ -599,7 +599,7 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); (void) strcat(newpath, nm); refstr_rele(pathref); - vfs_setresource(vfsp, newpath); + vfs_setresource(vfsp, newpath, 0); vfs_unlock(vfsp); } @@ -749,7 +749,8 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, return (err); if (err == 0) { - err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); + err = dmu_objset_snapshot(name, dirname, NULL, NULL, + B_FALSE, B_FALSE, -1); if (err) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index 6d6666822..b06d29ab3 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -630,7 +630,7 @@ zfs_rmnode(znode_t *zp) ASSERT(error == 0); } - acl_obj = ZFS_EXTERNAL_ACL(zp); + acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. @@ -1067,6 +1067,9 @@ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_replay) return (0); @@ -1074,7 +1077,10 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) if ((zdp->z_mode & S_ISVTX) == 0) return (0); - if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid || + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 8c0424e84..a853f4d73 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -388,26 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { - uint64_t fuid, fgid; - sa_bulk_attr_t bulk[2]; - int count = 0; - - if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), - NULL, &fuid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), - NULL, &fgid, 8); - VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count)); - } - if (IS_EPHEMERAL(zp->z_uid)) - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); - else - *uidp = zp->z_uid; - if (IS_EPHEMERAL(zp->z_gid)) - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, - zp->z_gid, cr, ZFS_GROUP); - else - *gidp = zp->z_gid; + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); } uid_t diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index de5fb1e4c..1b63c9bf4 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -60,6 +60,7 @@ #include <sys/fs/zfs.h> #include <sys/zfs_ctldir.h> #include <sys/zfs_dir.h> +#include <sys/zfs_onexit.h> #include <sys/zvol.h> #include <sys/dsl_scan.h> #include <sharefs/share.h> @@ -87,12 +88,18 @@ typedef enum { DATASET_NAME } zfs_ioc_namecheck_t; +typedef enum { + POOL_CHECK_NONE = 1 << 0, + POOL_CHECK_SUSPENDED = 1 << 1, + POOL_CHECK_READONLY = 1 << 2 +} zfs_ioc_poolcheck_t; + typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_his_log; - boolean_t zvec_pool_check; + zfs_ioc_poolcheck_t zvec_pool_check; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ @@ -281,9 +288,8 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) } static int -zfs_dozonecheck(const char *dataset, cred_t *cr) +zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { - uint64_t zoned; int writable = 1; /* @@ -294,9 +300,6 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) !zone_dataset_visible(dataset, &writable)) return (ENOENT); - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) - return (ENOENT); - if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the @@ -318,6 +321,32 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) return (0); } +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) + return (ENOENT); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) +{ + uint64_t zoned; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + return (ENOENT); + } + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { @@ -332,6 +361,21 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) return (error); } +int +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck_ds(name, ds, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error) + error = dsl_deleg_access_impl(ds, perm, cr); + } + return (error); +} + /* * Policy for setting the security label property. * @@ -507,8 +551,38 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds; + char *cp; + int error; + + /* + * Generate the current snapshot name from the given objsetid, then + * use that name for the secpolicy/zone checks. + */ + cp = strchr(zc->zc_name, '@'); + if (cp == NULL) + return (EINVAL); + error = spa_open(zc->zc_name, &spa, FTAG); + if (error) + return (error); + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) + return (error); + + dsl_dataset_name(ds, zc->zc_name); + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND, cr); + dsl_dataset_rele(ds, FTAG); + + return (error); } static int @@ -786,6 +860,22 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) } /* + * Policy for object to name lookups. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + return (0); + + error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); + return (error); +} + +/* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ @@ -876,6 +966,33 @@ zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) } /* + * Policy for allowing temporary snapshots to be taken or released + */ +static int +zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) +{ + /* + * A temporary snapshot is the same as a snapshot, + * hold, destroy and release all rolled into one. + * Delegated diff alone is sufficient that we allow this. + */ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr)) == 0) + return (0); + + error = zfs_secpolicy_snapshot(zc, cr); + if (!error) + error = zfs_secpolicy_hold(zc, cr); + if (!error) + error = zfs_secpolicy_release(zc, cr); + if (!error) + error = zfs_secpolicy_destroy(zc, cr); + return (error); +} + +/* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int @@ -1001,14 +1118,15 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) * case its z_vfs will be NULL, and it will be opened as the owner. */ static int -zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp) +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { - rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag); + rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : + RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting @@ -1137,13 +1255,15 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = EINVAL; - else if (zc->zc_cookie) - error = spa_import_verbatim(zc->zc_name, config, props); else - error = spa_import(zc->zc_name, config, props); + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); - if (zc->zc_nvlist_dst != 0) - (void) put_nvlist(zc, config); + if (zc->zc_nvlist_dst != 0) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } nvlist_free(config); @@ -1366,6 +1486,35 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_stat stats on object + * zc_value path to object + */ +static int +zfs_ioc_obj_to_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele(os, FTAG); + + return (error); +} + static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { @@ -1577,26 +1726,12 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) return (error); } -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - */ static int -zfs_ioc_objset_stats(zfs_cmd_t *zc) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { - objset_t *os = NULL; - int error; + int error = 0; nvlist_t *nv; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (error); - dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && @@ -1617,7 +1752,32 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) nvlist_free(nv); } + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (error); + + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + return (error); } @@ -1850,19 +2010,43 @@ top: error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); - dmu_objset_rele(os, FTAG); + zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, + NULL); + if (error == 0) { - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - if (error == ENOENT) { - /* We lost a race with destroy, get the next one. */ - *strchr(zc->zc_name, '@') = '\0'; - goto top; + dsl_dataset_t *ds; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + + /* + * Since we probably don't have a hold on this snapshot, + * it's possible that the objsetid could have been destroyed + * and reused for a new objset. It's OK if this happens during + * a zfs send operation, since the new createtxg will be + * beyond the range we're interested in. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) { + if (error == ENOENT) { + /* Racing with destroy, get the next one. */ + *strchr(zc->zc_name, '@') = '\0'; + dmu_objset_rele(os, FTAG); + goto top; + } + } else { + objset_t *ossnap; + + error = dmu_objset_from_ds(ds, &ossnap); + if (error == 0) + error = zfs_ioc_objset_stats_impl(zc, ossnap); + dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = ESRCH; } + dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error) *strchr(zc->zc_name, '@') = '\0'; @@ -1905,7 +2089,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) rid = valary[1]; quota = valary[2]; - err = zfsvfs_hold(dsname, FTAG, &zfsvfs); + err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); @@ -1970,7 +2154,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, { zfsvfs_t *zfsvfs; - if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0) + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); @@ -2872,8 +3056,8 @@ zfs_ioc_snapshot(zfs_cmd_t *zc) goto out; } - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, - nvprops, recursive); + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, + nvprops, recursive, B_FALSE, -1); out: nvlist_free(nvprops); @@ -3342,11 +3526,14 @@ static boolean_t zfs_ioc_recv_inject_err; * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag + * zc_cleanup_fd cleanup-on-exit file descriptor + * zc_action_handle handle for this guid/ds mapping (or zero on first call) * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t + * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) @@ -3475,7 +3662,8 @@ zfs_ioc_recv(zfs_cmd_t *zc) } off = fp->f_offset; - error = dmu_recv_stream(&drc, fp->f_vnode, &off); + error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd, + &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; @@ -3567,9 +3755,10 @@ out: /* * inputs: * zc_name name of snapshot to send - * zc_value short name of incremental fromsnap (may be empty) * zc_cookie file descriptor to send stream to - * zc_obj fromorigin flag (mutually exclusive with zc_value) + * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) + * zc_sendobj objsetid of snapshot to send + * zc_fromobj objsetid of incremental fromsnap (may be zero) * * outputs: none */ @@ -3581,34 +3770,55 @@ zfs_ioc_send(zfs_cmd_t *zc) file_t *fp; int error; offset_t off; + dsl_dataset_t *ds; + dsl_dataset_t *dsfrom = NULL; + spa_t *spa; + dsl_pool_t *dp; - error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); + error = spa_open(zc->zc_name, &spa, FTAG); if (error) return (error); - if (zc->zc_value[0] != '\0') { - char *buf; - char *cp; - - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strncpy(buf, zc->zc_name, MAXPATHLEN); - cp = strchr(buf, '@'); - if (cp) - *(cp+1) = 0; - (void) strncat(buf, zc->zc_value, MAXPATHLEN); - error = dmu_objset_hold(buf, FTAG, &fromsnap); - kmem_free(buf, MAXPATHLEN); + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) { + spa_close(spa, FTAG); + return (error); + } + + error = dmu_objset_from_ds(ds, &tosnap); + if (error) { + dsl_dataset_rele(ds, FTAG); + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_fromobj != 0) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + error = dmu_objset_from_ds(dsfrom, &fromsnap); if (error) { - dmu_objset_rele(tosnap, FTAG); + dsl_dataset_rele(dsfrom, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } + } else { + spa_close(spa, FTAG); } fp = getf(zc->zc_cookie); if (fp == NULL) { - dmu_objset_rele(tosnap, FTAG); - if (fromsnap) - dmu_objset_rele(fromsnap, FTAG); + dsl_dataset_rele(ds, FTAG); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); return (EBADF); } @@ -3618,9 +3828,9 @@ zfs_ioc_send(zfs_cmd_t *zc) if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); - if (fromsnap) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); + dsl_dataset_rele(ds, FTAG); return (error); } @@ -3717,7 +3927,10 @@ zfs_ioc_clear(zfs_cmd_t *zc) error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { - (void) put_nvlist(zc, config); + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; nvlist_free(config); } nvlist_free(policy); @@ -3801,7 +4014,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); - error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); @@ -3832,7 +4045,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) if (bufsize <= 0) return (ENOMEM); - int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); + int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); @@ -4032,6 +4245,113 @@ ace_t full_access[] = { }; /* + * inputs: + * zc_name name of containing filesystem + * zc_obj object # beyond which we want next in-use object # + * + * outputs: + * zc_obj next in-use object # + */ +static int +zfs_ioc_next_obj(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error) + return (error); + + error = dmu_object_next(os, &zc->zc_obj, B_FALSE, + os->os_dsl_dataset->ds_phys->ds_prev_snap_txg); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value prefix name for snapshot + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * + * outputs: + */ +static int +zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) +{ + char *snap_name; + int error; + + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + + if (strlen(snap_name) >= MAXNAMELEN) { + strfree(snap_name); + return (E2BIG); + } + + error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, + NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); + if (error != 0) { + strfree(snap_name); + return (error); + } + + (void) strcpy(zc->zc_value, snap_name); + strfree(snap_name); + return (0); +} + +/* + * inputs: + * zc_name name of "to" snapshot + * zc_value name of "from" snapshot + * zc_cookie file descriptor to write diff data on + * + * outputs: + * dmu_diff_record_t's to the file descriptor + */ +static int +zfs_ioc_diff(zfs_cmd_t *zc) +{ + objset_t *fromsnap; + objset_t *tosnap; + file_t *fp; + offset_t off; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); + if (error) + return (error); + + error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); + if (error) { + dmu_objset_rele(tosnap, FTAG); + return (error); + } + + fp = getf(zc->zc_cookie); + if (fp == NULL) { + dmu_objset_rele(fromsnap, FTAG); + dmu_objset_rele(tosnap, FTAG); + return (EBADF); + } + + off = fp->f_offset; + + error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); + + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + releasef(zc->zc_cookie); + + dmu_objset_rele(fromsnap, FTAG); + dmu_objset_rele(tosnap, FTAG); + return (error); +} + +/* * Remove all ACL files in shares dir */ static int @@ -4182,11 +4502,14 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snap - * zc_string user-supplied tag for this reference - * zc_cookie recursive flag - * zc_temphold set if hold is temporary + * zc_name name of filesystem + * zc_value short name of snap + * zc_string user-supplied tag for this hold + * zc_cookie recursive flag + * zc_temphold set if hold is temporary + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * zc_sendobj if non-zero, the objid for zc_name@zc_value + * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg * * outputs: none */ @@ -4194,22 +4517,76 @@ static int zfs_ioc_hold(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie; + spa_t *spa; + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + minor_t minor = 0; if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); - return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, - zc->zc_string, recursive, zc->zc_temphold)); + if (zc->zc_sendobj == 0) { + return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, + zc->zc_string, recursive, zc->zc_temphold, + zc->zc_cleanup_fd)); + } + + if (recursive) + return (EINVAL); + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error) + return (error); + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + spa_close(spa, FTAG); + if (error) + return (error); + + /* + * Until we have a hold on this snapshot, it's possible that + * zc_sendobj could've been destroyed and reused as part + * of a later txg. Make sure we're looking at the right object. + */ + if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { + dsl_dataset_rele(ds, FTAG); + return (ENOENT); + } + + if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, + zc->zc_temphold); + if (minor != 0) { + if (error == 0) { + dsl_register_onexit_hold_cleanup(ds, zc->zc_string, + minor); + } + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + } + dsl_dataset_rele(ds, FTAG); + + return (error); } /* * inputs: - * zc_name name of dataset from which we're releasing a user reference + * zc_name name of dataset from which we're releasing a user hold * zc_value short name of snap - * zc_string user-supplied tag for this reference + * zc_string user-supplied tag for this hold * zc_cookie recursive flag * - * outputs: none + * outputs: none */ static int zfs_ioc_release(zfs_cmd_t *zc) @@ -4251,132 +4628,264 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) */ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_READONLY }, { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_FALSE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED }, { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED }, { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, + POOL_CHECK_SUSPENDED }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, - B_TRUE}, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, + POOL_CHECK_NONE }, { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, + POOL_CHECK_NONE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, + POOL_CHECK_NONE }, { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, - B_TRUE, B_TRUE }, + B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, + POOL_CHECK_NONE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED }, { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, + POOL_CHECK_NONE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, + POOL_CHECK_NONE }, { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, - DATASET_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, - DATASET_NAME, B_FALSE, B_FALSE }, + POOL_CHECK_NONE }, + { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME, + B_FALSE, POOL_CHECK_NONE }, + { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME, + B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, - DATASET_NAME, B_FALSE, B_TRUE }, - { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, + DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, - B_TRUE }, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, + POOL_CHECK_SUSPENDED }, { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, + POOL_CHECK_NONE }, { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE } + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_NONE }, + { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, + POOL_CHECK_NONE }, + { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, + B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED } }; int -pool_status_check(const char *name, zfs_ioc_namecheck_t type) +pool_status_check(const char *name, zfs_ioc_namecheck_t type, + zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); + if (check & POOL_CHECK_NONE) + return (0); + error = spa_open(name, &spa, FTAG); if (error == 0) { - if (spa_suspended(spa)) + if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = EAGAIN; + else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) + error = EROFS; spa_close(spa, FTAG); } return (error); } +/* + * Find a free minor number. + */ +minor_t +zfsdev_minor_alloc(void) +{ + static minor_t last_minor; + minor_t m; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + for (m = last_minor + 1; m != last_minor; m++) { + if (m > ZFSDEV_MAX_MINOR) + m = 1; + if (ddi_get_soft_state(zfsdev_state, m) == NULL) { + last_minor = m; + return (m); + } + } + + return (0); +} + +static int +zfs_ctldev_init(dev_t *devp) +{ + minor_t minor; + zfs_soft_state_t *zs; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + ASSERT(getminor(*devp) == 0); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (ENXIO); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (EAGAIN); + + *devp = makedevice(getemajor(*devp), minor); + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_CTLDEV; + zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); + + return (0); +} + +static void +zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) +{ + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + zfs_onexit_destroy(zo); + ddi_soft_state_free(zfsdev_state, minor); +} + +void * +zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) +{ + zfs_soft_state_t *zp; + + zp = ddi_get_soft_state(zfsdev_state, minor); + if (zp == NULL || zp->zss_type != which) + return (NULL); + + return (zp->zss_data); +} + +static int +zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr) +{ + int error = 0; + + if (getminor(*devp) != 0) + return (zvol_open(devp, flag, otyp, cr)); + + /* This is the control device. Allocate a new minor if requested. */ + if (flag & FEXCL) { + mutex_enter(&zfsdev_state_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&zfsdev_state_lock); + } + + return (error); +} + +static int +zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr) +{ + zfs_onexit_t *zo; + minor_t minor = getminor(dev); + + if (minor == 0) + return (0); + + mutex_enter(&zfsdev_state_lock); + zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (zo == NULL) { + mutex_exit(&zfsdev_state_lock); + return (zvol_close(dev, flag, otyp, cr)); + } + zfs_ctldev_destroy(zo, minor); + mutex_exit(&zfsdev_state_lock); + + return (0); +} + static int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) { zfs_cmd_t *zc; uint_t vec; int error, rc; + minor_t minor = getminor(dev); - if (getminor(dev) != 0) + if (minor != 0 && + zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL) return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); vec = cmd - ZFS_IOC; @@ -4405,17 +4914,17 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; - if (zfs_ioc_vec[vec].zvec_pool_check) - error = pool_status_check(zc->zc_name, - zfs_ioc_vec[vec].zvec_namecheck); + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck, + zfs_ioc_vec[vec].zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; - if (zfs_ioc_vec[vec].zvec_pool_check) - error = pool_status_check(zc->zc_name, - zfs_ioc_vec[vec].zvec_namecheck); + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck, + zfs_ioc_vec[vec].zvec_pool_check); break; case NO_NAME: @@ -4499,8 +5008,8 @@ zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * so most of the standard driver entry points are in zvol.c. */ static struct cb_ops zfs_cb_ops = { - zvol_open, /* open */ - zvol_close, /* close */ + zfsdev_open, /* open */ + zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index bf9f37bca..26ab78279 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -170,6 +169,12 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) *attrs |= (xoap->xoa_reparse == 0) ? 0 : XAT0_REPARSE; + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + *attrs |= (xoap->xoa_offline == 0) ? 0 : + XAT0_OFFLINE; + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + *attrs |= (xoap->xoa_sparse == 0) ? 0 : + XAT0_SPARSE; } static void * @@ -231,7 +236,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; - uint64_t seq; lr_create_t *lr; lr_acl_create_t *lracl; size_t aclsize; @@ -333,9 +337,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ bcopy(name, end, namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -343,10 +345,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name) + znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; - uint64_t seq; lr_remove_t *lr; size_t namesize = strlen(name) + 1; @@ -358,8 +359,9 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr->lr_doid = dzp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; + itx->itx_oid = foid; + + zil_itx_assign(zilog, itx, tx); } /* @@ -370,7 +372,6 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; - uint64_t seq; lr_link_t *lr; size_t namesize = strlen(name) + 1; @@ -383,9 +384,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr->lr_link_obj = zp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -396,7 +395,6 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link) { itx_t *itx; - uint64_t seq; lr_create_t *lr; size_t namesize = strlen(name) + 1; size_t linksize = strlen(link) + 1; @@ -418,9 +416,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, bcopy(name, (char *)(lr + 1), namesize); bcopy(link, (char *)(lr + 1) + namesize, linksize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -431,7 +427,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; - uint64_t seq; lr_rename_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; @@ -445,11 +440,9 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr->lr_tdoid = tdzp->z_id; bcopy(sname, (char *)(lr + 1), snamesize); bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + itx->itx_oid = szp->z_id; - seq = zil_itx_assign(zilog, itx, tx); - sdzp->z_last_itx = seq; - tdzp->z_last_itx = seq; - szp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -520,13 +513,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx->itx_private = zp->z_zfsvfs; - if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || - (ioflag & (FSYNC | FDSYNC))) - itx->itx_sync = B_TRUE; - else + if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && + (fsync_cnt == 0)) itx->itx_sync = B_FALSE; - zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + zil_itx_assign(zilog, itx, tx); off += len; resid -= len; @@ -541,7 +532,6 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; - uint64_t seq; lr_truncate_t *lr; if (zil_replaying(zilog, tx) || zp->z_unlinked) @@ -554,8 +544,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, lr->lr_length = len; itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -566,7 +555,6 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_setattr_t *lr; xvattr_t *xvap = (xvattr_t *)vap; size_t recsize = sizeof (lr_setattr_t); @@ -618,8 +606,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, (void) zfs_log_fuid_domains(fuidp, start); itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* @@ -630,7 +617,6 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_acl_v0_t *lrv0; lr_acl_t *lr; int txtype; @@ -686,6 +672,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, } itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c new file mode 100644 index 000000000..9706de2b4 --- /dev/null +++ b/module/zfs/zfs_onexit.c @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/zfs_ioctl.h> +#include <sys/mkdev.h> +#include <sys/zfs_onexit.h> +#include <sys/zvol.h> + +/* + * ZFS kernel routines may add/delete callback routines to be invoked + * upon process exit (triggered via the close operation from the /dev/zfs + * driver). + * + * These cleanup callbacks are intended to allow for the accumulation + * of kernel state across multiple ioctls. User processes participate + * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a + * clone-open, generating a unique minor number. The process then passes + * along that file descriptor to each ioctl that might have a cleanup operation. + * + * Consumers of the onexit routines should call zfs_onexit_fd_hold() early + * on to validate the given fd and add a reference to its file table entry. + * This allows the consumer to do its work and then add a callback, knowing + * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers + * should call zfs_onexit_fd_rele(). + * + * A simple example is zfs_ioc_recv(), where we might create an AVL tree + * with dataset/GUID mappings and then reuse that tree on subsequent + * zfs_ioc_recv() calls. + * + * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() + * the AVL tree and pass it along with a callback function to + * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the + * callback and return an action handle. + * + * The action handle is then passed from user space to subsequent + * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree + * by calling zfs_onexit_cb_data() with the device minor number and + * action handle. + * + * If the user process exits abnormally, the callback is invoked implicitly + * as part of the driver close operation. Once the user space process is + * finished with the accumulated kernel state, it can also just call close(2) + * on the cleanup fd to trigger the cleanup callback. + */ + +void +zfs_onexit_init(zfs_onexit_t **zop) +{ + zfs_onexit_t *zo; + + zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); + mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), + offsetof(zfs_onexit_action_node_t, za_link)); +} + +void +zfs_onexit_destroy(zfs_onexit_t *zo) +{ + zfs_onexit_action_node_t *ap; + + mutex_enter(&zo->zo_lock); + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + mutex_enter(&zo->zo_lock); + } + mutex_exit(&zo->zo_lock); + + list_destroy(&zo->zo_actions); + mutex_destroy(&zo->zo_lock); + kmem_free(zo, sizeof (zfs_onexit_t)); +} + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (*zo == NULL) + return (EBADF); + + return (0); +} + +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp; + zfs_onexit_t *zo; + + fp = getf(fd); + if (fp == NULL) + return (EBADF); + + *minorp = getminor(fp->f_vnode->v_rdev); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + releasef(fd); +} + +/* + * Add a callback to be invoked when the calling process exits. + */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + list_link_init(&ap->za_link); + ap->za_func = func; + ap->za_data = data; + + mutex_enter(&zo->zo_lock); + list_insert_tail(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (action_handle) + *action_handle = (uint64_t)(uintptr_t)ap; + + return (0); +} + +static zfs_onexit_action_node_t * +zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) +{ + zfs_onexit_action_node_t *match; + zfs_onexit_action_node_t *ap; + list_t *l; + + ASSERT(MUTEX_HELD(&zo->zo_lock)); + + match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; + l = &zo->zo_actions; + for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { + if (match == ap) + break; + } + return (ap); +} + +/* + * Delete the callback, triggering it first if 'fire' is set. + */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (fire) + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + } else { + mutex_exit(&zo->zo_lock); + error = ENOENT; + } + + return (error); +} + +/* + * Return the data associated with this callback. This allows consumers + * of the cleanup-on-exit interfaces to stash kernel data across system + * calls, knowing that it will be cleaned up if the calling process exits. + */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + *data = NULL; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) + *data = ap->za_data; + else + error = ENOENT; + mutex_exit(&zo->zo_lock); + + return (error); +} diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index f26009b02..9fb336856 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -129,6 +128,10 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); } static int @@ -625,7 +628,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; - uint64_t orig_eof, eod, offset, length; + uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -643,9 +646,20 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) offset = lr->lr_offset; length = lr->lr_length; - eod = offset + length; /* end of data for this write */ + eod = offset + length; /* end of data for this write */ - orig_eof = zp->z_size; + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. This needs to be done within the single dmu + * transaction created within vn_rdwr -> zfs_write. So a possible + * new end of file is passed through in zfsvfs->z_replay_eof + */ + + zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -654,23 +668,15 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) offset -= offset % blocksize; length = blocksize; } + if (zp->z_size < eod) + zfsvfs->z_replay_eof = eod; } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - /* - * This may be a write from a dmu_sync() for a whole block, - * and may extend beyond the current end of the file. - * We can't just replay what was written for this TX_WRITE as - * a future TX_WRITE2 may extend the eof and the data for that - * write needs to be there. So we write the whole block and - * reduce the eof. - */ - if (orig_eof < zp->z_size) /* file length grew ? */ - zp->z_size = eod; - VN_RELE(ZTOV(zp)); + zfsvfs->z_replay_eof = 0; /* safety */ return (error); } @@ -694,10 +700,31 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); +top: end = lr->lr_offset + lr->lr_length; if (end > zp->z_size) { - ASSERT3U(end - zp->z_size, <, zp->z_blksz); + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + zp->z_size = end; + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + VN_RELE(ZTOV(zp)); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + + /* Ensure the replayed seq is updated */ + (void) zil_replaying(zfsvfs->z_log, tx); + + dmu_tx_commit(tx); } VN_RELE(ZTOV(zp)); diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 73a40aa4f..d141e43d7 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -125,6 +125,7 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; + ASSERT(MUTEX_HELD(&zp->z_lock)); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) { if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), @@ -158,6 +159,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; + ASSERT(MUTEX_HELD(&zp->z_lock)); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), @@ -204,6 +206,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) uint64_t crtime[2], mtime[2], ctime[2]; zfs_acl_phys_t znode_acl; char scanstamp[AV_SCANSTAMP_SZ]; + boolean_t drop_lock = B_FALSE; /* * No upgrade if ACL isn't cached @@ -214,6 +217,22 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) return; + /* + * If the z_lock is held and we aren't the owner + * the just return since we don't want to deadlock + * trying to update the status of z_is_sa. This + * file can then be upgraded at a later time. + * + * Otherwise, we know we are doing the + * sa_update() that caused us to enter this function. + */ + if (mutex_owner(&zp->z_lock) != curthread) { + if (mutex_tryenter(&zp->z_lock) == 0) + return; + else + drop_lock = B_TRUE; + } + /* First do a bulk query of the attributes that aren't cached */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); @@ -228,7 +247,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) &znode_acl, 88); if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) - return; + goto done; /* @@ -269,9 +288,10 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) locate.cb_aclp = zp->z_acl_cached; SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + if (xattr) - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), - NULL, &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), + NULL, &xattr, 8); /* if scanstamp then add scanstamp */ @@ -291,6 +311,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) znode_acl.z_acl_extern_obj, tx)); zp->z_is_sa = B_TRUE; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); } void @@ -299,12 +322,11 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) return; - ASSERT(!zp->z_is_sa); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - if (ZFS_EXTERNAL_ACL(zp)) { - dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0, + if (zfs_external_acl(zp)) { + dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, DMU_OBJECT_END); } } diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index f68dde85f..cb8c1d086 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -166,7 +166,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) } if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); + zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { @@ -417,7 +417,8 @@ zfs_register_callbacks(vfs_t *vfsp) * of mount options, we stash away the current values and * restore them after we register the callbacks. */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { @@ -821,23 +822,14 @@ zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; - uid_t id; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; - id = isgroup ? zp->z_gid : zp->z_uid; + fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); - if (IS_EPHEMERAL(id)) { - VERIFY(0 == sa_lookup(zp->z_sa_hdl, - isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs), - &fuid, sizeof (fuid))); - } else { - fuid = (uint64_t)id; - } - return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } @@ -922,7 +914,10 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) sa_obj = 0; } - zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END); + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error) + goto out; if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); @@ -1043,12 +1038,15 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ - if (zil_replay_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - } else { - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } @@ -1172,6 +1170,7 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); @@ -1808,10 +1807,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) /* * Evict cached data */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } + if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os)) + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } @@ -2031,8 +2030,9 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) goto bail; - zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj, - zfs_attr_table, ZPL_END); + if ((err = sa_setup(zfsvfs->z_os, sa_obj, + zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) + goto bail; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); @@ -2272,7 +2272,7 @@ static vfsdef_t vfw = { MNTTYPE_ZFS, zfs_vfsinit, VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| - VSW_XID, + VSW_XID|VSW_ZMOUNT, &zfs_mntopts }; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index aa43c065f..a0720079c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -132,7 +132,7 @@ * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * - * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: @@ -164,7 +164,7 @@ * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * zil_commit(zilog, seq, foid); // synchronous when necessary + * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ @@ -490,7 +490,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * If we're in FRSYNC mode, sync out this znode before reading it. */ if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. @@ -670,7 +670,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else - uio_prefaultpages(n, uio); + uio_prefaultpages(MIN(n, max_blksz), uio); /* * If in append mode, set the io offset pointer to eof. @@ -866,6 +866,8 @@ again: * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | @@ -893,6 +895,14 @@ again: uio->uio_loffset); ASSERT(error == 0); } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); @@ -902,6 +912,9 @@ again: break; ASSERT(tx_bytes == nbytes); n -= nbytes; + + if (!xuio && n > 0) + uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); @@ -917,7 +930,7 @@ again: if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_last_itx, zp->z_id); + zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); @@ -1356,6 +1369,8 @@ top: error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); @@ -1371,6 +1386,8 @@ top: * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); goto out; } @@ -1381,6 +1398,8 @@ top: if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); error = EINVAL; goto out; } @@ -1440,6 +1459,10 @@ top: } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + /* * A directory entry already exists for this name. */ @@ -1496,7 +1519,7 @@ out: } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -1527,12 +1550,13 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); - znode_t *xzp = NULL; + znode_t *xzp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - uint64_t acl_obj, xattr_obj = 0; + uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; @@ -1554,6 +1578,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, } top: + xattr_obj = 0; + xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ @@ -1596,6 +1622,7 @@ top: * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ + obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -1612,16 +1639,17 @@ top: /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); - if (xattr_obj) { + if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT3U(error, ==, 0); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } - /* are there any additional acls */ - if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now) + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); @@ -1630,6 +1658,8 @@ top: if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); + if (xzp) + VN_RELE(ZTOV(xzp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); @@ -1654,13 +1684,18 @@ top: if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && - xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) == + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; mutex_exit(&vp->v_lock); } @@ -1676,6 +1711,7 @@ top: ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); + if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); @@ -1685,7 +1721,6 @@ top: sizeof (uint64_t), tx); ASSERT3U(error, ==, 0); } - mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); vp->v_count--; ASSERT3U(vp->v_count, ==, 0); @@ -1693,13 +1728,14 @@ top: mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); } else if (unlinked) { + mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: @@ -1714,7 +1750,7 @@ out: VN_RELE(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -1896,7 +1932,7 @@ top: zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); @@ -2011,7 +2047,7 @@ top: uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); @@ -2024,7 +2060,7 @@ out: VN_RELE(vp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -2164,7 +2200,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; - off64_t *next; + off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. @@ -2290,7 +2326,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, } else { offset += 1; } - *next = offset; + if (next) + *next = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ @@ -2343,7 +2380,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); @@ -2384,6 +2421,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); @@ -2397,7 +2436,8 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ - if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) { + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); @@ -2413,8 +2453,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = zp->z_mode & MODEMASK; - vap->va_uid = zp->z_uid; - vap->va_gid = zp->z_gid; vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) @@ -2515,6 +2553,22 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); @@ -2570,7 +2624,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; - uint64_t xattr_obj = 0; + uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; @@ -2578,7 +2632,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; - zfs_acl_t *aclp = NULL; + zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; @@ -2657,6 +2711,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, top: attrzp = NULL; + aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { @@ -2692,6 +2747,8 @@ top: ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, @@ -2748,8 +2805,7 @@ top: mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; - oldva.va_uid = zp->z_uid; - oldva.va_gid = zp->z_gid; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes @@ -2880,10 +2936,10 @@ top: mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, - sizeof (xattr_obj)); + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); - if (xattr_obj) { + if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) goto out2; @@ -2891,8 +2947,10 @@ top: if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (vap->va_uid != zp->z_uid && + if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); err = EDQUOT; goto out2; } @@ -2903,6 +2961,8 @@ top: cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); err = EDQUOT; goto out2; } @@ -2912,32 +2972,33 @@ top: if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; + uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) - goto out; + zfs_acl_chmod_setattr(zp, &aclp, new_mode); - if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) { + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - ZNODE_ACL_VERSION(zp) == + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - ZFS_EXTERNAL_ACL(zp), 0, + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { - dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0, + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } + mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && @@ -2973,12 +3034,17 @@ top: * updated as a side-effect of calling this function. */ + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, @@ -2990,26 +3056,24 @@ top: if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); - zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid, - cr, ZFS_OWNER); + zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); - attrzp->z_uid = zp->z_uid; + attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); - zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr, - ZFS_GROUP); + zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); - attrzp->z_gid = zp->z_gid; + attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { @@ -3026,20 +3090,18 @@ top: } if (mask & AT_MODE) { - mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; - mutex_exit(&zp->z_acl_lock); } - if (attrzp) - mutex_exit(&attrzp->z_lock); if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); @@ -3118,7 +3180,14 @@ top: zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, @@ -3145,10 +3214,9 @@ out: dmu_tx_commit(tx); } - out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); @@ -3555,9 +3623,8 @@ top: error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), - sdzp, sdl->dl_name, tdzp, tdl->dl_name, - szp); + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); /* * Update path information for the target vnode @@ -3600,7 +3667,7 @@ out: VN_RELE(ZTOV(tzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -3724,11 +3791,13 @@ top: if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); + mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), @@ -3751,7 +3820,7 @@ top: VN_RELE(ZTOV(zp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -3785,11 +3854,13 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); @@ -3828,6 +3899,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, int error; int zf = ZNEW; uint64_t parent; + uid_t owner; ASSERT(tdvp->v_type == VDIR); @@ -3887,8 +3959,8 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, } - if (szp->z_uid != crgetuid(cr) && - secpolicy_basic_link(cr) != 0) { + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -3944,7 +4016,7 @@ top: } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -4181,7 +4253,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, out: zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); + zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (error); } @@ -4836,7 +4908,7 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, UINT64_MAX, 0); + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 24bd3ddcd..e1e4e9e03 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -63,6 +63,7 @@ #include <sys/zfs_znode.h> #include <sys/sa.h> #include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> #include "zfs_prop.h" #include "zfs_comutil.h" @@ -81,9 +82,6 @@ #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ -#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) -#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) - /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies @@ -136,6 +134,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; + zp->z_moved = 0; return (0); } @@ -196,7 +195,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_blksz = ozp->z_blksz; nzp->z_seq = ozp->z_seq; nzp->z_mapcnt = ozp->z_mapcnt; - nzp->z_last_itx = ozp->z_last_itx; nzp->z_gen = ozp->z_gen; nzp->z_sync_cnt = ozp->z_sync_cnt; nzp->z_is_sa = ozp->z_is_sa; @@ -228,6 +226,12 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) */ ozp->z_sa_hdl = NULL; POINTER_INVALIDATE(&ozp->z_zfsvfs); + + /* + * Mark the znode. + */ + nzp->z_moved = 1; + ozp->z_moved = (uint8_t)-1; } /*ARGSUSED*/ @@ -478,6 +482,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) vattr.va_gid = crgetgid(kcred); sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; @@ -619,7 +625,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, vnode_t *vp; uint64_t mode; uint64_t parent; - uint64_t uid, gid; sa_bulk_attr_t bulk[9]; int count = 0; @@ -627,6 +632,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT(zp->z_dirlocks == NULL); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; /* * Defer setting z_zfsvfs until the znode is ready to be a candidate for @@ -636,7 +642,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; - zp->z_last_itx = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; @@ -659,9 +664,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &uid, 8); + &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &gid, 8); + &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { if (hdl == NULL) @@ -670,8 +675,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, return (NULL); } - zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER); - zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP); zp->z_mode = mode; vp->v_vfsp = zfsvfs->z_parent->z_vfs; @@ -705,7 +708,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, case VREG: vp->v_flag |= VMODSORT; if (parent == zfsvfs->z_shares_dir) { - ASSERT(uid == 0 && gid == 0); + ASSERT(zp->z_uid == 0 && zp->z_gid == 0); vn_setops(vp, zfs_sharevnodeops); } else { vn_setops(vp, zfs_fvnodeops); @@ -759,7 +762,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; - uint64_t dzp_pflags = 0; + uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; @@ -794,7 +797,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ /* * There's currently no mechanism for pre-reading the blocks that will - * be to needed allocate a new object, so we accept the small chance + * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ @@ -1085,6 +1088,16 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } } int @@ -1174,7 +1187,6 @@ zfs_rezget(znode_t *zp) dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; - uint64_t uid, gid; sa_bulk_attr_t bulk[8]; int err; int count = 0; @@ -1220,28 +1232,26 @@ zfs_rezget(znode_t *zp) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &uid, sizeof (uid)); + &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &gid, sizeof (gid)); + &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); - zp->z_mode = mode; - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EIO); } + zp->z_mode = mode; + if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EIO); } - zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER); - zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP); zp->z_unlinked = (zp->z_links == 0); zp->z_blksz = doi.doi_data_block_size; @@ -1256,11 +1266,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; - uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp); + uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - if (acl_obj) + if (acl_obj) { + VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); @@ -1562,6 +1574,8 @@ zfs_trunc(znode_t *zp, uint64_t end) dmu_tx_t *tx; rl_t *rl; int error; + sa_bulk_attr_t bulk[2]; + int count = 0; /* * We will change zp_size, lock the whole file. @@ -1598,9 +1612,15 @@ top: } zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx)); + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); @@ -1805,6 +1825,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); @@ -1822,7 +1844,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) zfsvfs.z_use_sa = USE_SA(version, os); zfsvfs.z_norm = norm; - zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END); + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs.z_attr_table); + + ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case @@ -1838,7 +1863,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_zfsvfs = &zfsvfs; VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); @@ -1868,78 +1892,121 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) #endif /* _KERNEL */ -/* - * Given an object number, return its parent object number and whether - * or not the object is an extended attribute directory. - */ static int -zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir, - sa_attr_type_t *sa_table) +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db) { - dmu_buf_t *db; dmu_object_info_t doi; int error; - uint64_t parent; - uint64_t pflags; - uint64_t mode; - sa_bulk_attr_t bulk[3]; - sa_handle_t *hdl; - int count = 0; - if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0) + if ((error = sa_buf_hold(osp, obj, FTAG, db)) != 0) return (error); - dmu_object_info_from_db(db, &doi); + dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)) { - sa_buf_rele(db, FTAG); - return (EINVAL); + sa_buf_rele(*db, FTAG); + return (ENOTSUP); } - if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, - &hdl)) != 0) { - sa_buf_rele(db, FTAG); + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, FTAG); return (error); } - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], - NULL, &parent, 8); + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, FTAG); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, + int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, - &pflags, 8); + &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, - &mode, 8); + &mode, sizeof (mode)); - if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) { - sa_buf_rele(db, FTAG); - sa_handle_destroy(hdl); + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); - } + *pobjp = parent; *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); - sa_handle_destroy(hdl); - sa_buf_rele(db, FTAG); return (0); } -int -zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) { + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; - sa_attr_type_t *sa_table; int error; - uint64_t sa_obj = 0; *path = '\0'; - - error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); - - if (error != 0 && error != ENOENT) - return (error); - - sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END); + sa_hdl = hdl; for (;;) { uint64_t pobj; @@ -1947,8 +2014,11 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) size_t complen; int is_xattrdir; - if ((error = zfs_obj_to_pobj(osp, obj, &pobj, - &is_xattrdir, sa_table)) != 0) + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb); + + if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) break; if (pobj == obj) { @@ -1972,6 +2042,22 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) ASSERT(path >= buf); bcopy(component, path, complen); obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db); } if (error == 0) @@ -1979,3 +2065,57 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) return (error); } + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db); + return (error); +} diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 4aa4d10b0..c66313ff6 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -34,7 +34,7 @@ #include <sys/zil.h> #include <sys/zil_impl.h> #include <sys/dsl_dataset.h> -#include <sys/vdev.h> +#include <sys/vdev_impl.h> #include <sys/dmu_tx.h> #include <sys/dsl_pool.h> @@ -78,12 +78,21 @@ boolean_t zfs_nocacheflush = B_FALSE; static kmem_cache_t *zil_lwb_cache; -static boolean_t zil_empty(zilog_t *zilog); +static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) +/* + * ziltest is by and large an ugly hack, but very useful in + * checking replay without tedious work. + * When running ziltest we want to keep all itx's and so maintain + * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG + * We subtract TXG_CONCURRENT_STATES to allow for common code. + */ +#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) + static int zil_bp_compare(const void *x1, const void *x2) { @@ -631,6 +640,7 @@ zil_check_log_chain(const char *osname, void *tx) { zilog_t *zilog; objset_t *os; + blkptr_t *bp; int error; ASSERT(tx == NULL); @@ -642,6 +652,29 @@ zil_check_log_chain(const char *osname, void *tx) } zilog = dmu_objset_zil(os); + bp = (blkptr_t *)&zilog->zl_header->zh_log; + + /* + * Check the first block and determine if it's on a log device + * which may have been removed or faulted prior to loading this + * pool. If so, there's no point in checking the rest of the log + * as its content should have already been synced to the pool. + */ + if (!BP_IS_HOLE(bp)) { + vdev_t *vd; + boolean_t valid = B_TRUE; + + spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); + vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); + if (vd->vdev_islog && vdev_is_dead(vd)) + valid = vdev_log_state_valid(vd); + spa_config_exit(os->os_spa, SCL_STATE, FTAG); + + if (!valid) { + dmu_objset_rele(os, FTAG); + return (0); + } + } /* * Because tx == NULL, zil_claim_log_block() will not actually claim @@ -661,8 +694,8 @@ zil_check_log_chain(const char *osname, void *tx) static int zil_vdev_compare(const void *x1, const void *x2) { - uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; + const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; + const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; if (v1 < v2) return (-1); @@ -703,7 +736,7 @@ zil_add_block(zilog_t *zilog, const blkptr_t *bp) mutex_exit(&zilog->zl_vdev_lock); } -void +static void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; @@ -1045,6 +1078,7 @@ zil_itx_create(uint64_t txtype, size_t lrsize) itx->itx_lr.lrc_reclen = lrsize; itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ + itx->itx_sync = B_TRUE; /* default is synchronous */ return (itx); } @@ -1055,190 +1089,362 @@ zil_itx_destroy(itx_t *itx) kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } -uint64_t -zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +/* + * Free up the sync and async itxs. The itxs_t has already been detached + * so no locks are needed. + */ +static void +zil_itxg_clean(itxs_t *itxs) { - uint64_t seq; + itx_t *itx; + list_t *list; + avl_tree_t *t; + void *cookie; + itx_async_node_t *ian; + + list = &itxs->i_sync_list; + while ((itx = list_head(list)) != NULL) { + list_remove(list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } - ASSERT(itx->itx_lr.lrc_seq == 0); - ASSERT(!zilog->zl_replay); + cookie = NULL; + t = &itxs->i_async_tree; + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list = &ian->ia_list; + while ((itx = list_head(list)) != NULL) { + list_remove(list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + list_destroy(list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + avl_destroy(t); - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz += itx->itx_sod; - itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); - itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; - mutex_exit(&zilog->zl_lock); + kmem_free(itxs, sizeof (itxs_t)); +} + +static int +zil_aitx_compare(const void *x1, const void *x2) +{ + const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; + const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - return (seq); + if (o1 < o2) + return (-1); + if (o1 > o2) + return (1); + + return (0); } /* - * Free up all in-memory intent log transactions that have now been synced. + * Remove all async itx with the given oid. */ static void -zil_itx_clean(zilog_t *zilog) +zil_remove_async(zilog_t *zilog, uint64_t oid) { - uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); - uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; list_t clean_list; itx_t *itx; + ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); - mutex_enter(&zilog->zl_lock); - /* wait for a log writer to finish walking list */ - while (zilog->zl_writer) { - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - } + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; - /* - * Move the sync'd log transactions to a separate list so we can call - * kmem_free without holding the zl_lock. - * - * There is no need to set zl_writer as we don't drop zl_lock here - */ - while ((itx = list_head(&zilog->zl_itx_list)) != NULL && - itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { - list_remove(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz -= itx->itx_sod; - list_insert_tail(&clean_list, itx); - } - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; - /* destroy sync'd log transactions */ + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * Locate the object node and append its list. + */ + t = &itxg->itxg_itxs->i_async_tree; + ian = avl_find(t, &oid, &where); + if (ian != NULL) + list_move_tail(&clean_list, &ian->ia_list); + mutex_exit(&itxg->itxg_lock); + } while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); - zil_itx_destroy(itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); } list_destroy(&clean_list); } +void +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t txg; + itxg_t *itxg; + itxs_t *itxs, *clean = NULL; + + /* + * Object ids can be re-instantiated in the next txg so + * remove any async transactions to avoid future leaks. + * This can happen if a fsync occurs on the re-instantiated + * object for a WR_INDIRECT or WR_NEED_COPY write, which gets + * the new file data and flushes a write record for the old object. + */ + if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) + zil_remove_async(zilog, itx->itx_oid); + + /* + * Ensure the data of a renamed file is committed before the rename. + */ + if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) + zil_async_to_sync(zilog, itx->itx_oid); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) + txg = ZILTEST_TXG; + else + txg = dmu_tx_get_txg(tx); + + itxg = &zilog->zl_itxg[txg & TXG_MASK]; + mutex_enter(&itxg->itxg_lock); + itxs = itxg->itxg_itxs; + if (itxg->itxg_txg != txg) { + if (itxs != NULL) { + /* + * The zil_clean callback hasn't got around to cleaning + * this itxg. Save the itxs for release below. + * This should be rare. + */ + atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); + itxg->itxg_sod = 0; + clean = itxg->itxg_itxs; + } + ASSERT(itxg->itxg_sod == 0); + itxg->itxg_txg = txg; + itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); + + list_create(&itxs->i_sync_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + avl_create(&itxs->i_async_tree, zil_aitx_compare, + sizeof (itx_async_node_t), + offsetof(itx_async_node_t, ia_node)); + } + if (itx->itx_sync) { + list_insert_tail(&itxs->i_sync_list, itx); + atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); + itxg->itxg_sod += itx->itx_sod; + } else { + avl_tree_t *t = &itxs->i_async_tree; + uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; + itx_async_node_t *ian; + avl_index_t where; + + ian = avl_find(t, &foid, &where); + if (ian == NULL) { + ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); + list_create(&ian->ia_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + ian->ia_foid = foid; + avl_insert(t, ian, where); + } + list_insert_tail(&ian->ia_list, itx); + } + + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + mutex_exit(&itxg->itxg_lock); + + /* Release the old itxs now we've dropped the lock */ + if (clean != NULL) + zil_itxg_clean(clean); +} + /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. */ void -zil_clean(zilog_t *zilog) +zil_clean(zilog_t *zilog, uint64_t synced_txg) { - itx_t *itx; + itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; + itxs_t *clean_me; - mutex_enter(&zilog->zl_lock); - itx = list_head(&zilog->zl_itx_list); - if ((itx != NULL) && - (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { - (void) taskq_dispatch(zilog->zl_clean_taskq, - (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP); + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { + mutex_exit(&itxg->itxg_lock); + return; + } + ASSERT3U(itxg->itxg_txg, <=, synced_txg); + ASSERT(itxg->itxg_txg != 0); + ASSERT(zilog->zl_clean_taskq != NULL); + atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); + itxg->itxg_sod = 0; + clean_me = itxg->itxg_itxs; + itxg->itxg_itxs = NULL; + itxg->itxg_txg = 0; + mutex_exit(&itxg->itxg_lock); + /* + * Preferably start a task queue to free up the old itxs but + * if taskq_dispatch can't allocate resources to do that then + * free it in-line. This should be rare. Note, using TQ_SLEEP + * created a bad performance problem. + */ + if (taskq_dispatch(zilog->zl_clean_taskq, + (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) + zil_itxg_clean(clean_me); +} + +/* + * Get the list of itxs to commit into zl_itx_commit_list. + */ +static void +zil_get_commit_list(zilog_t *zilog) +{ + uint64_t otxg, txg; + list_t *commit_list = &zilog->zl_itx_commit_list; + uint64_t push_sod = 0; + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + push_sod += itxg->itxg_sod; + itxg->itxg_sod = 0; + + mutex_exit(&itxg->itxg_lock); + } + atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); +} + +/* + * Move the async itxs for a specified object to commit into sync lists. + */ +static void +zil_async_to_sync(zilog_t *zilog, uint64_t foid) +{ + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * If a foid is specified then find that node and append its + * list. Otherwise walk the tree appending all the lists + * to the sync list. We add to the end rather than the + * beginning to ensure the create has happened. + */ + t = &itxg->itxg_itxs->i_async_tree; + if (foid != 0) { + ian = avl_find(t, &foid, &where); + if (ian != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + } + } else { + void *cookie = NULL; + + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + list_destroy(&ian->ia_list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + } + mutex_exit(&itxg->itxg_lock); } - mutex_exit(&zilog->zl_lock); } static void -zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) +zil_commit_writer(zilog_t *zilog) { uint64_t txg; - uint64_t commit_seq = 0; - itx_t *itx, *itx_next; + itx_t *itx; lwb_t *lwb; - spa_t *spa; + spa_t *spa = zilog->zl_spa; int error = 0; - zilog->zl_writer = B_TRUE; ASSERT(zilog->zl_root_zio == NULL); - spa = zilog->zl_spa; + + mutex_exit(&zilog->zl_lock); + + zil_get_commit_list(zilog); + + /* + * Return if there's nothing to commit before we dirty the fs by + * calling zil_create(). + */ + if (list_head(&zilog->zl_itx_commit_list) == NULL) { + mutex_enter(&zilog->zl_lock); + return; + } if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) { - /* - * Return if there's nothing to flush before we - * dirty the fs by calling zil_create() - */ - if (list_is_empty(&zilog->zl_itx_list)) { - zilog->zl_writer = B_FALSE; - return; - } - mutex_exit(&zilog->zl_lock); + if (lwb == NULL) lwb = zil_create(zilog); - mutex_enter(&zilog->zl_lock); - } } - ASSERT(lwb == NULL || lwb->lwb_zio == NULL); - /* Loop through in-memory log transactions filling log blocks. */ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); - - for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) { - /* - * Save the next pointer. Even though we drop zl_lock below, - * all threads that can remove itx list entries (other writers - * and zil_itx_clean()) can't do so until they have zl_writer. - */ - itx_next = list_next(&zilog->zl_itx_list, itx); - - /* - * Determine whether to push this itx. - * Push all transactions related to specified foid and - * all other transactions except those that can be logged - * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL) - * for all other files. - * - * If foid == 0 (meaning "push all foids") or - * itx->itx_sync is set (meaning O_[D]SYNC), push regardless. - */ - if (foid != 0 && !itx->itx_sync && - TX_OOO(itx->itx_lr.lrc_txtype) && - ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid) - continue; /* skip this record */ - - if ((itx->itx_lr.lrc_seq > seq) && - ((lwb == NULL) || (LWB_EMPTY(lwb)) || - (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz))) - break; - - list_remove(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz -= itx->itx_sod; - - mutex_exit(&zilog->zl_lock); - + while (itx = list_head(&zilog->zl_itx_commit_list)) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); - if (txg > spa_last_synced_txg(spa) || - txg > spa_freeze_txg(spa)) + if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); - - zil_itx_destroy(itx); - - mutex_enter(&zilog->zl_lock); + list_remove(&zilog->zl_itx_commit_list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); - /* determine commit sequence number */ - itx = list_head(&zilog->zl_itx_list); - if (itx) - commit_seq = itx->itx_lr.lrc_seq - 1; - else - commit_seq = zilog->zl_itx_seq; - mutex_exit(&zilog->zl_lock); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); - zilog->zl_prev_used = zilog->zl_cur_used; zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { - DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); error = zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; - DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); zil_flush_vdevs(zilog); } @@ -1246,10 +1452,6 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) txg_wait_synced(zilog->zl_dmu_pool, 0); mutex_enter(&zilog->zl_lock); - zilog->zl_writer = B_FALSE; - - ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); - zilog->zl_commit_seq = commit_seq; /* * Remember the highest committed log sequence number for ztest. @@ -1261,58 +1463,61 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) } /* - * Push zfs transactions to stable storage up to the supplied sequence number. + * Commit zfs transactions to stable storage. * If foid is 0 push out all transactions, otherwise push only those - * for that file or might have been used to create that file. + * for that object or might reference that object. + * + * itxs are committed in batches. In a heavily stressed zil there will be + * a commit writer thread who is writing out a bunch of itxs to the log + * for a set of committing threads (cthreads) in the same batch as the writer. + * Those cthreads are all waiting on the same cv for that batch. + * + * There will also be a different and growing batch of threads that are + * waiting to commit (qthreads). When the committing batch completes + * a transition occurs such that the cthreads exit and the qthreads become + * cthreads. One of the new cthreads becomes the writer thread for the + * batch. Any new threads arriving become new qthreads. + * + * Only 2 condition variables are needed and there's no transition + * between the two cvs needed. They just flip-flop between qthreads + * and cthreads. + * + * Using this scheme we can efficiently wakeup up only those threads + * that have been committed. */ void -zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) +zil_commit(zilog_t *zilog, uint64_t foid) { - if (zilog->zl_sync == ZFS_SYNC_DISABLED || seq == 0) - return; + uint64_t mybatch; - mutex_enter(&zilog->zl_lock); + if (zilog->zl_sync == ZFS_SYNC_DISABLED) + return; - seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ + /* move the async itxs for the foid to the sync queues */ + zil_async_to_sync(zilog, foid); + mutex_enter(&zilog->zl_lock); + mybatch = zilog->zl_next_batch; while (zilog->zl_writer) { - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - if (seq <= zilog->zl_commit_seq) { + cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); + if (mybatch <= zilog->zl_com_batch) { mutex_exit(&zilog->zl_lock); return; } } - zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ - /* wake up others waiting on the commit */ - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); -} - -/* - * Report whether all transactions are committed. - */ -static boolean_t -zil_is_committed(zilog_t *zilog) -{ - lwb_t *lwb; - boolean_t committed; - - mutex_enter(&zilog->zl_lock); - while (zilog->zl_writer) - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + zilog->zl_next_batch++; + zilog->zl_writer = B_TRUE; + zil_commit_writer(zilog); + zilog->zl_com_batch = mybatch; + zilog->zl_writer = B_FALSE; + mutex_exit(&zilog->zl_lock); - if (!list_is_empty(&zilog->zl_itx_list)) - committed = B_FALSE; /* unpushed transactions */ - else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL) - committed = B_TRUE; /* intent log never used */ - else if (list_next(&zilog->zl_lwb_list, lwb) != NULL) - committed = B_FALSE; /* zil_sync() not done yet */ - else - committed = B_TRUE; /* everything synced */ + /* wake up one thread to become the next writer */ + cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); - mutex_exit(&zilog->zl_lock); - return (committed); + /* wake up all threads waiting for this batch to be committed */ + cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); } /* @@ -1425,15 +1630,21 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); + zilog->zl_next_batch = 1; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zilog->zl_itx_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); + for (int i = 0; i < TXG_SIZE; i++) { + mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, + MUTEX_DEFAULT, NULL); + } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); + list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, @@ -1441,6 +1652,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); return (zilog); } @@ -1448,27 +1661,47 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) void zil_free(zilog_t *zilog) { - lwb_t *lwb; + lwb_t *head_lwb; zilog->zl_stop_sync = 1; - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - list_remove(&zilog->zl_lwb_list, lwb); - if (lwb->lwb_buf != NULL) - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - kmem_cache_free(zil_lwb_cache, lwb); + /* + * After zil_close() there should only be one lwb with a buffer. + */ + head_lwb = list_head(&zilog->zl_lwb_list); + if (head_lwb) { + ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list)); + list_remove(&zilog->zl_lwb_list, head_lwb); + zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz); + kmem_cache_free(zil_lwb_cache, head_lwb); } list_destroy(&zilog->zl_lwb_list); avl_destroy(&zilog->zl_vdev_tree); mutex_destroy(&zilog->zl_vdev_lock); - ASSERT(list_head(&zilog->zl_itx_list) == NULL); - list_destroy(&zilog->zl_itx_list); + ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); + list_destroy(&zilog->zl_itx_commit_list); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * It's possible for an itx to be generated that doesn't dirty + * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() + * callback to remove the entry. We remove those here. + * + * Also free up the ziltest itxs. + */ + if (zilog->zl_itxg[i].itxg_itxs) + zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); + mutex_destroy(&zilog->zl_itxg[i].itxg_lock); + } + mutex_destroy(&zilog->zl_lock); cv_destroy(&zilog->zl_cv_writer); cv_destroy(&zilog->zl_cv_suspend); + cv_destroy(&zilog->zl_cv_batch[0]); + cv_destroy(&zilog->zl_cv_batch[1]); kmem_free(zilog, sizeof (zilog_t)); } @@ -1494,26 +1727,28 @@ zil_open(objset_t *os, zil_get_data_t *get_data) void zil_close(zilog_t *zilog) { + lwb_t *tail_lwb; + uint64_t txg = 0; + + zil_commit(zilog, 0); /* commit all itx */ + /* - * If the log isn't already committed, mark the objset dirty - * (so zil_sync() will be called) and wait for that txg to sync. + * The lwb_max_txg for the stubby lwb will reflect the last activity + * for the zil. After a txg_wait_synced() on the txg we know all the + * callbacks have occurred that may clean the zil. Only then can we + * destroy the zl_clean_taskq. */ - if (!zil_is_committed(zilog)) { - uint64_t txg; - dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); + mutex_enter(&zilog->zl_lock); + tail_lwb = list_tail(&zilog->zl_lwb_list); + if (tail_lwb != NULL) + txg = tail_lwb->lwb_max_txg; + mutex_exit(&zilog->zl_lock); + if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); - } taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; - - zil_itx_clean(zilog); - ASSERT(list_head(&zilog->zl_itx_list) == NULL); } /* @@ -1545,15 +1780,7 @@ zil_suspend(zilog_t *zilog) zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); - zil_commit(zilog, UINT64_MAX, 0); - - /* - * Wait for any in-flight log writes to complete. - */ - mutex_enter(&zilog->zl_lock); - while (zilog->zl_writer) - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - mutex_exit(&zilog->zl_lock); + zil_commit(zilog, 0); zil_destroy(zilog, B_FALSE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 88d80af4e..1ba2330bd 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2247,6 +2247,26 @@ zio_vdev_io_start(zio_t *zio) return (vdev_mirror_ops.vdev_op_io_start(zio)); } + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of spa_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = spa->spa_last_io; + uint64_t new = ddi_get_lbolt64(); + if (old != new) + (void) atomic_cas_64(&spa->spa_last_io, old, new); + } + align = 1ULL << vd->vdev_top->vdev_ashift; if (P2PHASE(zio->io_size, align) != 0) { @@ -2262,7 +2282,7 @@ zio_vdev_io_start(zio_t *zio) ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2744,6 +2764,7 @@ zio_done(zio_t *zio) if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && + !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 16eaed668..9ae7d1f69 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -476,7 +476,6 @@ int zio_clear_fault(int id) { inject_handler_t *handler; - int ret; rw_enter(&inject_lock, RW_WRITER); @@ -486,18 +485,18 @@ zio_clear_fault(int id) break; if (handler == NULL) { - ret = ENOENT; - } else { - list_remove(&inject_handlers, handler); - spa_inject_delref(handler->zi_spa); - kmem_free(handler, sizeof (inject_handler_t)); - atomic_add_32(&zio_injection_enabled, -1); - ret = 0; + rw_exit(&inject_lock); + return (ENOENT); } + list_remove(&inject_handlers, handler); rw_exit(&inject_lock); - return (ret); + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_add_32(&zio_injection_enabled, -1); + + return (0); } void diff --git a/module/zfs/zrlock.c b/module/zfs/zrlock.c new file mode 100644 index 000000000..ec94b0855 --- /dev/null +++ b/module/zfs/zrlock.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * A Zero Reference Lock (ZRL) is a reference count that can lock out new + * references only when the count is zero and only without waiting if the count + * is not already zero. It is similar to a read-write lock in that it allows + * multiple readers and only a single writer, but it does not allow a writer to + * block while waiting for readers to exit, and therefore the question of + * reader/writer priority is moot (no WRWANT bit). Since the equivalent of + * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it + * is perfectly safe for the same reader to acquire the same lock multiple + * times. The fact that a ZRL is reentrant for readers (through multiple calls + * to zrl_add()) makes it convenient for determining whether something is + * actively referenced without the fuss of flagging lock ownership across + * function calls. + */ +#include <sys/zrlock.h> + +/* + * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is + * treated as zero references. + */ +#define ZRL_LOCKED ((uint32_t)-1) +#define ZRL_DESTROYED -2 + +void +zrl_init(zrlock_t *zrl) +{ + mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); + zrl->zr_refcount = 0; + cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); +#ifdef ZFS_DEBUG + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; +#endif +} + +void +zrl_destroy(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount == 0); + + mutex_destroy(&zrl->zr_mtx); + zrl->zr_refcount = ZRL_DESTROYED; + cv_destroy(&zrl->zr_cv); +} + +void +#ifdef ZFS_DEBUG +zrl_add_debug(zrlock_t *zrl, const char *zc) +#else +zrl_add(zrlock_t *zrl) +#endif +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + while (n != ZRL_LOCKED) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, n, n + 1); + if (cas == n) { + ASSERT((int32_t)n >= 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + DTRACE_PROBE2(zrlock__reentry, + zrlock_t *, zrl, uint32_t, n); + } + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + return; + } + n = cas; + } + + mutex_enter(&zrl->zr_mtx); + while (zrl->zr_refcount == ZRL_LOCKED) { + cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + } + ASSERT(zrl->zr_refcount >= 0); + zrl->zr_refcount++; +#ifdef ZFS_DEBUG + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + mutex_exit(&zrl->zr_mtx); +} + +void +zrl_remove(zrlock_t *zrl) +{ + uint32_t n; + + n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); + ASSERT((int32_t)n >= 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; + } +#endif +} + +int +zrl_tryenter(zrlock_t *zrl) +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + if (n == 0) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); + if (cas == 0) { +#ifdef ZFS_DEBUG + ASSERT(zrl->zr_owner == NULL); + zrl->zr_owner = curthread; +#endif + return (1); + } + } + + ASSERT((int32_t)n > ZRL_DESTROYED); + + return (0); +} + +void +zrl_exit(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount == ZRL_LOCKED); + + mutex_enter(&zrl->zr_mtx); +#ifdef ZFS_DEBUG + ASSERT(zrl->zr_owner == curthread); + zrl->zr_owner = NULL; + membar_producer(); /* make sure the owner store happens first */ +#endif + zrl->zr_refcount = 0; + cv_broadcast(&zrl->zr_cv); + mutex_exit(&zrl->zr_mtx); +} + +int +zrl_refcount(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + int n = (int)zrl->zr_refcount; + return (n <= 0 ? 0 : n); +} + +int +zrl_is_zero(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + return (zrl->zr_refcount <= 0); +} + +int +zrl_is_locked(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + return (zrl->zr_refcount == ZRL_LOCKED); +} + +#ifdef ZFS_DEBUG +kthread_t * +zrl_owner(zrlock_t *zrl) +{ + return (zrl->zr_owner); +} +#endif |