diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/dbuf.c | 60 | ||||
-rw-r--r-- | module/zfs/dmu.c | 20 | ||||
-rw-r--r-- | module/zfs/dmu_object.c | 124 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 47 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 42 | ||||
-rw-r--r-- | module/zfs/dmu_traverse.c | 8 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 4 | ||||
-rw-r--r-- | module/zfs/dnode.c | 238 | ||||
-rw-r--r-- | module/zfs/dnode_sync.c | 20 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 10 | ||||
-rw-r--r-- | module/zfs/sa.c | 21 | ||||
-rw-r--r-- | module/zfs/spa.c | 17 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 10 | ||||
-rw-r--r-- | module/zfs/zap.c | 10 | ||||
-rw-r--r-- | module/zfs/zap_micro.c | 59 | ||||
-rw-r--r-- | module/zfs/zfeature_common.c | 11 | ||||
-rw-r--r-- | module/zfs/zfs_acl.c | 2 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 30 | ||||
-rw-r--r-- | module/zfs/zfs_log.c | 2 | ||||
-rw-r--r-- | module/zfs/zfs_replay.c | 32 | ||||
-rw-r--r-- | module/zfs/zfs_sa.c | 7 | ||||
-rw-r--r-- | module/zfs/zfs_znode.c | 35 | ||||
-rw-r--r-- | module/zfs/zil.c | 8 |
23 files changed, 663 insertions, 154 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 126748994..4bbbd0525 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -478,7 +478,6 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -730,13 +729,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { + /* + * The bonus length stored in the dnode may be less than + * the maximum available space in the bonus buffer. + */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - if (bonuslen < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); + db->db.db_data = zio_buf_alloc(max_bonuslen); + arc_space_consume(max_bonuslen, ARC_SPACE_OTHER); + if (bonuslen < max_bonuslen) + bzero(db->db.db_data, max_bonuslen); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); @@ -962,9 +966,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ - dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + dnode_t *dn = DB_DNODE(db); + int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + arc_space_consume(bonuslen, ARC_SPACE_OTHER); + bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); @@ -1858,8 +1864,10 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + zio_buf_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; @@ -1929,7 +1937,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) - *bpp = &dn->dn_phys->dn_spill; + *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); @@ -2018,7 +2026,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = DN_MAX_BONUSLEN - + db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; @@ -2810,7 +2818,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) return; if (db->db_blkid == DMU_SPILL_BLKID) { - db->db_blkptr = &dn->dn_phys->dn_spill; + db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } @@ -2950,13 +2958,16 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(*datap != NULL); ASSERT0(db->db_level); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { - zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + zio_buf_free(*datap, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; @@ -3107,7 +3118,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && - db->db_blkptr == &dn->dn_phys->dn_spill); + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif @@ -3119,11 +3130,16 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; - for (i = db->db.db_size >> DNODE_SHIFT; i > 0; - i--, dnp++) { - if (dnp->dn_type != DMU_OT_NONE) + i = 0; + while (i < db->db.db_size) { + dnode_phys_t *dnp = db->db.db_data + i; + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) { fill++; + i += dnp->dn_extra_slots * + DNODE_MIN_SIZE; + } } } else { if (BP_IS_HOLE(bp)) { @@ -3270,7 +3286,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == &dn->dn_phys->dn_spill); + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); DB_DNODE_EXIT(db); } #endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index a423264c6..e1dfb41ff 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -180,7 +180,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_bonus_max(void) { - return (DN_MAX_BONUSLEN); + return (DN_OLD_MAX_BONUSLEN); } int @@ -1853,6 +1853,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; @@ -1924,9 +1925,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, dn = DB_DNODE(db); *blksize = dn->dn_datablksz; - /* add 1 for dnode space */ + /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> - SPA_MINBLOCKSHIFT) + 1; + SPA_MINBLOCKSHIFT) + dn->dn_num_slots; + DB_DNODE_EXIT(db); +} + +void +dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } @@ -2020,6 +2033,7 @@ EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); +EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index a5a53418b..e54043fc3 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -30,28 +30,55 @@ #include <sys/dnode.h> #include <sys/zap.h> #include <sys/zfeature.h> +#include <sys/dsl_dataset.h> uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen, + 0, tx); +} + +uint64_t +dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; + int dn_slots = dnodesize >> DNODE_SHIFT; + boolean_t restarted = B_FALSE; + + if (dn_slots == 0) { + dn_slots = DNODE_MIN_SLOTS; + } else { + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + } mutex_enter(&os->os_obj_lock); for (;;) { object = os->os_obj_next; /* * Each time we polish off a L1 bp worth of dnodes (2^12 - * objects), move to another L1 bp that's still reasonably - * sparse (at most 1/4 full). Look from the beginning at most - * once per txg, but after that keep looking from here. + * objects), move to another L1 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning at most once per txg. If we still can't + * allocate from that L1 block, search for an empty L0 + * block, which will quickly skip to the end of the + * metadnode if the no nearby L0 blocks are empty. This + * fallback avoids a pathology where full dnode blocks + * containing large dnodes appear sparse because they + * have a low blk_fill, leading to many failed + * allocation attempts. In the long term a better + * mechanism to search for sparse metadnode regions, + * such as spacemaps, could be implemented. + * * os_scan_dnodes is set during txg sync if enough objects * have been freed since the previous rescan to justify - * backfilling again. If we can't find a suitable block, just - * keep going from here. + * backfilling again. * * Note that dmu_traverse depends on the behavior that we use * multiple blocks of the dnode object before going back to @@ -59,9 +86,10 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, * that property or find another solution to the issues * described in traverse_visitbp. */ - if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; + uint64_t blkfill; + int minlvl; int error; if (os->os_rescan_dnodes) { offset = 0; @@ -69,13 +97,15 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, } else { offset = object << DNODE_SHIFT; } + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; error = dnode_next_offset(DMU_META_DNODE(os), - DNODE_FIND_HOLE, - &offset, 2, DNODES_PER_BLOCK >> 2, 0); + DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); if (error == 0) object = offset >> DNODE_SHIFT; } - os->os_obj_next = ++object; + os->os_obj_next = object + dn_slots; /* * XXX We should check for an i/o error here and return @@ -83,16 +113,22 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - os->os_obj_next = object - 1; + os->os_obj_next = object; + else + /* + * Skip to next known valid starting point for a dnode. + */ + os->os_obj_next = P2ROUNDUP(object + 1, + DNODES_PER_BLOCK); } - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); dnode_rele(dn, FTAG); mutex_exit(&os->os_obj_lock); @@ -105,16 +141,33 @@ int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; int err; + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); if (err) return (err); - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); dnode_rele(dn, FTAG); dmu_tx_add_new_object(tx, os, object); @@ -125,23 +178,34 @@ int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dmu_tx_t *tx) +{ dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; int err; if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); dnode_rele(dn, FTAG); return (err); } + int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { @@ -150,7 +214,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); @@ -171,9 +235,30 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { - uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + uint64_t offset; + dmu_object_info_t doi; + struct dsl_dataset *ds = os->os_dsl_dataset; + int dnodesize; int error; + /* + * Avoid expensive dnode hold if this dataset doesn't use large dnodes. + */ + if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + error = dmu_object_info(os, *objectp, &doi); + if (error && !(error == EINVAL && *objectp == 0)) + return (SET_ERROR(error)); + else + dnodesize = doi.doi_dnodesize; + } else { + dnodesize = DNODE_MIN_SIZE; + } + + if (*objectp == 0) + offset = 1 << DNODE_SHIFT; + else + offset = (*objectp << DNODE_SHIFT) + dnodesize; + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); @@ -235,8 +320,11 @@ dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_object_alloc); +EXPORT_SYMBOL(dmu_object_alloc_dnsize); EXPORT_SYMBOL(dmu_object_claim); +EXPORT_SYMBOL(dmu_object_claim_dnsize); EXPORT_SYMBOL(dmu_object_reclaim); +EXPORT_SYMBOL(dmu_object_reclaim_dnsize); EXPORT_SYMBOL(dmu_object_free); EXPORT_SYMBOL(dmu_object_next); EXPORT_SYMBOL(dmu_object_zapify); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 03b30dd3b..cdc897726 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -138,6 +138,12 @@ dmu_objset_id(objset_t *os) return (ds ? ds->ds_object : 0); } +uint64_t +dmu_objset_dnodesize(objset_t *os) +{ + return (os->os_dnodesize); +} + zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { @@ -268,6 +274,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval) } static void +dnodesize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + switch (newval) { + case ZFS_DNSIZE_LEGACY: + os->os_dnodesize = DNODE_MIN_SIZE; + break; + case ZFS_DNSIZE_AUTO: + /* + * Choose a dnode size that will work well for most + * workloads if the user specified "auto". Future code + * improvements could dynamically select a dnode size + * based on observed workload patterns. + */ + os->os_dnodesize = DNODE_MIN_SIZE * 2; + break; + case ZFS_DNSIZE_1K: + case ZFS_DNSIZE_2K: + case ZFS_DNSIZE_4K: + case ZFS_DNSIZE_8K: + case ZFS_DNSIZE_16K: + os->os_dnodesize = newval; + break; + } +} + +static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -421,6 +455,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DNODESIZE), + dnodesize_changed_cb, os); + } } if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, @@ -439,6 +478,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; + os->os_dnodesize = DNODE_MIN_SIZE; } if (ds == NULL || !ds->ds_is_snapshot) @@ -768,8 +808,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mdn = DMU_META_DNODE(os); - dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, - DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT, + DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); /* * We don't want to have to increase the meta-dnode's nlevels @@ -1202,7 +1242,7 @@ do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { - int64_t delta = DNODE_SIZE + used; + int64_t delta = DNODE_MIN_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, @@ -2023,6 +2063,7 @@ EXPORT_SYMBOL(dmu_objset_find); EXPORT_SYMBOL(dmu_objset_byteswap); EXPORT_SYMBOL(dmu_objset_evict_dbufs); EXPORT_SYMBOL(dmu_objset_snap_cmtime); +EXPORT_SYMBOL(dmu_objset_dnodesize); EXPORT_SYMBOL(dmu_objset_sync); EXPORT_SYMBOL(dmu_objset_is_dirty); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 896a84b50..901386a5a 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -445,6 +445,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; @@ -570,7 +571,6 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - dnode_phys_t *blk; uint64_t dnobj; ASSERT3U(zb->zb_level, >=, 0); @@ -590,7 +590,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - int blksz = BP_GET_LSIZE(bp); + dnode_phys_t *blk; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int i; @@ -603,8 +604,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); blk = abuf->b_data; - dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + dnobj = zb->zb_blkid * epb; + for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; @@ -736,6 +737,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) + featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; @@ -1252,6 +1255,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); + /* + * The receiving code doesn't know how to translate large dnodes + * to smaller ones, so the pool must have the LARGE_DNODE + * feature enabled if the stream has LARGE_DNODE. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ @@ -1658,7 +1670,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) return (1); } else { return (1 + - ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); + ((DN_OLD_MAX_BONUSLEN - + MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } @@ -1679,7 +1692,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || - drro->drr_bonuslen > DN_MAX_BONUSLEN) { + drro->drr_bonuslen > + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) { return (SET_ERROR(EINVAL)); } @@ -1719,9 +1733,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(rwa->os, drro->drr_object, + err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); + drro->drr_bonustype, drro->drr_bonuslen, + drro->drr_dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || @@ -1771,18 +1786,25 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); - for (obj = drrfo->drr_firstobj; + for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { + dmu_object_info_t doi; int err; - if (dmu_object_info(rwa->os, obj, NULL) != 0) + err = dmu_object_info(rwa->os, obj, &doi); + if (err == ENOENT) { + obj++; continue; + } else if (err != 0) { + return (err); + } err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } + return (0); } diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index bba9efe14..44ba74181 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -331,13 +331,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, goto post; cdnp = buf->b_data; - for (i = 0; i < epb; i++) { + for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) { prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) { err = traverse_dnode(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) @@ -439,7 +439,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); + traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); } } @@ -470,7 +470,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); + err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } if (err == 0 && (td->td_flags & TRAVERSE_POST)) { diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 74e323dbd..ed29bfbc6 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1586,7 +1586,7 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) } else { blkptr_t *bp; - bp = &dn->dn_phys->dn_spill; + bp = DN_SPILL_BLKPTR(dn->dn_phys); if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE; @@ -1618,7 +1618,7 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) dmu_tx_sa_registration_hold(sa, tx); - if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 38bcecd46..975bd5fb8 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -248,6 +248,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); @@ -258,12 +259,12 @@ dnode_verify(dnode_t *dn) ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + - dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + dn->dn_bonuslen, <=, max_bonuslen); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } @@ -294,6 +295,7 @@ dnode_byteswap(dnode_phys_t *dnp) dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); @@ -320,7 +322,8 @@ dnode_byteswap(dnode_phys_t *dnp) * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); - size_t len = DN_MAX_BONUSLEN - off; + int slots = dnp->dn_extra_slots + 1; + size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; dmu_object_byteswap_t byteswap; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); @@ -329,23 +332,24 @@ dnode_byteswap(dnode_phys_t *dnp) /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) - byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); - + byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { - dnode_phys_t *buf = vbuf; - int i; + int i = 0; ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); - size >>= DNODE_SHIFT; - for (i = 0; i < size; i++) { - dnode_byteswap(buf); - buf++; + while (i < size) { + dnode_phys_t *dnp = vbuf + i; + dnode_byteswap(dnp); + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) + i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } @@ -356,7 +360,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) @@ -434,6 +438,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; @@ -534,10 +539,13 @@ dnode_destroy(dnode_t *dn) void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int i; + ASSERT3U(dn_slots, >, 0); + ASSERT3U(dn_slots << DNODE_SHIFT, <=, + spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) @@ -550,8 +558,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, - dn->dn_object, tx->tx_txg, blocksize, ibs); + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", + dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); @@ -562,7 +570,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); @@ -588,11 +596,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; + dn->dn_num_slots = dn_slots; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; - else - dn->dn_nblkptr = 1 + - ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + else { + dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + } + dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; @@ -617,7 +629,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) { int nblkptr; @@ -631,7 +643,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(bonuslen, <=, + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + + dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); @@ -654,7 +669,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) @@ -672,6 +689,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; + dn->dn_num_slots = dn_slots; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; @@ -680,7 +698,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = - DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } @@ -1053,24 +1072,150 @@ dnode_buf_pageout(void *dbu) } /* + * Return true if the given index is interior to a dnode already + * allocated in the block. That is, the index is neither free nor + * allocated, but is consumed by a large dnode. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_consumed(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int skip; + int i; + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + for (i = 0; i < idx; i += skip) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) { + ot = dnh->dnh_dnode->dn_type; + skip = dnh->dnh_dnode->dn_num_slots; + } else { + ot = dn_block[i].dn_type; + skip = dn_block[i].dn_extra_slots + 1; + } + zrl_remove(&dnh->dnh_zrlock); + + if (ot == DMU_OT_NONE) + skip = 1; + } + + return (i > idx); +} + +/* + * Return true if the given index in the dnode block is a valid + * allocated dnode. That is, the index is not consumed by a large + * dnode and is not free. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_allocated(dmu_buf_impl_t *db, int idx) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + dnh = &children_dnodes->dnc_children[idx]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[idx].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + return (ot != DMU_OT_NONE); +} + +/* + * Return true if the given range of indices in the dnode block are + * free. That is, the starting index is not consumed by a large dnode + * and none of the indices are allocated. + * + * The dnode_phys_t buffer may not be in sync with the in-core dnode + * structure, so we try to check the dnode structure first and fall back + * to the dnode_phys_t buffer it doesn't exist. + */ +static boolean_t +dnode_is_free(dmu_buf_impl_t *db, int idx, int slots) +{ + dnode_handle_t *dnh; + dmu_object_type_t ot; + dnode_children_t *children_dnodes; + dnode_phys_t *dn_block; + int i; + + if (idx + slots > DNODES_PER_BLOCK) + return (B_FALSE); + + children_dnodes = dmu_buf_get_user(&db->db); + dn_block = (dnode_phys_t *)db->db.db_data; + + if (dnode_is_consumed(db, idx)) + return (B_FALSE); + + for (i = idx; i < idx + slots; i++) { + dnh = &children_dnodes->dnc_children[i]; + + zrl_add(&dnh->dnh_zrlock); + if (dnh->dnh_dnode != NULL) + ot = dnh->dnh_dnode->dn_type; + else + ot = dn_block[i].dn_type; + zrl_remove(&dnh->dnh_zrlock); + + if (ot != DMU_OT_NONE) + return (B_FALSE); + } + + return (B_TRUE); +} + +/* * errors: * EINVAL - invalid object number. + * ENOSPC - hole too small to fulfill "slots" request * EIO - i/o error. * succeeds even for free dnodes. */ int -dnode_hold_impl(objset_t *os, uint64_t object, int flag, +dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, void *tag, dnode_t **dnp) { - int epb, idx, err; + int epb, idx, err, i; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_children_t *children_dnodes; + dnode_phys_t *dn_block_begin; dnode_handle_t *dnh; + ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); + ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool @@ -1126,12 +1271,9 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); epb = db->db.db_size >> DNODE_SHIFT; - idx = object & (epb-1); - ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { - int i; dnode_children_t *winner; children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); @@ -1156,21 +1298,28 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, } ASSERT(children_dnodes->dnc_count == epb); + idx = object & (epb - 1); + dn_block_begin = (dnode_phys_t *)db->db.db_data; + + if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) { + dbuf_rele(db, FTAG); + return (ENOSPC); + } else if ((flag & DNODE_MUST_BE_ALLOCATED) && + !dnode_is_allocated(db, idx)) { + dbuf_rele(db, FTAG); + return (ENOENT); + } + dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; - if (dn == NULL) { - dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - - dn = dnode_create(os, phys, db, object, dnh); - } + if (dn == NULL) + dn = dnode_create(os, dn_block_begin + idx, db, object, dnh); mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { + ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) { mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); @@ -1198,7 +1347,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { - return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, + dnp)); } /* @@ -1908,17 +2058,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; - span = DNODE_SHIFT; + ASSERT(dn->dn_type == DMU_OT_DNODE); + ASSERT(!(flags & DNODE_FIND_BACKWARDS)); - for (i = (*offset >> span) & (blkfill - 1); - i >= 0 && i < blkfill; i += inc) { + for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); + i < blkfill; i += dnp[i].dn_extra_slots + 1) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; - *offset += (1ULL << span) * inc; } - if (i < 0 || i == blkfill) + + if (i == blkfill) error = SET_ERROR(ESRCH); + + *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + + (i << DNODE_SHIFT); } else { blkptr_t *bp = data; uint64_t start = *offset; diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index bea7be186..54066e2e3 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -524,7 +524,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); - bzero(dn->dn_phys, sizeof (dnode_phys_t)); + bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; @@ -559,7 +559,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || - bcmp(dnp, &zerodn, DNODE_SIZE) == 0); + bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); @@ -591,6 +591,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } + + dnp->dn_extra_slots = dn->dn_num_slots - 1; + ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || @@ -623,7 +626,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonuslen = 0; else dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; - ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + ASSERT(dnp->dn_bonuslen <= + DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); dn->dn_next_bonuslen[txgoff] = 0; } @@ -662,7 +666,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); if (kill_spill) { - free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); mutex_enter(&dn->dn_mtx); dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); @@ -687,6 +691,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) return; } + if (dn->dn_num_slots > DNODE_MIN_SLOTS) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = + B_TRUE; + mutex_exit(&ds->ds_lock); + } + if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b5e272fb9..72163521e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -709,14 +709,18 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -779,7 +783,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); - dsl_scan_visitbp(&dnp->dn_spill, + dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } diff --git a/module/zfs/sa.c b/module/zfs/sa.c index d6ac5fcc7..adc301512 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -33,6 +33,7 @@ #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> #include <sys/dbuf.h> #include <sys/dnode.h> #include <sys/zap.h> @@ -553,12 +554,11 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, - boolean_t *will_spill) + dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, + int *total, boolean_t *will_spill) { int var_size_count = 0; int i; - int full_space; int hdrsize; int extra_hdrsize; @@ -577,7 +577,6 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); - full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { @@ -668,6 +667,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, void *data_start; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; + int dnodesize; int spill_idx; int hdrsize; int spillhdrsize = 0; @@ -676,20 +676,23 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, sa_lot_t *lot; int len_idx; int spill_used; + int bonuslen; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); + bonuslen = DN_BONUS_SIZE(dnodesize); /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, - SA_BONUS, &spill_idx, &used, &spilling); + SA_BONUS, bonuslen, &spill_idx, &used, &spilling); if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); - VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? - MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : + VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || @@ -706,8 +709,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx], - attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i, - &spill_used, &dummy); + attr_count - spill_idx, hdl->sa_spill, SA_SPILL, + hdl->sa_spill->db_size, &i, &spill_used, &dummy); if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c23fd7a3a..d1aefe585 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -281,6 +281,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MAX_SIZE, ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MIN_SIZE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -512,7 +520,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) /* * Must be ZPL, and its property settings * must be supported by GRUB (compression - * is not gzip, and large blocks are not used). + * is not gzip, and large blocks or large + * dnodes are not used). */ if (dmu_objset_type(os) != DMU_OST_ZFS) { @@ -529,6 +538,12 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) &propval)) == 0 && propval > SPA_OLD_MAXBLOCKSIZE) { error = SET_ERROR(ENOTSUP); + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), + zfs_prop_to_name(ZFS_PROP_DNODESIZE), + &propval)) == 0 && + propval != ZFS_DNSIZE_LEGACY) { + error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index e3e7e36fe..d1303b5c2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2000,6 +2000,15 @@ spa_maxblocksize(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } +int +spa_maxdnodesize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (DNODE_MAX_SIZE); + else + return (DNODE_MIN_SIZE); +} + #if defined(_KERNEL) && defined(HAVE_SPL) /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); @@ -2056,6 +2065,7 @@ EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); +EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_rename); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 454b4be62..9e4f05049 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -968,9 +968,17 @@ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { + return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); +} + +uint64_t +zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, int dnodesize, dmu_tx_t *tx) +{ uint64_t new_obj; - VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, + dnodesize, tx)) > 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 3faf27ce3..f3153cc18 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -630,8 +630,16 @@ int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - return (zap_create_claim_norm(os, obj, - 0, ot, bonustype, bonuslen, tx)); + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} + +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); } int @@ -639,9 +647,19 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ int err; - err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); @@ -656,10 +674,27 @@ zap_create(objset_t *os, dmu_object_type_t ot, } uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); +} + +uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); +} + +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); @@ -670,7 +705,17 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); +} + +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, + dnodesize, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && @@ -1458,10 +1503,14 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(zap_create); +EXPORT_SYMBOL(zap_create_dnsize); EXPORT_SYMBOL(zap_create_norm); +EXPORT_SYMBOL(zap_create_norm_dnsize); EXPORT_SYMBOL(zap_create_flags); +EXPORT_SYMBOL(zap_create_flags_dnsize); EXPORT_SYMBOL(zap_create_claim); EXPORT_SYMBOL(zap_create_claim_norm); +EXPORT_SYMBOL(zap_create_claim_norm_dnsize); EXPORT_SYMBOL(zap_destroy); EXPORT_SYMBOL(zap_lookup); EXPORT_SYMBOL(zap_lookup_norm); diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index f57e5489c..3264f6235 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -242,4 +242,15 @@ zpool_feature_init(void) "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); } + + { + static const spa_feature_t large_dnode_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_DNODE, + "org.zfsonlinux:large_dnode", "large_dnode", + "Variable on-disk size of dnodes.", + ZFEATURE_FLAG_PER_DATASET, large_dnode_deps); + } } diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 961083d4a..f820cdfd6 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -1394,7 +1394,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? - DN_MAX_BONUSLEN : 0, tx); + DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zsb->z_os, aoid, aclp->z_acl_bytes, 0, tx); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c63af167a..30338ac14 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3785,7 +3785,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) /* * If this is a bootable dataset then - * the we don't allow large (>128K) blocks, + * we don't allow large (>128K) blocks, * because GRUB doesn't support them. */ if (zfs_is_bootfs(dsname) && @@ -3813,6 +3813,34 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; + case ZFS_PROP_DNODESIZE: + /* Dnode sizes above 512 need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval != ZFS_DNSIZE_LEGACY) { + spa_t *spa; + + /* + * If this is a bootable dataset then + * we don't allow large (>512B) dnodes, + * because GRUB doesn't support them. + */ + if (zfs_is_bootfs(dsname) && + intval != ZFS_DNSIZE_LEGACY) { + return (SET_ERROR(EDOM)); + } + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_DNODE)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 38d8de0eb..4d89cb04b 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -279,6 +279,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; + /* Store dnode slot count in 8 bits above object id. */ + LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); lr->lr_mode = zp->z_mode; if (!IS_EPHEMERAL(zp->z_uid)) { lr->lr_uid = (uint64_t)zp->z_uid; diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index b97a60ed8..54c175437 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -279,6 +279,8 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) void *fuidstart; size_t xvatlen = 0; uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); @@ -304,19 +306,24 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) return (error); + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic zfs_create() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. + * creation time, generation number, and dnode size. The generic + * zfs_create() has no concept of these attributes, so we smuggle + * the values inside the vattr's otherwise unused va_ctime, + * va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); if (error != ENOENT) @@ -418,6 +425,8 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) void *start; size_t xvatlen; uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; int error; txtype = (lr->lr_common.lrc_txtype & ~TX_CI); @@ -431,21 +440,26 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) return (error); + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's - * creation time and generation number. The generic zfs_create() - * doesn't have either concept, so we smuggle the values inside - * the vattr's otherwise unused va_ctime and va_nblocks fields. + * creation time, generation number, and dnode slot count. The + * generic zfs_create() has no concept of these attributes, so + * we smuggle the values inside * the vattr's otherwise unused + * va_ctime, va_nblocks, and va_nlink fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; - error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); + error = dmu_object_info(zsb->z_os, objid, NULL); if (error != ENOENT) goto out; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index f4841435b..f3eac51f8 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -97,8 +97,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { - VERIFY(dmu_set_bonus(db, - len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); + VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); if (len) { bcopy(link, (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, len); @@ -107,8 +106,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_t *dbp; zfs_grow_blocksize(zp, len, tx); - VERIFY(0 == dmu_buf_hold(ZTOZSB(zp)->z_os, - zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); + VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, + DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(dbp, tx); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 19cb414a9..310d4827b 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -62,6 +62,7 @@ #include <sys/dmu.h> #include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> #include <sys/refcount.h> #include <sys/stat.h> #include <sys/zap.h> @@ -728,6 +729,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, timestruc_t now; uint64_t gen, obj; int bonuslen; + int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; @@ -739,15 +741,21 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zsb->z_os); } + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + obj_type = zsb->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? - DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. @@ -760,23 +768,23 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ if (S_ISDIR(vap->va_mode)) { if (zsb->z_replay) { - VERIFY0(zap_create_claim_norm(zsb->z_os, obj, + VERIFY0(zap_create_claim_norm_dnsize(zsb->z_os, obj, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, tx)); + obj_type, bonuslen, dnodesize, tx)); } else { - obj = zap_create_norm(zsb->z_os, + obj = zap_create_norm_dnsize(zsb->z_os, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, tx); + obj_type, bonuslen, dnodesize, tx); } } else { if (zsb->z_replay) { - VERIFY0(dmu_object_claim(zsb->z_os, obj, + VERIFY0(dmu_object_claim_dnsize(zsb->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, tx)); + obj_type, bonuslen, dnodesize, tx)); } else { - obj = dmu_object_alloc(zsb->z_os, + obj = dmu_object_alloc_dnsize(zsb->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, tx); + obj_type, bonuslen, dnodesize, tx); } } @@ -948,6 +956,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { @@ -1767,6 +1776,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) ASSERT(error == 0); /* + * Give dmu_object_alloc() a hint about where to start + * allocating new objects. Otherwise, since the metadnode's + * dnode_phys_t structure isn't initialized yet, dmu_object_next() + * would fail and we'd have to skip to the next dnode block. + */ + os->os_obj_next = moid + 1; + + /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 289b23c7f..988ffec29 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1372,7 +1372,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; - uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; + uint64_t foid = + LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; @@ -1918,7 +1919,8 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(!zilog_is_dirty(zilog)); + if (txg < spa_freeze_txg(zilog->zl_spa)) + ASSERT(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; @@ -2122,7 +2124,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, - ((lr_ooo_t *)lr)->lr_foid, NULL); + LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } |