summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/dbuf.c60
-rw-r--r--module/zfs/dmu.c20
-rw-r--r--module/zfs/dmu_object.c124
-rw-r--r--module/zfs/dmu_objset.c47
-rw-r--r--module/zfs/dmu_send.c42
-rw-r--r--module/zfs/dmu_traverse.c8
-rw-r--r--module/zfs/dmu_tx.c4
-rw-r--r--module/zfs/dnode.c238
-rw-r--r--module/zfs/dnode_sync.c20
-rw-r--r--module/zfs/dsl_scan.c10
-rw-r--r--module/zfs/sa.c21
-rw-r--r--module/zfs/spa.c17
-rw-r--r--module/zfs/spa_misc.c10
-rw-r--r--module/zfs/zap.c10
-rw-r--r--module/zfs/zap_micro.c59
-rw-r--r--module/zfs/zfeature_common.c11
-rw-r--r--module/zfs/zfs_acl.c2
-rw-r--r--module/zfs/zfs_ioctl.c30
-rw-r--r--module/zfs/zfs_log.c2
-rw-r--r--module/zfs/zfs_replay.c32
-rw-r--r--module/zfs/zfs_sa.c7
-rw-r--r--module/zfs/zfs_znode.c35
-rw-r--r--module/zfs/zil.c8
23 files changed, 663 insertions, 154 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 126748994..4bbbd0525 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -478,7 +478,6 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
} else if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT0(db->db.db_offset);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -730,13 +729,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
+ /*
+ * The bonus length stored in the dnode may be less than
+ * the maximum available space in the bonus buffer.
+ */
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- if (bonuslen < DN_MAX_BONUSLEN)
- bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = zio_buf_alloc(max_bonuslen);
+ arc_space_consume(max_bonuslen, ARC_SPACE_OTHER);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
@@ -962,9 +966,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ arc_space_consume(bonuslen, ARC_SPACE_OTHER);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
@@ -1858,8 +1864,10 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_OTHER);
}
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
@@ -1929,7 +1937,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
mutex_enter(&dn->dn_mtx);
if (dn->dn_have_spill &&
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
- *bpp = &dn->dn_phys->dn_spill;
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
else
*bpp = NULL;
dbuf_add_ref(dn->dn_dbuf, NULL);
@@ -2018,7 +2026,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = DN_MAX_BONUSLEN -
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
@@ -2810,7 +2818,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
return;
if (db->db_blkid == DMU_SPILL_BLKID) {
- db->db_blkptr = &dn->dn_phys->dn_spill;
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
BP_ZERO(db->db_blkptr);
return;
}
@@ -2950,13 +2958,16 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
- zio_buf_free(*datap, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(*datap, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_OTHER);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
@@ -3107,7 +3118,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(bp)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
}
#endif
@@ -3119,11 +3130,16 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_DNODE) {
- dnode_phys_t *dnp = db->db.db_data;
- for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
- i--, dnp++) {
- if (dnp->dn_type != DMU_OT_NONE)
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp = db->db.db_data + i;
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
}
} else {
if (BP_IS_HOLE(bp)) {
@@ -3270,7 +3286,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
DB_DNODE_EXIT(db);
}
#endif
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index a423264c6..e1dfb41ff 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -180,7 +180,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_bonus_max(void)
{
- return (DN_MAX_BONUSLEN);
+ return (DN_OLD_MAX_BONUSLEN);
}
int
@@ -1853,6 +1853,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
@@ -1924,9 +1925,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
- /* add 1 for dnode space */
+ /* add in number of slots used for the dnode itself */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
- SPA_MINBLOCKSHIFT) + 1;
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
DB_DNODE_EXIT(db);
}
@@ -2020,6 +2033,7 @@ EXPORT_SYMBOL(dmu_object_info);
EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_dnsize_from_db);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index a5a53418b..e54043fc3 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -30,28 +30,55 @@
#include <sys/dnode.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
+ 0, tx);
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
uint64_t object;
uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
mutex_enter(&os->os_obj_lock);
for (;;) {
object = os->os_obj_next;
/*
* Each time we polish off a L1 bp worth of dnodes (2^12
- * objects), move to another L1 bp that's still reasonably
- * sparse (at most 1/4 full). Look from the beginning at most
- * once per txg, but after that keep looking from here.
+ * objects), move to another L1 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning at most once per txg. If we still can't
+ * allocate from that L1 block, search for an empty L0
+ * block, which will quickly skip to the end of the
+ * metadnode if the no nearby L0 blocks are empty. This
+ * fallback avoids a pathology where full dnode blocks
+ * containing large dnodes appear sparse because they
+ * have a low blk_fill, leading to many failed
+ * allocation attempts. In the long term a better
+ * mechanism to search for sparse metadnode regions,
+ * such as spacemaps, could be implemented.
+ *
* os_scan_dnodes is set during txg sync if enough objects
* have been freed since the previous rescan to justify
- * backfilling again. If we can't find a suitable block, just
- * keep going from here.
+ * backfilling again.
*
* Note that dmu_traverse depends on the behavior that we use
* multiple blocks of the dnode object before going back to
@@ -59,9 +86,10 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* that property or find another solution to the issues
* described in traverse_visitbp.
*/
-
if (P2PHASE(object, L1_dnode_count) == 0) {
uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
int error;
if (os->os_rescan_dnodes) {
offset = 0;
@@ -69,13 +97,15 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
} else {
offset = object << DNODE_SHIFT;
}
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
error = dnode_next_offset(DMU_META_DNODE(os),
- DNODE_FIND_HOLE,
- &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
if (error == 0)
object = offset >> DNODE_SHIFT;
}
- os->os_obj_next = ++object;
+ os->os_obj_next = object + dn_slots;
/*
* XXX We should check for an i/o error here and return
@@ -83,16 +113,22 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
FTAG, &dn);
if (dn)
break;
if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- os->os_obj_next = object - 1;
+ os->os_obj_next = object;
+ else
+ /*
+ * Skip to next known valid starting point for a dnode.
+ */
+ os->os_obj_next = P2ROUNDUP(object + 1,
+ DNODES_PER_BLOCK);
}
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
mutex_exit(&os->os_obj_lock);
@@ -105,16 +141,33 @@ int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
if (err)
return (err);
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
dmu_tx_add_new_object(tx, os, object);
@@ -125,23 +178,34 @@ int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
return (err);
}
+
int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
@@ -150,7 +214,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
@@ -171,9 +235,30 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
- uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ uint64_t offset;
+ dmu_object_info_t doi;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
+ int dnodesize;
int error;
+ /*
+ * Avoid expensive dnode hold if this dataset doesn't use large dnodes.
+ */
+ if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ error = dmu_object_info(os, *objectp, &doi);
+ if (error && !(error == EINVAL && *objectp == 0))
+ return (SET_ERROR(error));
+ else
+ dnodesize = doi.doi_dnodesize;
+ } else {
+ dnodesize = DNODE_MIN_SIZE;
+ }
+
+ if (*objectp == 0)
+ offset = 1 << DNODE_SHIFT;
+ else
+ offset = (*objectp << DNODE_SHIFT) + dnodesize;
+
error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
@@ -235,8 +320,11 @@ dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(dmu_object_alloc);
+EXPORT_SYMBOL(dmu_object_alloc_dnsize);
EXPORT_SYMBOL(dmu_object_claim);
+EXPORT_SYMBOL(dmu_object_claim_dnsize);
EXPORT_SYMBOL(dmu_object_reclaim);
+EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
EXPORT_SYMBOL(dmu_object_free);
EXPORT_SYMBOL(dmu_object_next);
EXPORT_SYMBOL(dmu_object_zapify);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 03b30dd3b..cdc897726 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -138,6 +138,12 @@ dmu_objset_id(objset_t *os)
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
@@ -268,6 +274,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval)
}
static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -421,6 +455,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -439,6 +478,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
}
if (ds == NULL || !ds->ds_is_snapshot)
@@ -768,8 +808,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mdn = DMU_META_DNODE(os);
- dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
- DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+ dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
+ DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -1202,7 +1242,7 @@ do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
- int64_t delta = DNODE_SIZE + used;
+ int64_t delta = DNODE_MIN_SIZE + used;
if (subtract)
delta = -delta;
VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
@@ -2023,6 +2063,7 @@ EXPORT_SYMBOL(dmu_objset_find);
EXPORT_SYMBOL(dmu_objset_byteswap);
EXPORT_SYMBOL(dmu_objset_evict_dbufs);
EXPORT_SYMBOL(dmu_objset_snap_cmtime);
+EXPORT_SYMBOL(dmu_objset_dnodesize);
EXPORT_SYMBOL(dmu_objset_sync);
EXPORT_SYMBOL(dmu_objset_is_dirty);
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 896a84b50..901386a5a 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -445,6 +445,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_bonustype = dnp->dn_bonustype;
drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
drro->drr_checksumtype = dnp->dn_checksum;
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
@@ -570,7 +571,6 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
int err = 0;
- dnode_phys_t *blk;
uint64_t dnobj;
ASSERT3U(zb->zb_level, >=, 0);
@@ -590,7 +590,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- int blksz = BP_GET_LSIZE(bp);
+ dnode_phys_t *blk;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
int i;
@@ -603,8 +604,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
return (SET_ERROR(EIO));
blk = abuf->b_data;
- dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ dnobj = zb->zb_blkid * epb;
+ for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
@@ -736,6 +737,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -1252,6 +1255,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large dnodes
+ * to smaller ones, so the pool must have the LARGE_DNODE
+ * feature enabled if the stream has LARGE_DNODE.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1658,7 +1670,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
return (1);
} else {
return (1 +
- ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
}
}
@@ -1679,7 +1692,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) {
return (SET_ERROR(EINVAL));
}
@@ -1719,9 +1733,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- err = dmu_object_claim(rwa->os, drro->drr_object,
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -1771,18 +1786,25 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
- for (obj = drrfo->drr_firstobj;
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+ dmu_object_info_t doi;
int err;
- if (dmu_object_info(rwa->os, obj, NULL) != 0)
+ err = dmu_object_info(rwa->os, obj, &doi);
+ if (err == ENOENT) {
+ obj++;
continue;
+ } else if (err != 0) {
+ return (err);
+ }
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
}
+
return (0);
}
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index bba9efe14..44ba74181 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -331,13 +331,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
goto post;
cdnp = buf->b_data;
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
if (err != 0)
@@ -439,7 +439,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
+ traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
}
}
@@ -470,7 +470,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
+ err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
}
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 74e323dbd..ed29bfbc6 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1586,7 +1586,7 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
} else {
blkptr_t *bp;
- bp = &dn->dn_phys->dn_spill;
+ bp = DN_SPILL_BLKPTR(dn->dn_phys);
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
@@ -1618,7 +1618,7 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
dmu_tx_sa_registration_hold(sa, tx);
- if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
return;
(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 38bcecd46..975bd5fb8 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -248,6 +248,7 @@ dnode_verify(dnode_t *dn)
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -258,12 +259,12 @@ dnode_verify(dnode_t *dn)
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ dn->dn_bonuslen, <=, max_bonuslen);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
@@ -294,6 +295,7 @@ dnode_byteswap(dnode_phys_t *dnp)
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
@@ -320,7 +322,8 @@ dnode_byteswap(dnode_phys_t *dnp)
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -329,23 +332,24 @@ dnode_byteswap(dnode_phys_t *dnp)
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
- byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
- dnode_phys_t *buf = vbuf;
- int i;
+ int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
+ while (i < size) {
+ dnode_phys_t *dnp = vbuf + i;
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
}
}
@@ -356,7 +360,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
@@ -434,6 +438,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
@@ -534,10 +539,13 @@ dnode_destroy(dnode_t *dn)
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int i;
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
@@ -550,8 +558,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -562,7 +570,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
@@ -588,11 +596,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
- else
- dn->dn_nblkptr = 1 +
- ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -617,7 +629,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int nblkptr;
@@ -631,7 +643,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+ dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
@@ -654,7 +669,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
@@ -672,6 +689,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -680,7 +698,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
- DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
@@ -1053,24 +1072,150 @@ dnode_buf_pageout(void *dbu)
}
/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int skip;
+ int i;
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ for (i = 0; i < idx; i += skip) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL) {
+ ot = dnh->dnh_dnode->dn_type;
+ skip = dnh->dnh_dnode->dn_num_slots;
+ } else {
+ ot = dn_block[i].dn_type;
+ skip = dn_block[i].dn_extra_slots + 1;
+ }
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot == DMU_OT_NONE)
+ skip = 1;
+ }
+
+ return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ dnh = &children_dnodes->dnc_children[idx];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[idx].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int i;
+
+ if (idx + slots > DNODES_PER_BLOCK)
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ for (i = idx; i < idx + slots; i++) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[i].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot != DMU_OT_NONE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
* errors:
* EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
* EIO - i/o error.
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
void *tag, dnode_t **dnp)
{
- int epb, idx, err;
+ int epb, idx, err, i;
int drop_struct_lock = FALSE;
int type;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block_begin;
dnode_handle_t *dnh;
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
@@ -1126,12 +1271,9 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb-1);
-
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- int i;
dnode_children_t *winner;
children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t), KM_SLEEP);
@@ -1156,21 +1298,28 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
}
ASSERT(children_dnodes->dnc_count == epb);
+ idx = object & (epb - 1);
+ dn_block_begin = (dnode_phys_t *)db->db.db_data;
+
+ if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+ dbuf_rele(db, FTAG);
+ return (ENOSPC);
+ } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+ !dnode_is_allocated(db, idx)) {
+ dbuf_rele(db, FTAG);
+ return (ENOENT);
+ }
+
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
- if (dn == NULL) {
- dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-
- dn = dnode_create(os, phys, db, object, dnh);
- }
+ if (dn == NULL)
+ dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+ ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
@@ -1198,7 +1347,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
}
/*
@@ -1908,17 +2058,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
+
ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
- for (i = (*offset >> span) & (blkfill - 1);
- i >= 0 && i < blkfill; i += inc) {
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
- *offset += (1ULL << span) * inc;
}
- if (i < 0 || i == blkfill)
+
+ if (i == blkfill)
error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index bea7be186..54066e2e3 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -524,7 +524,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
- bzero(dn->dn_phys, sizeof (dnode_phys_t));
+ bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
@@ -559,7 +559,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
ASSERT(dnp->dn_type != DMU_OT_NONE ||
- bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
+ bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
@@ -591,6 +591,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
+
+ dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
@@ -623,7 +626,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonuslen = 0;
else
dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
- ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ ASSERT(dnp->dn_bonuslen <=
+ DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
dn->dn_next_bonuslen[txgoff] = 0;
}
@@ -662,7 +666,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
- free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@@ -687,6 +691,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
return;
}
+ if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
+ B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
if (dn->dn_next_nlevels[txgoff]) {
dnode_increase_indirection(dn, tx);
dn->dn_next_nlevels[txgoff] = 0;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b5e272fb9..72163521e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -709,14 +709,18 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
scn->scn_phys.scn_errors++;
return (err);
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
for (j = 0; j < cdnp->dn_nblkptr; j++) {
blkptr_t *cbp = &cdnp->dn_blkptr[j];
dsl_scan_prefetch(scn, buf, cbp,
zb->zb_objset, zb->zb_blkid * epb + i, j);
}
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
dsl_scan_visitdnode(scn, ds, ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
@@ -779,7 +783,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
0, DMU_SPILL_BLKID);
- dsl_scan_visitbp(&dnp->dn_spill,
+ dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
&czb, dnp, ds, scn, ostype, tx);
}
}
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index d6ac5fcc7..adc301512 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -33,6 +33,7 @@
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
#include <sys/zap.h>
@@ -553,12 +554,11 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
*/
static int
sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
- dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
- boolean_t *will_spill)
+ dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+ int *total, boolean_t *will_spill)
{
int var_size_count = 0;
int i;
- int full_space;
int hdrsize;
int extra_hdrsize;
@@ -577,7 +577,6 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
sizeof (sa_hdr_phys_t);
- full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
ASSERT(IS_P2ALIGNED(full_space, 8));
for (i = 0; i != attr_count; i++) {
@@ -668,6 +667,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
void *data_start;
sa_attr_type_t *attrs, *attrs_start;
int i, lot_count;
+ int dnodesize;
int spill_idx;
int hdrsize;
int spillhdrsize = 0;
@@ -676,20 +676,23 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
sa_lot_t *lot;
int len_idx;
int spill_used;
+ int bonuslen;
boolean_t spilling;
dmu_buf_will_dirty(hdl->sa_bonus, tx);
bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
/* first determine bonus header size and sum of all attributes */
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
- SA_BONUS, &spill_idx, &used, &spilling);
+ SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
- VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
- MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
used + hdrsize, tx));
ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
@@ -706,8 +709,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
dmu_buf_will_dirty(hdl->sa_spill, tx);
spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
- attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
- &spill_used, &dummy);
+ attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
+ hdl->sa_spill->db_size, &i, &spill_used, &dummy);
if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c23fd7a3a..d1aefe585 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -281,6 +281,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
}
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -512,7 +520,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
/*
* Must be ZPL, and its property settings
* must be supported by GRUB (compression
- * is not gzip, and large blocks are not used).
+ * is not gzip, and large blocks or large
+ * dnodes are not used).
*/
if (dmu_objset_type(os) != DMU_OST_ZFS) {
@@ -529,6 +538,12 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
&propval)) == 0 &&
propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ &propval)) == 0 &&
+ propval != ZFS_DNSIZE_LEGACY) {
+ error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index e3e7e36fe..d1303b5c2 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2000,6 +2000,15 @@ spa_maxblocksize(spa_t *spa)
return (SPA_OLD_MAXBLOCKSIZE);
}
+int
+spa_maxdnodesize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (DNODE_MAX_SIZE);
+ else
+ return (DNODE_MIN_SIZE);
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
@@ -2056,6 +2065,7 @@ EXPORT_SYMBOL(spa_bootfs);
EXPORT_SYMBOL(spa_delegation);
EXPORT_SYMBOL(spa_meta_objset);
EXPORT_SYMBOL(spa_maxblocksize);
+EXPORT_SYMBOL(spa_maxdnodesize);
/* Miscellaneous support routines */
EXPORT_SYMBOL(spa_rename);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 454b4be62..9e4f05049 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -968,9 +968,17 @@ uint64_t
zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
const char *name, dmu_tx_t *tx)
{
+ return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, int dnodesize, dmu_tx_t *tx)
+{
uint64_t new_obj;
- VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+ VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
+ dnodesize, tx)) > 0);
VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
tx));
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 3faf27ce3..f3153cc18 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -630,8 +630,16 @@ int
zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (zap_create_claim_norm(os, obj,
- 0, ot, bonustype, bonuslen, tx));
+ return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj,
+ 0, ot, bonustype, bonuslen, dnodesize, tx));
}
int
@@ -639,9 +647,19 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
int err;
- err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
if (err != 0)
return (err);
mzap_create_impl(os, obj, normflags, 0, tx);
@@ -656,10 +674,27 @@ zap_create(objset_t *os, dmu_object_type_t ot,
}
uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+ dnodesize, tx));
+}
+
+uint64_t
zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
mzap_create_impl(os, obj, normflags, 0, tx);
return (obj);
@@ -670,7 +705,17 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_flags_dnsize(os, normflags, flags, ot,
+ leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
@@ -1458,10 +1503,14 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zap_create);
+EXPORT_SYMBOL(zap_create_dnsize);
EXPORT_SYMBOL(zap_create_norm);
+EXPORT_SYMBOL(zap_create_norm_dnsize);
EXPORT_SYMBOL(zap_create_flags);
+EXPORT_SYMBOL(zap_create_flags_dnsize);
EXPORT_SYMBOL(zap_create_claim);
EXPORT_SYMBOL(zap_create_claim_norm);
+EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
EXPORT_SYMBOL(zap_destroy);
EXPORT_SYMBOL(zap_lookup);
EXPORT_SYMBOL(zap_lookup_norm);
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
index f57e5489c..3264f6235 100644
--- a/module/zfs/zfeature_common.c
+++ b/module/zfs/zfeature_common.c
@@ -242,4 +242,15 @@ zpool_feature_init(void)
"Support for blocks larger than 128KB.",
ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
}
+
+ {
+ static const spa_feature_t large_dnode_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_DNODE,
+ "org.zfsonlinux:large_dnode", "large_dnode",
+ "Variable on-disk size of dnodes.",
+ ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
+ }
}
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
index 961083d4a..f820cdfd6 100644
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -1394,7 +1394,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
otype == DMU_OT_ACL ?
DMU_OT_SYSACL : DMU_OT_NONE,
otype == DMU_OT_ACL ?
- DN_MAX_BONUSLEN : 0, tx);
+ DN_OLD_MAX_BONUSLEN : 0, tx);
} else {
(void) dmu_object_set_blocksize(zsb->z_os,
aoid, aclp->z_acl_bytes, 0, tx);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index c63af167a..30338ac14 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3785,7 +3785,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
/*
* If this is a bootable dataset then
- * the we don't allow large (>128K) blocks,
+ * we don't allow large (>128K) blocks,
* because GRUB doesn't support them.
*/
if (zfs_is_bootfs(dsname) &&
@@ -3813,6 +3813,34 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
+ case ZFS_PROP_DNODESIZE:
+ /* Dnode sizes above 512 need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * we don't allow large (>512B) dnodes,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ return (SET_ERROR(EDOM));
+ }
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_DNODE)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 38d8de0eb..4d89cb04b 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -279,6 +279,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
+ /* Store dnode slot count in 8 bits above object id. */
+ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
lr->lr_mode = zp->z_mode;
if (!IS_EPHEMERAL(zp->z_uid)) {
lr->lr_uid = (uint64_t)zp->z_uid;
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index b97a60ed8..54c175437 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -279,6 +279,8 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap)
void *fuidstart;
size_t xvatlen = 0;
uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
int error;
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
@@ -304,19 +306,24 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap)
if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0)
return (error);
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic zfs_create()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode size. The generic
+ * zfs_create() has no concept of these attributes, so we smuggle
+ * the values inside the vattr's otherwise unused va_ctime,
+ * va_nblocks, and va_fsid fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL);
if (error != ENOENT)
@@ -418,6 +425,8 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap)
void *start;
size_t xvatlen;
uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
int error;
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
@@ -431,21 +440,26 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap)
if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0)
return (error);
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic zfs_create()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode slot count. The
+ * generic zfs_create() has no concept of these attributes, so
+ * we smuggle the values inside * the vattr's otherwise unused
+ * va_ctime, va_nblocks, and va_nlink fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
- error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL);
+ error = dmu_object_info(zsb->z_os, objid, NULL);
if (error != ENOENT)
goto out;
diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c
index f4841435b..f3eac51f8 100644
--- a/module/zfs/zfs_sa.c
+++ b/module/zfs/zfs_sa.c
@@ -97,8 +97,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
- VERIFY(dmu_set_bonus(db,
- len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
if (len) {
bcopy(link, (caddr_t)db->db_data +
ZFS_OLD_ZNODE_PHYS_SIZE, len);
@@ -107,8 +106,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_t *dbp;
zfs_grow_blocksize(zp, len, tx);
- VERIFY(0 == dmu_buf_hold(ZTOZSB(zp)->z_os,
- zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+ VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp,
+ DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(dbp, tx);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 19cb414a9..310d4827b 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -62,6 +62,7 @@
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -728,6 +729,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
timestruc_t now;
uint64_t gen, obj;
int bonuslen;
+ int dnodesize;
sa_handle_t *sa_hdl;
dmu_object_type_t obj_type;
sa_bulk_attr_t *sa_attrs;
@@ -739,15 +741,21 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
obj = vap->va_nodeid;
now = vap->va_ctime; /* see zfs_replay_create() */
gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
} else {
obj = 0;
gethrestime(&now);
gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zsb->z_os);
}
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
obj_type = zsb->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
bonuslen = (obj_type == DMU_OT_SA) ?
- DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
/*
* Create a new DMU object.
@@ -760,23 +768,23 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
if (S_ISDIR(vap->va_mode)) {
if (zsb->z_replay) {
- VERIFY0(zap_create_claim_norm(zsb->z_os, obj,
+ VERIFY0(zap_create_claim_norm_dnsize(zsb->z_os, obj,
zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = zap_create_norm(zsb->z_os,
+ obj = zap_create_norm_dnsize(zsb->z_os,
zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
} else {
if (zsb->z_replay) {
- VERIFY0(dmu_object_claim(zsb->z_os, obj,
+ VERIFY0(dmu_object_claim_dnsize(zsb->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = dmu_object_alloc(zsb->z_os,
+ obj = dmu_object_alloc_dnsize(zsb->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
}
@@ -948,6 +956,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_pflags = pflags;
(*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
if (obj_type == DMU_OT_ZNODE ||
acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
@@ -1767,6 +1776,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
ASSERT(error == 0);
/*
+ * Give dmu_object_alloc() a hint about where to start
+ * allocating new objects. Otherwise, since the metadnode's
+ * dnode_phys_t structure isn't initialized yet, dmu_object_next()
+ * would fail and we'd have to skip to the next dnode block.
+ */
+ os->os_obj_next = moid + 1;
+
+ /*
* Set starting attributes.
*/
version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 289b23c7f..988ffec29 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1372,7 +1372,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
itxg->itxg_sod += itx->itx_sod;
} else {
avl_tree_t *t = &itxs->i_async_tree;
- uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ uint64_t foid =
+ LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
itx_async_node_t *ian;
avl_index_t where;
@@ -1918,7 +1919,8 @@ zil_close(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- ASSERT(!zilog_is_dirty(zilog));
+ if (txg < spa_freeze_txg(zilog->zl_spa))
+ ASSERT(!zilog_is_dirty(zilog));
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
@@ -2122,7 +2124,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
*/
if (TX_OOO(txtype)) {
error = dmu_object_info(zilog->zl_os,
- ((lr_ooo_t *)lr)->lr_foid, NULL);
+ LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
if (error == ENOENT || error == EEXIST)
return (0);
}