aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c17
-rw-r--r--cmd/zdb/zdb_il.c6
-rw-r--r--cmd/ztest/ztest.c202
-rw-r--r--configure.ac1
-rw-r--r--include/sys/dmu.h13
-rw-r--r--include/sys/dmu_objset.h1
-rw-r--r--include/sys/dnode.h44
-rw-r--r--include/sys/dsl_dataset.h7
-rw-r--r--include/sys/fs/zfs.h12
-rw-r--r--include/sys/sa_impl.h2
-rw-r--r--include/sys/spa.h1
-rw-r--r--include/sys/zap.h19
-rw-r--r--include/sys/zfs_ioctl.h6
-rw-r--r--include/sys/zfs_znode.h1
-rw-r--r--include/sys/zil.h13
-rw-r--r--include/zfeature_common.h1
-rw-r--r--lib/libzfs/libzfs_dataset.c1
-rw-r--r--man/man5/zpool-features.525
-rw-r--r--man/man8/zfs.829
-rw-r--r--module/zcommon/zfs_prop.c14
-rw-r--r--module/zcommon/zpool_prop.c2
-rw-r--r--module/zfs/dbuf.c60
-rw-r--r--module/zfs/dmu.c20
-rw-r--r--module/zfs/dmu_object.c124
-rw-r--r--module/zfs/dmu_objset.c47
-rw-r--r--module/zfs/dmu_send.c42
-rw-r--r--module/zfs/dmu_traverse.c8
-rw-r--r--module/zfs/dmu_tx.c4
-rw-r--r--module/zfs/dnode.c238
-rw-r--r--module/zfs/dnode_sync.c20
-rw-r--r--module/zfs/dsl_scan.c10
-rw-r--r--module/zfs/sa.c21
-rw-r--r--module/zfs/spa.c17
-rw-r--r--module/zfs/spa_misc.c10
-rw-r--r--module/zfs/zap.c10
-rw-r--r--module/zfs/zap_micro.c59
-rw-r--r--module/zfs/zfeature_common.c11
-rw-r--r--module/zfs/zfs_acl.c2
-rw-r--r--module/zfs/zfs_ioctl.c30
-rw-r--r--module/zfs/zfs_log.c2
-rw-r--r--module/zfs/zfs_replay.c32
-rw-r--r--module/zfs/zfs_sa.c7
-rw-r--r--module/zfs/zfs_znode.c35
-rw-r--r--module/zfs/zil.c8
-rw-r--r--tests/runfiles/linux.run5
-rw-r--r--tests/zfs-tests/include/default.cfg.in2
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg2
-rw-r--r--tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib19
-rwxr-xr-xtests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh2
-rwxr-xr-xtests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh4
-rw-r--r--tests/zfs-tests/tests/functional/features/Makefile.am4
-rw-r--r--tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am11
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/cleanup.ksh25
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh77
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh78
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_003_pos.ksh65
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh59
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh64
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh58
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/large_dnode_007_neg.ksh56
-rwxr-xr-xtests/zfs-tests/tests/functional/features/large_dnode/setup.ksh27
-rw-r--r--tests/zfs-tests/tests/functional/rsend/rsend.kshlib4
-rwxr-xr-xtests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh1
-rw-r--r--zfs-script-config.sh.in2
64 files changed, 1575 insertions, 224 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 3311a6e6f..ee5e21f10 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1880,15 +1880,15 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
dnode_t *dn;
void *bonus = NULL;
size_t bsize = 0;
- char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+ char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
char bonus_size[32];
char aux[50];
int error;
if (*print_header) {
- (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n",
- "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
- "%full", "type");
+ (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
+ "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
+ "lsize", "%full", "type");
*print_header = 0;
}
@@ -1910,6 +1910,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
zdb_nicenum(doi.doi_max_offset, lsize);
zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
zdb_nicenum(doi.doi_bonus_size, bonus_size);
+ zdb_nicenum(doi.doi_dnodesize, dnsize);
(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
doi.doi_max_offset);
@@ -1926,13 +1927,13 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
ZDB_COMPRESS_NAME(doi.doi_compress));
}
- (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n",
+ (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
(u_longlong_t)object, doi.doi_indirection, iblk, dblk,
- asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
+ asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
- (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n",
- "", "", "", "", "", bonus_size, "bonus",
+ (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
+ "", "", "", "", "", "", bonus_size, "bonus",
ZDB_OT_NAME(doi.doi_bonus_type));
}
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 93b905793..1501e879d 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -80,8 +80,10 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
}
(void) printf("%s%s", prefix, ctime(&crtime));
- (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
- (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
+ (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", prefix,
+ (u_longlong_t)lr->lr_doid,
+ (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
+ (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
(longlong_t)lr->lr_mode);
(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
(u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index e6d6e9803..ad15dea1d 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -221,6 +221,7 @@ typedef struct ztest_block_tag {
uint64_t bt_magic;
uint64_t bt_objset;
uint64_t bt_object;
+ uint64_t bt_dnodesize;
uint64_t bt_offset;
uint64_t bt_gen;
uint64_t bt_txg;
@@ -258,6 +259,7 @@ typedef struct ztest_od {
dmu_object_type_t od_crtype;
uint64_t od_blocksize;
uint64_t od_crblocksize;
+ uint64_t od_crdnodesize;
uint64_t od_gen;
uint64_t od_crgen;
char od_name[MAXNAMELEN];
@@ -329,6 +331,7 @@ ztest_func_t ztest_split_pool;
ztest_func_t ztest_reguid;
ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_fletcher;
+ztest_func_t ztest_verify_dnode_bt;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -375,6 +378,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
+ ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -1028,6 +1032,36 @@ ztest_random_blocksize(void)
}
static int
+ztest_random_dnodesize(void)
+{
+ int slots;
+ int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT;
+
+ if (max_slots == DNODE_MIN_SLOTS)
+ return (DNODE_MIN_SIZE);
+
+ /*
+ * Weight the random distribution more heavily toward smaller
+ * dnode sizes since that is more likely to reflect real-world
+ * usage.
+ */
+ ASSERT3U(max_slots, >, 4);
+ switch (ztest_random(10)) {
+ case 0:
+ slots = 5 + ztest_random(max_slots - 4);
+ break;
+ case 1 ... 4:
+ slots = 2 + ztest_random(3);
+ break;
+ default:
+ slots = 1;
+ break;
+ }
+
+ return (slots << DNODE_SHIFT);
+}
+
+static int
ztest_random_ibshift(void)
{
return (DN_MIN_INDBLKSHIFT +
@@ -1417,11 +1451,13 @@ ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
static void
ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
- uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+ uint64_t crtxg)
{
bt->bt_magic = BT_MAGIC;
bt->bt_objset = dmu_objset_id(os);
bt->bt_object = object;
+ bt->bt_dnodesize = dnodesize;
bt->bt_offset = offset;
bt->bt_gen = gen;
bt->bt_txg = txg;
@@ -1430,11 +1466,13 @@ ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
static void
ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
- uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+ uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+ uint64_t crtxg)
{
ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
ASSERT3U(bt->bt_object, ==, object);
+ ASSERT3U(bt->bt_dnodesize, ==, dnodesize);
ASSERT3U(bt->bt_offset, ==, offset);
ASSERT3U(bt->bt_gen, <=, gen);
ASSERT3U(bt->bt_txg, <=, txg);
@@ -1456,6 +1494,52 @@ ztest_bt_bonus(dmu_buf_t *db)
}
/*
+ * Generate a token to fill up unused bonus buffer space. Try to make
+ * it unique to the object, generation, and offset to verify that data
+ * is not getting overwritten by data from other dnodes.
+ */
+#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
+ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
+
+/*
+ * Fill up the unused bonus buffer region before the block tag with a
+ * verifiable pattern. Filling the whole bonus area with non-zero data
+ * helps ensure that all dnode traversal code properly skips the
+ * interior regions of large dnodes.
+ */
+void
+ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+ objset_t *os, uint64_t gen)
+{
+ uint64_t *bonusp;
+
+ ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8));
+
+ for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+ uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+ gen, bonusp - (uint64_t *)db->db_data);
+ *bonusp = token;
+ }
+}
+
+/*
+ * Verify that the unused area of a bonus buffer is filled with the
+ * expected tokens.
+ */
+void
+ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+ objset_t *os, uint64_t gen)
+{
+ uint64_t *bonusp;
+
+ for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+ uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+ gen, bonusp - (uint64_t *)db->db_data);
+ VERIFY3U(*bonusp, ==, token);
+ }
+}
+
+/*
* ZIL logging ops
*/
@@ -1463,7 +1547,7 @@ ztest_bt_bonus(dmu_buf_t *db)
#define lrz_blocksize lr_uid
#define lrz_ibshift lr_gid
#define lrz_bonustype lr_rdev
-#define lrz_bonuslen lr_crtime[1]
+#define lrz_dnodesize lr_crtime[1]
static void
ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
@@ -1578,6 +1662,7 @@ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
dmu_tx_t *tx;
uint64_t txg;
int error = 0;
+ int bonuslen;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -1600,26 +1685,27 @@ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
return (ENOSPC);
ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+ bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize);
if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
if (lr->lr_foid == 0) {
- lr->lr_foid = zap_create(os,
+ lr->lr_foid = zap_create_dnsize(os,
lr->lrz_type, lr->lrz_bonustype,
- lr->lrz_bonuslen, tx);
+ bonuslen, lr->lrz_dnodesize, tx);
} else {
- error = zap_create_claim(os, lr->lr_foid,
+ error = zap_create_claim_dnsize(os, lr->lr_foid,
lr->lrz_type, lr->lrz_bonustype,
- lr->lrz_bonuslen, tx);
+ bonuslen, lr->lrz_dnodesize, tx);
}
} else {
if (lr->lr_foid == 0) {
- lr->lr_foid = dmu_object_alloc(os,
+ lr->lr_foid = dmu_object_alloc_dnsize(os,
lr->lrz_type, 0, lr->lrz_bonustype,
- lr->lrz_bonuslen, tx);
+ bonuslen, lr->lrz_dnodesize, tx);
} else {
- error = dmu_object_claim(os, lr->lr_foid,
+ error = dmu_object_claim_dnsize(os, lr->lr_foid,
lr->lrz_type, 0, lr->lrz_bonustype,
- lr->lrz_bonuslen, tx);
+ bonuslen, lr->lrz_dnodesize, tx);
}
}
@@ -1639,7 +1725,9 @@ ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
bbt = ztest_bt_bonus(db);
dmu_buf_will_dirty(db, tx);
- ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+ ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL,
+ lr->lr_gen, txg, txg);
+ ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen);
dmu_buf_rele(db, FTAG);
VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
@@ -1785,7 +1873,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0);
if (rbt.bt_magic == BT_MAGIC) {
- ztest_bt_verify(&rbt, os, lr->lr_foid,
+ ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
offset, gen, txg, crtxg);
}
}
@@ -1797,7 +1885,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
* as it was when the write was generated.
*/
if (zd->zd_zilog->zl_replay) {
- ztest_bt_verify(bt, os, lr->lr_foid, offset,
+ ztest_bt_verify(bt, os, lr->lr_foid, 0, offset,
MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
bt->bt_crtxg);
}
@@ -1806,7 +1894,8 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
* Set the bt's gen/txg to the bonus buffer's gen/txg
* so that all of the usual ASSERTs will work.
*/
- ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg,
+ crtxg);
}
if (abuf == NULL) {
@@ -1874,7 +1963,7 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
dmu_tx_t *tx;
dmu_buf_t *db;
ztest_block_tag_t *bbt;
- uint64_t txg, lrtxg, crtxg;
+ uint64_t txg, lrtxg, crtxg, dnodesize;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -1897,6 +1986,7 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
crtxg = bbt->bt_crtxg;
lrtxg = lr->lr_common.lrc_txg;
+ dnodesize = bbt->bt_dnodesize;
if (zd->zd_zilog->zl_replay) {
ASSERT(lr->lr_size != 0);
@@ -1915,7 +2005,7 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
/*
* Verify that the current bonus buffer is not newer than our txg.
*/
- ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
MAX(txg, lrtxg), crtxg);
dmu_buf_will_dirty(db, tx);
@@ -1925,8 +2015,9 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
bbt = ztest_bt_bonus(db);
- ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
-
+ ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
+ txg, crtxg);
+ ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
dmu_buf_rele(db, FTAG);
(void) ztest_log_setattr(zd, tx, lr);
@@ -2171,7 +2262,7 @@ ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
lr->lrz_blocksize = od->od_crblocksize;
lr->lrz_ibshift = ztest_random_ibshift();
lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
- lr->lrz_bonuslen = dmu_bonus_max();
+ lr->lrz_dnodesize = od->od_crdnodesize;
lr->lr_gen = od->od_crgen;
lr->lr_crtime[0] = time(NULL);
@@ -2351,7 +2442,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
switch (io_type) {
case ZTEST_IO_WRITE_TAG:
- ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+ ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize,
+ offset, 0, 0, 0);
(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
break;
@@ -2414,13 +2506,15 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
*/
static void
ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
- dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+ dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize,
+ uint64_t gen)
{
od->od_dir = ZTEST_DIROBJ;
od->od_object = 0;
od->od_crtype = type;
od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize();
od->od_crgen = gen;
od->od_type = DMU_OT_NONE;
@@ -3740,7 +3834,8 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
batchsize = OD_ARRAY_SIZE;
for (b = 0; b < batchsize; b++)
- ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
+ ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER,
+ 0, 0, 0);
/*
* Destroy the previous batch of objects, create a new batch,
@@ -3809,8 +3904,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
/*
* Read the directory info. If it's the first time, set things up.
*/
- ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
- ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
+ ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+ chunksize);
if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
umem_free(od, size);
@@ -4090,8 +4186,9 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
/*
* Read the directory info. If it's the first time, set things up.
*/
- ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
- ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+ ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+ chunksize);
if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
@@ -4299,7 +4396,7 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
* to verify that parallel writes to an object -- even to the
* same blocks within the object -- doesn't cause any trouble.
*/
- ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
return;
@@ -4322,7 +4419,7 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t),
!ztest_random(2)) != 0) {
@@ -4375,7 +4472,7 @@ ztest_zap(ztest_ds_t *zd, uint64_t id)
char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t),
!ztest_random(2)) != 0)
@@ -4512,7 +4609,7 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id)
int i;
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t),
!ztest_random(2)) != 0)
@@ -4561,7 +4658,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
void *data;
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+ ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
umem_free(od, sizeof (ztest_od_t));
@@ -4750,7 +4847,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
int i, error = 0;
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
umem_free(od, sizeof (ztest_od_t));
@@ -4871,6 +4968,41 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
umem_free(od, sizeof (ztest_od_t));
}
+/*
+ * Visit each object in the dataset. Verify that its properties
+ * are consistent what was stored in the block tag when it was created,
+ * and that its unused bonus buffer space has not been overwritten.
+ */
+void
+ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ uint64_t obj;
+ int err = 0;
+
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+ ztest_block_tag_t *bt = NULL;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+
+ if (dmu_bonus_hold(os, obj, FTAG, &db) != 0)
+ continue;
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_size >= sizeof (*bt))
+ bt = ztest_bt_bonus(db);
+
+ if (bt && bt->bt_magic == BT_MAGIC) {
+ ztest_bt_verify(bt, os, obj, doi.doi_dnodesize,
+ bt->bt_offset, bt->bt_gen, bt->bt_txg,
+ bt->bt_crtxg);
+ ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen);
+ }
+
+ dmu_buf_rele(db, FTAG);
+ }
+}
+
/* ARGSUSED */
void
ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
@@ -5317,7 +5449,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
blocksize = MIN(blocksize, 2048); /* because we write so many */
od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
- ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
umem_free(od, sizeof (ztest_od_t));
@@ -6194,7 +6326,7 @@ ztest_freeze(void)
numloops++ < ztest_opts.zo_maxloops &&
metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
ztest_od_t od;
- ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+ ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
ztest_io(zd, od.od_object,
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
diff --git a/configure.ac b/configure.ac
index 99c25bc27..9264d6bc6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -230,6 +230,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/devices/Makefile
tests/zfs-tests/tests/functional/exec/Makefile
tests/zfs-tests/tests/functional/features/async_destroy/Makefile
+ tests/zfs-tests/tests/functional/features/large_dnode/Makefile
tests/zfs-tests/tests/functional/features/Makefile
tests/zfs-tests/tests/functional/grow_pool/Makefile
tests/zfs-tests/tests/functional/grow_replicas/Makefile
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 64d28334e..74001d8f3 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -334,10 +334,19 @@ typedef struct dmu_buf {
*/
uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
+int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
+ dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
+ int bonuslen, int dnodesize, dmu_tx_t *txp);
/*
* Free an object from this objset.
@@ -756,6 +765,7 @@ typedef struct dmu_object_info {
uint8_t doi_compress;
uint8_t doi_nblkptr;
uint8_t doi_pad[4];
+ uint64_t doi_dnodesize;
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
uint64_t doi_max_offset;
uint64_t doi_fill_count; /* number of non-empty blocks */
@@ -797,6 +807,8 @@ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512);
+void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
+
typedef struct dmu_objset_stats {
uint64_t dds_num_clones; /* number of clones of this */
uint64_t dds_creation_txg;
@@ -854,6 +866,7 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index 0f03699f1..81bb89bfc 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -88,6 +88,7 @@ struct objset {
list_node_t os_evicting_node;
/* can change, under dsl_dir's locks: */
+ uint64_t os_dnodesize; /* default dnode size for new objects */
enum zio_checksum os_checksum;
enum zio_compress os_compress;
uint8_t os_copies;
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index c5250d51a..cee4ea783 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -79,11 +79,18 @@ extern "C" {
/*
* Derived constants.
*/
-#define DNODE_SIZE (1 << DNODE_SHIFT)
-#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
+#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
+#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
+#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
+ (1 << SPA_BLKPTRSHIFT))
+#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
+#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
+#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
-#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
+#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
@@ -131,7 +138,8 @@ typedef struct dnode_phys {
uint8_t dn_flags; /* DNODE_FLAG_* */
uint16_t dn_datablkszsec; /* data block size in 512b sectors */
uint16_t dn_bonuslen; /* length of dn_bonus */
- uint8_t dn_pad2[4];
+ uint8_t dn_extra_slots; /* # of subsequent slots consumed */
+ uint8_t dn_pad2[3];
/* accounting is protected by dn_dirty_mtx */
uint64_t dn_maxblkid; /* largest allocated block ID */
@@ -140,8 +148,11 @@ typedef struct dnode_phys {
uint64_t dn_pad3[4];
/*
- * The tail region is 448 bytes, and there are three ways to
- * look at it.
+ * The tail region is 448 bytes for a 512 byte dnode, and
+ * correspondingly larger for larger dnode sizes. The spill
+ * block pointer, when present, is always at the end of the tail
+ * region. There are three ways this space may be used, using
+ * a 512 byte dnode for this diagram:
*
* 0 64 128 192 256 320 384 448 (offset)
* +---------------+---------------+---------------+-------+
@@ -149,23 +160,27 @@ typedef struct dnode_phys {
* +---------------+---------------+---------------+-------+
* | dn_blkptr[0] | dn_bonus[0..319] |
* +---------------+-----------------------+---------------+
- * | dn_blkptr[0] | / | dn_spill |
+ * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill |
* +---------------+-----------------------+---------------+
*/
union {
- blkptr_t dn_blkptr[1+DN_MAX_BONUSLEN/sizeof (blkptr_t)];
+ blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
struct {
blkptr_t __dn_ignore1;
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
};
struct {
blkptr_t __dn_ignore2;
- uint8_t __dn_ignore3[DN_MAX_BONUSLEN-sizeof (blkptr_t)];
+ uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
+ sizeof (blkptr_t)];
blkptr_t dn_spill;
};
};
} dnode_phys_t;
+#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
+ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
+
typedef struct dnode {
/*
* Protects the structure of the dnode, including the number of levels
@@ -202,6 +217,7 @@ typedef struct dnode {
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
uint8_t dn_next_type[TXG_SIZE];
+ uint8_t dn_num_slots; /* metadnode slots consumed on disk */
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
@@ -299,7 +315,7 @@ void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
@@ -307,9 +323,9 @@ void dnode_rele_and_unlock(dnode_t *dn, void *tag);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
void dnode_free(dnode_t *dn, dmu_tx_t *tx);
void dnode_byteswap(dnode_phys_t *dnp);
void dnode_buf_byteswap(void *buf, size_t size);
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index a596642e3..f3e64a772 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -92,6 +92,13 @@ struct dsl_pool;
#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
/*
+ * This field is present (with value=0) if this dataset may contain large
+ * dnodes (>512B). If it is present, then this dataset is counted in the
+ * refcount of the SPA_FEATURE_LARGE_DNODE feature.
+ */
+#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 9de50e9a2..0935dca77 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -137,6 +137,7 @@ typedef enum {
ZFS_PROP_DEDUP,
ZFS_PROP_MLSLABEL,
ZFS_PROP_SYNC,
+ ZFS_PROP_DNODESIZE,
ZFS_PROP_REFRATIO,
ZFS_PROP_WRITTEN,
ZFS_PROP_CLONES,
@@ -204,6 +205,7 @@ typedef enum {
ZPOOL_PROP_LEAKED,
ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_PROP_TNAME,
+ ZPOOL_PROP_MAXDNODESIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -362,6 +364,16 @@ typedef enum {
} zfs_xattr_type_t;
typedef enum {
+ ZFS_DNSIZE_LEGACY = 0,
+ ZFS_DNSIZE_AUTO = 1,
+ ZFS_DNSIZE_1K = 1024,
+ ZFS_DNSIZE_2K = 2048,
+ ZFS_DNSIZE_4K = 4096,
+ ZFS_DNSIZE_8K = 8192,
+ ZFS_DNSIZE_16K = 16384
+} zfs_dnsize_type_t;
+
+typedef enum {
ZFS_REDUNDANT_METADATA_ALL,
ZFS_REDUNDANT_METADATA_MOST
} zfs_redundant_metadata_type_t;
diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h
index 6f2f1db6d..b68b7610b 100644
--- a/include/sys/sa_impl.h
+++ b/include/sys/sa_impl.h
@@ -235,7 +235,7 @@ struct sa_handle {
#define SA_BONUSTYPE_FROM_DB(db) \
(dmu_get_bonustype((dmu_buf_t *)db))
-#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t))
#define SA_LAYOUT_NUM(x, type) \
((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 7c09caf6c..17bf76de8 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -822,6 +822,7 @@ extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
+extern int spa_maxdnodesize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
extern int spa_mode(spa_t *spa);
diff --git a/include/sys/zap.h b/include/sys/zap.h
index ed60b86db..70cbaae97 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -129,16 +129,30 @@ typedef enum zap_flags {
* MT_FIRST/MT_BEST matching will find entries that match without
* regard to case (eg. looking for "foo" can find an entry "Foo").
* Eventually, other flags will permit unicode normalization as well.
+ *
+ * dnodesize specifies the on-disk size of the dnode for the new zapobj.
+ * Valid values are multiples of 512 up to DNODE_MAX_SIZE.
*/
uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
+ zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
+uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
+ uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
/*
* Initialize an already-allocated object.
@@ -152,9 +166,14 @@ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
*/
int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
int zap_create_claim_norm(objset_t *ds, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
+ int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
/*
* The zapobj passed in must be a valid ZAP object for all of the
diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h
index ac70f690e..301f83232 100644
--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@@ -98,6 +98,7 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
/* flag #18 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
+#define DMU_BACKUP_FEATURE_LARGE_DNODE (1<<20)
/*
* Mask of all supported backup features
@@ -105,7 +106,7 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
- DMU_BACKUP_FEATURE_LARGE_BLOCKS)
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -173,7 +174,8 @@ typedef struct dmu_replay_record {
uint32_t drr_bonuslen;
uint8_t drr_checksumtype;
uint8_t drr_compress;
- uint8_t drr_pad[6];
+ uint8_t drr_dn_slots;
+ uint8_t drr_pad[5];
uint64_t drr_toguid;
/* bonus content follows */
} drr_object;
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index 43b591915..2c09feee1 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -196,6 +196,7 @@ typedef struct znode {
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_dnodesize; /* dnode size */
uint64_t z_size; /* file size (cached) */
uint64_t z_links; /* file links (cached) */
uint64_t z_pflags; /* pflags (cached) */
diff --git a/include/sys/zil.h b/include/sys/zil.h
index 65b14f1cd..1f04d1833 100644
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -173,6 +173,19 @@ typedef enum zil_create {
(txtype) == TX_WRITE2)
/*
+ * The number of dnode slots consumed by the object is stored in the 8
+ * unused upper bits of the object ID. We subtract 1 from the value
+ * stored on disk for compatibility with implementations that don't
+ * support large dnodes. The slot count for a single-slot dnode will
+ * contain 0 for those bits to preserve the log record format for
+ * "small" dnodes.
+ */
+#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
+#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
+#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
+#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
+
+/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index d481a28a8..41cfdf807 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -50,6 +50,7 @@ typedef enum spa_feature {
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURE_LARGE_BLOCKS,
+ SPA_FEATURE_LARGE_DNODE,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 87f79a532..07f2e75a6 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1473,6 +1473,7 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
case ERANGE:
if (prop == ZFS_PROP_COMPRESSION ||
+ prop == ZFS_PROP_DNODESIZE ||
prop == ZFS_PROP_RECORDSIZE) {
(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property setting is not allowed on "
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index 6d74c9a78..fa04d6e81 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -432,5 +432,30 @@ set larger than 128KB, and will return to being \fBenabled\fR once all
filesystems that have ever had their recordsize larger than 128KB are destroyed.
.RE
+.sp
+.ne 2
+.na
+\fB\fBlarge_dnode\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID org.zfsonlinux:large_dnode
+READ\-ONLY COMPATIBLE no
+DEPENDENCIES extensible_dataset
+.TE
+
+The \fBlarge_dnode\fR feature allows the size of dnodes in a dataset to be
+set larger than 512B.
+
+This feature becomes \fBactive\fR once a dataset contains an object with
+a dnode larger than 512B, which occurs as a result of setting the
+\fBdnodesize\fR dataset property to a value other than \fBlegacy\fR. The
+feature will return to being \fBenabled\fR once all filesystems that
+have ever contained a dnode larger than 512B are destroyed. Large dnodes
+allow more data to be stored in the bonus buffer, thus potentially
+improving performance by avoiding the use of spill blocks.
+.RE
+
.SH "SEE ALSO"
\fBzpool\fR(8)
diff --git a/man/man8/zfs.8 b/man/man8/zfs.8
index 4e3fa54a8..2aca58b76 100644
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@@ -910,6 +910,35 @@ The values \fBon\fR and \fBoff\fR are equivalent to the \fBdev\fR and \fBnodev\f
.sp
.ne 2
.na
+\fB\fBdnodesize\fR=\fBlegacy\fR | \fBauto\fR | \fB1k\fR | \fB2k\fR | \fB4k\fR | \fB8k\fR | \fB16k\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies a compatibility mode or literal value for the size of dnodes
+in the file system. The default value is \fBlegacy\fR. Setting this
+property to a value other than \fBlegacy\fR requires the
+\fBlarge_dnode\fR pool feature to be enabled.
+.sp
+Consider setting \fBdnodesize\fR to \fBauto\fR if the dataset uses the
+\fBxattr=sa\fR property setting and the workload makes heavy use of
+extended attributes. This may be applicable to SELinux-enabled systems,
+Lustre servers, and Samba servers, for example. Literal values are
+supported for cases where the optimal size is known in advance and for
+performance testing.
+.sp
+Leave \fBdnodesize\fR set to \fBlegacy\fR if you need to receive
+a \fBzfs send\fR stream of this dataset on a pool that doesn't enable
+the \fBlarge_dnode\fR feature, or if you need to import this pool on a
+system that doesn't support the \fBlarge_dnode\fR feature.
+.sp
+This property can also be referred to by its shortened column name,
+\fBdnsize\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
\fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR
.ad
.sp .6
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 76d564b43..1dbeab084 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -211,6 +211,17 @@ zfs_prop_init(void)
{ NULL }
};
+ static zprop_index_t dnsize_table[] = {
+ { "legacy", ZFS_DNSIZE_LEGACY },
+ { "auto", ZFS_DNSIZE_AUTO },
+ { "1k", ZFS_DNSIZE_1K },
+ { "2k", ZFS_DNSIZE_2K },
+ { "4k", ZFS_DNSIZE_4K },
+ { "8k", ZFS_DNSIZE_8K },
+ { "16k", ZFS_DNSIZE_16K },
+ { NULL }
+ };
+
static zprop_index_t redundant_metadata_table[] = {
{ "all", ZFS_REDUNDANT_METADATA_ALL },
{ "most", ZFS_REDUNDANT_METADATA_MOST },
@@ -271,6 +282,9 @@ zfs_prop_init(void)
zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"on | off | dir | sa", "XATTR", xattr_table);
+ zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
+ ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
/* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 910c56dcc..4a5836e5b 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -135,6 +135,8 @@ zpool_prop_init(void)
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
}
/*
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 126748994..4bbbd0525 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -478,7 +478,6 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
} else if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT0(db->db.db_offset);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -730,13 +729,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
+ /*
+ * The bonus length stored in the dnode may be less than
+ * the maximum available space in the bonus buffer.
+ */
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- if (bonuslen < DN_MAX_BONUSLEN)
- bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = zio_buf_alloc(max_bonuslen);
+ arc_space_consume(max_bonuslen, ARC_SPACE_OTHER);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
@@ -962,9 +966,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ arc_space_consume(bonuslen, ARC_SPACE_OTHER);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
@@ -1858,8 +1864,10 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_OTHER);
}
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
@@ -1929,7 +1937,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
mutex_enter(&dn->dn_mtx);
if (dn->dn_have_spill &&
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
- *bpp = &dn->dn_phys->dn_spill;
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
else
*bpp = NULL;
dbuf_add_ref(dn->dn_dbuf, NULL);
@@ -2018,7 +2026,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = DN_MAX_BONUSLEN -
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
@@ -2810,7 +2818,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
return;
if (db->db_blkid == DMU_SPILL_BLKID) {
- db->db_blkptr = &dn->dn_phys->dn_spill;
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
BP_ZERO(db->db_blkptr);
return;
}
@@ -2950,13 +2958,16 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
- zio_buf_free(*datap, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(*datap, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_OTHER);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
@@ -3107,7 +3118,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(bp)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
}
#endif
@@ -3119,11 +3130,16 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_DNODE) {
- dnode_phys_t *dnp = db->db.db_data;
- for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
- i--, dnp++) {
- if (dnp->dn_type != DMU_OT_NONE)
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp = db->db.db_data + i;
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
}
} else {
if (BP_IS_HOLE(bp)) {
@@ -3270,7 +3286,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
DB_DNODE_EXIT(db);
}
#endif
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index a423264c6..e1dfb41ff 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -180,7 +180,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_bonus_max(void)
{
- return (DN_MAX_BONUSLEN);
+ return (DN_OLD_MAX_BONUSLEN);
}
int
@@ -1853,6 +1853,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
@@ -1924,9 +1925,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
- /* add 1 for dnode space */
+ /* add in number of slots used for the dnode itself */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
- SPA_MINBLOCKSHIFT) + 1;
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
DB_DNODE_EXIT(db);
}
@@ -2020,6 +2033,7 @@ EXPORT_SYMBOL(dmu_object_info);
EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_dnsize_from_db);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index a5a53418b..e54043fc3 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -30,28 +30,55 @@
#include <sys/dnode.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return dmu_object_alloc_dnsize(os, ot, blocksize, bonustype, bonuslen,
+ 0, tx);
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
uint64_t object;
uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
mutex_enter(&os->os_obj_lock);
for (;;) {
object = os->os_obj_next;
/*
* Each time we polish off a L1 bp worth of dnodes (2^12
- * objects), move to another L1 bp that's still reasonably
- * sparse (at most 1/4 full). Look from the beginning at most
- * once per txg, but after that keep looking from here.
+ * objects), move to another L1 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning at most once per txg. If we still can't
+ * allocate from that L1 block, search for an empty L0
+ * block, which will quickly skip to the end of the
+ * metadnode if the no nearby L0 blocks are empty. This
+ * fallback avoids a pathology where full dnode blocks
+ * containing large dnodes appear sparse because they
+ * have a low blk_fill, leading to many failed
+ * allocation attempts. In the long term a better
+ * mechanism to search for sparse metadnode regions,
+ * such as spacemaps, could be implemented.
+ *
* os_scan_dnodes is set during txg sync if enough objects
* have been freed since the previous rescan to justify
- * backfilling again. If we can't find a suitable block, just
- * keep going from here.
+ * backfilling again.
*
* Note that dmu_traverse depends on the behavior that we use
* multiple blocks of the dnode object before going back to
@@ -59,9 +86,10 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* that property or find another solution to the issues
* described in traverse_visitbp.
*/
-
if (P2PHASE(object, L1_dnode_count) == 0) {
uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
int error;
if (os->os_rescan_dnodes) {
offset = 0;
@@ -69,13 +97,15 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
} else {
offset = object << DNODE_SHIFT;
}
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
error = dnode_next_offset(DMU_META_DNODE(os),
- DNODE_FIND_HOLE,
- &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
if (error == 0)
object = offset >> DNODE_SHIFT;
}
- os->os_obj_next = ++object;
+ os->os_obj_next = object + dn_slots;
/*
* XXX We should check for an i/o error here and return
@@ -83,16 +113,22 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
FTAG, &dn);
if (dn)
break;
if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- os->os_obj_next = object - 1;
+ os->os_obj_next = object;
+ else
+ /*
+ * Skip to next known valid starting point for a dnode.
+ */
+ os->os_obj_next = P2ROUNDUP(object + 1,
+ DNODES_PER_BLOCK);
}
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
mutex_exit(&os->os_obj_lock);
@@ -105,16 +141,33 @@ int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
if (err)
return (err);
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
dmu_tx_add_new_object(tx, os, object);
@@ -125,23 +178,34 @@ int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
return (err);
}
+
int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
@@ -150,7 +214,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
@@ -171,9 +235,30 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
- uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ uint64_t offset;
+ dmu_object_info_t doi;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
+ int dnodesize;
int error;
+ /*
+ * Avoid expensive dnode hold if this dataset doesn't use large dnodes.
+ */
+ if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ error = dmu_object_info(os, *objectp, &doi);
+ if (error && !(error == EINVAL && *objectp == 0))
+ return (SET_ERROR(error));
+ else
+ dnodesize = doi.doi_dnodesize;
+ } else {
+ dnodesize = DNODE_MIN_SIZE;
+ }
+
+ if (*objectp == 0)
+ offset = 1 << DNODE_SHIFT;
+ else
+ offset = (*objectp << DNODE_SHIFT) + dnodesize;
+
error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
@@ -235,8 +320,11 @@ dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(dmu_object_alloc);
+EXPORT_SYMBOL(dmu_object_alloc_dnsize);
EXPORT_SYMBOL(dmu_object_claim);
+EXPORT_SYMBOL(dmu_object_claim_dnsize);
EXPORT_SYMBOL(dmu_object_reclaim);
+EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
EXPORT_SYMBOL(dmu_object_free);
EXPORT_SYMBOL(dmu_object_next);
EXPORT_SYMBOL(dmu_object_zapify);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 03b30dd3b..cdc897726 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -138,6 +138,12 @@ dmu_objset_id(objset_t *os)
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
@@ -268,6 +274,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval)
}
static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -421,6 +455,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -439,6 +478,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
}
if (ds == NULL || !ds->ds_is_snapshot)
@@ -768,8 +808,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mdn = DMU_META_DNODE(os);
- dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
- DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+ dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
+ DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -1202,7 +1242,7 @@ do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
- int64_t delta = DNODE_SIZE + used;
+ int64_t delta = DNODE_MIN_SIZE + used;
if (subtract)
delta = -delta;
VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
@@ -2023,6 +2063,7 @@ EXPORT_SYMBOL(dmu_objset_find);
EXPORT_SYMBOL(dmu_objset_byteswap);
EXPORT_SYMBOL(dmu_objset_evict_dbufs);
EXPORT_SYMBOL(dmu_objset_snap_cmtime);
+EXPORT_SYMBOL(dmu_objset_dnodesize);
EXPORT_SYMBOL(dmu_objset_sync);
EXPORT_SYMBOL(dmu_objset_is_dirty);
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 896a84b50..901386a5a 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -445,6 +445,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_bonustype = dnp->dn_bonustype;
drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
drro->drr_checksumtype = dnp->dn_checksum;
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
@@ -570,7 +571,6 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
int err = 0;
- dnode_phys_t *blk;
uint64_t dnobj;
ASSERT3U(zb->zb_level, >=, 0);
@@ -590,7 +590,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- int blksz = BP_GET_LSIZE(bp);
+ dnode_phys_t *blk;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
int i;
@@ -603,8 +604,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
return (SET_ERROR(EIO));
blk = abuf->b_data;
- dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ dnobj = zb->zb_blkid * epb;
+ for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
@@ -736,6 +737,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -1252,6 +1255,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large dnodes
+ * to smaller ones, so the pool must have the LARGE_DNODE
+ * feature enabled if the stream has LARGE_DNODE.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1658,7 +1670,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
return (1);
} else {
return (1 +
- ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
}
}
@@ -1679,7 +1692,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) {
return (SET_ERROR(EINVAL));
}
@@ -1719,9 +1733,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- err = dmu_object_claim(rwa->os, drro->drr_object,
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -1771,18 +1786,25 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
- for (obj = drrfo->drr_firstobj;
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+ dmu_object_info_t doi;
int err;
- if (dmu_object_info(rwa->os, obj, NULL) != 0)
+ err = dmu_object_info(rwa->os, obj, &doi);
+ if (err == ENOENT) {
+ obj++;
continue;
+ } else if (err != 0) {
+ return (err);
+ }
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
}
+
return (0);
}
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index bba9efe14..44ba74181 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -331,13 +331,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
goto post;
cdnp = buf->b_data;
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
if (err != 0)
@@ -439,7 +439,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
+ traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
}
}
@@ -470,7 +470,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
+ err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
}
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 74e323dbd..ed29bfbc6 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1586,7 +1586,7 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
} else {
blkptr_t *bp;
- bp = &dn->dn_phys->dn_spill;
+ bp = DN_SPILL_BLKPTR(dn->dn_phys);
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
@@ -1618,7 +1618,7 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
dmu_tx_sa_registration_hold(sa, tx);
- if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
return;
(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 38bcecd46..975bd5fb8 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -248,6 +248,7 @@ dnode_verify(dnode_t *dn)
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -258,12 +259,12 @@ dnode_verify(dnode_t *dn)
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ dn->dn_bonuslen, <=, max_bonuslen);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
@@ -294,6 +295,7 @@ dnode_byteswap(dnode_phys_t *dnp)
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
@@ -320,7 +322,8 @@ dnode_byteswap(dnode_phys_t *dnp)
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -329,23 +332,24 @@ dnode_byteswap(dnode_phys_t *dnp)
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
- byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
- dnode_phys_t *buf = vbuf;
- int i;
+ int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
+ while (i < size) {
+ dnode_phys_t *dnp = vbuf + i;
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
}
}
@@ -356,7 +360,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
@@ -434,6 +438,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
@@ -534,10 +539,13 @@ dnode_destroy(dnode_t *dn)
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int i;
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
@@ -550,8 +558,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -562,7 +570,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
@@ -588,11 +596,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
- else
- dn->dn_nblkptr = 1 +
- ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -617,7 +629,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int nblkptr;
@@ -631,7 +643,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+ dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
@@ -654,7 +669,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
@@ -672,6 +689,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -680,7 +698,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
- DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
@@ -1053,24 +1072,150 @@ dnode_buf_pageout(void *dbu)
}
/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int skip;
+ int i;
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ for (i = 0; i < idx; i += skip) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL) {
+ ot = dnh->dnh_dnode->dn_type;
+ skip = dnh->dnh_dnode->dn_num_slots;
+ } else {
+ ot = dn_block[i].dn_type;
+ skip = dn_block[i].dn_extra_slots + 1;
+ }
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot == DMU_OT_NONE)
+ skip = 1;
+ }
+
+ return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ dnh = &children_dnodes->dnc_children[idx];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[idx].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int i;
+
+ if (idx + slots > DNODES_PER_BLOCK)
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ for (i = idx; i < idx + slots; i++) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[i].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot != DMU_OT_NONE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
* errors:
* EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
* EIO - i/o error.
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
void *tag, dnode_t **dnp)
{
- int epb, idx, err;
+ int epb, idx, err, i;
int drop_struct_lock = FALSE;
int type;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block_begin;
dnode_handle_t *dnh;
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
@@ -1126,12 +1271,9 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb-1);
-
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- int i;
dnode_children_t *winner;
children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t), KM_SLEEP);
@@ -1156,21 +1298,28 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
}
ASSERT(children_dnodes->dnc_count == epb);
+ idx = object & (epb - 1);
+ dn_block_begin = (dnode_phys_t *)db->db.db_data;
+
+ if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+ dbuf_rele(db, FTAG);
+ return (ENOSPC);
+ } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+ !dnode_is_allocated(db, idx)) {
+ dbuf_rele(db, FTAG);
+ return (ENOENT);
+ }
+
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
- if (dn == NULL) {
- dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-
- dn = dnode_create(os, phys, db, object, dnh);
- }
+ if (dn == NULL)
+ dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+ ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
@@ -1198,7 +1347,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
}
/*
@@ -1908,17 +2058,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
+
ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
- for (i = (*offset >> span) & (blkfill - 1);
- i >= 0 && i < blkfill; i += inc) {
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
- *offset += (1ULL << span) * inc;
}
- if (i < 0 || i == blkfill)
+
+ if (i == blkfill)
error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index bea7be186..54066e2e3 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -524,7 +524,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
- bzero(dn->dn_phys, sizeof (dnode_phys_t));
+ bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
@@ -559,7 +559,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
ASSERT(dnp->dn_type != DMU_OT_NONE ||
- bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
+ bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
@@ -591,6 +591,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
+
+ dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
@@ -623,7 +626,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonuslen = 0;
else
dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
- ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ ASSERT(dnp->dn_bonuslen <=
+ DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
dn->dn_next_bonuslen[txgoff] = 0;
}
@@ -662,7 +666,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
- free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@@ -687,6 +691,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
return;
}
+ if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
+ B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
if (dn->dn_next_nlevels[txgoff]) {
dnode_increase_indirection(dn, tx);
dn->dn_next_nlevels[txgoff] = 0;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b5e272fb9..72163521e 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -709,14 +709,18 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
scn->scn_phys.scn_errors++;
return (err);
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
for (j = 0; j < cdnp->dn_nblkptr; j++) {
blkptr_t *cbp = &cdnp->dn_blkptr[j];
dsl_scan_prefetch(scn, buf, cbp,
zb->zb_objset, zb->zb_blkid * epb + i, j);
}
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
dsl_scan_visitdnode(scn, ds, ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
@@ -779,7 +783,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
0, DMU_SPILL_BLKID);
- dsl_scan_visitbp(&dnp->dn_spill,
+ dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
&czb, dnp, ds, scn, ostype, tx);
}
}
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index d6ac5fcc7..adc301512 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -33,6 +33,7 @@
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
#include <sys/zap.h>
@@ -553,12 +554,11 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
*/
static int
sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
- dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
- boolean_t *will_spill)
+ dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+ int *total, boolean_t *will_spill)
{
int var_size_count = 0;
int i;
- int full_space;
int hdrsize;
int extra_hdrsize;
@@ -577,7 +577,6 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
sizeof (sa_hdr_phys_t);
- full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
ASSERT(IS_P2ALIGNED(full_space, 8));
for (i = 0; i != attr_count; i++) {
@@ -668,6 +667,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
void *data_start;
sa_attr_type_t *attrs, *attrs_start;
int i, lot_count;
+ int dnodesize;
int spill_idx;
int hdrsize;
int spillhdrsize = 0;
@@ -676,20 +676,23 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
sa_lot_t *lot;
int len_idx;
int spill_used;
+ int bonuslen;
boolean_t spilling;
dmu_buf_will_dirty(hdl->sa_bonus, tx);
bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
/* first determine bonus header size and sum of all attributes */
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
- SA_BONUS, &spill_idx, &used, &spilling);
+ SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
- VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
- MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
used + hdrsize, tx));
ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
@@ -706,8 +709,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
dmu_buf_will_dirty(hdl->sa_spill, tx);
spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
- attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
- &spill_used, &dummy);
+ attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
+ hdl->sa_spill->db_size, &i, &spill_used, &dummy);
if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index c23fd7a3a..d1aefe585 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -281,6 +281,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
}
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -512,7 +520,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
/*
* Must be ZPL, and its property settings
* must be supported by GRUB (compression
- * is not gzip, and large blocks are not used).
+ * is not gzip, and large blocks or large
+ * dnodes are not used).
*/
if (dmu_objset_type(os) != DMU_OST_ZFS) {
@@ -529,6 +538,12 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
&propval)) == 0 &&
propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ &propval)) == 0 &&
+ propval != ZFS_DNSIZE_LEGACY) {
+ error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index e3e7e36fe..d1303b5c2 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2000,6 +2000,15 @@ spa_maxblocksize(spa_t *spa)
return (SPA_OLD_MAXBLOCKSIZE);
}
+int
+spa_maxdnodesize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (DNODE_MAX_SIZE);
+ else
+ return (DNODE_MIN_SIZE);
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
@@ -2056,6 +2065,7 @@ EXPORT_SYMBOL(spa_bootfs);
EXPORT_SYMBOL(spa_delegation);
EXPORT_SYMBOL(spa_meta_objset);
EXPORT_SYMBOL(spa_maxblocksize);
+EXPORT_SYMBOL(spa_maxdnodesize);
/* Miscellaneous support routines */
EXPORT_SYMBOL(spa_rename);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 454b4be62..9e4f05049 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -968,9 +968,17 @@ uint64_t
zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
const char *name, dmu_tx_t *tx)
{
+ return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, int dnodesize, dmu_tx_t *tx)
+{
uint64_t new_obj;
- VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+ VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
+ dnodesize, tx)) > 0);
VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
tx));
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 3faf27ce3..f3153cc18 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -630,8 +630,16 @@ int
zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (zap_create_claim_norm(os, obj,
- 0, ot, bonustype, bonuslen, tx));
+ return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj,
+ 0, ot, bonustype, bonuslen, dnodesize, tx));
}
int
@@ -639,9 +647,19 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
int err;
- err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
if (err != 0)
return (err);
mzap_create_impl(os, obj, normflags, 0, tx);
@@ -656,10 +674,27 @@ zap_create(objset_t *os, dmu_object_type_t ot,
}
uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+ dnodesize, tx));
+}
+
+uint64_t
zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
mzap_create_impl(os, obj, normflags, 0, tx);
return (obj);
@@ -670,7 +705,17 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_flags_dnsize(os, normflags, flags, ot,
+ leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
@@ -1458,10 +1503,14 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zap_create);
+EXPORT_SYMBOL(zap_create_dnsize);
EXPORT_SYMBOL(zap_create_norm);
+EXPORT_SYMBOL(zap_create_norm_dnsize);
EXPORT_SYMBOL(zap_create_flags);
+EXPORT_SYMBOL(zap_create_flags_dnsize);
EXPORT_SYMBOL(zap_create_claim);
EXPORT_SYMBOL(zap_create_claim_norm);
+EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
EXPORT_SYMBOL(zap_destroy);
EXPORT_SYMBOL(zap_lookup);
EXPORT_SYMBOL(zap_lookup_norm);
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
index f57e5489c..3264f6235 100644
--- a/module/zfs/zfeature_common.c
+++ b/module/zfs/zfeature_common.c
@@ -242,4 +242,15 @@ zpool_feature_init(void)
"Support for blocks larger than 128KB.",
ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
}
+
+ {
+ static const spa_feature_t large_dnode_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_DNODE,
+ "org.zfsonlinux:large_dnode", "large_dnode",
+ "Variable on-disk size of dnodes.",
+ ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
+ }
}
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
index 961083d4a..f820cdfd6 100644
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -1394,7 +1394,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
otype == DMU_OT_ACL ?
DMU_OT_SYSACL : DMU_OT_NONE,
otype == DMU_OT_ACL ?
- DN_MAX_BONUSLEN : 0, tx);
+ DN_OLD_MAX_BONUSLEN : 0, tx);
} else {
(void) dmu_object_set_blocksize(zsb->z_os,
aoid, aclp->z_acl_bytes, 0, tx);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index c63af167a..30338ac14 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3785,7 +3785,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
/*
* If this is a bootable dataset then
- * the we don't allow large (>128K) blocks,
+ * we don't allow large (>128K) blocks,
* because GRUB doesn't support them.
*/
if (zfs_is_bootfs(dsname) &&
@@ -3813,6 +3813,34 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
+ case ZFS_PROP_DNODESIZE:
+ /* Dnode sizes above 512 need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * we don't allow large (>512B) dnodes,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ return (SET_ERROR(EDOM));
+ }
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_DNODE)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 38d8de0eb..4d89cb04b 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -279,6 +279,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
+ /* Store dnode slot count in 8 bits above object id. */
+ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
lr->lr_mode = zp->z_mode;
if (!IS_EPHEMERAL(zp->z_uid)) {
lr->lr_uid = (uint64_t)zp->z_uid;
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c
index b97a60ed8..54c175437 100644
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -279,6 +279,8 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap)
void *fuidstart;
size_t xvatlen = 0;
uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
int error;
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
@@ -304,19 +306,24 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap)
if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0)
return (error);
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic zfs_create()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode size. The generic
+ * zfs_create() has no concept of these attributes, so we smuggle
+ * the values inside the vattr's otherwise unused va_ctime,
+ * va_nblocks, and va_fsid fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL);
if (error != ENOENT)
@@ -418,6 +425,8 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap)
void *start;
size_t xvatlen;
uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
int error;
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
@@ -431,21 +440,26 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap)
if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0)
return (error);
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic zfs_create()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode slot count. The
+ * generic zfs_create() has no concept of these attributes, so
+ * we smuggle the values inside * the vattr's otherwise unused
+ * va_ctime, va_nblocks, and va_nlink fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
- error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL);
+ error = dmu_object_info(zsb->z_os, objid, NULL);
if (error != ENOENT)
goto out;
diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c
index f4841435b..f3eac51f8 100644
--- a/module/zfs/zfs_sa.c
+++ b/module/zfs/zfs_sa.c
@@ -97,8 +97,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
- VERIFY(dmu_set_bonus(db,
- len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
if (len) {
bcopy(link, (caddr_t)db->db_data +
ZFS_OLD_ZNODE_PHYS_SIZE, len);
@@ -107,8 +106,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_t *dbp;
zfs_grow_blocksize(zp, len, tx);
- VERIFY(0 == dmu_buf_hold(ZTOZSB(zp)->z_os,
- zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+ VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp,
+ DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(dbp, tx);
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 19cb414a9..310d4827b 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -62,6 +62,7 @@
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -728,6 +729,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
timestruc_t now;
uint64_t gen, obj;
int bonuslen;
+ int dnodesize;
sa_handle_t *sa_hdl;
dmu_object_type_t obj_type;
sa_bulk_attr_t *sa_attrs;
@@ -739,15 +741,21 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
obj = vap->va_nodeid;
now = vap->va_ctime; /* see zfs_replay_create() */
gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
} else {
obj = 0;
gethrestime(&now);
gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zsb->z_os);
}
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
obj_type = zsb->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
bonuslen = (obj_type == DMU_OT_SA) ?
- DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
/*
* Create a new DMU object.
@@ -760,23 +768,23 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
if (S_ISDIR(vap->va_mode)) {
if (zsb->z_replay) {
- VERIFY0(zap_create_claim_norm(zsb->z_os, obj,
+ VERIFY0(zap_create_claim_norm_dnsize(zsb->z_os, obj,
zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = zap_create_norm(zsb->z_os,
+ obj = zap_create_norm_dnsize(zsb->z_os,
zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
} else {
if (zsb->z_replay) {
- VERIFY0(dmu_object_claim(zsb->z_os, obj,
+ VERIFY0(dmu_object_claim_dnsize(zsb->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = dmu_object_alloc(zsb->z_os,
+ obj = dmu_object_alloc_dnsize(zsb->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
}
@@ -948,6 +956,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_pflags = pflags;
(*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
if (obj_type == DMU_OT_ZNODE ||
acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
@@ -1767,6 +1776,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
ASSERT(error == 0);
/*
+ * Give dmu_object_alloc() a hint about where to start
+ * allocating new objects. Otherwise, since the metadnode's
+ * dnode_phys_t structure isn't initialized yet, dmu_object_next()
+ * would fail and we'd have to skip to the next dnode block.
+ */
+ os->os_obj_next = moid + 1;
+
+ /*
* Set starting attributes.
*/
version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 289b23c7f..988ffec29 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1372,7 +1372,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
itxg->itxg_sod += itx->itx_sod;
} else {
avl_tree_t *t = &itxs->i_async_tree;
- uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ uint64_t foid =
+ LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
itx_async_node_t *ian;
avl_index_t where;
@@ -1918,7 +1919,8 @@ zil_close(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- ASSERT(!zilog_is_dirty(zilog));
+ if (txg < spa_freeze_txg(zilog->zl_spa))
+ ASSERT(!zilog_is_dirty(zilog));
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
@@ -2122,7 +2124,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
*/
if (TX_OOO(txtype)) {
error = dmu_object_info(zilog->zl_os,
- ((lr_ooo_t *)lr)->lr_foid, NULL);
+ LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
if (error == ENOENT || error == EEXIST)
return (0);
}
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 1f52d2815..003c513dd 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -411,6 +411,11 @@ tests = ['exec_001_pos']
[tests/functional/features/async_destroy]
tests = ['async_destroy_001_pos']
+[tests/functional/features/large_dnode]
+tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
+ 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
+ 'large_dnode_007_neg']
+
# DISABLED: needs investigation
#[tests/functional/grow_pool]
#tests = ['grow_pool_001_pos']
diff --git a/tests/zfs-tests/include/default.cfg.in b/tests/zfs-tests/include/default.cfg.in
index 9474c489e..13317ea87 100644
--- a/tests/zfs-tests/include/default.cfg.in
+++ b/tests/zfs-tests/include/default.cfg.in
@@ -80,7 +80,7 @@ export READMMAP=${READMMAP:-${helperdir}/readmmap}
export RENAME_DIR=${RENAME_DIR:-${helperdir}/rename_dir}
export RM_LNKCNT_ZERO_FILE=${RM_LNKCNT_ZERO_FILE:-${helperdir}/rm_lnkcnt_zero_file}
export THREADSAPPEND=${THREADSAPPEND:-${helperdir}/threadsappend}
-export XATTRTEST=${XATTRTEST:-${helpdir}/xattrtest}
+export XATTRTEST=${XATTRTEST:-${helperdir}/xattrtest}
# ensure we're running in the C locale, since
# localised messages may result in test failures
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index d1d89f1af..f7a1d9cb1 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -35,7 +35,7 @@ typeset -a properties=("size" "capacity" "altroot" "health" "guid" "version"
"free" "allocated" "readonly" "comment" "expandsize" "freeing" "failmode"
"listsnapshots" "autoexpand" "fragmentation" "leaked" "ashift"
"feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress"
- "feature@large_blocks" "feature@filesystem_limits"
+ "feature@large_blocks" "feature@large_dnode" "feature@filesystem_limits"
"feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth"
"feature@extensible_dataset" "feature@bookmarks" "feature@embedded_data")
else
diff --git a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
index c90329ce1..796cda929 100644
--- a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
+++ b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib
@@ -250,6 +250,10 @@ function check_fs_perm
verify_fs_canmount $user $perm $fs
ret=$?
;;
+ dnodesize)
+ verify_fs_dnodesize $user $perm $fs
+ ret=$?
+ ;;
recordsize)
verify_fs_recordsize $user $perm $fs
ret=$?
@@ -1112,6 +1116,21 @@ function verify_fs_recordsize
return 0
}
+function verify_fs_dnodesize
+{
+ typeset user=$1
+ typeset perm=$2
+ typeset fs=$3
+ value="2k"
+
+ user_run $user $ZFS set dnodesize=$value $fs
+ if [[ $value != $(get_prop dnodesize $fs) ]]; then
+ return 1
+ fi
+
+ return 0
+}
+
function verify_fs_quota
{
typeset user=$1
diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh
index 28b2b1dec..2e3be0cf5 100755
--- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh
@@ -70,6 +70,7 @@ set -A perms create true false \
allow true true \
quota true false \
reservation true true \
+ dnodesize true false \
recordsize true false \
checksum true true \
compression true true \
@@ -95,6 +96,7 @@ set -A perms create true false \
allow true true \
quota true false \
reservation true true \
+ dnodesize true false \
recordsize true false \
mountpoint true false \
checksum true true \
diff --git a/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh b/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh
index a64561b69..723fb9292 100755
--- a/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh
+++ b/tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh
@@ -59,13 +59,13 @@ if is_linux; then
set -A perms create snapshot mount send allow quota reservation \
recordsize mountpoint checksum compression canmount atime \
devices exec volsize setuid readonly snapdir userprop \
- rollback clone rename promote \
+ rollback clone rename promote dnodesize \
zoned xattr receive destroy
else
set -A perms create snapshot mount send allow quota reservation \
recordsize mountpoint checksum compression canmount atime \
devices exec volsize setuid readonly snapdir userprop \
- aclmode aclinherit rollback clone rename promote \
+ aclmode aclinherit rollback clone rename promote dnodesize \
zoned xattr receive destroy sharenfs share
fi
diff --git a/tests/zfs-tests/tests/functional/features/Makefile.am b/tests/zfs-tests/tests/functional/features/Makefile.am
index 4c6142a2f..3657461e6 100644
--- a/tests/zfs-tests/tests/functional/features/Makefile.am
+++ b/tests/zfs-tests/tests/functional/features/Makefile.am
@@ -1 +1,3 @@
-SUBDIRS = async_destroy
+SUBDIRS = \
+ async_destroy \
+ large_dnode
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
new file mode 100644
index 000000000..6afda5362
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
@@ -0,0 +1,11 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/features/large_dnode
+dist_pkgdata_SCRIPTS = \
+ cleanup.ksh \
+ setup.ksh \
+ large_dnode_001_pos.ksh \
+ large_dnode_002_pos.ksh \
+ large_dnode_003_pos.ksh \
+ large_dnode_004_neg.ksh \
+ large_dnode_005_pos.ksh \
+ large_dnode_006_pos.ksh \
+ large_dnode_007_neg.ksh
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/cleanup.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/cleanup.ksh
new file mode 100755
index 000000000..61caf3910
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/cleanup.ksh
@@ -0,0 +1,25 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh
new file mode 100755
index 000000000..a05acb97e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh
@@ -0,0 +1,77 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that the dnode sizes of newly created files are consistent
+# with the dnodesize dataset property.
+#
+# STRATEGY:
+# 1. Create a file system
+# 2. Set dnodesize to a legal literal value
+# 3. Create a file
+# 4. Repeat 2-3 for all legal literal values of dnodesize values
+# 5. Unmount the file system
+# 6. Use zdb to check expected dnode sizes
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TEST_FS && log_must $ZFS destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "dnode sizes are consistent with dnodesize dataset property"
+
+log_must $ZFS create $TEST_FS
+
+set -A dnsizes "512" "1k" "2k" "4k" "8k" "16k"
+set -A inodes
+
+for ((i=0; i < ${#dnsizes[*]}; i++)) ; do
+ size=${dnsizes[$i]}
+ if [[ $size == "512" ]] ; then
+ size="legacy"
+ fi
+ file=/$TEST_FS/file.$size
+ log_must $ZFS set dnsize=$size $TEST_FS
+ touch $file
+ inodes[$i]=$(ls -li $file | awk '{print $1}')
+done
+
+log_must $ZFS umount $TEST_FS
+
+for ((i=0; i < ${#dnsizes[*]}; i++)) ; do
+ dnsize=$($ZDB -dddd $TEST_FS ${inodes[$i]} |
+ awk '/ZFS plain file/ {print $6}' | tr K k)
+ if [[ "$dnsize" != "${dnsizes[$i]}" ]]; then
+ log_fail "dnode size is $dnsize (expected ${dnsizes[$i]})"
+ fi
+done
+
+log_pass "dnode sizes are consistent with dnodesize dataset property"
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh
new file mode 100755
index 000000000..788e33a13
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh
@@ -0,0 +1,78 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that extended attributes can use extra bonus space of a large
+# dnode without kicking in a spill block.
+#
+# STRATEGY:
+# 1. Create a file system with xattr=sa
+# 2. Set dnodesize to a legal literal value
+# 3. Create a file
+# 4 Store an xattr that fits within the dnode size
+# 4. Repeat 2-3 for all legal literal values of dnodesize values
+# 5. Unmount the file system
+# 6. Use zdb to check for missing SPILL_BLKPTR flag
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TEST_FS && log_must $ZFS destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "extended attributes use extra bonus space of a large dnode"
+
+log_must $ZFS create -o xattr=sa $TEST_FS
+
+# Store dnode size minus 512 in an xattr
+set -A xattr_sizes "512" "1536" "3584" "7680" "15872"
+set -A prop_values "1k" "2k" "4k" "8k" "16k"
+set -A inodes
+
+for ((i=0; i < ${#prop_values[*]}; i++)) ; do
+ prop_val=${prop_values[$i]}
+ file=/$TEST_FS/file.$prop_val
+ log_must $ZFS set dnsize=$prop_val $TEST_FS
+ touch $file
+ xattr_size=${xattr_sizes[$i]}
+ xattr_name=user.foo
+ xattr_val=$(dd if=/dev/urandom bs=1 count=$xattr_size |
+ openssl enc -a -A)
+ log_must setfattr -n $xattr_name -v 0s$xattr_val $file
+ inodes[$i]=$(ls -li $file | awk '{print $1}')
+done
+
+log_must $ZFS umount $TEST_FS
+
+for ((i=0; i < ${#inodes[*]}; i++)) ; do
+ log_mustnot eval "$ZDB -dddd $TEST_FS ${inodes[$i]} | grep SPILL_BLKPTR"
+done
+
+log_pass "extended attributes use extra bonus space of a large dnode"
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_003_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_003_pos.ksh
new file mode 100755
index 000000000..3f5779ded
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_003_pos.ksh
@@ -0,0 +1,65 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+ if datasetexists $LDNPOOL ; then
+ log_must $ZPOOL destroy -f $LDNPOOL
+ fi
+}
+
+log_onexit cleanup
+
+log_assert "feature correctly switches between enabled and active"
+
+LDNPOOL=ldnpool
+LDNFS=$LDNPOOL/large_dnode
+log_must $MKFILE 64M $TESTDIR/$LDNPOOL
+log_must $ZPOOL create $LDNPOOL $TESTDIR/$LDNPOOL
+
+
+state=$($ZPOOL list -Ho feature@large_dnode $LDNPOOL)
+if [[ "$state" != "enabled" ]]; then
+ log_fail "large_dnode has state $state (expected enabled)"
+fi
+
+log_must $ZFS create -o dnodesize=1k $LDNFS
+log_must touch /$LDNFS/foo
+log_must $ZFS unmount $LDNFS
+
+state=$($ZPOOL list -Ho feature@large_dnode $LDNPOOL)
+if [[ "$state" != "active" ]]; then
+ log_fail "large_dnode has state $state (expected active)"
+fi
+
+log_must $ZFS destroy $LDNFS
+
+state=$($ZPOOL list -Ho feature@large_dnode $LDNPOOL)
+if [[ "$state" != "enabled" ]]; then
+ log_fail "large_dnode has state $state (expected enabled)"
+fi
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh
new file mode 100755
index 000000000..26ed66994
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh
@@ -0,0 +1,59 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+TEST_FS=$TESTPOOL/large_dnode
+TEST_SNAP=$TESTPOOL/large_dnode@ldnsnap
+TEST_STREAM=$TESTDIR/ldnsnap
+
+function cleanup
+{
+ if datasetexists $TEST_FS ; then
+ log_must $ZFS destroy -r $TEST_FS
+ fi
+
+ if datasetexists $LGCYPOOL ; then
+ log_must $ZPOOL destroy -f $LGCYPOOL
+ fi
+
+ rm -f $TEST_STREAM
+}
+
+log_onexit cleanup
+log_assert "zfs send stream with large dnodes not accepted by legacy pool"
+
+log_must $ZFS create -o dnodesize=1k $TEST_FS
+log_must touch /$TEST_FS/foo
+log_must $ZFS umount $TEST_FS
+log_must $ZFS snap $TEST_SNAP
+log_must eval "$ZFS send $TEST_SNAP > $TEST_STREAM"
+
+LGCYPOOL=ldnpool
+LGCYFS=$LGCYPOOL/legacy
+log_must $MKFILE 64M $TESTDIR/$LGCYPOOL
+log_must $ZPOOL create -d $LGCYPOOL $TESTDIR/$LGCYPOOL
+log_mustnot eval "$ZFS recv $LGCYFS < $TEST_STREAM"
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh
new file mode 100755
index 000000000..e03d1274f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+TEST_SEND_FS=$TESTPOOL/send_large_dnode
+TEST_RECV_FS=$TESTPOOL/recv_large_dnode
+TEST_SNAP=$TEST_SEND_FS@ldnsnap
+TEST_STREAM=$TESTDIR/ldnsnap
+TEST_FILE=foo
+
+
+function cleanup
+{
+ if datasetexists $TEST_SEND_FS ; then
+ log_must $ZFS destroy -r $TEST_SEND_FS
+ fi
+
+ if datasetexists $TEST_RECV_FS ; then
+ log_must $ZFS destroy -r $TEST_RECV_FS
+ fi
+
+ rm -f $TEST_STREAM
+}
+
+log_onexit cleanup
+
+log_assert "zfs send stream with large dnodes accepted by new pool"
+
+log_must $ZFS create -o dnodesize=1k $TEST_SEND_FS
+log_must touch /$TEST_SEND_FS/$TEST_FILE
+log_must $ZFS umount $TEST_SEND_FS
+log_must $ZFS snap $TEST_SNAP
+log_must $ZFS send $TEST_SNAP > $TEST_STREAM
+
+log_must eval "$ZFS recv $TEST_RECV_FS < $TEST_STREAM"
+inode=$(ls -li /$TEST_RECV_FS/$TEST_FILE | awk '{print $1}')
+dnsize=$($ZDB -dddd $TEST_RECV_FS $inode | awk '/ZFS plain file/ {print $6}')
+if [[ "$dnsize" != "1K" ]]; then
+ log_fail "dnode size is $dnsize (expected 1K)"
+fi
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh
new file mode 100755
index 000000000..198b69bf7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh
@@ -0,0 +1,58 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Run xattrtest on a dataset with large dnodes and xattr=sa
+# to stress xattr usage of the extra bonus space and verify
+# contents
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TEST_FS && log_must $ZFS destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "xattrtest runs cleanly on dataset with large dnodes"
+
+log_must $ZFS create $TEST_FS
+
+set -A xattr_sizes "512" "1536" "3584" "7680" "15872"
+set -A prop_values "1k" "2k" "4k" "8k" "16k"
+
+for ((i=0; i < ${#prop_values[*]}; i++)) ; do
+ prop_val=${prop_values[$i]}
+ dir=/$TEST_FS/$prop_val
+ xattr_size=${xattr_sizes[$i]}
+ log_must $ZFS set dnsize=$prop_val $TEST_FS
+ log_must mkdir $dir
+ log_must $XATTRTEST -R -y -s $xattr_size -f 1024 -p $dir
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_007_neg.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_007_neg.ksh
new file mode 100755
index 000000000..d6cc17e0b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_007_neg.ksh
@@ -0,0 +1,56 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that the dnodesize dataset property won't accept a value
+# other than "legacy" if the large_dnode feature is not enabled.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+ if datasetexists $LGCYPOOL ; then
+ log_must $ZPOOL destroy -f $LGCYPOOL
+ fi
+}
+
+log_onexit cleanup
+
+log_assert "values other than dnodesize=legacy rejected by legacy pool"
+
+set -A prop_vals "auto" "1k" "2k" "4k" "8k" "16k"
+
+LGCYPOOL=lgcypool
+LGCYFS=$LGCYPOOL/legacy
+log_must $MKFILE 64M $TESTDIR/$LGCYPOOL
+log_must $ZPOOL create -d $LGCYPOOL $TESTDIR/$LGCYPOOL
+log_must $ZFS create $LGCYFS
+
+for val in ${prop_vals[@]} ; do
+ log_mustnot $ZFS set dnodesize=$val $LGCYFS
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/setup.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/setup.ksh
new file mode 100755
index 000000000..d9b1a6ee8
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/features/large_dnode/setup.ksh
@@ -0,0 +1,27 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
index 46f96fa61..91779cc78 100644
--- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -201,8 +201,8 @@ function cmp_ds_prop
for item in "type" "origin" "volblocksize" "aclinherit" "aclmode" \
"atime" "canmount" "checksum" "compression" "copies" "devices" \
- "exec" "quota" "readonly" "recordsize" "reservation" "setuid" \
- "sharenfs" "snapdir" "version" "volsize" "xattr" "zoned" \
+ "dnodesize" "exec" "quota" "readonly" "recordsize" "reservation" \
+ "setuid" "sharenfs" "snapdir" "version" "volsize" "xattr" "zoned" \
"mountpoint";
do
$ZFS get -H -o property,value,source $item $dtst1 >> \
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
index 4a590fab4..91cdd6e34 100755
--- a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
@@ -142,6 +142,7 @@ for fs in "$POOL" "$POOL/pclone" "$POOL/$FS" "$POOL/$FS/fs1" \
rand_set_prop $fs exec "on" "off"
rand_set_prop $fs quota "512M" "1024M"
rand_set_prop $fs recordsize "512" "2K" "8K" "32K" "128K"
+ rand_set_prop $fs dnodesize "legacy" "auto" "1k" "2k" "4k" "8k" "16k"
rand_set_prop $fs setuid "on" "off"
rand_set_prop $fs snapdir "hidden" "visible"
rand_set_prop $fs xattr "on" "off"
diff --git a/zfs-script-config.sh.in b/zfs-script-config.sh.in
index 2470c554a..e22cfd595 100644
--- a/zfs-script-config.sh.in
+++ b/zfs-script-config.sh.in
@@ -59,7 +59,7 @@ export READMMAP=${TESTSDIR}/zfs-tests/cmd/readmmap/readmmap
export RENAME_DIR=${TESTSDIR}/zfs-tests/cmd/rename_dir/rename_dir
export RM_LNKCNT_ZERO_FILE=${TESTSDIR}/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file
export THREADSAPPEND=${TESTSDIR}/zfs-tests/cmd/threadsappend/threadsappend
-export XATTRTEST=${TESTDIR}/zfs-tests/cmd/xattrtest
+export XATTRTEST=${TESTSDIR}/zfs-tests/cmd/xattrtest/xattrtest
export INTREE=1
export LDMOD=/sbin/insmod