diff options
author | Giuseppe Di Natale <[email protected]> | 2017-09-13 15:46:15 -0700 |
---|---|---|
committer | Tony Hutter <[email protected]> | 2017-09-13 15:46:15 -0700 |
commit | 45d1abc74d6bd4b09c573dd8db0d2571eb82220d (patch) | |
tree | 8d2afe50bd95ebaadde7410d3a566e1a10f8f4ec | |
parent | 89950722c627ad4470916c5fe94d200af72817b2 (diff) |
Improved dnode allocation and dmu_hold_impl() (#6611)
Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the
code, fix errors introduced by commit dbeb879 (PR #6117) interacting
badly with large dnodes, and improve performance.
* When allocating a new dnode in dmu_object_alloc_dnsize(), update the
percpu object ID for the core's metadnode chunk immediately. This
eliminates most lock contention when taking the hold and creating the
dnode.
* Correct detection of the chunk boundary to work properly with large
dnodes.
* Separate the dmu_hold_impl() code for the FREE case from the code for
the ALLOCATED case to make it easier to read.
* Fully populate the dnode handle array immediately after reading a
block of the metadnode from disk. Subsequently the dnode handle array
provides enough information to determine which dnode slots are in use
and which are free.
* Add several kstats to allow the behavior of the code to be examined.
* Verify dnode packing in large_dnode_008_pos.ksh. Since the test is
purely creates, it should leave very few holes in the metadnode.
* Add test large_dnode_009_pos.ksh, which performs concurrent creates
and deletes, to complement existing test which does only creates.
With the above fixes, there is very little contention in a test of about
200,000 racing dnode allocations produced by tests 'large_dnode_008_pos'
and 'large_dnode_009_pos'.
name type data
dnode_hold_dbuf_hold 4 0
dnode_hold_dbuf_read 4 0
dnode_hold_alloc_hits 4 3804690
dnode_hold_alloc_misses 4 216
dnode_hold_alloc_interior 4 3
dnode_hold_alloc_lock_retry 4 0
dnode_hold_alloc_lock_misses 4 0
dnode_hold_alloc_type_none 4 0
dnode_hold_free_hits 4 203105
dnode_hold_free_misses 4 4
dnode_hold_free_lock_misses 4 0
dnode_hold_free_lock_retry 4 0
dnode_hold_free_overflow 4 0
dnode_hold_free_refcount 4 57
dnode_hold_free_txg 4 0
dnode_allocate 4 203154
dnode_reallocate 4 0
dnode_buf_evict 4 23918
dnode_alloc_next_chunk 4 4887
dnode_alloc_race 4 0
dnode_alloc_next_block 4 18
The performance is slightly improved for concurrent creates with
16+ threads, and unchanged for low thread counts.
Signed-off-by: Brian Behlendorf <[email protected]>
Signed-off-by: Olaf Faaland <[email protected]>
-rw-r--r-- | cmd/zdb/zdb.c | 36 | ||||
-rw-r--r-- | include/sys/dnode.h | 136 | ||||
-rw-r--r-- | module/zfs/dbuf_stats.c | 3 | ||||
-rw-r--r-- | module/zfs/dmu_object.c | 68 | ||||
-rw-r--r-- | module/zfs/dnode.c | 522 | ||||
-rw-r--r-- | tests/runfiles/linux.run | 2 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh | 19 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh | 71 |
9 files changed, 609 insertions, 251 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 21f8ea87c..1097501e8 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1933,7 +1933,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { }; static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) +dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, + uint64_t *dnode_slots_used) { dmu_buf_t *db = NULL; dmu_object_info_t doi; @@ -1965,6 +1966,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } dmu_object_info_from_dnode(dn, &doi); + if (dnode_slots_used) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + zdb_nicenum(doi.doi_metadata_block_size, iblk); zdb_nicenum(doi.doi_data_block_size, dblk); zdb_nicenum(doi.doi_max_offset, lsize); @@ -2072,6 +2076,9 @@ dump_dir(objset_t *os) int verbosity = dump_opt['d']; int print_header = 1; int i, error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); @@ -2112,7 +2119,7 @@ dump_dir(objset_t *os) if (zopt_objects != 0) { for (i = 0; i < zopt_objects; i++) dump_object(os, zopt_object[i], verbosity, - &print_header); + &print_header, NULL); (void) printf("\n"); return; } @@ -2129,24 +2136,39 @@ dump_dir(objset_t *os) if (BP_IS_HOLE(os->os_rootbp)) return; - dump_object(os, 0, verbosity, &print_header); + dump_object(os, 0, verbosity, &print_header, NULL); object_count = 0; if (DMU_USERUSED_DNODE(os) != NULL && DMU_USERUSED_DNODE(os)->dn_type != 0) { - dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); - dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL); } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header); + dump_object(os, object, verbosity, &print_header, &dnode_slots); object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; } ASSERT3U(object_count, ==, usedobjs); (void) printf("\n"); + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + + (void) printf("\n"); + if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); @@ -2610,7 +2632,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name) return (dump_path_impl(os, child_obj, s + 1)); /*FALLTHROUGH*/ case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header); + dump_object(os, child_obj, dump_opt['v'], &header, NULL); return (0); default: (void) fprintf(stderr, "object %llu has non-file/directory " diff --git a/include/sys/dnode.h b/include/sys/dnode.h index d32855dcd..c7efe5593 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -100,6 +100,13 @@ extern "C" { #define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) #define DN_KILL_SPILLBLK (1) +#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ +#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ +#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ +#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ +#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) +#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) + #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -363,6 +370,135 @@ void dnode_evict_bonus(dnode_t *dn); ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) +/* + * Used for dnodestats kstat. + */ +typedef struct dnode_stats { + /* + * Number of failed attempts to hold a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_hold; + /* + * Number of failed attempts to read a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_read; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able + * to hold the requested object number which was allocated. This is + * the common case when looking up any allocated object number. + */ + kstat_named_t dnode_hold_alloc_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because it was not allocated. + */ + kstat_named_t dnode_hold_alloc_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because the object number + * refers to an interior large dnode slot. + */ + kstat_named_t dnode_hold_alloc_interior; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_alloc_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not + * need to create the dnode because another thread did so after + * dropping the read lock but before acquiring the write lock. + */ + kstat_named_t dnode_hold_alloc_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found + * a free dnode instantiated by dnode_create() but not yet allocated + * by dnode_allocate(). + */ + kstat_named_t dnode_hold_alloc_type_none; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able + * to hold the requested range of free dnode slots. + */ + kstat_named_t dnode_hold_free_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * after acquiring the zrl lock at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_free_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which were held by another thread. + */ + kstat_named_t dnode_hold_free_refcount; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which would overflow the dnode_phys_t. + */ + kstat_named_t dnode_hold_free_overflow; + /* + * Number of times a dnode_hold(...) was attempted on a dnode + * which had already been unlinked in an earlier txg. + */ + kstat_named_t dnode_hold_free_txg; + /* + * Number of new dnodes allocated by dnode_allocate(). + */ + kstat_named_t dnode_allocate; + /* + * Number of dnodes re-allocated by dnode_reallocate(). + */ + kstat_named_t dnode_reallocate; + /* + * Number of meta dnode dbufs evicted. + */ + kstat_named_t dnode_buf_evict; + /* + * Number of times dmu_object_alloc*() reached the end of the existing + * object ID chunk and advanced to a new one. + */ + kstat_named_t dnode_alloc_next_chunk; + /* + * Number of times multiple threads attempted to allocate a dnode + * from the same block of free dnodes. + */ + kstat_named_t dnode_alloc_race; + /* + * Number of times dmu_object_alloc*() was forced to advance to the + * next meta dnode dbuf due to an error from dmu_object_next(). + */ + kstat_named_t dnode_alloc_next_block; + /* + * Statistics for tracking dnodes which have been moved. + */ + kstat_named_t dnode_move_invalid; + kstat_named_t dnode_move_recheck1; + kstat_named_t dnode_move_recheck2; + kstat_named_t dnode_move_special; + kstat_named_t dnode_move_handle; + kstat_named_t dnode_move_rwlock; + kstat_named_t dnode_move_active; +} dnode_stats_t; + +extern dnode_stats_t dnode_stats; + +#define DNODE_STAT_INCR(stat, val) \ + atomic_add_64(&dnode_stats.stat.value.ui64, (val)); +#define DNODE_STAT_BUMP(stat) \ + DNODE_STAT_INCR(stat, 1); + #ifdef ZFS_DEBUG /* diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index ae8ba8682..1712c9c10 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -72,8 +72,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) if (db->db_buf) arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); - if (dn) - __dmu_object_info_from_dnode(dn, &doi); + __dmu_object_info_from_dnode(dn, &doi); nwritten = snprintf(buf, size, "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 14264ec30..e7412b750 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, * If we finished a chunk of dnodes, get a new one from * the global allocator. */ - if (P2PHASE(object, dnodes_per_chunk) == 0) { + if ((P2PHASE(object, dnodes_per_chunk) == 0) || + (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < + dn_slots)) { + DNODE_STAT_BUMP(dnode_alloc_next_chunk); mutex_enter(&os->os_obj_lock); ASSERT0(P2PHASE(os->os_obj_next_chunk, dnodes_per_chunk)); @@ -158,6 +161,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, } /* + * The value of (*cpuobj) before adding dn_slots is the object + * ID assigned to us. The value afterwards is the object ID + * assigned to whoever wants to do an allocation next. + */ + object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; + + /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism @@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, rw_exit(&dn->dn_struct_rwlock); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); - - (void) atomic_swap_64(cpuobj, - object + dn_slots); return (object); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); + DNODE_STAT_BUMP(dnode_alloc_race); } + /* + * Skip to next known valid starting point on error. This + * is the start of the next block of dnodes. + */ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { - /* - * Skip to next known valid starting point for a - * dnode. - */ object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); + DNODE_STAT_BUMP(dnode_alloc_next_block); } (void) atomic_swap_64(cpuobj, object); } @@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) if (*objectp == 0) { start_obj = 1; } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + uint64_t i = *objectp + 1; + uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); + dmu_object_info_t doi; + /* - * For large_dnode datasets, scan from the beginning of the - * dnode block to find the starting offset. This is needed - * because objectp could be part of a large dnode so we can't - * assume it's a hole even if dmu_object_info() returns ENOENT. + * Scan through the remaining meta dnode block. The contents + * of each slot in the block are known so it can be quickly + * checked. If the block is exhausted without a match then + * hand off to dnode_next_offset() for further scanning. */ - int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT; - int skip; - uint64_t i; - - for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) { - dmu_object_info_t doi; - + while (i <= last_obj) { error = dmu_object_info(os, i, &doi); - if (error) - skip = 1; - else - skip = doi.doi_dnodesize >> DNODE_SHIFT; + if (error == ENOENT) { + if (hole) { + *objectp = i; + return (0); + } else { + i++; + } + } else if (error == EEXIST) { + i++; + } else if (error == 0) { + if (hole) { + i += doi.doi_dnodesize >> DNODE_SHIFT; + } else { + *objectp = i; + return (0); + } + } else { + return (error); + } } start_obj = i; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 614b5785a..e05a4d0a5 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -39,20 +39,39 @@ #include <sys/range_tree.h> #include <sys/trace_dnode.h> +dnode_stats_t dnode_stats = { + { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, + { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, + { "dnode_allocate", KSTAT_DATA_UINT64 }, + { "dnode_reallocate", KSTAT_DATA_UINT64 }, + { "dnode_buf_evict", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, + { "dnode_alloc_race", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, + { "dnode_move_invalid", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, + { "dnode_move_special", KSTAT_DATA_UINT64 }, + { "dnode_move_handle", KSTAT_DATA_UINT64 }, + { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, + { "dnode_move_active", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; -/* - * Define DNODE_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define DNODE_STATS -#endif /* DEBUG */ - -#ifdef DNODE_STATS -#define DNODE_STAT_ADD(stat) ((stat)++) -#else -#define DNODE_STAT_ADD(stat) /* nothing */ -#endif /* DNODE_STATS */ ASSERTV(static dnode_phys_t dnode_phys_zero); @@ -203,11 +222,24 @@ dnode_init(void) dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); kmem_cache_set_move(dnode_cache, dnode_move); + + dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", + KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dnode_ksp != NULL) { + dnode_ksp->ks_data = &dnode_stats; + kstat_install(dnode_ksp); + } } void dnode_fini(void) { + if (dnode_ksp != NULL) { + kstat_delete(dnode_ksp); + dnode_ksp = NULL; + } + kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } @@ -391,7 +423,7 @@ dnode_setdblksz(dnode_t *dn, int size) } static dnode_t * -dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db, +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; @@ -424,26 +456,18 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db, dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_num_slots = dnp->dn_extra_slots + 1; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; - if (slots && dn->dn_type == DMU_OT_NONE) - dn->dn_num_slots = slots; - else - dn->dn_num_slots = dnp->dn_extra_slots + 1; - dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); mutex_enter(&os->os_lock); - if (dnh->dnh_dnode != NULL) { - /* Lost the allocation race. */ - mutex_exit(&os->os_lock); - kmem_cache_free(dnode_cache, dn); - return (dnh->dnh_dnode); - } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes @@ -466,6 +490,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db, mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); + return (dn); } @@ -549,6 +574,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); @@ -636,6 +662,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; + DNODE_STAT_BUMP(dnode_reallocate); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); @@ -697,18 +724,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, } #ifdef _KERNEL -#ifdef DNODE_STATS -static struct { - uint64_t dms_dnode_invalid; - uint64_t dms_dnode_recheck1; - uint64_t dms_dnode_recheck2; - uint64_t dms_dnode_special; - uint64_t dms_dnode_handle; - uint64_t dms_dnode_rwlock; - uint64_t dms_dnode_active; -} dnode_move_stats; -#endif /* DNODE_STATS */ - static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { @@ -866,7 +881,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + DNODE_STAT_BUMP(dnode_move_invalid); return (KMEM_CBRC_DONT_KNOW); } @@ -876,7 +891,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + DNODE_STAT_BUMP(dnode_move_recheck1); return (KMEM_CBRC_DONT_KNOW); } @@ -894,7 +909,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + DNODE_STAT_BUMP(dnode_move_recheck2); return (KMEM_CBRC_DONT_KNOW); } @@ -907,7 +922,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + DNODE_STAT_BUMP(dnode_move_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ @@ -922,7 +937,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + DNODE_STAT_BUMP(dnode_move_handle); return (KMEM_CBRC_LATER); } @@ -938,7 +953,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + DNODE_STAT_BUMP(dnode_move_rwlock); return (KMEM_CBRC_LATER); } @@ -964,7 +979,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); - DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + DNODE_STAT_BUMP(dnode_move_active); return (KMEM_CBRC_LATER); } @@ -988,6 +1003,78 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) } #endif /* _KERNEL */ +static void +dnode_slots_hold(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_add(&dnh->dnh_zrlock); + } +} + +static void +dnode_slots_rele(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (zrl_is_locked(&dnh->dnh_zrlock)) + zrl_exit(&dnh->dnh_zrlock); + else + zrl_remove(&dnh->dnh_zrlock); + } +} + +static int +dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (!zrl_tryenter(&dnh->dnh_zrlock)) { + for (int j = idx; j < i; j++) { + dnh = &children->dnc_children[j]; + zrl_exit(&dnh->dnh_zrlock); + } + + return (0); + } + } + + return (1); +} + +static void +dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnh->dnh_dnode = ptr; + } +} + +static boolean_t +dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + if (dnh->dnh_dnode != ptr) + return (B_FALSE); + } + + return (B_TRUE); +} + void dnode_special_close(dnode_handle_t *dnh) { @@ -995,7 +1082,7 @@ dnode_special_close(dnode_handle_t *dnh) /* * Wait for final references to the dnode to clear. This can - * only happen if the arc is asyncronously evicting state that + * only happen if the arc is asynchronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ @@ -1015,19 +1102,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, { dnode_t *dn; - dn = dnode_create(os, dnp, 0, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); + zrl_tryenter(&dnh->dnh_zrlock); + + dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); + + zrl_exit(&dnh->dnh_zrlock); } static void dnode_buf_evict_async(void *dbu) { - dnode_children_t *children_dnodes = dbu; - int i; + dnode_children_t *dnc = dbu; - for (i = 0; i < children_dnodes->dnc_count; i++) { - dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + DNODE_STAT_BUMP(dnode_buf_evict); + + for (int i = 0; i < dnc->dnc_count; i++) { + dnode_handle_t *dnh = &dnc->dnc_children[i]; dnode_t *dn; /* @@ -1035,8 +1127,9 @@ dnode_buf_evict_async(void *dbu) * another valid address, so there is no need here to guard * against changes to or from NULL. */ - if (dnh->dnh_dnode == NULL) { + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; continue; } @@ -1051,150 +1144,37 @@ dnode_buf_evict_async(void *dbu) ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - dnode_destroy(dn); /* implicit zrl_remove() */ + dnode_destroy(dn); /* implicit zrl_remove() for first slot */ zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = NULL; - } - kmem_free(children_dnodes, sizeof (dnode_children_t) + - children_dnodes->dnc_count * sizeof (dnode_handle_t)); -} - -/* - * Return true if the given index is interior to a dnode already - * allocated in the block. That is, the index is neither free nor - * allocated, but is consumed by a large dnode. - * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. When an in-code dnode - * exists we can always trust dn->dn_num_slots to be accurate, even for - * a held dnode which has not yet been fully allocated. - */ -static boolean_t -dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx) -{ - int skip, i; - - for (i = 0; i < idx; i += skip) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - if (dnh->dnh_dnode != NULL) { - skip = dnh->dnh_dnode->dn_num_slots; - } else { - if (dn_block[i].dn_type != DMU_OT_NONE) - skip = dn_block[i].dn_extra_slots + 1; - else - skip = 1; - } - } - - return (i > idx); -} - -/* - * Return true if the given index in the dnode block is a valid - * allocated dnode. That is, the index is not consumed by a large - * dnode and is not free. - * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. - */ -static boolean_t -dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx) -{ - dnode_handle_t *dnh; - dmu_object_type_t ot; - - if (dnode_is_consumed(children, dn_block, idx)) - return (B_FALSE); - - dnh = &children->dnc_children[idx]; - if (dnh->dnh_dnode != NULL) - ot = dnh->dnh_dnode->dn_type; - else - ot = dn_block[idx].dn_type; - - return (ot != DMU_OT_NONE); -} - -/* - * Return true if the given range of indices in the dnode block are - * free. That is, the starting index is not consumed by a large dnode - * and none of the indices are allocated. - * - * The dnode_phys_t buffer may not be in sync with the in-core dnode - * structure, so we try to check the dnode structure first and fall back - * to the dnode_phys_t buffer it doesn't exist. - */ -static boolean_t -dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx, - int slots) -{ - if (idx + slots > DNODES_PER_BLOCK) - return (B_FALSE); - - if (dnode_is_consumed(children, dn_block, idx)) - return (B_FALSE); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - dmu_object_type_t ot; - - if (dnh->dnh_dnode != NULL) { - if (dnh->dnh_dnode->dn_num_slots > 1) - return (B_FALSE); - - ot = dnh->dnh_dnode->dn_type; - } else { - ot = dn_block[i].dn_type; - } - - if (ot != DMU_OT_NONE) - return (B_FALSE); - } - - return (B_TRUE); -} - -static void -dnode_hold_slots(dnode_children_t *children, int idx, int slots) -{ - for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - zrl_add(&dnh->dnh_zrlock); - } -} - -static void -dnode_rele_slots(dnode_children_t *children, int idx, int slots) -{ - for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - zrl_remove(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; } + kmem_free(dnc, sizeof (dnode_children_t) + + dnc->dnc_count * sizeof (dnode_handle_t)); } /* * errors: - * EINVAL - invalid object number. - * ENOSPC - hole too small to fulfill "slots" request - * ENOENT - the requested dnode is not allocated - * EIO - i/o error. + * EINVAL - Invalid object number or flags. + * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) + * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) + * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) + * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) + * EIO - I/O error when reading the meta dnode dbuf. + * * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, void *tag, dnode_t **dnp) { - int epb, idx, err, i; + int epb, idx, err; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_children_t *children_dnodes; - dnode_phys_t *dn_block_begin; + dnode_children_t *dnc; + dnode_phys_t *dn_block; dnode_handle_t *dnh; ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); @@ -1244,10 +1224,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) + if (db == NULL) { + DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); + } err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { + DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); return (err); } @@ -1255,72 +1238,179 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); epb = db->db.db_size >> DNODE_SHIFT; + idx = object & (epb - 1); + dn_block = (dnode_phys_t *)db->db.db_data; + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - children_dnodes = dmu_buf_get_user(&db->db); - if (children_dnodes == NULL) { + dnc = dmu_buf_get_user(&db->db); + dnh = NULL; + if (dnc == NULL) { dnode_children_t *winner; - children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + int skip = 0; + + dnc = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); - children_dnodes->dnc_count = epb; - dnh = &children_dnodes->dnc_children[0]; - for (i = 0; i < epb; i++) { + dnc->dnc_count = epb; + dnh = &dnc->dnc_children[0]; + + /* Initialize dnode slot status from dnode_phys_t */ + for (int i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); + + if (skip) { + skip--; + continue; + } + + if (dn_block[i].dn_type != DMU_OT_NONE) { + int interior = dn_block[i].dn_extra_slots; + + dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); + dnode_set_slots(dnc, i + 1, interior, + DN_SLOT_INTERIOR); + skip = interior; + } else { + dnh[i].dnh_dnode = DN_SLOT_FREE; + skip = 0; + } } - dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, + + dmu_buf_init_user(&dnc->dnc_dbu, NULL, dnode_buf_evict_async, NULL); - winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); if (winner != NULL) { - for (i = 0; i < epb; i++) { + for (int i = 0; i < epb; i++) zrl_destroy(&dnh[i].dnh_zrlock); - } - kmem_free(children_dnodes, sizeof (dnode_children_t) + + kmem_free(dnc, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); - children_dnodes = winner; + dnc = winner; } } - ASSERT(children_dnodes->dnc_count == epb); - idx = object & (epb - 1); - dn_block_begin = (dnode_phys_t *)db->db.db_data; + ASSERT(dnc->dnc_count == epb); + dn = DN_SLOT_UNINIT; - dnode_hold_slots(children_dnodes, idx, slots); + if (flag & DNODE_MUST_BE_ALLOCATED) { + slots = 1; - if ((flag & DNODE_MUST_BE_FREE) && - !dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) { - dnode_rele_slots(children_dnodes, idx, slots); - dbuf_rele(db, FTAG); - return (ENOSPC); - } else if ((flag & DNODE_MUST_BE_ALLOCATED) && - !dnode_is_allocated(children_dnodes, dn_block_begin, idx)) { - dnode_rele_slots(children_dnodes, idx, slots); + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + dnh = &dnc->dnc_children[idx]; + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + break; + } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { + DNODE_STAT_BUMP(dnode_hold_alloc_interior); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { + DNODE_STAT_BUMP(dnode_hold_alloc_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); + continue; + } + + /* + * Someone else won the race and called dnode_create() + * after we checked DN_SLOT_IS_PTR() above but before + * we acquired the lock. + */ + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE) { + DNODE_STAT_BUMP(dnode_hold_alloc_type_none); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + DNODE_STAT_BUMP(dnode_hold_alloc_hits); + } else if (flag & DNODE_MUST_BE_FREE) { + + if (idx + slots - 1 >= DNODES_PER_BLOCK) { + DNODE_STAT_BUMP(dnode_hold_free_overflow); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + while (dn == DN_SLOT_UNINIT) { + dnode_slots_hold(dnc, idx, slots); + + if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) { + DNODE_STAT_BUMP(dnode_hold_free_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnode_slots_rele(dnc, idx, slots); + if (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_retry); + continue; + } + + if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnh = &dnc->dnc_children[idx]; + dn = dnode_create(os, dn_block + idx, db, object, dnh); + } + + mutex_enter(&dn->dn_mtx); + if (!refcount_is_zero(&dn->dn_holds)) { + DNODE_STAT_BUMP(dnode_hold_free_refcount); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } + + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); + DNODE_STAT_BUMP(dnode_hold_free_hits); + } else { dbuf_rele(db, FTAG); - return (ENOENT); + return (SET_ERROR(EINVAL)); } - dnh = &children_dnodes->dnc_children[idx]; - dn = dnh->dnh_dnode; - if (dn == NULL) - dn = dnode_create(os, dn_block_begin + idx, slots, db, - object, dnh); - - mutex_enter(&dn->dn_mtx); - type = dn->dn_type; - if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) { + if (dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_txg); + type = dn->dn_type; mutex_exit(&dn->dn_mtx); - dnode_rele_slots(children_dnodes, idx, slots); + dnode_slots_rele(dnc, idx, slots); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } + if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ - dnode_rele_slots(children_dnodes, idx, slots); + dnode_slots_rele(dnc, idx, slots); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2b99f22e5..472f85dd4 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -365,7 +365,7 @@ tests = ['async_destroy_001_pos'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos', - 'large_dnode_007_neg', 'large_dnode_008_pos'] + 'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos'] [tests/functional/grow_pool] tests = ['grow_pool_001_pos'] diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am index 69ec5e18a..13ba3ab33 100644 --- a/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am +++ b/tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am @@ -9,4 +9,5 @@ dist_pkgdata_SCRIPTS = \ large_dnode_005_pos.ksh \ large_dnode_006_pos.ksh \ large_dnode_007_neg.ksh \ - large_dnode_008_pos.ksh + large_dnode_008_pos.ksh \ + large_dnode_009_pos.ksh diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh index 1f900b5ef..eac292cbe 100755 --- a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh @@ -42,6 +42,21 @@ function cleanup datasetexists $TEST_FS && log_must zfs destroy $TEST_FS } +function verify_dnode_packing +{ + zdb -dd $TEST_FS | grep -A 3 'Dnode slots' | awk ' + /Total used:/ {total_used=$NF} + /Max used:/ {max_used=$NF} + /Percent empty:/ {print total_used, max_used, int($NF)} + ' | while read total_used max_used pct_empty + do + log_note "total_used $total_used max_used $max_used pct_empty $pct_empty" + if [ $pct_empty -gt 5 ]; then + log_fail "Holes in dnode array: pct empty $pct_empty > 5" + fi + done +} + log_onexit cleanup log_assert "xattrtest runs concurrently on dataset with large dnodes" @@ -52,9 +67,11 @@ log_must zfs set xattr=sa $TEST_FS for ((i=0; i < 100; i++)); do dir="/$TEST_FS/dir.$i" log_must mkdir "$dir" - log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &" + log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir >/dev/null 2>&1 &" done log_must wait +verify_dnode_packing + log_pass diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh new file mode 100755 index 000000000..fa746c52e --- /dev/null +++ b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Run many xattrtests on a dataset with large dnodes and xattr=sa to +# stress concurrent allocation of large dnodes. +# + +TEST_FS=$TESTPOOL/large_dnode + +verify_runnable "both" + +function cleanup +{ + datasetexists $TEST_FS && log_must zfs destroy $TEST_FS +} + +log_onexit cleanup +log_assert "xattrtest runs concurrently on dataset with large dnodes" + +log_must zfs create $TEST_FS +log_must zfs set dnsize=auto $TEST_FS +log_must zfs set xattr=sa $TEST_FS + +for ((i=0; i < 100; i++)); do + dir="/$TEST_FS/dir.$i" + log_must mkdir "$dir" + + do_unlink="" + if [ $((RANDOM % 2)) -eq 0 ]; then + do_unlink="-k -f 1024" + else + do_unlink="-f $((RANDOM % 1024))" + fi + log_must eval "xattrtest -R -r -y -x 1 $do_unlink -p $dir >/dev/null 2>&1 &" +done + +log_must wait + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must ls -lR "/$TEST_FS/" >/dev/null 2>&1 +log_must zdb -d $TESTPOOL +log_pass |