diff options
author | Jitendra Patidar <[email protected]> | 2021-04-16 02:19:27 +0530 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2021-04-19 15:22:57 -0700 |
commit | 4c925936e3e392771a7c2820d847079e723bcf70 (patch) | |
tree | db0900bc3f687c3a59e5ad10e67473a874664e89 /module | |
parent | 15d3470c2e4d66b926cab047b403948cb95d91fa (diff) |
ZFS traverse_visitbp optimization to limit prefetch
Traversal code, traverse_visitbp() does visit blocks recursively.
Indirect (Non L0) Block of size 128k could contain, 1024 block pointers
of 128 bytes. In case of full traverse OR incremental traverse, where
all blocks were modified, it could traverse large number of blocks
pointed by indirect. Traversal code does issue prefetch of blocks
traversed below indirect. This could result into large number of
async reads queued on vdev queue. So, account for prefetch issued for
blocks pointed by indirect and limit max prefetch in one go.
Module Param:
zfs_traverse_indirect_prefetch_limit: Limit of prefetch while traversing
an indirect block.
Local counters:
prefetched: Local counter to account for number prefetch done.
pidx: Index for which next prefetch to be issued.
ptidx: Index at which next prefetch to be triggered.
Keep "ptidx" somewhere in the middle of blocks prefetched, so that
blocks prefetch read gets the enough time window before their demand
read is issued.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Alexander Motin <[email protected]>
Signed-off-by: Jitendra Patidar <[email protected]>
Closes #11802
Closes #11803
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/dmu_traverse.c | 67 |
1 files changed, 53 insertions, 14 deletions
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 31db49dae..862c0bf40 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -41,6 +41,7 @@ int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ int32_t send_holes_without_birth_time = 1; +int32_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; @@ -176,7 +177,10 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void +/* + * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. + */ +static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { @@ -184,18 +188,18 @@ traverse_prefetch_metadata(traverse_data_t *td, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) - return; + return (B_FALSE); /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) - return; + return (B_FALSE); if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) - return; + return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) - return; + return (B_FALSE); ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) @@ -203,6 +207,7 @@ traverse_prefetch_metadata(traverse_data_t *td, (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + return (B_TRUE); } static boolean_t @@ -295,7 +300,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_FLAG_WAIT; - int32_t i; + int32_t i, ptidx, pidx; + uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -308,16 +314,46 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + /* + * When performing a traversal it is beneficial to + * asynchronously read-ahead the upcoming indirect + * blocks since they will be needed shortly. However, + * since a 128k indirect (non-L0) block may contain up + * to 1024 128-byte block pointers, its preferable to not + * prefetch them all at once. Issuing a large number of + * async reads may effect performance, and the earlier + * the indirect blocks are prefetched the less likely + * they are to still be resident in the ARC when needed. + * Therefore, prefetching indirect blocks is limited to + * zfs_traverse_indirect_prefetch_limit=32 blocks by + * default. + * + * pidx: Index for which next prefetch to be issued. + * ptidx: Index at which next prefetch to be triggered. + */ + ptidx = 0; + pidx = 1; + prefetchlimit = zfs_traverse_indirect_prefetch_limit; for (i = 0; i < epb; i++) { - SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); - } + if (prefetchlimit && i == ptidx) { + ASSERT3S(ptidx, <=, pidx); + for (uint32_t prefetched = 0; pidx < epb && + prefetched < prefetchlimit; pidx++) { + SET_BOOKMARK(czb, zb->zb_objset, + zb->zb_object, zb->zb_level - 1, + zb->zb_blkid * epb + pidx); + if (traverse_prefetch_metadata(td, + &((blkptr_t *)buf->b_data)[pidx], + czb) == B_TRUE) { + prefetched++; + if (prefetched == + MAX(prefetchlimit / 2, 1)) + ptidx = pidx; + } + } + } - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + /* recursively visitbp() blocks below this */ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); @@ -777,6 +813,9 @@ EXPORT_SYMBOL(traverse_pool); ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, "Max number of bytes to prefetch"); +ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW, + "Traverse prefetch number of blocks pointed by indirect block"); + #if defined(_KERNEL) module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); MODULE_PARM_DESC(ignore_hole_birth, |