diff options
author | Alexander Motin <[email protected]> | 2023-08-07 16:54:41 -0400 |
---|---|---|
committer | GitHub <[email protected]> | 2023-08-07 13:54:41 -0700 |
commit | 6c94e64963139e5e598a9880fe73cd326fa5453b (patch) | |
tree | d9583b4496591c9852826f50a1c5c66eb21951dd /module | |
parent | a97b8fc2dd9ed85640b81b0514ff7424e26fc32e (diff) |
Refactor dmu_prefetch().
- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate
function. It is quite inconvenient to read the code where len = 0
means dnode prefetch instead indirect/data prefetch. One function
doing both has no benefits, since the code paths are independent.
- Improve dmu_prefetch() handling of long block ranges. Instead
of limiting L0 data length to prefetch for to dmu_prefetch_max,
make dmu_prefetch_max limit the actual amount of prefetch at the
specified level, and, if there is more, prefetch all the rest at
higher indirection level. It should improve random access times
within the prefetched range of any length, reducing importance of
specific dmu_prefetch_max value.
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored by: iXsystems, Inc.
Closes #15076
Diffstat (limited to 'module')
-rw-r--r-- | module/os/freebsd/zfs/zfs_vnops_os.c | 4 | ||||
-rw-r--r-- | module/os/linux/zfs/zfs_vnops_os.c | 7 | ||||
-rw-r--r-- | module/zfs/dmu.c | 103 | ||||
-rw-r--r-- | module/zfs/dsl_deadlist.c | 8 | ||||
-rw-r--r-- | module/zfs/spa_log_spacemap.c | 4 | ||||
-rw-r--r-- | module/zfs/zvol.c | 2 |
6 files changed, 71 insertions, 57 deletions
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 45cf6fdfc..e80e29c2d 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, ASSERT3S(outcount, <=, bufsize); - /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 33baac9db..b7d44f344 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) if (done) break; - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } + if (prefetch) + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 078811dbf..c2565b50b 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -694,75 +694,94 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) } /* - * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing - * the data starting at offset, and continuing to offset + len. + * the data starting at offset, and continuing to offset + len. If the range + * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * for the rest only a higher level, also fitting within dmu_prefetch_max. It + * should primarily help random reads, since for long sequential reads there is + * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not - * in cache, they will be asynchronously read in. + * in cache, they will be asynchronously read in. Dnode read by dnode_hold() + * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - uint64_t blkid; - int nblks, err; - - if (len == 0) { /* they're interested in the bonus buffer */ - dn = DMU_META_DNODE(os); + int64_t level2 = level; + uint64_t start, end, start2, end2; - if (object == 0 || object >= DN_MAX_OBJECT) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, level, - object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, level, blkid, pri, 0); - rw_exit(&dn->dn_struct_rwlock); + if (dmu_prefetch_max == 0 || len == 0) { + dmu_prefetch_dnode(os, object, pri); return; } - /* - * See comment before the definition of dmu_prefetch_max. - */ - len = MIN(len, dmu_prefetch_max); - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) + if (dnode_hold(os, object, FTAG, &dn) != 0) return; /* - * offset + len - 1 is the last byte we want to prefetch for, and offset - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the - * last block we want to prefetch, and dbuf_whichblock(dn, level, - * offset) is the first. Then the number we need to prefetch is the - * last - first + 1. + * Depending on len we may do two prefetches: blocks [start, end) at + * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (level > 0 || dn->dn_datablkshift != 0) { - nblks = dbuf_whichblock(dn, level, offset + len - 1) - - dbuf_whichblock(dn, level, offset) + 1; + if (dn->dn_datablkshift != 0) { + /* + * The object has multiple blocks. Calculate the full range + * of blocks [start, end2) and then split it into two parts, + * so that the first [start, end) fits into dmu_prefetch_max. + */ + start = dbuf_whichblock(dn, level, offset); + end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; + uint8_t ibs = dn->dn_indblkshift; + uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; + uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; + start2 = end = MIN(end2, start + limit); + + /* + * Find level2 where [start2, end2) fits into dmu_prefetch_max. + */ + uint8_t ibps = ibs - SPA_BLKPTRSHIFT; + limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + do { + level2++; + start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; + end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; + } while (end2 - start2 > limit); } else { - nblks = (offset < dn->dn_datablksz); + /* There is only one block. Prefetch it or nothing. */ + start = start2 = end2 = 0; + end = start + (level == 0 && offset < dn->dn_datablksz); } - if (nblks != 0) { - blkid = dbuf_whichblock(dn, level, offset); - for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, level, blkid + i, pri, 0); - } + for (uint64_t i = start; i < end; i++) + dbuf_prefetch(dn, level, i, pri, 0); + for (uint64_t i = start2; i < end2; i++) + dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* + * Issue prefetch I/Os for the given object's dnode. + */ +void +dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) +{ + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + dnode_t *dn = DMU_META_DNODE(os); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, 0, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); +} + +/* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 47c234f76..2832294b6 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dle->dle_bpobj.bpo_object = za.za_first_integer; - dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_tree, dle); } @@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dlce->dlce_bpobj = za.za_first_integer; - dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } VERIFY3U(error, ==, ENOENT); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 2878e68c6..cf05158b6 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa) /* Prefetch log spacemaps dnodes. */ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj, + ZIO_PRIORITY_SYNC_READ); } uint_t pn = 0; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index cd4e6f0c7..547687f07 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -981,7 +981,7 @@ zvol_prefetch_minors_impl(void *arg) job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } |