diff options
author | Alexander Motin <[email protected]> | 2021-08-17 12:15:54 -0400 |
---|---|---|
committer | GitHub <[email protected]> | 2021-08-17 10:15:54 -0600 |
commit | 6b88b4b501a9e12dd6feee7e3cb2824bc70beeca (patch) | |
tree | 8dca8b6ea9f1d8ae628a454ec2ead51dcf422f46 /module/zfs/arc.c | |
parent | 72f0521abadd77ef06fb1d10c4f4f80af50ec279 (diff) |
Remove b_pabd/b_rabd allocation from arc_hdr_alloc()
When a header is allocated for full overwrite it is a waste of time
to allocate b_pabd/b_rabd for it, since arc_write() will free them
without ever being touched. If it is a read or a partial overwrite
then arc_read() and arc_hdr_decrypt() allocate them explicitly.
Reduced memory allocation in user threads also reduces ARC eviction
throttling there, proportionally increasing it in ZIO threads, that
is not good. To minimize or even avoid it introduce ARC allocation
reserve, allowing certain arc_get_data_abd() callers to allocate a
bit longer in situations where user threads will already throttle.
Reviewed-by: George Wilson <[email protected]>
Reviewed-by: Mark Maybee <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored-By: iXsystems, Inc.
Closes #12398
Diffstat (limited to 'module/zfs/arc.c')
-rw-r--r-- | module/zfs/arc.c | 113 |
1 files changed, 65 insertions, 48 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 41553fc57..227d0417c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -834,12 +834,13 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, + ARC_HDR_USE_RESERVE = 0x4, }; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); @@ -1854,7 +1855,8 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -3169,7 +3171,6 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); - boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3180,14 +3181,14 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } @@ -3233,13 +3234,34 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } +/* + * Allocate empty anonymous ARC header. The header will get its identity + * assigned and buffers attached later as part of read or write operations. + * + * In case of read arc_read() assigns header its identify (b_dva + b_birth), + * inserts it into ARC hash to become globally visible and allocates physical + * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read + * completion arc_read_done() allocates ARC buffer(s) as needed, potentially + * sharing one of them with the physical ABD buffer. + * + * In case of write arc_alloc_buf() allocates ARC buffer to be filled with + * data. Then after compression and/or encryption arc_write_ready() allocates + * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD + * buffer. On disk write completion arc_write_done() assigns the header its + * new identity (b_dva + b_birth) and inserts into ARC hash. + * + * In case of partial overwrite the old data is read first as described. Then + * arc_release() either allocates new anonymous ARC header and moves the ARC + * buffer to it, or reuses the old ARC header by discarding its identity and + * removing it from ARC hash. After buffer modification normal write process + * follows as described. + */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, - arc_buf_contents_t type, boolean_t alloc_rdata) + arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - int flags = ARC_HDR_DO_ADAPT; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { @@ -3247,7 +3269,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } - flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); @@ -3271,12 +3292,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_abd(hdr, flags); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -3558,7 +3573,7 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); + B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3582,7 +3597,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); + B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3590,16 +3605,12 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_untransform() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - arc_hdr_free_abd(hdr, B_FALSE); - arc_share_buf(hdr, buf); - } + /* + * To ensure that the hdr has the correct data in it if we call + * arc_untransform() on this buf before it's been written to disk, + * it's easiest if we just set up sharing between the buf and the hdr. + */ + arc_share_buf(hdr, buf); return (buf); } @@ -3621,7 +3632,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, - compression_type, complevel, type, B_TRUE); + compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -5119,7 +5130,7 @@ arc_adapt(int bytes, arc_state_t *state) * zfs_arc_overflow_shift. */ static arc_ovf_level_t -arc_is_overflowing(void) +arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, @@ -5136,17 +5147,19 @@ arc_is_overflowing(void) */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; + if (!use_reserve) + overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, do_adapt); + arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { @@ -5160,7 +5173,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, B_TRUE); + arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5177,9 +5190,9 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * of ARC behavior and settings. See arc_lowmem_init(). */ void -arc_wait_for_eviction(uint64_t amount) +arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { - switch (arc_is_overflowing()) { + switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: @@ -5256,12 +5269,12 @@ arc_wait_for_eviction(uint64_t amount) */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - if (do_adapt) + if (alloc_flags & ARC_HDR_DO_ADAPT) arc_adapt(size, state); /* @@ -5277,7 +5290,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ - arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100); + arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, + alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -6087,8 +6101,7 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, - encrypted_read); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); @@ -6102,6 +6115,7 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } + alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6149,9 +6163,9 @@ top: */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); - arc_hdr_alloc_abd(hdr, alloc_flags); } + arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); @@ -6673,7 +6687,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, - compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); + compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -6853,7 +6867,8 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || !arc_can_share(hdr, buf)) { @@ -6864,17 +6879,19 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, - ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -8696,7 +8713,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8733,7 +8750,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), |