From ef56b0780c80ebb0b1e637b8b8c79530a8ab3201 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Fri, 12 Jun 2015 21:20:29 +0200 Subject: Account for ashift when gathering buffers to be written to l2arc device If we don't account for that, then we might end up overwriting disk area of buffers that have not been evicted yet, because l2arc_evict operates in terms of disk addresses. The discrepancy between the write size calculation and the actual increment to l2ad_hand was introduced in commit 3a17a7a9. The change that introduced l2ad_hand alignment was almost correct as the write size was accumulated as a sum of rounded buffer sizes. See commit illumos/illumos-gate@e14bb32. Also, we now consistently use asize / a_sz for the allocated size and psize / p_sz for the physical size. The latter accounts for a possible size reduction because of the compression, whereas the former accounts for a possible subsequent size expansion because of the alignment requirements. The code still assumes that either underlying storage subsystems or hardware is able to do read-modify-write when an L2ARC buffer size is not a multiple of a disk's block size. This is true for 4KB sector disks that provide 512B sector emulation, but may not be true in general. In other words, we currently do not have any code to make sure that an L2ARC buffer, whether compressed or not, which is used for physical I/O has a suitable size. Note that currently the cache device utilization is calculated based on the physical size, not the allocated size. The same applies to l2_asize kstat. That is wrong, but this commit does not fix that. The accounting problem was introduced partially in commit 3a17a7a9 and partially in 3038a2b (accounting became consistent but in favour of the wrong size). Porting Notes: Reworked to be C90 compatible and the 'write_psize' variable was removed because it is now unused. References: https://reviews.csiden.org/r/229/ https://reviews.freebsd.org/D2764 Ported-by: kernelOfTruth Signed-off-by: Brian Behlendorf Closes #3400 Closes #3433 Closes #3451 --- module/zfs/arc.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'module/zfs') diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5d5bcbe2b..16d970672 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5789,8 +5789,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_sz, headroom, - buf_compress_minsz; + uint64_t write_asize, write_sz, headroom, buf_compress_minsz, + stats_size; void *buf_data; boolean_t full; l2arc_write_callback_t *cb; @@ -5805,7 +5805,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, *headroom_boost = B_FALSE; pio = NULL; - write_sz = write_asize = write_psize = 0; + write_sz = write_asize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; @@ -5842,6 +5842,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t buf_sz; + uint64_t buf_a_sz; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -5870,7 +5871,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - if ((write_sz + hdr->b_size) > target_sz) { + /* + * Assume that the buffer is not going to be compressed + * and could take more space on disk because of a larger + * disk block size. + */ + buf_sz = hdr->b_size; + buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + if ((write_asize + buf_a_sz) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -5935,8 +5944,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * using it to denote the header's state change. */ hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - - buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; mutex_enter(&dev->l2ad_mtx); @@ -5953,6 +5960,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, mutex_exit(hash_lock); write_sz += buf_sz; + write_asize += buf_a_sz; } multilist_sublist_unlock(mls); @@ -5971,6 +5979,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, mutex_enter(&dev->l2ad_mtx); + /* + * Note that elsewhere in this file arcstat_l2_asize + * and the used space on l2ad_vdev are updated using b_asize, + * which is not necessarily rounded up to the device block size. + * Too keep accounting consistent we do the same here as well: + * stats_size accumulates the sum of b_asize of the written buffers, + * while write_asize accumulates the sum of b_asize rounded up + * to the device block size. + * The latter sum is used only to validate the corectness of the code. + */ + stats_size = 0; + write_asize = 0; + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector @@ -6024,7 +6045,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { - uint64_t buf_p_sz; + uint64_t buf_a_sz; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, @@ -6035,14 +6056,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, zio_t *, wzio); (void) zio_nowait(wzio); - write_asize += buf_sz; + stats_size += buf_sz; /* * Keep the clock hand suitably device-aligned. */ - buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_psize += buf_p_sz; - dev->l2ad_hand += buf_p_sz; + buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + write_asize += buf_a_sz; + dev->l2ad_hand += buf_a_sz; } } @@ -6052,8 +6073,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, write_asize); - vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, stats_size); + vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); /* * Bump device hand to the device start if it is approaching the end. -- cgit v1.2.3