diff options
author | Dan Kimmel <[email protected]> | 2016-07-11 13:45:52 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-09-13 09:58:58 -0700 |
commit | 2aa34383b9362200e19b22ca4a22ea97d70d9ddf (patch) | |
tree | 81f822652c27bf7452fe30457d6c3d1aab9164a4 /module/zfs/arc.c | |
parent | d3c2ae1c0806b183a315e3d43cc8018cfdca79b5 (diff) |
DLPX-40252 integrate EP-476 compressed zfs send/receive
Authored by: Dan Kimmel <[email protected]>
Reviewed by: Tom Caputi <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Ported by: David Quigley <[email protected]>
Issue #5078
Diffstat (limited to 'module/zfs/arc.c')
-rwxr-xr-x | module/zfs/arc.c | 762 |
1 files changed, 486 insertions, 276 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 43f0bfa4a..ee95f0f8d 100755 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -77,10 +77,10 @@ * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface - * uses method 1, while the internal arc algorithms for + * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the - * arc list locks. + * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most @@ -93,21 +93,12 @@ * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * - * Each arc state also has a mutex which is used to protect the + * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an arc list lock you + * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * - * Arc buffers may have an associated eviction callback function. - * This function will be invoked prior to removing the buffer (e.g. - * in arc_do_user_evicts()). Note however that the data associated - * with the buffer may be evicted prior to the callback. The callback - * must be made with *no locks held* (to prevent deadlock). Additionally, - * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_clear_callback() - * and arc_do_user_evicts(). - * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so @@ -144,67 +135,81 @@ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC - * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). - * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC - * consumer, and always contains uncompressed data. The ARC will provide - * references to this data and will keep it cached until it is no longer in - * use. Typically, the arc will try to cache only the L1ARC's physical data - * block and will aggressively evict any arc_buf_t that is no longer referenced. - * The amount of memory consumed by the arc_buf_t's can be seen via the + * + * The L1ARC's data pointer may or may not be uncompressed. The ARC has the + * ability to store the physical data (b_pdata) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * it will match its on-disk compression characteristics. This behavior can be + * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * Data in the L1ARC is not accessed by consumers of the ARC directly. Each + * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. + * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer. The ARC will provide references to this data and will keep it + * cached until it is no longer in use. The ARC caches only the L1ARC's physical + * data block and will evict any arc_buf_t that is no longer referenced. The + * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * + * Depending on the consumer, an arc_buf_t can be requested in uncompressed or + * compressed form. The typical case is that consumers will want uncompressed + * data, and when that happens a new data buffer is allocated where the data is + * decompressed for them to use. Currently the only consumer who wants + * compressed arc_buf_t's is "zfs send", when it streams data exactly as it + * exists on disk. When this happens, the arc_buf_t's data buffer is shared + * with the arc_buf_hdr_t. * - * arc_buf_hdr_t - * +-----------+ - * | | - * | | - * | | - * +-----------+ - * l2arc_buf_hdr_t| | - * | | - * +-----------+ - * l1arc_buf_hdr_t| | - * | | arc_buf_t - * | b_buf +------------>+---------+ arc_buf_t - * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL - * +-----------+ | | | +---------+ - * | |b_data +-+ | | - * | +---------+ | |b_data +-+ - * +->+------+ | +---------+ | - * (potentially) | | | | - * compressed | | | | - * data +------+ | v - * +->+------+ +------+ - * uncompressed | | | | - * data | | | | - * +------+ +------+ + * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The + * first one is owned by a compressed send consumer (and therefore references + * the same compressed data buffer as the arc_buf_hdr_t) and the second could be + * used by any other consumer (and has its own uncompressed copy of the data + * buffer). * - * The L1ARC's data pointer, however, may or may not be uncompressed. The - * ARC has the ability to store the physical data (b_pdata) associated with - * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk - * physical block, it will match its on-disk compression characteristics. - * If the block on-disk is compressed, then the physical data block - * in the cache will also be compressed and vice-versa. This behavior - * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an - * uncompressed version of the on-disk data. + * arc_buf_hdr_t + * +-----------+ + * | fields | + * | common to | + * | L1- and | + * | L2ARC | + * +-----------+ + * | l2arc_buf_hdr_t + * | | + * +-----------+ + * | l1arc_buf_hdr_t + * | | arc_buf_t + * | b_buf +------------>+-----------+ arc_buf_t + * | b_pdata +-+ |b_next +---->+-----------+ + * +-----------+ | |-----------| |b_next +-->NULL + * | |b_comp = T | +-----------+ + * | |b_data +-+ |b_comp = F | + * | +-----------+ | |b_data +-+ + * +->+------+ | +-----------+ | + * compressed | | | | + * data | |<--------------+ | uncompressed + * +------+ compressed, | data + * shared +-->+------+ + * data | | + * | | + * +------+ * * When a consumer reads a block, the ARC must first look to see if the - * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, - * then an additional arc_buf_t is allocated and the uncompressed data is - * bcopied from the existing arc_buf_t. If the hdr is cached but does not - * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses - * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's - * b_pdata is not compressed, then the block is shared with the newly - * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t - * in the arc buffer chain. Sharing the block reduces the memory overhead - * required when the hdr is caching uncompressed blocks or the compressed - * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new + * arc_buf_t and either copies uncompressed data into a new data buffer from an + * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a + * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * hdr is compressed and the desired compression characteristics of the + * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the + * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be + * the last buffer in the hdr's b_buf list, however a shared compressed buf can + * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is - * sharing its data with an arc_buf_t: + * sharing its data with an arc_buf_t (note that the shared uncompressed buf is + * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ @@ -233,20 +238,24 @@ * | +------+ | * +---------------------------------+ * - * Writing to the arc requires that the ARC first discard the b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pdata * since the physical block is about to be rewritten. The new data contents - * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline - * performs the write, it may compress the data before writing it to disk. - * The ARC will be called with the transformed data and will bcopy the - * transformed on-disk block into a newly allocated b_pdata. + * will be contained in the arc_buf_t. As the I/O pipeline performs the write, + * it may compress the data before writing it to disk. The ARC will be called + * with the transformed data and will bcopy the transformed on-disk block into + * a newly allocated b_pdata. Writes are always done into buffers which have + * either been loaned (and hence are new and don't have other readers) or + * buffers which have been released (and hence have their own hdr, if there + * were originally other readers of the buf's original hdr). This ensures that + * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pdata. The * L2ARC will always write the contents of b_pdata to the L2ARC. This means - * that when compressed arc is enabled that the L2ARC blocks are identical + * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed - * arc is disabled, then the L2ARC's block must be transformed to look + * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ @@ -853,6 +862,8 @@ static taskq_t *arc_prune_taskq; HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) +#define ARC_BUF_SHARED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_prop_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes @@ -935,7 +946,7 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ + arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ @@ -1289,12 +1300,39 @@ retry: #define ARC_MINTIME (hz>>4) /* 62 ms */ +/* + * This is the size that the buf occupies in memory. If the buf is compressed, + * it will correspond to the compressed size. You should use this method of + * getting the buf size unless you explicitly need the logical size. + */ +uint64_t +arc_buf_size(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); +} + +uint64_t +arc_buf_lsize(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +enum zio_compress +arc_get_compression(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + IMPLY(shared, ARC_BUF_SHARED(buf)); + IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); return (shared); } @@ -1326,7 +1364,8 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), &zc); + + fletcher_2_native(buf->b_data, arc_buf_size(buf), &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); @@ -1411,14 +1450,22 @@ arc_cksum_compute(arc_buf_t *buf) return; ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } + + ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), + fletcher_2_native(buf->b_data, arc_buf_size(buf), hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); @@ -1450,7 +1497,7 @@ arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) - ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), + ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #endif } @@ -1468,6 +1515,12 @@ arc_buf_type(arc_buf_hdr_t *hdr) return (type); } +boolean_t +arc_is_metadata(arc_buf_t *buf) +{ + return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); +} + static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { @@ -1489,14 +1542,23 @@ arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (hdr->b_l1hdr.b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(hdr)) - panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } + /* + * Compressed buffers do not manipulate the b_freeze_cksum or + * allocate b_thawed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + hdr->b_l1hdr.b_bufcnt > 1); + return; + } + ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); @@ -1511,6 +1573,12 @@ arc_buf_freeze(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || + hdr->b_l1hdr.b_bufcnt > 1); + return; + } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -1519,7 +1587,6 @@ arc_buf_freeze(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); - } /* @@ -1576,16 +1643,14 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) } } -static int +int arc_decompress(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; int error; - if (arc_buf_is_shared(buf)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { /* * The arc_buf_hdr_t is either not compressed or is * associated with an embedded block or a hole in which @@ -1593,11 +1658,31 @@ arc_decompress(arc_buf_t *buf) */ IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + if (!arc_buf_is_shared(buf)) { + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + HDR_GET_LSIZE(hdr)); + } } else { - ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + + /* + * If the buf is compressed and sharing data with hdr, unlink + * its data buf from the header and make it uncompressed. + */ + if (ARC_BUF_COMPRESSED(buf)) { + buf->b_prop_flags &= + ~(ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Previously this buf was shared so overhead was 0, so + * just add new overhead. + */ + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); @@ -1644,7 +1729,6 @@ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1653,7 +1737,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + (void) refcount_add_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); return; } @@ -1663,11 +1748,11 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } - (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); + (void) refcount_add_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); } } @@ -1677,10 +1762,9 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) * so that we can add and remove them from the refcount individually. */ static void -arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1690,7 +1774,7 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], - lsize, hdr); + HDR_GET_LSIZE(hdr), hdr); return; } @@ -1700,12 +1784,11 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, arc_buf_size(buf)); (void) refcount_remove_many(&state->arcs_esize[type], - lsize, buf); + arc_buf_size(buf), buf); } } @@ -1735,7 +1818,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); - arc_evitable_space_decrement(hdr, state); + arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); @@ -1872,7 +1955,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } - arc_evitable_space_decrement(hdr, old_state); + arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* @@ -1935,13 +2018,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, + arc_buf_size(buf)); (void) refcount_add_many(&new_state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -1958,6 +2041,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); /* * When moving a header off of a ghost state, @@ -1969,7 +2053,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -1991,13 +2074,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); + if (arc_buf_is_shared(buf)) continue; - } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, + arc_buf_size(buf)); (void) refcount_remove_many( - &old_state->arcs_size, HDR_GET_LSIZE(hdr), + &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); @@ -2098,11 +2181,11 @@ arc_space_return(uint64_t space, arc_space_type_t type) } /* - * Allocate an initial buffer for this hdr, subsequent buffers will - * use arc_buf_clone(). + * Allocate either the first buffer for this hdr, or a compressed buffer for + * this hdr. Subsequent non-compressed buffers use arc_buf_clone(). */ static arc_buf_t * -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed) { arc_buf_t *buf; @@ -2111,9 +2194,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; @@ -2123,7 +2203,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_next = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; add_reference(hdr, tag); @@ -2134,19 +2214,30 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * If the hdr's data can be shared (no byteswapping, hdr is - * uncompressed, hdr's data is not currently being written to the - * L2ARC write) then we share the data buffer and set the appropriate - * bit in the hdr's b_flags to indicate the hdr is sharing it's - * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to - * store the buf's data. + * If the hdr's data can be shared (no byteswapping, hdr compression + * matches the requested buf compression) then we share the data buffer + * and set the appropriate bit in the hdr's b_flags to indicate + * the hdr is sharing it's b_pdata with the arc_buf_t. Otherwise, we + * allocate a new buffer to store the buf's data. */ - if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && compressed && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(!HDR_SHARED_DATA(hdr)); + buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_prop_flags = + ARC_BUF_FLAG_SHARED | ARC_BUF_FLAG_COMPRESSED; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !compressed && HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(ARC_BUF_LAST(buf)); buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_prop_flags = ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!compressed); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + buf->b_prop_flags = 0; ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } @@ -2170,10 +2261,12 @@ arc_buf_clone(arc_buf_t *from) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!ARC_BUF_COMPRESSED(from)); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; + buf->b_prop_flags = 0; buf->b_next = hdr->b_l1hdr.b_buf; hdr->b_l1hdr.b_buf = buf; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); @@ -2193,16 +2286,27 @@ static char *arc_onloan_tag = "onloan"; * freed. */ arc_buf_t * -arc_loan_buf(spa_t *spa, uint64_t size) +arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { - arc_buf_t *buf; - - buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, + is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); atomic_add_64(&arc_loaned_bytes, size); return (buf); } +arc_buf_t * +arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, + psize, lsize, compression_type); + + atomic_add_64(&arc_loaned_bytes, psize); + return (buf); +} + + /* * Return a loaned arc buffer to the arc. */ @@ -2216,7 +2320,7 @@ arc_return_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); + atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -2230,7 +2334,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); + atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } static void @@ -2287,6 +2391,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pdata = buf->b_data; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_prop_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need @@ -2295,7 +2400,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void @@ -2313,6 +2418,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); hdr->b_l1hdr.b_pdata = NULL; + buf->b_prop_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between @@ -2320,21 +2426,59 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + * Remove an arc_buf_t from the hdr's buf list and return the last + * arc_buf_t on the list. If no buffers remain on the list then return + * NULL. + */ +static arc_buf_t * +arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; + arc_buf_t *lastbuf = NULL; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Remove the buf from the hdr list and locate the last + * remaining buffer on the list. + */ + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); + IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); + + return (lastbuf); +} + +/* + * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's + * list and free it. */ static void -arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf) { - arc_buf_t **bufp; + arc_buf_t *lastbuf; arc_buf_hdr_t *hdr = buf->b_hdr; - arc_buf_t *lastbuf = NULL; - uint64_t size = HDR_GET_LSIZE(hdr); - boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); /* * Free up the data associated with the buf but only @@ -2349,14 +2493,15 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - arc_cksum_verify(buf); + if (!ARC_BUF_COMPRESSED(buf)) { + arc_cksum_verify(buf); + } arc_buf_unwatch(buf); - if (destroyed_buf_is_shared) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(HDR_SHARED_DATA(hdr)); + if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } @@ -2366,53 +2511,53 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) hdr->b_l1hdr.b_bufcnt -= 1; } - /* only remove the buf if requested */ - if (!remove) - return; - - /* remove the buf from the hdr list */ - bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != NULL) { - if (*bufp == buf) - *bufp = buf->b_next; + lastbuf = arc_buf_remove(hdr, buf); + if (ARC_BUF_COMPRESSED(buf)) { /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. + * For compressed, shared buffers we don't need to do anything + * special so take the opportunity to ensure that compressed + * buffers must be shared. The hdr has already been marked as + * not shared and we already cleared b_data, so just check the + * flag on the buf. */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); + VERIFY(ARC_BUF_SHARED(buf)); + } else if (ARC_BUF_SHARED(buf)) { + ASSERT(!ARC_BUF_COMPRESSED(buf)); - /* - * If the current arc_buf_t is sharing its data - * buffer with the hdr, then reassign the hdr's - * b_pdata to share it with the new buffer at the end - * of the list. The shared buffer is always the last one - * on the hdr's buffer list. - */ - if (destroyed_buf_is_shared && lastbuf != NULL) { - ASSERT(ARC_BUF_LAST(buf)); - ASSERT(ARC_BUF_LAST(lastbuf)); - VERIFY(!arc_buf_is_shared(lastbuf)); + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (lastbuf != NULL) { + VERIFY(!arc_buf_is_shared(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } + } else if (HDR_SHARED_DATA(hdr)) { /* - * We must setup a new shared block between the - * last buffer and the hdr. The data would have - * been allocated by the arc buf so we need to transfer - * ownership to the hdr since it's now being shared. + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. */ - arc_share_buf(hdr, lastbuf); - } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + ASSERT3P(lastbuf, !=, NULL); + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); } if (hdr->b_l1hdr.b_bufcnt == 0) @@ -2467,11 +2612,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compress, arc_buf_contents_t type) + enum zio_compress compression_type, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - ASSERT3U(lsize, >, 0); VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); @@ -2483,7 +2627,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); - arc_hdr_set_compress(hdr, compress); + arc_hdr_set_compress(hdr, compression_type); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; @@ -2604,14 +2748,42 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * -arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_t *buf; arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - buf = arc_buf_alloc_impl(hdr, tag); + + buf = arc_buf_alloc_impl(hdr, tag, B_FALSE); + arc_buf_thaw(buf); + + return (buf); +} + +/* + * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this + * for bufs containing metadata. + */ +arc_buf_t * +arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT(compression_type > ZIO_COMPRESS_OFF); + ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + compression_type, ARC_BUFC_DATA); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + buf = arc_buf_alloc_impl(hdr, tag, B_TRUE); arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + return (buf); } @@ -2678,7 +2850,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) - arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pdata != NULL) { arc_hdr_free_pdata(hdr); @@ -2717,16 +2889,10 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } -uint64_t -arc_buf_size(arc_buf_t *buf) -{ - return (HDR_GET_LSIZE(buf->b_hdr)); -} - /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this @@ -2770,7 +2936,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* @@ -2785,7 +2950,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2814,7 +2978,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf, B_TRUE); + arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { @@ -3325,7 +3489,7 @@ arc_adjust_meta_only(void) /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the - * space alloted to the MFU (which is defined as arc_c - arc_p). + * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); @@ -4449,7 +4613,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } @@ -4487,11 +4651,11 @@ static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; - arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = B_FALSE; - + boolean_t freeable = B_FALSE; + arc_buf_t *decomp_buf = NULL; + int callback_cnt = 0; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since @@ -4549,39 +4713,45 @@ arc_read_done(zio_t *zio) arc_access(hdr, hash_lock); } - /* create copies of the data buffer for the callers */ - for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done != NULL) { - /* - * If we're here, then this must be a demand read - * since prefetch requests don't have callbacks. - * If a read request has a callback (i.e. acb_done is - * not NULL), then we decompress the data for the - * first request and clone the rest. This avoids - * having to waste cpu resources decompressing data - * that nobody is explicitly waiting to read. - */ - if (abuf == NULL) { - acb->acb_buf = arc_buf_alloc_impl(hdr, - acb->acb_private); + /* create buffers for the callers. only decompress the data once. */ + for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + if (!acb->acb_done) + continue; + + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ + + callback_cnt++; + if (acb->acb_compressed && !HDR_SHARED_DATA(hdr) && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private, B_TRUE); + } else { + if (decomp_buf == NULL) { + decomp_buf = arc_buf_alloc_impl(hdr, + acb->acb_private, B_FALSE); if (zio->io_error == 0) { zio->io_error = - arc_decompress(acb->acb_buf); + arc_decompress(decomp_buf); } - abuf = acb->acb_buf; + acb->acb_buf = decomp_buf; } else { add_reference(hdr, acb->acb_private); - acb->acb_buf = arc_buf_clone(abuf); + acb->acb_buf = arc_buf_clone(decomp_buf); } } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (abuf == NULL) { - /* - * This buffer didn't have a callback so it must - * be a prefetch. - */ + if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); @@ -4666,6 +4836,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || @@ -4766,19 +4937,43 @@ top: ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* - * If this block is already in use, create a new - * copy of the data so that we will be guaranteed - * that arc_release() will always succeed. + * If we're doing a raw read, the header hasn't been + * shared yet, the header contains compressed data, and + * the data does not need to be byteswapped, use the + * header's b_pdata as the new buf's b_data. Otherwise, + * we'll either need to clone an existing decompressed + * buf or decompress the data ourselves. */ - buf = hdr->b_l1hdr.b_buf; - if (buf == NULL) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - buf = arc_buf_alloc_impl(hdr, private); - VERIFY0(arc_decompress(buf)); + if (compressed_read && !HDR_SHARED_DATA(hdr) && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) { + buf = arc_buf_alloc_impl(hdr, private, B_TRUE); } else { - add_reference(hdr, private); - buf = arc_buf_clone(buf); + /* search for a decompressed buf */ + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (!ARC_BUF_COMPRESSED(buf)) + break; + } + + if (buf == NULL) { + /* there could be one compressed buf */ + IMPLY(HDR_SHARED_DATA(hdr), + refcount_count( + &hdr->b_l1hdr.b_refcnt) == 1); + /* otherwise there won't be any */ + IMPLY(!HDR_SHARED_DATA(hdr), + refcount_count( + &hdr->b_l1hdr.b_refcnt) == 0); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, + ==, NULL); + buf = arc_buf_alloc_impl(hdr, private, + B_FALSE); + VERIFY0(arc_decompress(buf)); + } else { + add_reference(hdr, private); + buf = arc_buf_clone(buf); + } } ASSERT3P(buf->b_data, !=, NULL); @@ -4851,6 +5046,7 @@ top: ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * This is a delicate dance that we play here. @@ -4891,6 +5087,7 @@ top: acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; + acb->acb_compressed = compressed_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; @@ -5175,7 +5372,7 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); @@ -5199,7 +5396,6 @@ arc_release(arc_buf_t *buf, void *tag) */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; - arc_buf_t **bufp; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); @@ -5211,35 +5407,15 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); + if (arc_buf_is_shared(buf)) ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(buf)); - } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ - bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != NULL) { - if (*bufp == buf) { - *bufp = buf->b_next; - } - - /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. - */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); + lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* @@ -5250,7 +5426,6 @@ arc_release(arc_buf_t *buf, void *tag) */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(lastbuf)); VERIFY(!arc_buf_is_shared(lastbuf)); /* @@ -5260,21 +5435,46 @@ arc_release(arc_buf_t *buf, void *tag) * on the arc_buf_t list. */ arc_unshare_buf(hdr, buf); - arc_share_buf(hdr, lastbuf); + + /* + * If the buf we removed was compressed, then + * we need to allocate a new compressed block for the + * hdr and copy the data over. Otherwise, the + * buffer was uncompressed and we can now share + * the data with the lastbuf. + */ + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); + arc_hdr_alloc_pdata(hdr); + bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); + arc_share_buf(hdr, lastbuf); + } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { - ASSERT(arc_buf_is_shared(lastbuf)); + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT(arc_buf_is_shared(lastbuf) || + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), buf); + arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; @@ -5368,15 +5568,13 @@ arc_write_ready(zio_t *zio) /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the - * callback as invoked. + * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pdata != NULL) { if (arc_buf_is_shared(buf)) { - ASSERT(HDR_SHARED_DATA(hdr)); - arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5412,19 +5610,27 @@ arc_write_ready(zio_t *zio) * arc thus the on-disk block may or may not match what we maintain * in the hdr's b_pdata field. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); arc_hdr_alloc_pdata(hdr); bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); } else { ASSERT3P(buf->b_data, ==, zio->io_orig_data); - ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(zio->io_orig_size, ==, HDR_GET_PSIZE(hdr)); + } else { + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + } + EQUIV(HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF, + ARC_BUF_COMPRESSED(buf)); /* * This hdr is not compressed so we're able to share @@ -5561,6 +5767,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF); + zio_flags |= ZIO_FLAG_RAW; + } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5581,7 +5791,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_LAST(buf)); arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); @@ -5592,7 +5801,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, + zio = zio_write(pio, spa, txg, bp, buf->b_data, + HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, |