diff options
author | Serapheim Dimitropoulos <[email protected]> | 2017-08-04 09:30:49 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2018-07-05 12:02:34 -0700 |
commit | 4d044c4c1d68ed518fe37eea61a4cc77048940fb (patch) | |
tree | e9e7f15b37f046a508ae038246e3808e17eed625 /include/sys | |
parent | 4e82b4be78b0febb2a6add5dc070f34b27a4b786 (diff) |
OpenZFS 9238 - ZFS Spacemap Encoding V2
Motivation
==========
The current space map encoding has the following disadvantages:
[1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
This makes the encoding very inefficient for large regions of space.
[2] As vdev-wide space maps have started to be used by new features (i.e.
device removal, zpool checkpoint) we've started imposing limits in the
vdevs that can be used with them based on the maximum addressable offset
(currently 64PB for a top-level vdev).
New encoding
============
The layout can be found at space_map.h and it remains backwards compatible with
the old one. The introduced two-word entry format, besides extending the limits
imposed by the single-entry layout, also includes a vdev field and some extra
padding after its prefix.
The extra padding after the prefix should is reserved for future usage (e.g.
new prefixes for future encodings or new fields for flags). The new vdev field
not only makes the space maps more self-descriptive, but also opens the doors
for pool-wide space maps (expected to be used in the log spacemap project).
One final important note is that the number of bits used for vdevs is reduced
to 24 bits for blkptrs. That was decided as we don't know of any setups that
use more than 16M vdevs for the time being and we wanted to fit the vdev field
in the space map. In addition that gives us some extra bits in dva_t.
Other references:
=================
The new encoding is also discussed towards the end of the Log Space Map
presentation from 2017's OpenZFS summit.
Link: https://www.youtube.com/watch?v=jj2IxRkl5bQ
Authored by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Approved by: Gordon Ross <[email protected]>
Ported-by: Tim Chase <[email protected]>
Signed-off-by: Tim Chase <[email protected]>
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/90a56e6d
OpenZFS-issue: https://www.illumos.org/issues/9238
Closes #7665
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/spa.h | 12 | ||||
-rw-r--r-- | include/sys/space_map.h | 114 |
2 files changed, 90 insertions, 36 deletions
diff --git a/include/sys/spa.h b/include/sys/spa.h index b6483e11b..4a3fc71f7 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -153,6 +153,7 @@ _NOTE(CONSTCOND) } while (0) #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). @@ -177,15 +178,15 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | + * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | + * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | vdev3 | GRID | ASIZE | + * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -443,8 +444,9 @@ typedef struct blkptr { #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) -#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 98b87269c..64c97bb4d 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -93,50 +93,100 @@ typedef struct space_map { /* * debug entry * - * 1 3 10 50 - * ,---+--------+------------+---------------------------------. - * | 1 | action | syncpass | txg (lower bits) | - * `---+--------+------------+---------------------------------' - * 63 62 60 59 50 49 0 + * 2 2 10 50 + * +-----+-----+------------+----------------------------------+ + * | 1 0 | act | syncpass | txg (lower bits) | + * +-----+-----+------------+----------------------------------+ + * 63 62 61 60 59 50 49 0 * * - * non-debug entry + * one-word entry * * 1 47 1 15 - * ,-----------------------------------------------------------. + * +-----------------------------------------------------------+ * | 0 | offset (sm_shift units) | type | run | - * `-----------------------------------------------------------' - * 63 62 17 16 15 0 + * +-----------------------------------------------------------+ + * 63 62 16 15 14 0 + * + * + * two-word entry + * + * 2 2 36 24 + * +-----+-----+---------------------------+-------------------+ + * | 1 1 | pad | run | vdev | + * +-----+-----+---------------------------+-------------------+ + * 63 62 61 60 59 24 23 0 + * + * 1 63 + * +------+----------------------------------------------------+ + * | type | offset | + * +------+----------------------------------------------------+ + * 63 62 0 + * + * Note that a two-word entry will not straddle a block boundary. + * If necessary, the last word of a block will be padded with a + * debug entry (with act = syncpass = txg = 0). */ -/* All this stuff takes and returns bytes */ -#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) -#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) -#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) -#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) -#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) -#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) -#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) -#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) +typedef enum { + SM_ALLOC, + SM_FREE +} maptype_t; + +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ +} space_map_entry_t; + +#define SM_NO_VDEVID (1 << SPA_VDEVBITS) -#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) -#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) #define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) #define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) - #define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) #define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) -#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) - -typedef enum { - SM_ALLOC, - SM_FREE -} maptype_t; - -typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, - void *arg); +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +boolean_t sm_entry_is_debug(uint64_t e); +boolean_t sm_entry_is_single_word(uint64_t e); +boolean_t sm_entry_is_double_word(uint64_t e); + +typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); @@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - dmu_tx_t *tx); + uint64_t vdev_id, dmu_tx_t *tx); +uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id); void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); |