From 4d044c4c1d68ed518fe37eea61a4cc77048940fb Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Fri, 4 Aug 2017 09:30:49 -0700 Subject: OpenZFS 9238 - ZFS Spacemap Encoding V2 Motivation ========== The current space map encoding has the following disadvantages: [1] Assuming 512 sector size each entry can represent at most 16MB for a segment. This makes the encoding very inefficient for large regions of space. [2] As vdev-wide space maps have started to be used by new features (i.e. device removal, zpool checkpoint) we've started imposing limits in the vdevs that can be used with them based on the maximum addressable offset (currently 64PB for a top-level vdev). New encoding ============ The layout can be found at space_map.h and it remains backwards compatible with the old one. The introduced two-word entry format, besides extending the limits imposed by the single-entry layout, also includes a vdev field and some extra padding after its prefix. The extra padding after the prefix should is reserved for future usage (e.g. new prefixes for future encodings or new fields for flags). The new vdev field not only makes the space maps more self-descriptive, but also opens the doors for pool-wide space maps (expected to be used in the log spacemap project). One final important note is that the number of bits used for vdevs is reduced to 24 bits for blkptrs. That was decided as we don't know of any setups that use more than 16M vdevs for the time being and we wanted to fit the vdev field in the space map. In addition that gives us some extra bits in dva_t. Other references: ================= The new encoding is also discussed towards the end of the Log Space Map presentation from 2017's OpenZFS summit. Link: https://www.youtube.com/watch?v=jj2IxRkl5bQ Authored by: Serapheim Dimitropoulos Reviewed by: Matt Ahrens Reviewed by: George Wilson Reviewed by: Brian Behlendorf Approved by: Gordon Ross Ported-by: Tim Chase Signed-off-by: Tim Chase OpenZFS-commit: https://github.com/openzfs/openzfs/commit/90a56e6d OpenZFS-issue: https://www.illumos.org/issues/9238 Closes #7665 --- module/zfs/spa_checkpoint.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'module/zfs/spa_checkpoint.c') diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 544658821..6f7e9ab83 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg { } spa_checkpoint_discard_sync_callback_arg_t; static int -spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, - uint64_t size, void *arg) +spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) { spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; vdev_t *vd = sdc->sdc_vd; - metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - uint64_t end = offset + size; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) return (EINTR); @@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ - VERIFY3U(type, ==, SM_FREE); - VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(sme->sme_type, ==, SM_FREE); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* @@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, mutex_enter(&ms->ms_lock); if (range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); - range_tree_add(ms->ms_freeing, offset, size); + range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); - ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size); - ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size); + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, + sme->sme_run); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); - vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size; - vd->vdev_stat.vs_checkpoint_space -= size; + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; + vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; sdc->sdc_entry_limit--; return (0); @@ -291,12 +291,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) * Thus, we set the maximum entries that the space map callback * will be applied to be half the entries that could fit in the * imposed memory limit. + * + * Note that since this is a conservative estimate we also + * assume the worst case scenario in our computation where each + * entry is two-word. */ uint64_t max_entry_limit = - (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1; - - uint64_t entries_in_sm = - space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; /* * Iterate from the end of the space map towards the beginning, @@ -320,14 +321,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) spa_checkpoint_discard_sync_callback_arg_t sdc; sdc.sdc_vd = vd; sdc.sdc_txg = tx->tx_txg; - sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit); + sdc.sdc_entry_limit = max_entry_limit; - uint64_t entries_before = entries_in_sm; + uint64_t words_before = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, spa_checkpoint_discard_sync_callback, &sdc, tx); - uint64_t entries_after = + uint64_t words_after = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); #ifdef ZFS_DEBUG @@ -335,9 +337,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) #endif zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " - "deleted %llu entries - %llu entries are left", - tx->tx_txg, vd->vdev_id, (entries_before - entries_after), - entries_after); + "deleted %llu words - %llu words are left", + tx->tx_txg, vd->vdev_id, (words_before - words_after), + words_after); if (error != EINTR) { if (error != 0) { @@ -346,15 +348,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) "space map of vdev %llu\n", error, vd->vdev_id); } - ASSERT0(entries_after); + ASSERT0(words_after); ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); - ASSERT0(vd->vdev_checkpoint_sm->sm_length); + ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; - VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset, + VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); } } -- cgit v1.2.3