diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/arc.c | 123 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 22 | ||||
-rw-r--r-- | module/zfs/ddt.c | 17 | ||||
-rw-r--r-- | module/zfs/dmu_traverse.c | 3 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 4 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 2680 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 82 | ||||
-rw-r--r-- | module/zfs/range_tree.c | 324 | ||||
-rw-r--r-- | module/zfs/spa.c | 6 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 10 | ||||
-rw-r--r-- | module/zfs/vdev.c | 16 | ||||
-rw-r--r-- | module/zfs/vdev_queue.c | 2 | ||||
-rw-r--r-- | module/zfs/zap.c | 2 | ||||
-rw-r--r-- | module/zfs/zio.c | 22 |
14 files changed, 2669 insertions, 644 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index cd343b04e..698357632 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -357,7 +357,8 @@ int arc_no_grow_shift = 5; * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +static int arc_min_prefetch_ms; +static int arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. @@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10; * These tunables are Linux specific */ unsigned long zfs_arc_sys_free = 0; -int zfs_arc_min_prefetch_lifespan = 0; +int zfs_arc_min_prefetch_ms = 0; +int zfs_arc_min_prescient_prefetch_ms = 0; int zfs_arc_p_aggressive_disable = 1; int zfs_arc_p_dampener_disable = 1; int zfs_arc_meta_prune = 10000; @@ -663,6 +665,7 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_min; kstat_named_t arcstat_sync_wait_for_async; kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; @@ -762,6 +765,7 @@ static arc_stats_t arc_stats = { { "arc_meta_min", KSTAT_DATA_UINT64 }, { "sync_wait_for_async", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 } @@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) @@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } @@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((HDR_PREFETCH(hdr)) != 0) { - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - /* link protected by hash_lock */ - ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - } + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); @@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } @@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void -arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg) +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { - if (error == 0) - bcopy(buf->b_data, arg, arc_buf_size(buf)); + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ +/* ARGSUSED */ void -arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg) +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; - if (error != 0) { - arc_buf_destroy(buf, arg); + + if (buf == NULL) { *bufp = NULL; } else { *bufp = buf; @@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio) arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio) } } - if (no_zio_error) { + if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio) if (!acb->acb_done) continue; - /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; + if (zio->io_error != 0) + continue; + int error = arc_buf_alloc_impl(hdr, zio->io_spa, acb->acb_dsobj, acb->acb_private, acb->acb_encrypted, - acb->acb_compressed, acb->acb_noauth, no_zio_error, + acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); + if (error != 0) { + arc_buf_destroy(acb->acb_buf, acb->acb_private); + acb->acb_buf = NULL; + } /* * Assert non-speculative zios didn't fail because an @@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio) } } - if (no_zio_error) { + if (zio->io_error == 0) zio->io_error = error; - } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); @@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio) ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (no_zio_error) { + if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio) /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) { - acb->acb_done(zio, zio->io_error, acb->acb_buf, - acb->acb_private); + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { @@ -5974,12 +5991,25 @@ top: arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } + + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); + if (rc != 0) { + arc_buf_destroy(buf, private); + buf = NULL; + } + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { @@ -5987,6 +6017,8 @@ top: } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); @@ -5996,7 +6028,7 @@ top: data, metadata, hits); if (done) - done(NULL, rc, buf, private); + done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -6112,6 +6144,8 @@ top: if (*arc_flags & ARC_FLAG_PREFETCH && refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) @@ -7223,9 +7257,15 @@ arc_tuning_update(void) if (zfs_arc_p_min_shift) arc_p_min_shift = zfs_arc_p_min_shift; - /* Valid range: 1 - N ticks */ - if (zfs_arc_min_prefetch_lifespan) - arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan; + /* Valid range: 1 - N ms */ + if (zfs_arc_min_prefetch_ms) + arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; + + /* Valid range: 1 - N ms */ + if (zfs_arc_min_prescient_prefetch_ms) { + arc_min_prescient_prefetch_ms = + zfs_arc_min_prescient_prefetch_ms; + } /* Valid range: 0 - 100 */ if ((zfs_arc_lotsfree_percent >= 0) && @@ -7368,7 +7408,8 @@ arc_init(void) cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; + arc_min_prefetch_ms = 1; + arc_min_prescient_prefetch_ms = 6; #ifdef _KERNEL /* @@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size"); module_param(zfs_compressed_arc_enabled, int, 0644); MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers"); -module_param(zfs_arc_min_prefetch_lifespan, int, 0644); -MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); +module_param(zfs_arc_min_prefetch_ms, int, 0644); +MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms"); + +module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644); +MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms, + "Min life of prescient prefetched block in ms"); module_param(l2arc_write_max, ulong, 0644); MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 64c1a68af..190d0656a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) } static void -dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb) ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ + if (buf == NULL) { + buf = arc_alloc_buf(db->db_objset->os_spa, + db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); + } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (err == 0) { + } else if (buf != NULL) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; @@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private) dbuf_rele(db, FTAG); } - dpa->dpa_curlevel--; + if (abuf == NULL) { + kmem_free(dpa, sizeof (*dpa)); + return; + } + dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp) || err != 0) { + + if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 00b0a0b9e..24516834f 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1172,14 +1172,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) void ddt_sync(spa_t *spa, uint64_t txg) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dmu_tx_t *tx; - zio_t *rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + zio_t *rio; ASSERT(spa_syncing_txg(spa) == txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) @@ -1189,6 +1201,7 @@ ddt_sync(spa_t *spa, uint64_t txg) } (void) zio_wait(rio); + scn->scn_zio_root = NULL; dmu_tx_commit(tx); } diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 64e7d2f77..280e0ee34 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -520,7 +520,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { prefetch_data_t *pfd = arg; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (bp == NULL) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 43fd90861..86863fad8 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -390,8 +390,10 @@ dsl_pool_close(dsl_pool_t *dp) mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); taskq_destroy(dp->dp_iput_taskq); - if (dp->dp_blkstats) + if (dp->dp_blkstats) { + mutex_destroy(&dp->dp_blkstats->zab_lock); vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + } kmem_free(dp, sizeof (dsl_pool_t)); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b0aec5332..52c700f11 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -50,33 +50,141 @@ #include <sys/sa_impl.h> #include <sys/zfeature.h> #include <sys/abd.h> +#include <sys/range_tree.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limitted performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; -static void dsl_scan_cancel_sync(void *, dmu_tx_t *); -static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); -static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); -int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ -int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ -int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ -int zfs_scan_idle = 50; /* idle window in clock ticks */ +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +int zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ -int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +int zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; -int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ /* max number of blocks to free in a single TXG */ unsigned long zfs_free_max_blocks = 100000; +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) @@ -93,6 +201,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_offset; + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_asize; + + /* fields from zio_t */ + int sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issueing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; +} scan_io_t; + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + avl_tree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache; + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; + + sio_cache = kmem_cache_create("sio_cache", + sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +scan_fini(void) +{ + kmem_cache_destroy(sio_cache); +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +{ + bzero(bp, sizeof (*bp)); + DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); + DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + /* we discard the vdev id, since we can deduce it from the queue */ + sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); + sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; +} + int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { @@ -113,6 +378,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { @@ -123,7 +395,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it @@ -157,7 +429,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_async_destroying) { spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; - return (SET_ERROR(EOVERFLOW)); + return (EOVERFLOW); } bcopy(zaptmp, &scn->scn_phys, @@ -177,7 +449,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) else if (err) return (err); - if (scn->scn_phys.scn_state == DSS_SCANNING && + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old @@ -189,8 +468,24 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); + } + } + + /* reload the queue into the in-core state */ + if (scn->scn_phys.scn_queue_obj != 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + za.za_first_integer); } + zap_cursor_fini(&zc); } spa_scan_stat_init(spa); @@ -200,19 +495,116 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) void dsl_scan_fini(dsl_pool_t *dp) { - if (dp->dp_scan) { + if (dp->dp_scan != NULL) { + dsl_scan_t *scn = dp->dp_scan; + + if (scn->scn_taskq != NULL) + taskq_destroy(scn->scn_taskq); + scan_ds_queue_clear(scn); + avl_destroy(&scn->scn_queue); + avl_destroy(&scn->scn_prefetch_queue); + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} + +boolean_t +dsl_scan_scrubbing(const dsl_pool_t *dp) +{ + dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; + + return (scn_phys->scn_state == DSS_SCANNING && + scn_phys->scn_func == POOL_SCAN_SCRUB); +} + +boolean_t +dsl_scan_is_paused_scrub(const dsl_scan_t *scn) +{ + return (dsl_scan_scrubbing(scn->scn_dp) && + scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); +} + +/* + * Writes out a persistent dsl_scan_phys_t record to the pool directory. + * Because we can be running in the block sorting algorithm, we do not always + * want to write out the record, only when it is "safe" to do so. This safety + * condition is achieved by making sure that the sorting queues are empty + * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * is inconsistent with how much actual scanning progress has been made. The + * kind of sync to be performed is specified by the sync_type argument. If the + * sync is optional, we only sync if the queues are empty. If the sync is + * mandatory, we do a hard ASSERT to make sure that the queues are empty. The + * third possible state is a "cached" sync. This is done in response to: + * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been + * destroyed, so we wouldn't be able to restart scanning from it. + * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been + * superseded by a newer snapshot. + * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been + * swapped with its clone. + * In all cases, a cached sync simply rewrites the last record we've written, + * just slightly modified. For the modifications that are performed to the + * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, + * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. + */ +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) +{ + int i; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); + if (scn->scn_bytes_pending == 0) { + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; + + if (q == NULL) + continue; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); + ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + if (scn->scn_phys.scn_queue_obj != 0) + scan_ds_queue_sync(scn, tx); + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, + sizeof (scn->scn_phys)); + + if (scn->scn_checkpointing) + zfs_dbgmsg("finish scan checkpoint"); + + scn->scn_checkpointing = B_FALSE; + scn->scn_last_checkpoint = ddi_get_lbolt(); + } else if (sync_type == SYNC_CACHED) { + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys_cached, tx)); + } +} + /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state == DSS_SCANNING) + if (dsl_scan_is_running(scn)) return (SET_ERROR(EBUSY)); return (0); @@ -227,7 +619,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; - ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; @@ -238,8 +630,11 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; + scn->scn_last_checkpoint = 0; + scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { @@ -272,8 +667,10 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (dp->dp_blkstats == NULL) { dp->dp_blkstats = vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + mutex_init(&dp->dp_blkstats->zab_lock, NULL, + MUTEX_DEFAULT, NULL); } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -281,13 +678,52 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); - dsl_scan_sync_state(scn, tx); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } +/* + * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. + * Can also be called to resume a paused scrub. + */ +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { + /* got scrub start cmd, resume paused scrub */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) + return (ECANCELED); + + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); +} + /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -315,10 +751,11 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } + scan_ds_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; @@ -326,13 +763,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) * If we were "restarted" from a stopped state, don't bother * with anything else. */ - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) { + ASSERT(!scn->scn_is_sorted); return; + } - if (complete) - scn->scn_phys.scn_state = DSS_FINISHED; - else - scn->scn_phys.scn_state = DSS_CANCELED; + if (scn->scn_is_sorted) { + scan_io_queues_destroy(scn); + scn->scn_is_sorted = B_FALSE; + + if (scn->scn_taskq != NULL) { + taskq_destroy(scn->scn_taskq); + scn->scn_taskq = NULL; + } + } + + scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, @@ -345,12 +791,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); - } - mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; @@ -379,6 +819,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; + + ASSERT(!dsl_scan_is_running(scn)); } /* ARGSUSED */ @@ -387,7 +829,7 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } @@ -399,7 +841,7 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); } int @@ -409,16 +851,6 @@ dsl_scan_cancel(dsl_pool_t *dp) dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_is_paused_scrub(const dsl_scan_t *scn) -{ - if (dsl_scan_scrubbing(scn->scn_dp) && - scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED) - return (B_TRUE); - - return (B_FALSE); -} - static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { @@ -453,7 +885,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { @@ -466,7 +898,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } @@ -482,25 +914,25 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_scrubbing(const dsl_pool_t *dp) -{ - dsl_scan_t *scn = dp->dp_scan; - if (scn->scn_phys.scn_state == DSS_SCANNING && - scn->scn_phys.scn_func == POOL_SCAN_SCRUB) - return (B_TRUE); +/* start a new scan, or restart an existing one. */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - return (B_FALSE); + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg); } -static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, - dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, - dmu_objset_type_t ostype, dmu_tx_t *tx); -inline __attribute__((always_inline)) static void dsl_scan_visitdnode( - dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, - dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); - void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { @@ -514,25 +946,169 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } -static uint64_t -dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +static int +scan_ds_queue_compare(const void *a, const void *b) { - uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (ds->ds_is_snapshot) - return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); - return (smt); + const scan_ds_t *sds_a = a, *sds_b = b; + + if (sds_a->sds_dsobj < sds_b->sds_dsobj) + return (-1); + if (sds_a->sds_dsobj == sds_b->sds_dsobj) + return (0); + return (1); } static void -dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +scan_ds_queue_clear(dsl_scan_t *scn) { - VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys, tx)); + void *cookie = NULL; + scan_ds_t *sds; + while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { + kmem_free(sds, sizeof (*sds)); + } } -extern int zfs_vdev_async_write_active_min_dirty_percent; +static boolean_t +scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + sds = avl_find(&scn->scn_queue, &srch, NULL); + if (sds != NULL && txg != NULL) + *txg = sds->sds_txg; + return (sds != NULL); +} + +static void +scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) +{ + scan_ds_t *sds; + avl_index_t where; + + sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); + sds->sds_dsobj = dsobj; + sds->sds_txg = txg; + + VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); + avl_insert(&scn->scn_queue, sds, where); +} + +static void +scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + + sds = avl_find(&scn->scn_queue, &srch, NULL); + VERIFY(sds != NULL); + avl_remove(&scn->scn_queue, sds); + kmem_free(sds, sizeof (*sds)); +} + +static void +scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? + DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; + + ASSERT0(scn->scn_bytes_pending); + ASSERT(scn->scn_phys.scn_queue_obj != 0); + + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, + DMU_OT_NONE, 0, tx); + for (scan_ds_t *sds = avl_first(&scn->scn_queue); + sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { + VERIFY0(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, sds->sds_dsobj, + sds->sds_txg, tx)); + } +} + +/* + * Computes the memory limit state that we're currently in. A sorted scan + * needs quite a bit of memory to hold the sorting queue, so we need to + * reasonably constrain the size so it doesn't impact overall system + * performance. We compute two limits: + * 1) Hard memory limit: if the amount of memory used by the sorting + * queues on a pool gets above this value, we stop the metadata + * scanning portion and start issuing the queued up and sorted + * I/Os to reduce memory usage. + * This limit is calculated as a fraction of physmem (by default 5%). + * We constrain the lower bound of the hard limit to an absolute + * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain + * the upper bound to 5% of the total pool size - no chance we'll + * ever need that much memory, but just to keep the value in check. + * 2) Soft memory limit: once we hit the hard memory limit, we start + * issuing I/O to reduce queue memory usage, but we don't want to + * completely empty out the queues, since we might be able to find I/Os + * that will fill in the gaps of our non-sequential IOs at some point + * in the future. So we stop the issuing of I/Os once the amount of + * memory used drops below the soft limit (at which point we stop issuing + * I/O and start scanning metadata again). + * + * This limit is calculated by subtracting a fraction of the hard + * limit from the hard limit. By default this fraction is 5%, so + * the soft limit is 95% of the hard limit. We cap the size of the + * difference between the hard and soft limits at an absolute + * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is + * sufficient to not cause too frequent switching between the + * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's + * worth of queues is about 1.2 GiB of on-pool data, so scanning + * that should take at least a decent fraction of a second). + */ +static boolean_t +dsl_scan_should_clear(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + uint64_t mlim_hard, mlim_soft, mused; + uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( + scn->scn_dp->dp_spa)); + + mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, + zfs_scan_mem_lim_min); + mlim_hard = MIN(mlim_hard, alloc / 20); + mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, + zfs_scan_mem_lim_soft_max); + mused = 0; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + dsl_scan_io_queue_t *queue; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + queue = tvd->vdev_scan_io_queue; + if (queue != NULL) { + /* #extents in exts_by_size = # in exts_by_addr */ + mused += avl_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_t) + + avl_numnodes(&queue->q_sios_by_addr) * + sizeof (scan_io_t); + } + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } + + dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); + + if (mused == 0) + ASSERT0(scn->scn_bytes_pending); + + /* + * If we are above our hard limit, we need to clear out memory. + * If we are below our soft limit, we need to accumulate sequential IOs. + * Otherwise, we should keep doing whatever we are currently doing. + */ + if (mused >= mlim_hard) + return (B_TRUE); + else if (mused < mlim_soft) + return (B_FALSE); + else + return (scn->scn_clearing); +} static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) @@ -553,27 +1129,32 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) /* * We suspend if: - * - we have scanned for the maximum time: an entire txg - * timeout (default 5 sec) - * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly - * (default 30%), or someone is explicitly waiting for this txg - * to complete. + * (default 30%), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. + * or + * - the scan queue has reached its memory use limit */ - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scan_min_time_ms; - uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > mintime && - (txg_sync_waiting(scn->scn_dp) || - dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || - spa_shutting_down(scn->scn_dp->dp_spa)) { + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa) || + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { if (zb) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, @@ -581,12 +1162,16 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; + } else { + dsl_scan_phys_t *scnp = &scn->scn_phys; + + dprintf("suspending at at DDT bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } - dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_suspending = B_TRUE; return (B_TRUE); } @@ -683,32 +1268,283 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) zil_free(zilog); } -/* ARGSUSED */ +/* + * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea + * here is to sort the AVL tree by the order each block will be needed. + */ +static int +scan_prefetch_queue_compare(const void *a, const void *b) +{ + const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; + const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; + const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; + + return (zbookmark_compare(spc_a->spc_datablkszsec, + spc_a->spc_indblkshift, spc_b->spc_datablkszsec, + spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); +} + static void -dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, - uint64_t objset, uint64_t object, uint64_t blkid) +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) { - zbookmark_phys_t czb; - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + if (refcount_remove(&spc->spc_refcnt, tag) == 0) { + refcount_destroy(&spc->spc_refcnt); + kmem_free(spc, sizeof (scan_prefetch_ctx_t)); + } +} + +static scan_prefetch_ctx_t * +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +{ + scan_prefetch_ctx_t *spc; + + spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); + refcount_create(&spc->spc_refcnt); + refcount_add(&spc->spc_refcnt, tag); + spc->spc_scn = scn; + if (dnp != NULL) { + spc->spc_datablkszsec = dnp->dn_datablkszsec; + spc->spc_indblkshift = dnp->dn_indblkshift; + spc->spc_root = B_FALSE; + } else { + spc->spc_datablkszsec = 0; + spc->spc_indblkshift = 0; + spc->spc_root = B_TRUE; + } + + return (spc); +} + +static void +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +{ + refcount_add(&spc->spc_refcnt, tag); +} + +static boolean_t +dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, + const zbookmark_phys_t *zb) +{ + zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; + dnode_phys_t tmp_dnp; + dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; + + if (zb->zb_objset != last_zb->zb_objset) + return (B_TRUE); + if ((int64_t)zb->zb_object < 0) + return (B_FALSE); + + tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; + tmp_dnp.dn_indblkshift = spc->spc_indblkshift; + + if (zbookmark_subtree_completed(dnp, zb, last_zb)) + return (B_TRUE); + + return (B_FALSE); +} + +static void +dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) +{ + avl_index_t idx; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch) return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; - if (BP_IS_PROTECTED(bp)) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - zio_flags |= ZIO_FLAG_RAW; + if (dsl_scan_check_prefetch_resume(spc, zb)) + return; + + scan_prefetch_ctx_add_ref(spc, scn); + spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); + spic->spic_spc = spc; + spic->spic_bp = *bp; + spic->spic_zb = *zb; + + /* + * Add the IO to the queue of blocks to prefetch. This allows us to + * prioritize blocks that we will need first for the main traversal + * thread. + */ + mutex_enter(&spa->spa_scrub_lock); + if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { + /* this block is already queued for prefetch */ + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + scan_prefetch_ctx_rele(spc, scn); + mutex_exit(&spa->spa_scrub_lock); + return; + } + + avl_insert(&scn->scn_prefetch_queue, spic, idx); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +static void +dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int i; + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + return; + + SET_BOOKMARK(&zb, objset, object, 0, 0); + + spc = scan_prefetch_ctx_create(scn, dnp, FTAG); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); + zb.zb_blkid = i; + dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zb.zb_level = 0; + zb.zb_blkid = DMU_SPILL_BLKID; + dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); + } + + scan_prefetch_ctx_rele(spc, FTAG); +} + +void +dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *private) +{ + scan_prefetch_ctx_t *spc = private; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + + /* broadcast that the IO has completed for rate limitting purposes */ + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + /* if there was an error or we are done prefetching, just cleanup */ + if (buf == NULL || scn->scn_suspending) + goto out; + + if (BP_GET_LEVEL(bp) > 0) { + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t czb; + + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, zb->zb_blkid * epb + i); + dsl_scan_prefetch(spc, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + dnode_phys_t *cdnp; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_prefetch_dnode(scn, cdnp, + zb->zb_objset, zb->zb_blkid * epb + i); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + objset_phys_t *osp = buf->b_data; + + dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, + zb->zb_objset, DMU_META_DNODE_OBJECT); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + dsl_scan_prefetch_dnode(scn, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } } - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); +out: + if (buf != NULL) + arc_buf_destroy(buf, private); + scan_prefetch_ctx_rele(spc, scn); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch_thread(void *arg) +{ + dsl_scan_t *scn = arg; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; + + /* loop until we are told to stop */ + while (!scn->scn_prefetch_stop) { + arc_flags_t flags = ARC_FLAG_NOWAIT | + ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Wait until we have an IO to issue and are not above our + * maximum in flight limit. + */ + while (!scn->scn_prefetch_stop && + (avl_numnodes(&scn->scn_prefetch_queue) == 0 || + spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb); + /* recheck if we should stop since we waited for the cv */ + if (scn->scn_prefetch_stop) { + mutex_exit(&spa->spa_scrub_lock); + break; + } + + /* remove the prefetch IO from the tree */ + spic = avl_first(&scn->scn_prefetch_queue); + spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); + avl_remove(&scn->scn_prefetch_queue, spic); + + mutex_exit(&spa->spa_scrub_lock); + + if (BP_IS_PROTECTED(&spic->spic_bp)) { + ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || + BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); + ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); + zio_flags |= ZIO_FLAG_RAW; + } + + /* issue the prefetch asynchronously */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, + &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb); + + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + + ASSERT(scn->scn_prefetch_stop); + + /* free any prefetches we didn't get to complete */ + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { + avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); + mutex_exit(&spa->spa_scrub_lock); } static boolean_t @@ -747,6 +1583,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, return (B_FALSE); } +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +inline __attribute__((always_inline)) static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. @@ -774,10 +1617,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -790,7 +1629,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; @@ -808,15 +1647,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - dsl_scan_prefetch(scn, buf, cbp, - zb->zb_objset, zb->zb_blkid * epb + i, j); - } - } - for (i = 0, cdnp = buf->b_data; i < epb; - i += cdnp->dn_extra_slots + 1, - cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -843,8 +1673,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, /* * We also always visit user/group accounting * objects, and never skip them, even if we are - * suspending. This is necessary so that the space - * deltas from this txg get integrated. + * suspending. This is necessary so that the + * space deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, @@ -894,21 +1724,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; - blkptr_t *bp_toread; - - bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - *bp_toread = *bp; - - /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + blkptr_t *bp_toread = NULL; if (dsl_scan_check_suspend(scn, zb)) - goto out; + return; if (dsl_scan_check_resume(scn, dnp, zb)) - goto out; - - if (BP_IS_HOLE(bp)) - goto out; + return; scn->scn_visited_this_txg++; @@ -919,14 +1741,24 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, * if required to debug an issue in dsl_scan_visitbp(). * * dprintf_bp(bp, - * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", - * ds, ds ? ds->ds_object : 0, - * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, - * bp); + * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", + * ds, ds ? ds->ds_object : 0, + * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + * bp); */ - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) - goto out; + if (BP_IS_HOLE(bp)) { + scn->scn_holes_this_txg++; + return; + } + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; + return; + } + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) goto out; @@ -938,6 +1770,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { + scn->scn_ddt_contained_this_txg++; goto out; } @@ -948,9 +1781,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + scn->scn_gt_max_this_txg++; + goto out; } + + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + out: kmem_free(bp_toread, sizeof (blkptr_t)); } @@ -960,26 +1797,33 @@ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - dsl_scan_visitbp(bp, &zb, NULL, - ds, scn, DMU_OST_NONE, tx); + + if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { + SET_BOOKMARK(&scn->scn_prefetch_bookmark, + zb.zb_objset, 0, 0, 0); + } else { + scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; + } + + scn->scn_objsets_visited_this_txg++; + + spc = scan_prefetch_ctx_create(scn, NULL, FTAG); + dsl_scan_prefetch(spc, bp, &zb); + scan_prefetch_ctx_rele(spc, FTAG); + + dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } -void -dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +static void +ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: @@ -991,23 +1835,57 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) * ignore it when we retraverse it in * dsl_scan_visitds(). */ - scn->scn_phys.scn_bookmark.zb_objset = + scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); - scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { - SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + } +} + +/* + * Invoked when a dataset is destroyed. We need to make sure that: + * + * 1) If it is the dataset that was currently being scanned, we write + * a new dsl_scan_phys_t and marking the objset reference in it + * as destroyed. + * 2) Remove it from the work queue, if it was present. + * + * If the dataset was actually a snapshot, instead of marking the dataset + * as destroyed, we instead substitute the next snapshot in line. + */ +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_destroyed_scn_phys(ds, &scn->scn_phys); + ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + if (ds->ds_is_snapshot) + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); @@ -1036,9 +1914,28 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds->ds_object) { + scn_bookmark->zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } } +/* + * Called when a dataset is snapshotted. If we were currently traversing + * this snapshot, we reset our bookmark to point at the newly created + * snapshot. We also modify our work queue to remove the old snapshot and + * replace with the new one. + */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -1046,20 +1943,22 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = - dsl_dataset_phys(ds)->ds_prev_snap_obj; - zfs_dbgmsg("snapshotting ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); + ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, @@ -1070,37 +1969,59 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } - dsl_scan_sync_state(scn, tx); + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -void -dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +static void +ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, + zbookmark_phys_t *scn_bookmark) { - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + if (scn_bookmark->zb_objset == ds1->ds_object) { + scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + } else if (scn_bookmark->zb_objset == ds2->ds_object) { + scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } +} + +/* + * Called when a parent dataset and its clone are swapped. If we were + * currently traversing the dataset, we need to switch to traversing the + * newly promoted parent. + */ +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + } + if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1118,8 +2039,9 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + } + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1132,31 +2054,26 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) (u_longlong_t)ds1->ds_object); } - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - struct enqueue_clones_arg *eca = arg; + uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; - if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); @@ -1166,9 +2083,8 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1214,9 +2130,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", - dsobj, dsname, - scn->scn_phys.scn_cur_min_txg, - scn->scn_phys.scn_max_txg); + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; @@ -1232,7 +2148,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) + if (!ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); /* @@ -1266,9 +2182,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - scn->scn_phys.scn_cur_max_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + scn->scn_phys.scn_cur_max_txg); goto out; } @@ -1276,10 +2191,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, + scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); + dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; @@ -1300,17 +2214,21 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) } if (usenext) { - VERIFY0(zap_join_key(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj, - scn->scn_phys.scn_queue_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + dsl_dataset_phys(ds)->ds_creation_txg); + } + zap_cursor_fini(&zc); } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); + enqueue_clones_cb, &ds->ds_object, + DS_FIND_CHILDREN)); } } @@ -1322,7 +2240,6 @@ out: static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; @@ -1352,12 +2269,37 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + int p; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + /* * Scrub/dedup interaction. * @@ -1432,36 +2374,20 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } -/* ARGSUSED */ -void -dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; - blkptr_t bp; - zbookmark_phys_t zb = { 0 }; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; - ddt_bp_create(checksum, ddk, ddp, &bp); - - scn->scn_visited_this_txg++; - scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); - } + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { + scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; - zap_cursor_t *zc; - zap_attribute_t *za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { @@ -1485,7 +2411,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_cb, tx, DS_FIND_CHILDREN)); + enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); @@ -1493,42 +2419,42 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { + uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* - * If we were suspended, continue from here. Note if the + * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ - dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* - * In case we were suspended right at the end of the ds, zero the + * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); - zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(zc, dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj), - zap_cursor_retrieve(zc, za) == 0) { + /* + * Keep pulling things out of the dataset avl queue. Updates to the + * persistent zap-object-as-queue happen only at checkpoints. + */ + while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t dsobj = sds->sds_dsobj; + uint64_t txg = sds->sds_txg; - dsobj = zfs_strtonum(za->za_name, NULL); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, dsobj, tx)); + /* dequeue and free the ds from the queue */ + scan_ds_queue_remove(scn, dsobj); + sds = NULL; - /* Set up min/max txg */ + /* set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - if (za->za_first_integer != 0) { + if (txg != 0) { scn->scn_phys.scn_cur_min_txg = - MAX(scn->scn_phys.scn_min_txg, - za->za_first_integer); + MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, @@ -1538,14 +2464,360 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); - zap_cursor_fini(zc); if (scn->scn_suspending) - goto out; + return; } - zap_cursor_fini(zc); -out: - kmem_free(za, sizeof (zap_attribute_t)); - kmem_free(zc, sizeof (zap_cursor_t)); + + /* No more objsets to fetch, we're done */ + scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; + ASSERT0(scn->scn_suspending); +} + +static uint64_t +dsl_scan_count_leaves(vdev_t *vd) +{ + uint64_t i, leaves = 0; + + /* we only count leaves that belong to the main pool and are readable */ + if (vd->vdev_islog || vd->vdev_isspare || + vd->vdev_isl2cache || !vdev_readable(vd)) + return (0); + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (i = 0; i < vd->vdev_children; i++) { + leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + } + + return (leaves); +} + +static void +scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) +{ + int i; + uint64_t cur_size = 0; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); + } + + q->q_total_zio_size_this_txg += cur_size; + q->q_zios_this_txg++; +} + +static void +scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, + uint64_t end) +{ + q->q_total_seg_size_this_txg += end - start; + q->q_segs_this_txg++; +} + +static boolean_t +scan_io_queue_check_suspend(dsl_scan_t *scn) +{ + /* See comment in dsl_scan_check_suspend() */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + return ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +/* + * Given a list of scan_io_t's in io_list, this issues the io's out to + * disk. This consumes the io_list and frees the scan_io_t's. This is + * called when emptying queues, either when we're up against the memory + * limit or when we have finished scanning. Returns B_TRUE if we stopped + * processing the list before we finished. Any zios that were not issued + * will remain in the io_list. + */ +static boolean_t +scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + int64_t bytes_issued = 0; + boolean_t suspended = B_FALSE; + + while ((sio = list_head(io_list)) != NULL) { + blkptr_t bp; + + if (scan_io_queue_check_suspend(scn)) { + suspended = B_TRUE; + break; + } + + sio2bp(sio, &bp, queue->q_vd->vdev_id); + bytes_issued += sio->sio_asize; + scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, + &sio->sio_zb, queue); + (void) list_remove_head(io_list); + scan_io_queues_update_zio_stats(queue, &bp); + kmem_cache_free(sio_cache, sio); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); + + return (suspended); +} + +/* + * This function removes sios from an IO queue which reside within a given + * range_seg_t and inserts them (in offset order) into a list. Note that + * we only ever return a maximum of 32 sios at once. If there are more sios + * to process within this segment that did not make it onto the list we + * return B_TRUE and otherwise B_FALSE. + */ +static boolean_t +scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +{ + scan_io_t srch_sio, *sio, *next_sio; + avl_index_t idx; + uint_t num_sios = 0; + int64_t bytes_issued = 0; + + ASSERT(rs != NULL); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + srch_sio.sio_offset = rs->rs_start; + + /* + * The exact start of the extent might not contain any matching zios, + * so if that's the case, examine the next one in the tree. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + if (sio == NULL) + sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); + + while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { + ASSERT3U(sio->sio_offset, >=, rs->rs_start); + ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + + next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); + avl_remove(&queue->q_sios_by_addr, sio); + + bytes_issued += sio->sio_asize; + num_sios++; + list_insert_tail(list, sio); + sio = next_sio; + } + + /* + * We limit the number of sios we process at once to 32 to avoid + * biting off more than we can chew. If we didn't take everything + * in the segment we update it to reflect the work we were able to + * complete. Otherwise, we remove it from the range tree entirely. + */ + if (sio != NULL && sio->sio_offset < rs->rs_end) { + range_tree_adjust_fill(queue->q_exts_by_addr, rs, + -bytes_issued); + range_tree_resize_segment(queue->q_exts_by_addr, rs, + sio->sio_offset, rs->rs_end - sio->sio_offset); + + return (B_TRUE); + } else { + range_tree_remove(queue->q_exts_by_addr, rs->rs_start, + rs->rs_end - rs->rs_start); + return (B_FALSE); + } +} + +/* + * This is called from the queue emptying thread and selects the next + * extent from which we are to issue io's. The behavior of this function + * depends on the state of the scan, the current memory consumption and + * whether or not we are performing a scan shutdown. + * 1) We select extents in an elevator algorithm (LBA-order) if the scan + * needs to perform a checkpoint + * 2) We select the largest available extent if we are up against the + * memory limit. + * 3) Otherwise we don't select any extents. + */ +static range_seg_t * +scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + ASSERT(scn->scn_is_sorted); + + /* handle tunable overrides */ + if (scn->scn_checkpointing || scn->scn_clearing) { + if (zfs_scan_issue_strategy == 1) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (zfs_scan_issue_strategy == 2) { + return (avl_first(&queue->q_exts_by_size)); + } + } + + /* + * During normal clearing, we want to issue our largest segments + * first, keeping IO as sequential as possible, and leaving the + * smaller extents for later with the hope that they might eventually + * grow to larger sequential segments. However, when the scan is + * checkpointing, no new extents will be added to the sorting queue, + * so the way we are sorted now is as good as it will ever get. + * In this case, we instead switch to issuing extents in LBA order. + */ + if (scn->scn_checkpointing) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (scn->scn_clearing) { + return (avl_first(&queue->q_exts_by_size)); + } else { + return (NULL); + } +} + +static void +scan_io_queues_run_one(void *arg) +{ + dsl_scan_io_queue_t *queue = arg; + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + boolean_t suspended = B_FALSE; + range_seg_t *rs = NULL; + scan_io_t *sio = NULL; + list_t sio_list; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); + + ASSERT(queue->q_scn->scn_is_sorted); + + list_create(&sio_list, sizeof (scan_io_t), + offsetof(scan_io_t, sio_nodes.sio_list_node)); + mutex_enter(q_lock); + + /* calculate maximum in-flight bytes for this txg (min 1MB) */ + queue->q_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + /* reset per-queue scan statistics for this txg */ + queue->q_total_seg_size_this_txg = 0; + queue->q_segs_this_txg = 0; + queue->q_total_zio_size_this_txg = 0; + queue->q_zios_this_txg = 0; + + /* loop until we run out of time or sios */ + while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { + uint64_t seg_start = 0, seg_end = 0; + boolean_t more_left = B_TRUE; + + ASSERT(list_is_empty(&sio_list)); + + /* loop while we still have sios left to process in this rs */ + while (more_left) { + scan_io_t *first_sio, *last_sio; + + /* + * We have selected which extent needs to be + * processed next. Gather up the corresponding sios. + */ + more_left = scan_io_queue_gather(queue, rs, &sio_list); + ASSERT(!list_is_empty(&sio_list)); + first_sio = list_head(&sio_list); + last_sio = list_tail(&sio_list); + + seg_end = last_sio->sio_offset + last_sio->sio_asize; + if (seg_start == 0) + seg_start = first_sio->sio_offset; + + /* + * Issuing sios can take a long time so drop the + * queue lock. The sio queue won't be updated by + * other threads since we're in syncing context so + * we can be sure that our trees will remain exactly + * as we left them. + */ + mutex_exit(q_lock); + suspended = scan_io_queue_issue(queue, &sio_list); + mutex_enter(q_lock); + + if (suspended) + break; + } + + /* update statistics for debugging purposes */ + scan_io_queues_update_seg_stats(queue, seg_start, seg_end); + + if (suspended) + break; + } + + /* + * If we were suspended in the middle of processing, + * requeue any unfinished sios and exit. + */ + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); + scan_io_queue_insert_impl(queue, sio); + } + + mutex_exit(q_lock); + list_destroy(&sio_list); +} + +/* + * Performs an emptying run on all scan queues in the pool. This just + * punches out one thread per top-level vdev, each of which processes + * only that vdev's scan queue. We can parallelize the I/O here because + * we know that each queue's io's only affect its own top-level vdev. + * + * This function waits for the queue runs to complete, and must be + * called from dsl_scan_sync (or in general, syncing context). + */ +static void +scan_io_queues_run(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_is_sorted); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (scn->scn_bytes_pending == 0) + return; + + if (scn->scn_taskq == NULL) { + int nthreads = spa->spa_root_vdev->vdev_children; + + /* + * We need to make this taskq *always* execute as many + * threads in parallel as we have top-level vdevs and no + * less, otherwise strange serialization of the calls to + * scan_io_queues_run_one can occur during spa_sync runs + * and that significantly impacts performance. + */ + scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, + minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + if (vd->vdev_scan_io_queue != NULL) { + VERIFY(taskq_dispatch(scn->scn_taskq, + scan_io_queues_run_one, vd->vdev_scan_io_queue, + TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * Wait for the queues to finish issuing thir IOs for this run + * before we return. There may still be IOs in flight at this + * point. + */ + taskq_wait(scn->scn_taskq); } static boolean_t @@ -1586,6 +2858,41 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +static void +dsl_scan_update_stats(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t i; + uint64_t seg_size_total = 0, zio_size_total = 0; + uint64_t seg_count_total = 0, zio_count_total = 0; + + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; + + if (queue == NULL) + continue; + + seg_size_total += queue->q_total_seg_size_this_txg; + zio_size_total += queue->q_total_zio_size_this_txg; + seg_count_total += queue->q_segs_this_txg; + zio_count_total += queue->q_zios_this_txg; + } + + if (seg_count_total == 0 || zio_count_total == 0) { + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_zios_this_txg = 0; + return; + } + + scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; + scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; + scn->scn_segs_this_txg = seg_count_total; + scn->scn_zios_this_txg = zio_count_total; +} + boolean_t dsl_scan_active(dsl_scan_t *scn) { @@ -1596,8 +2903,7 @@ dsl_scan_active(dsl_scan_t *scn) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); - if ((scn->scn_phys.scn_state == DSS_SCANNING && - !dsl_scan_is_paused_scrub(scn)) || + if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); @@ -1608,13 +2914,60 @@ dsl_scan_active(dsl_scan_t *scn) return (used != 0); } -/* Called whenever a txg syncs. */ +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * cna guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this funciton controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { + int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - int err = 0; + state_sync_type_t sync_type = SYNC_OPTIONAL; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1627,14 +2980,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, tx->tx_txg); + func, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * Only process scans in sync pass 1. */ - if (spa_sync_pass(dp->dp_spa) > 1) + if (spa_sync_pass(spa) > 1) return; /* @@ -1651,7 +3004,17 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; + /* reset scan statistics */ scn->scn_visited_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; @@ -1664,13 +3027,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * blocks than to scrub them. */ if (zfs_free_bpobj_enabled && - spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); - VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); @@ -1679,11 +3043,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; @@ -1770,110 +3135,189 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; - if (scn->scn_done_txg == tx->tx_txg) { - ASSERT(!scn->scn_suspending); - /* finished with scan. */ - zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); - dsl_scan_done(scn, B_TRUE, tx); - ASSERT3U(spa->spa_scrub_inflight, ==, 0); - dsl_scan_sync_state(scn, tx); + /* + * Wait a few txgs after importing to begin scanning so that + * we can get the pool imported quickly. + */ + if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; - } - if (dsl_scan_is_paused_scrub(scn)) - return; + /* + * It is possible to switch from unsorted to sorted at any time, + * but afterwards the scan will remain sorted unless reloaded from + * a checkpoint after a reboot. + */ + if (!zfs_scan_legacy) { + scn->scn_is_sorted = B_TRUE; + if (scn->scn_last_checkpoint == 0) + scn->scn_last_checkpoint = ddi_get_lbolt(); + } - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= - scn->scn_phys.scn_ddt_class_max) { - zfs_dbgmsg("doing scan sync txg %llu; " - "ddt bm=%llu/%llu/%llu/%llx", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); - ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + /* + * For sorted scans, determine what kind of work we will be doing + * this txg based on our memory limitations and whether or not we + * need to perform a checkpoint. + */ + if (scn->scn_is_sorted) { + /* + * If we are over our checkpoint interval, set scn_clearing + * so that we can begin checkpointing immediately. The + * checkpoint allows us to save a consisent bookmark + * representing how much data we have scrubbed so far. + * Otherwise, use the memory limit to determine if we should + * scan for metadata or start issue scrub IOs. We accumulate + * metadata until we hit our hard memory limit at which point + * we issue scrub IOs until we are at our soft memory limit. + */ + if (scn->scn_checkpointing || + ddi_get_lbolt() - scn->scn_last_checkpoint > + SEC_TO_TICK(zfs_scan_checkpoint_intval)) { + if (!scn->scn_checkpointing) + zfs_dbgmsg("begin scan checkpoint"); + + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } else { + boolean_t should_clear = dsl_scan_should_clear(scn); + if (should_clear && !scn->scn_clearing) { + zfs_dbgmsg("begin scan clearing"); + scn->scn_clearing = B_TRUE; + } else if (!should_clear && scn->scn_clearing) { + zfs_dbgmsg("finish scan clearing"); + scn->scn_clearing = B_FALSE; + } + } } else { - zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, - (longlong_t)scn->scn_phys.scn_bookmark.zb_object, - (longlong_t)scn->scn_phys.scn_bookmark.zb_level, - (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + ASSERT0(scn->scn_checkpointing); + ASSERT0(scn->scn_clearing); } - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); - dsl_pool_config_exit(dp, FTAG); - (void) zio_wait(scn->scn_zio_root); - scn->scn_zio_root = NULL; + if (!scn->scn_clearing && scn->scn_done_txg == 0) { + /* Need to scan metadata for more blocks to scrub */ + dsl_scan_phys_t *scnp = &scn->scn_phys; + taskqid_t prefetch_tqid; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); - zfs_dbgmsg("visited %llu blocks in %llums", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); + /* + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + if (scnp->scn_ddt_bookmark.ddb_class <= + scnp->scn_ddt_class_max) { + ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } else { + zfs_dbgmsg("doing scan sync txg %llu; " + "bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_bookmark.zb_objset, + (longlong_t)scnp->scn_bookmark.zb_object, + (longlong_t)scnp->scn_bookmark.zb_level, + (longlong_t)scnp->scn_bookmark.zb_blkid); + } - if (!scn->scn_suspending) { - scn->scn_done_txg = tx->tx_txg + 1; - zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", - tx->tx_txg, scn->scn_done_txg); - } + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); - } - mutex_exit(&spa->spa_scrub_lock); - } + scn->scn_prefetch_stop = B_FALSE; + prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, + dsl_scan_prefetch_thread, scn, TQ_SLEEP); + ASSERT(prefetch_tqid != TASKQID_INVALID); - dsl_scan_sync_state(scn, tx); -} + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); -/* - * This will start a new scan, or restart an existing one. - */ -void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) -{ - if (txg == 0) { - dmu_tx_t *tx; - tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + mutex_enter(&dp->dp_spa->spa_scrub_lock); + scn->scn_prefetch_stop = B_TRUE; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&dp->dp_spa->spa_scrub_lock); - txg = dmu_tx_get_txg(tx); - dp->dp_scan->scn_restart_txg = txg; - dmu_tx_commit(tx); - } else { - dp->dp_scan->scn_restart_txg = txg; + taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("scan visited %llu blocks in %llums " + "(%llu os's, %llu holes, %llu < mintxg, " + "%llu in ddt, %llu > maxtxg)", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_objsets_visited_this_txg, + (longlong_t)scn->scn_holes_this_txg, + (longlong_t)scn->scn_lt_min_this_txg, + (longlong_t)scn->scn_ddt_contained_this_txg, + (longlong_t)scn->scn_gt_max_this_txg); + + if (!scn->scn_suspending) { + ASSERT0(avl_numnodes(&scn->scn_queue)); + scn->scn_done_txg = tx->tx_txg + 1; + if (scn->scn_is_sorted) { + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } + zfs_dbgmsg("scan complete txg %llu", + (longlong_t)tx->tx_txg); + } + } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + /* need to issue scrubbing IOs from per-vdev queues */ + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + scan_io_queues_run(scn); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + /* calculate and dprintf the current memory usage */ + (void) dsl_scan_should_clear(scn); + dsl_scan_update_stats(scn); + + zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums " + "(avg_block_size = %llu, avg_seg_size = %llu)", + (longlong_t)scn->scn_zios_this_txg, + (longlong_t)scn->scn_segs_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_avg_zio_size_this_txg, + (longlong_t)scn->scn_avg_seg_size_this_txg); + } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { + /* Finished with everything. Mark the scrub as complete */ + zfs_dbgmsg("scan issuing complete txg %llu", + (longlong_t)tx->tx_txg); + ASSERT3U(scn->scn_done_txg, !=, 0); + ASSERT0(spa->spa_scrub_inflight); + ASSERT0(scn->scn_bytes_pending); + dsl_scan_done(scn, B_TRUE, tx); + sync_type = SYNC_MANDATORY; } - zfs_dbgmsg("restarting resilver txg=%llu", txg); -} -boolean_t -dsl_scan_resilvering(dsl_pool_t *dp) -{ - return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + dsl_scan_sync_state(scn, tx, sync_type); } -/* - * scrub consumers - */ - static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; + /* update the spa's stats on how many bytes we have issued */ + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } + /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. @@ -1881,6 +3325,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) if (zab == NULL) return; + mutex_enter(&zab->zab_lock); + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; @@ -1916,63 +3362,97 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) break; } } + + mutex_exit(&zab->zab_lock); } static void -dsl_scan_scrub_done(zio_t *zio) +scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { - spa_t *spa = zio->io_spa; - - abd_free(zio->io_abd); + avl_index_t idx; + int64_t asize = sio->sio_asize; + dsl_scan_t *scn = queue->q_scn; - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { + /* block is already scheduled for reading */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + kmem_cache_free(sio_cache, sio); + return; } - mutex_exit(&spa->spa_scrub_lock); + avl_insert(&queue->q_sios_by_addr, sio, idx); + range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); } -static boolean_t -dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, - uint64_t phys_birth) +/* + * Given all the info we got from our metadata scanning process, we + * construct a scan_io_t and insert it into the scan sorting queue. The + * I/O must already be suitable for us to process. This is controlled + * by dsl_scan_enqueue(). + */ +static void +scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, + int zio_flags, const zbookmark_phys_t *zb) { - vdev_t *vd; + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP); - if (DVA_GET_GANG(dva)) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - return (B_TRUE); - } + ASSERT0(BP_IS_GANG(bp)); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + bp2sio(bp, sio, dva_i); + sio->sio_flags = zio_flags; + sio->sio_zb = *zb; /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. + * Increment the bytes pending counter now so that we can't + * get an integer underflow in case the worker processes the + * zio before we get to incrementing this counter. */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); + atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + + scan_io_queue_insert_impl(queue, sio); +} + +/* + * Given a set of I/O parameters as discovered by the metadata traversal + * process, attempts to place the I/O into the sorted queues (if allowed), + * or immediately executes the I/O. + */ +static void +dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb) +{ + spa_t *spa = dp->dp_spa; + + ASSERT(!BP_IS_EMBEDDED(bp)); /* - * Check if the top-level vdev must resilver this offset. - * When the offset does not intersect with a dirty leaf DTL - * then it may be possible to skip the resilver IO. The psize - * is provided instead of asize to simplify the check for RAIDZ. + * Gang blocks are hard to issue sequentially, so we just issue them + * here immediately instead of queuing them. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) - return (B_FALSE); + if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { + scan_exec_io(dp, bp, zio_flags, zb, NULL); + return; + } - return (B_TRUE); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + dva_t dva; + vdev_t *vdev; + + dva = bp->blk_dva[i]; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); + ASSERT(vdev != NULL); + + mutex_enter(&vdev->vdev_scan_io_queue_lock); + if (vdev->vdev_scan_io_queue == NULL) + vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); + ASSERT(dp->dp_scan != NULL); + scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, + i, zio_flags, zb); + mutex_exit(&vdev->vdev_scan_io_queue_lock); + } } static int @@ -1980,32 +3460,29 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; - size_t psize = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); - count_block(dp->dp_blkstats, bp); - - if (BP_IS_EMBEDDED(bp)) + if (BP_IS_EMBEDDED(bp)) { + count_block(scn, dp->dp_blkstats, bp); return (0); + } ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; - scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; - scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ @@ -2029,91 +3506,348 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, } if (needs_io && !zfs_no_scrub_io) { - vdev_t *rvd = spa->spa_root_vdev; - uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + dsl_scan_enqueue(dp, bp, zio_flags, zb); + } else { + count_block(scn, dp->dp_blkstats, bp); + } + + /* do not relocate this block */ + return (0); +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + dsl_scan_io_queue_t *queue = zio->io_private; + + abd_free(zio->io_abd); + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + } else { + mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); + ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); + queue->q_inflight_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&queue->q_zio_cv); + mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); + } + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + } +} +/* + * Given a scanning zio's information, executes the zio. The zio need + * not necessarily be only sortable, this function simply executes the + * zio, no matter what it is. The optional queue argument allows the + * caller to specify that they want per top level vdev IO rate limiting + * instead of the legacy global limiting. + */ +static void +scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + abd_t *data = abd_alloc_for_io(size, B_FALSE); + + if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= maxinflight) + while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); + } else { + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; - /* - * If we're seeing recent (zfs_scan_idle) "important" I/Os - * then throttle our workload to limit the impact of a scan. - */ - if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) - delay(scan_delay); + mutex_enter(q_lock); + while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) + cv_wait(&queue->q_zio_cv, q_lock); + queue->q_inflight_bytes += BP_GET_PSIZE(bp); + mutex_exit(q_lock); + } + + count_block(scn, dp->dp_blkstats, bp); + zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size, + dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); +} - zio_nowait(zio_read(NULL, spa, bp, - abd_alloc_for_io(psize, B_FALSE), - psize, dsl_scan_scrub_done, NULL, - ZIO_PRIORITY_SCRUB, zio_flags, zb)); +/* + * This is the primary extent sorting algorithm. We balance two parameters: + * 1) how many bytes of I/O are in an extent + * 2) how well the extent is filled with I/O (as a fraction of its total size) + * Since we allow extents to have gaps between their constituent I/Os, it's + * possible to have a fairly large extent that contains the same amount of + * I/O bytes than a much smaller extent, which just packs the I/O more tightly. + * The algorithm sorts based on a score calculated from the extent's size, + * the relative fill volume (in %) and a "fill weight" parameter that controls + * the split between whether we prefer larger extents or more well populated + * extents: + * + * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) + * + * Example: + * 1) assume extsz = 64 MiB + * 2) assume fill = 32 MiB (extent is half full) + * 3) assume fill_weight = 3 + * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 + * SCORE = 32M + (50 * 3 * 32M) / 100 + * SCORE = 32M + (4800M / 100) + * SCORE = 32M + 48M + * ^ ^ + * | +--- final total relative fill-based score + * +--------- final total fill-based score + * SCORE = 80M + * + * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards + * extents that are more completely filled (in a 3:2 ratio) vs just larger. + * Note that as an optimization, we replace multiplication and division by + * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). + */ +static int +ext_size_compare(const void *x, const void *y) +{ + const range_seg_t *rsa = x, *rsb = y; + uint64_t sa = rsa->rs_end - rsa->rs_start, + sb = rsb->rs_end - rsb->rs_start; + uint64_t score_a, score_b; + + score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * + fill_weight * rsa->rs_fill) >> 7); + score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * + fill_weight * rsb->rs_fill) >> 7); + + if (score_a > score_b) + return (-1); + if (score_a == score_b) { + if (rsa->rs_start < rsb->rs_start) + return (-1); + if (rsa->rs_start == rsb->rs_start) + return (0); + return (1); } + return (1); +} - /* do not relocate this block */ - return (0); +/* + * Comparator for the q_sios_by_addr tree. Sorting is simply performed + * based on LBA-order (from lowest to highest). + */ +static int +sio_addr_compare(const void *x, const void *y) +{ + const scan_io_t *a = x, *b = y; + + if (a->sio_offset < b->sio_offset) + return (-1); + if (a->sio_offset == b->sio_offset) + return (0); + return (1); +} + +/* IO queues are created on demand when they are needed. */ +static dsl_scan_io_queue_t * +scan_io_queue_create(vdev_t *vd) +{ + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); + + q->q_scn = scn; + q->q_vd = vd; + cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); + q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, + &q->q_exts_by_size, ext_size_compare, + &q->q_vd->vdev_scan_io_queue_lock, zfs_scan_max_ext_gap); + avl_create(&q->q_sios_by_addr, sio_addr_compare, + sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); + + return (q); } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Destroys a scan queue and all segments and scan_io_t's contained in it. + * No further execution of I/O occurs, anything pending in the queue is + * simply freed without being executed. */ -int -dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +void +dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { - spa_t *spa = dp->dp_spa; - dsl_scan_t *scn = dp->dp_scan; + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + void *cookie = NULL; + int64_t bytes_dequeued = 0; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != + NULL) { + ASSERT(range_tree_contains(queue->q_exts_by_addr, + sio->sio_offset, sio->sio_asize)); + bytes_dequeued += sio->sio_asize; + kmem_cache_free(sio_cache, sio); + } - /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); + atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); + range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + range_tree_destroy(queue->q_exts_by_addr); + avl_destroy(&queue->q_sios_by_addr); + cv_destroy(&queue->q_zio_cv); - if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { - /* got scrub start cmd, resume paused scrub */ - int err = dsl_scrub_set_pause_resume(scn->scn_dp, - POOL_SCRUB_NORMAL); - if (err == 0) - return (SET_ERROR(ECANCELED)); + kmem_free(queue, sizeof (*queue)); +} - return (SET_ERROR(err)); +/* + * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is + * called on behalf of vdev_top_transfer when creating or destroying + * a mirror vdev due to zpool attach/detach. + */ +void +dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) +{ + mutex_enter(&svd->vdev_scan_io_queue_lock); + mutex_enter(&tvd->vdev_scan_io_queue_lock); + + VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; + svd->vdev_scan_io_queue = NULL; + if (tvd->vdev_scan_io_queue != NULL) { + tvd->vdev_scan_io_queue->q_vd = tvd; + range_tree_set_lock(tvd->vdev_scan_io_queue->q_exts_by_addr, + &tvd->vdev_scan_io_queue_lock); } - return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); + mutex_exit(&tvd->vdev_scan_io_queue_lock); + mutex_exit(&svd->vdev_scan_io_queue_lock); } -static boolean_t -dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +static void +scan_io_queues_destroy(dsl_scan_t *scn) { - return (scn->scn_restart_txg != 0 && - scn->scn_restart_txg <= tx->tx_txg); + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + if (tvd->vdev_scan_io_queue != NULL) + dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); + tvd->vdev_scan_io_queue = NULL; + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } } -#if defined(_KERNEL) && defined(HAVE_SPL) -module_param(zfs_top_maxinflight, int, 0644); -MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level"); +static void +dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + vdev_t *vdev; + kmutex_t *q_lock; + dsl_scan_io_queue_t *queue; + scan_io_t srch, *sio; + avl_index_t idx; + uint64_t start, size; + + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); + ASSERT(vdev != NULL); + q_lock = &vdev->vdev_scan_io_queue_lock; + queue = vdev->vdev_scan_io_queue; + + mutex_enter(q_lock); + if (queue == NULL) { + mutex_exit(q_lock); + return; + } + + bp2sio(bp, &srch, dva_i); + start = srch.sio_offset; + size = srch.sio_asize; + + /* + * We can find the zio in two states: + * 1) Cold, just sitting in the queue of zio's to be issued at + * some point in the future. In this case, all we do is + * remove the zio from the q_sios_by_addr tree, decrement + * its data volume from the containing range_seg_t and + * resort the q_exts_by_size tree to reflect that the + * range_seg_t has lost some of its 'fill'. We don't shorten + * the range_seg_t - this is usually rare enough not to be + * worth the extra hassle of trying keep track of precise + * extent boundaries. + * 2) Hot, where the zio is currently in-flight in + * dsl_scan_issue_ios. In this case, we can't simply + * reach in and stop the in-flight zio's, so we instead + * block the caller. Eventually, dsl_scan_issue_ios will + * be done with issuing the zio's it gathered and will + * signal us. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + if (sio != NULL) { + int64_t asize = sio->sio_asize; + blkptr_t tmpbp; + + /* Got it while it was cold in the queue */ + ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(size, ==, asize); + avl_remove(&queue->q_sios_by_addr, sio); -module_param(zfs_resilver_delay, int, 0644); -MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver"); + ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); + range_tree_remove_fill(queue->q_exts_by_addr, start, size); + + /* + * We only update scn_bytes_pending in the cold path, + * otherwise it will already have been accounted for as + * part of the zio's execution. + */ + atomic_add_64(&scn->scn_bytes_pending, -asize); -module_param(zfs_scrub_delay, int, 0644); -MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub"); + /* count the block as though we issued it */ + sio2bp(sio, &tmpbp, dva_i); + count_block(scn, dp->dp_blkstats, &tmpbp); -module_param(zfs_scan_idle, int, 0644); -MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks"); + kmem_cache_free(sio_cache, sio); + } + mutex_exit(q_lock); +} -module_param(zfs_scan_min_time_ms, int, 0644); -MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg"); +/* + * Callback invoked when a zio_free() zio is executing. This needs to be + * intercepted to prevent the zio from deallocating a particular portion + * of disk space and it then getting reallocated and written to, while we + * still have it queued up for processing. + */ +void +dsl_scan_freed(spa_t *spa, const blkptr_t *bp) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(scn != NULL); + if (!dsl_scan_is_running(scn)) + return; + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) + dsl_scan_freed_dva(spa, bp, i); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* CSTYLED */ +module_param(zfs_scan_vdev_limit, ulong, 0644); +MODULE_PARM_DESC(zfs_scan_vdev_limit, + "Max bytes in flight per leaf vdev for scrubs and resilvers"); + +module_param(zfs_scrub_min_time_ms, int, 0644); +MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg"); module_param(zfs_free_min_time_ms, int, 0644); MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); @@ -2133,4 +3867,30 @@ MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg"); module_param(zfs_free_bpobj_enabled, int, 0644); MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj"); + +module_param(zfs_scan_mem_lim_fact, int, 0644); +MODULE_PARM_DESC(zfs_scan_mem_lim_fact, "Fraction of RAM for scan hard limit"); + +module_param(zfs_scan_issue_strategy, int, 0644); +MODULE_PARM_DESC(zfs_scan_issue_strategy, + "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); + +module_param(zfs_scan_legacy, int, 0644); +MODULE_PARM_DESC(zfs_scan_legacy, "Scrub using legacy non-sequential method"); + +module_param(zfs_scan_checkpoint_intval, int, 0644); +MODULE_PARM_DESC(zfs_scan_checkpoint_intval, + "Scan progress on-disk checkpointing interval"); + +module_param(zfs_scan_mem_lim_soft_fact, int, 0644); +MODULE_PARM_DESC(zfs_scan_mem_lim_soft_fact, + "Fraction of hard limit used as soft limit"); + +module_param(zfs_scan_strict_mem_lim, int, 0644); +MODULE_PARM_DESC(zfs_scan_strict_mem_lim, + "Tunable to attempt to reduce lock contention"); + +module_param(zfs_scan_fill_weight, int, 0644); +MODULE_PARM_DESC(zfs_scan_fill_weight, + "Tunable to adjust bias towards more filled segments during scans"); #endif diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5dc9ed60d..6320fd388 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -972,85 +972,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2) } /* - * Create any block allocator specific components. The current allocators - * rely on using both a size-ordered range_tree_t and an array of uint64_t's. - */ -static void -metaslab_rt_create(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT(msp->ms_tree == NULL); - - avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -/* - * Destroy the block allocator specific components. - */ -static void -metaslab_rt_destroy(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_tree, ==, rt); - ASSERT0(avl_numnodes(&msp->ms_size_tree)); - - avl_destroy(&msp->ms_size_tree); -} - -static void -metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_tree, ==, rt); - VERIFY(!msp->ms_condensing); - avl_add(&msp->ms_size_tree, rs); -} - -static void -metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_tree, ==, rt); - VERIFY(!msp->ms_condensing); - avl_remove(&msp->ms_size_tree, rs); -} - -static void -metaslab_rt_vacate(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_tree, ==, rt); - - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -static range_tree_ops_t metaslab_rt_ops = { - metaslab_rt_create, - metaslab_rt_destroy, - metaslab_rt_add, - metaslab_rt_remove, - metaslab_rt_vacate -}; - -/* * ========================================================================== * Common allocator routines * ========================================================================== @@ -1425,7 +1346,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); + ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree, + metaslab_rangesize_compare, &ms->ms_lock, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index ebef7f447..01ef463ec 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -33,8 +33,58 @@ #include <sys/zio.h> #include <sys/range_tree.h> +/* + * Range trees are tree-based data structures that can be used to + * track free space or generally any space allocation information. + * A range tree keeps track of individual segments and automatically + * provides facilities such as adjacent extent merging and extent + * splitting in response to range add/remove requests. + * + * A range tree starts out completely empty, with no segments in it. + * Adding an allocation via range_tree_add to the range tree can either: + * 1) create a new extent + * 2) extend an adjacent extent + * 3) merge two adjacent extents + * Conversely, removing an allocation via range_tree_remove can: + * 1) completely remove an extent + * 2) shorten an extent (if the allocation was near one of its ends) + * 3) split an extent into two extents, in effect punching a hole + * + * A range tree is also capable of 'bridging' gaps when adding + * allocations. This is useful for cases when close proximity of + * allocations is an important detail that needs to be represented + * in the range tree. See range_tree_set_gap(). The default behavior + * is not to bridge gaps (i.e. the maximum allowed gap size is 0). + * + * In order to traverse a range tree, use either the range_tree_walk() + * or range_tree_vacate() functions. + * + * To obtain more accurate information on individual segment + * operations that the range tree performs "under the hood", you can + * specify a set of callbacks by passing a range_tree_ops_t structure + * to the range_tree_create function. Any callbacks that are non-NULL + * are then called at the appropriate times. + * + * The range tree code also supports a special variant of range trees + * that can bridge small gaps between segments. This kind of tree is used + * by the dsl scanning code to group I/Os into mostly sequential chunks to + * optimize disk performance. The code here attempts to do this with as + * little memory and computational overhead as possible. One limitation of + * this implementation is that segments of range trees with gaps can only + * support removing complete segments. + */ + kmem_cache_t *range_seg_cache; +/* Generic ops for managing an AVL tree alongside a range tree */ +struct range_tree_ops rt_avl_ops = { + .rtop_create = rt_avl_create, + .rtop_destroy = rt_avl_destroy, + .rtop_add = rt_avl_add, + .rtop_remove = rt_avl_remove, + .rtop_vacate = rt_avl_vacate, +}; + void range_tree_init(void) { @@ -75,6 +125,18 @@ range_tree_stat_verify(range_tree_t *rt) } } +/* + * Changes out the lock used by the range tree. Useful when you are moving + * the range tree between containing structures without having to recreate + * it. Both the old and new locks must be held by the caller. + */ +void +range_tree_set_lock(range_tree_t *rt, kmutex_t *lp) +{ + ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp)); + rt->rt_lock = lp; +} + static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { @@ -121,31 +183,38 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) +range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap) { - range_tree_t *rt; - - rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); rt->rt_lock = lp; rt->rt_ops = ops; + rt->rt_gap = gap; rt->rt_arg = arg; + rt->rt_avl_compare = avl_compare; - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } +range_tree_t * +range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) +{ + return (range_tree_create_impl(ops, arg, NULL, lp, 0)); +} + void range_tree_destroy(range_tree_t *rt) { VERIFY0(rt->rt_space); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); avl_destroy(&rt->rt_root); @@ -153,40 +222,102 @@ range_tree_destroy(range_tree_t *rt) } void -range_tree_add(void *arg, uint64_t start, uint64_t size) +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +{ + ASSERT(MUTEX_HELD(rt->rt_lock)); + + ASSERT3U(rs->rs_fill + delta, !=, 0); + ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += delta; + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + +static void +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs_before, *rs_after, *rs; - uint64_t end = start + size; + uint64_t end = start + size, gap = rt->rt_gap; + uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT(MUTEX_HELD(rt->rt_lock)); - VERIFY(size != 0); + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { + if (gap == 0 && rs != NULL && + rs->rs_start <= start && rs->rs_end >= end) { zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size); + "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + + /* + * If this is a gap-supporting range tree, it is possible that we + * are inserting into an existing segment. In this case simply + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. + */ + if (rs != NULL) { + ASSERT3U(gap, !=, 0); + if (rs->rs_start <= start && rs->rs_end >= end) { + range_tree_adjust_fill(rt, rs, fill); + return; + } + + avl_remove(&rt->rt_root, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rs->rs_end - rs->rs_start; + + fill += rs->rs_fill; + start = MIN(start, rs->rs_start); + end = MAX(end, rs->rs_end); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + + kmem_cache_free(range_seg_cache, rs); return; } - /* Make sure we don't overlap with either of our neighbors */ - VERIFY(rs == NULL); + ASSERT3P(rs, ==, NULL); + /* + * Determine whether or not we will have to merge with our neighbors. + * If gap != 0, we might need to merge with our neighbors even if we + * aren't directly touching. + */ rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); - merge_before = (rs_before != NULL && rs_before->rs_end == start); - merge_after = (rs_after != NULL && rs_after->rs_start == end); + merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); + merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + + if (merge_before && gap != 0) + bridge_size += start - rs_before->rs_end; + if (merge_after && gap != 0) + bridge_size += rs_after->rs_start - end; if (merge_before && merge_after) { avl_remove(&rt->rt_root, rs_before); - if (rt->rt_ops != NULL) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } @@ -194,43 +325,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += rs_before->rs_fill + fill; rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; } else if (merge_before) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); range_tree_stat_decr(rt, rs_before); + rs_before->rs_fill += fill; rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += fill; rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + + rs->rs_fill = fill; rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } - if (rt->rt_ops != NULL) + if (gap != 0) + ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); + else + ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); range_tree_stat_incr(rt, rs); - rt->rt_space += size; + rt->rt_space += size + bridge_size; } void -range_tree_remove(void *arg, uint64_t start, uint64_t size) +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_add_impl(arg, start, size, size); +} + +static void +range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, + boolean_t do_fill) { - range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs, *newseg; uint64_t end = start + size; @@ -251,6 +398,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) (longlong_t)start, (longlong_t)size); return; } + + /* + * Range trees with gap support must only remove complete segments + * from the tree. This allows us to maintain accurate fill accounting + * and to ensure that bridged sections are not leaked. If we need to + * remove less than the full segment, we can only adjust the fill count. + */ + if (rt->rt_gap != 0) { + if (do_fill) { + if (rs->rs_fill == size) { + start = rs->rs_start; + end = rs->rs_end; + size = end - start; + } else { + range_tree_adjust_fill(rt, rs, -size); + return; + } + } else if (rs->rs_start != start || rs->rs_end != end) { + zfs_panic_recover("zfs: freeing partial segment of " + "gap tree (offset=%llu size=%llu) of " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + } + VERIFY3U(rs->rs_start, <=, start); VERIFY3U(rs->rs_end, >=, end); @@ -259,19 +434,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); newseg->rs_start = end; newseg->rs_end = rs->rs_end; + newseg->rs_fill = newseg->rs_end - newseg->rs_start; range_tree_stat_incr(rt, newseg); rs->rs_end = start; avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); } else if (left_over) { rs->rs_end = start; @@ -284,15 +460,55 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) } if (rs != NULL) { + /* + * The fill of the leftover segment will always be equal to + * the size, since we do not support removing partial segments + * of range trees with gaps. + */ + rs->rs_fill = rs->rs_end - rs->rs_start; range_tree_stat_incr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } rt->rt_space -= size; } +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(arg, start, size, B_FALSE); +} + +void +range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(rt, start, size, B_TRUE); +} + +void +range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize) +{ + int64_t delta = newsize - (rs->rs_end - rs->rs_start); + + ASSERT(MUTEX_HELD(rt->rt_lock)); + + range_tree_stat_decr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + rs->rs_start = newstart; + rs->rs_end = newstart + newsize; + + range_tree_stat_incr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + rt->rt_space += delta; +} + static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { @@ -308,7 +524,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) return (avl_find(&rt->rt_root, &rsearch, &where)); } -static range_seg_t * +range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs = range_tree_find_impl(rt, start, size); @@ -373,7 +589,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) ASSERT(MUTEX_HELD(rt->rt_lock)); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { @@ -397,8 +613,60 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } +range_seg_t * +range_tree_first(range_tree_t *rt) +{ + ASSERT(MUTEX_HELD(rt->rt_lock)); + return (avl_first(&rt->rt_root)); +} + uint64_t range_tree_space(range_tree_t *rt) { return (rt->rt_space); } + +/* Generic range tree functions for maintaining segments in an AVL tree. */ +void +rt_avl_create(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), + offsetof(range_seg_t, rs_pp_node)); +} + +void +rt_avl_destroy(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + ASSERT0(avl_numnodes(tree)); + avl_destroy(tree); +} + +void +rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_add(tree, rs); +} + +void +rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_remove(tree, rs); +} + +void +rt_avl_vacate(range_tree_t *rt, void *arg) +{ + /* + * Normally one would walk the tree freeing nodes along the way. + * Since the nodes are shared with the range trees we can avoid + * walking all nodes and just reinitialize the avl tree. The nodes + * will be freed by the range tree, so we don't want to free them here. + */ + rt_avl_create(rt, arg); +} diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0604742ab..e06190f9d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1996,7 +1996,7 @@ spa_load_verify_done(zio_t *zio) } mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } @@ -2030,9 +2030,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) + while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9a3290e95..116b0ebd9 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1892,6 +1892,7 @@ spa_init(int mode) zpool_feature_init(); spa_config_load(); l2arc_start(); + scan_init(); qat_init(); } @@ -1915,6 +1916,7 @@ spa_fini(void) unique_fini(); refcount_fini(); fm_fini(); + scan_fini(); qat_fini(); avl_destroy(&spa_namespace_avl); @@ -2016,6 +2018,7 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; + spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } @@ -2033,18 +2036,21 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; + ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; - ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; - ps->pss_state = scn->scn_phys.scn_state; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_issued = + scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; + ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 2df0040af..9edeaf525 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -360,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, @@ -648,6 +649,18 @@ vdev_free(vdev_t *vd) spa_t *spa = vd->vdev_spa; /* + * Scan queues are normally destroyed at the end of a scan. If the + * queue exists here, that implies the vdev is being removed while + * the scan is still running. + */ + if (vd->vdev_scan_io_queue != NULL) { + mutex_enter(&vd->vdev_scan_io_queue_lock); + dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); + vd->vdev_scan_io_queue = NULL; + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ @@ -723,6 +736,7 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_scan_io_queue_lock); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -800,6 +814,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; + + dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 36a4bf629..792642952 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = 1 << 20; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 6d1b860cc..2f6aed667 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1070,7 +1070,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, } err = zap_add(os, intoobj, za.za_name, 8, 1, &value, tx); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4cfda7a9e..311f79e23 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -39,6 +39,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/dsl_scan.h> #include <sys/metaslab_impl.h> #include <sys/time.h> #include <sys/trace_zio.h> @@ -1050,6 +1051,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, metaslab_check_free(spa, bp); arc_freed(spa, bp); + dsl_scan_freed(spa, bp); /* * GANG and DEDUP blocks can induce a read (for the gang block header, @@ -3333,26 +3335,6 @@ zio_vdev_io_start(zio_t *zio) ASSERT3P(zio->io_logical, !=, zio); - /* - * We keep track of time-sensitive I/Os so that the scan thread - * can quickly react to certain workloads. In particular, we care - * about non-scrubbing, top-level reads and writes with the following - * characteristics: - * - synchronous writes of user data to non-slog devices - * - any reads of user data - * When these conditions are met, adjust the timestamp of spa_last_io - * which allows the scan thread to adjust its workload accordingly. - */ - if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && - vd == vd->vdev_top && !vd->vdev_islog && - zio->io_bookmark.zb_objset != DMU_META_OBJSET && - zio->io_txg != spa_syncing_txg(spa)) { - uint64_t old = spa->spa_last_io; - uint64_t new = ddi_get_lbolt64(); - if (old != new) - (void) atomic_cas_64(&spa->spa_last_io, old, new); - } - align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && |