Sequential scrub and resilvers

Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <[email protected]> Authored-by: Saso Kiselkov <[email protected]> Authored-by: Alek Pinchuk <[email protected]> Authored-by: Tom Caputi <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #3625 Closes #6256
author: Tom Caputi <[email protected]> 2017-11-15 20:27:01 -0500
committer: Brian Behlendorf <[email protected]> 2017-11-15 17:27:01 -0800
commit: d4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree: 1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /module
parent: e301113c17673a290098850830cf2e6d1a1fcbe3 (diff)
14 files changed, 2669 insertions, 644 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index cd343b04e..698357632 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -357,7 +357,8 @@ int			arc_no_grow_shift = 5;
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int		arc_min_prefetch_lifespan;
+static int		arc_min_prefetch_ms;
+static int		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
@@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
  * These tunables are Linux specific
  */
 unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_aggressive_disable = 1;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
@@ -663,6 +665,7 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_meta_min;
 	kstat_named_t arcstat_sync_wait_for_async;
 	kstat_named_t arcstat_demand_hit_predictive_prefetch;
+	kstat_named_t arcstat_demand_hit_prescient_prefetch;
 	kstat_named_t arcstat_need_free;
 	kstat_named_t arcstat_sys_free;
 	kstat_named_t arcstat_raw_size;
@@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 }
@@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define	HDR_PRESCIENT_PREFETCH(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
@@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
+	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 	/* prefetch buffers have a minimum lifespan */
 	if (HDR_IO_IN_PROGRESS(hdr) ||
 	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-	    arc_min_prefetch_lifespan)) {
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
@@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 				/* link protected by hash lock */
 				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
-				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
 				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
@@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * MFU state.
 		 */
 
-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			new_state = arc_mru;
-			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
-				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+			}
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
@@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
-		if ((HDR_PREFETCH(hdr)) != 0) {
-			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			/* link protected by hash_lock */
-			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-		}
+
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * MFU state.
 		 */
 
-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
-			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
 			new_state = arc_mru;
 		}
 
@@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
-arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
-	if (error == 0)
-		bcopy(buf->b_data, arg, arc_buf_size(buf));
+	if (buf == NULL)
+		return;
+
+	bcopy(buf->b_data, arg, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
+/* ARGSUSED */
 void
-arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
-	if (error != 0) {
-		arc_buf_destroy(buf, arg);
+
+	if (buf == NULL) {
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
@@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 	boolean_t	freeable = B_FALSE;
-	boolean_t	no_zio_error = (zio->io_error == 0);
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
@@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
 		}
 	}
 
-	if (no_zio_error) {
+	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 
-	if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+	if (hash_lock && zio->io_error == 0 &&
+	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
@@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
 		if (!acb->acb_done)
 			continue;
 
-		/* This is a demand read since prefetches don't use callbacks */
 		callback_cnt++;
 
+		if (zio->io_error != 0)
+			continue;
+
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
-		    acb->acb_compressed, acb->acb_noauth, no_zio_error,
+		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
+		if (error != 0) {
+			arc_buf_destroy(acb->acb_buf, acb->acb_private);
+			acb->acb_buf = NULL;
+		}
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
@@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio)
 			}
 		}
 
-		if (no_zio_error) {
+		if (zio->io_error == 0)
 			zio->io_error = error;
-		}
 	}
 	hdr->b_l1hdr.b_acb = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);
 
-	if (no_zio_error) {
+	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done) {
-			acb->acb_done(zio, zio->io_error, acb->acb_buf,
-			    acb->acb_private);
+			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
@@ -5974,12 +5991,25 @@ top:
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
+
+			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_prescient_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+			}
+
 			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
 			    private, encrypted_read, compressed_read,
 			    noauth_read, B_TRUE, &buf);
+			if (rc != 0) {
+				arc_buf_destroy(buf, private);
+				buf = NULL;
+			}
+
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
@@ -5987,6 +6017,8 @@ top:
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
@@ -5996,7 +6028,7 @@ top:
 		    data, metadata, hits);
 
 		if (done)
-			done(NULL, rc, buf, private);
+			done(NULL, zb, bp, buf, private);
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
@@ -6112,6 +6144,8 @@ top:
 		if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		if (BP_IS_AUTHENTICATED(bp))
@@ -7223,9 +7257,15 @@ arc_tuning_update(void)
 	if (zfs_arc_p_min_shift)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
-	/* Valid range: 1 - N ticks */
-	if (zfs_arc_min_prefetch_lifespan)
-		arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prefetch_ms)
+		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prescient_prefetch_ms) {
+		arc_min_prescient_prefetch_ms =
+		    zfs_arc_min_prescient_prefetch_ms;
+	}
 
 	/* Valid range: 0 - 100 */
 	if ((zfs_arc_lotsfree_percent >= 0) &&
@@ -7368,7 +7408,8 @@ arc_init(void)
 	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
-	arc_min_prefetch_lifespan = 1 * hz;
+	arc_min_prefetch_ms = 1;
+	arc_min_prescient_prefetch_ms = 6;
 
 #ifdef _KERNEL
 	/*
@@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 module_param(zfs_compressed_arc_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
 
-module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
-MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+module_param(zfs_arc_min_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
+
+module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
+	"Min life of prescient prefetched block in ms");
 
 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 64c1a68af..190d0656a 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 }
 
 static void
-dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
@@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
+		if (buf == NULL) {
+			buf = arc_alloc_buf(db->db_objset->os_spa,
+			    db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+		}
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
-	} else if (err == 0) {
+	} else if (buf != NULL) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
@@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
  * prefetch if the next block down is our target.
  */
 static void
-dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	dbuf_prefetch_arg_t *dpa = private;
 
@@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
 		dbuf_rele(db, FTAG);
 	}
 
-	dpa->dpa_curlevel--;
+	if (abuf == NULL) {
+		kmem_free(dpa, sizeof (*dpa));
+		return;
+	}
 
+	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
-	if (BP_IS_HOLE(bp) || err != 0) {
+
+	if (BP_IS_HOLE(bp)) {
 		kmem_free(dpa, sizeof (*dpa));
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 00b0a0b9e..24516834f 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -1172,14 +1172,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 	dmu_tx_t *tx;
-	zio_t *rio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	zio_t *rio;
 
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
+	rio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	/*
+	 * This function may cause an immediate scan of ddt blocks (see
+	 * the comment above dsl_scan_ddt() for details). We set the
+	 * scan's root zio here so that we can wait for any scan IOs in
+	 * addition to the regular ddt IOs.
+	 */
+	ASSERT3P(scn->scn_zio_root, ==, NULL);
+	scn->scn_zio_root = rio;
+
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
@@ -1189,6 +1201,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 	}
 
 	(void) zio_wait(rio);
+	scn->scn_zio_root = NULL;
 
 	dmu_tx_commit(tx);
 }
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 64e7d2f77..280e0ee34 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -520,7 +520,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 {
 	prefetch_data_t *pfd = arg;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
-	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+	    ARC_FLAG_PRESCIENT_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
 	if (bp == NULL)
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 43fd90861..86863fad8 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -390,8 +390,10 @@ dsl_pool_close(dsl_pool_t *dp)
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
 	taskq_destroy(dp->dp_iput_taskq);
-	if (dp->dp_blkstats)
+	if (dp->dp_blkstats) {
+		mutex_destroy(&dp->dp_blkstats->zab_lock);
 		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+	}
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b0aec5332..52c700f11 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -50,33 +50,141 @@
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/range_tree.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limitted performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
-static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
-static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
 
-int zfs_top_maxinflight = 32;		/* maximum I/Os per top-level */
-int zfs_resilver_delay = 2;		/* number of ticks to delay resilver */
-int zfs_scrub_delay = 4;		/* number of ticks to delay scrub */
-int zfs_scan_idle = 50;			/* idle window in clock ticks */
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+    uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+int zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
+int zfs_scan_mem_lim_fact = 20;		/* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20;	/* fraction of mem lim above */
 
-int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+int zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
 /* max number of blocks to free in a single TXG */
 unsigned long zfs_free_max_blocks = 100000;
 
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define	SCAN_IMPORT_WAIT_TXGS 		5
+
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -93,6 +201,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+	uint64_t	sds_dsobj;
+	uint64_t	sds_txg;
+	avl_node_t	sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ *	write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+	SYNC_OPTIONAL,
+	SYNC_MANDATORY,
+	SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+	/* fields from blkptr_t */
+	uint64_t		sio_offset;
+	uint64_t		sio_blk_prop;
+	uint64_t		sio_phys_birth;
+	uint64_t		sio_birth;
+	zio_cksum_t		sio_cksum;
+	uint32_t		sio_asize;
+
+	/* fields from zio_t */
+	int			sio_flags;
+	zbookmark_phys_t	sio_zb;
+
+	/* members for queue sorting */
+	union {
+		avl_node_t	sio_addr_node; /* link into issueing queue */
+		list_node_t	sio_list_node; /* link for issuing to disk */
+	} sio_nodes;
+} scan_io_t;
+
+struct dsl_scan_io_queue {
+	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
+	vdev_t		*q_vd; /* top-level vdev that this queue represents */
+
+	/* trees used for sorting I/Os and extents of I/Os */
+	range_tree_t	*q_exts_by_addr;
+	avl_tree_t	q_exts_by_size;
+	avl_tree_t	q_sios_by_addr;
+
+	/* members for zio rate limiting */
+	uint64_t	q_maxinflight_bytes;
+	uint64_t	q_inflight_bytes;
+	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+	/* per txg statistics */
+	uint64_t	q_total_seg_size_this_txg;
+	uint64_t	q_segs_this_txg;
+	uint64_t	q_total_zio_size_this_txg;
+	uint64_t	q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+	refcount_t spc_refcnt;		/* refcount for memory management */
+	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
+	boolean_t spc_root;		/* is this prefetch for an objset? */
+	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
+	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
+	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
+	blkptr_t spic_bp;		/* bp to prefetch */
+	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+    scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache;
+
+void
+scan_init(void)
+{
+	/*
+	 * This is used in ext_size_compare() to weight segments
+	 * based on how sparse they are. This cannot be changed
+	 * mid-scan and the tree comparison functions don't currently
+	 * have a mechansim for passing additional context to the
+	 * compare functions. Thus we store this value globally and
+	 * we only allow it to be set at module intiailization time
+	 */
+	fill_weight = zfs_scan_fill_weight;
+
+	sio_cache = kmem_cache_create("sio_cache",
+	    sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+scan_fini(void)
+{
+	kmem_cache_destroy(sio_cache);
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+	return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+	return (dsl_scan_is_running(dp->dp_scan) &&
+	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+{
+	bzero(bp, sizeof (*bp));
+	DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
+	DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
+	DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
+	bp->blk_prop = sio->sio_blk_prop;
+	bp->blk_phys_birth = sio->sio_phys_birth;
+	bp->blk_birth = sio->sio_birth;
+	bp->blk_fill = 1;	/* we always only work with data pointers */
+	bp->blk_cksum = sio->sio_cksum;
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+	/* we discard the vdev id, since we can deduce it from the queue */
+	sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
+	sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
+	sio->sio_blk_prop = bp->blk_prop;
+	sio->sio_phys_birth = bp->blk_phys_birth;
+	sio->sio_birth = bp->blk_birth;
+	sio->sio_cksum = bp->blk_cksum;
+}
+
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
@@ -113,6 +378,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
+	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+	    offsetof(scan_ds_t, sds_node));
+	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+	    sizeof (scan_prefetch_issue_ctx_t),
+	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
@@ -123,7 +395,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress; "
 		    "restarting new-style scrub in txg %llu",
-		    scn->scn_restart_txg);
+		    (longlong_t)scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
@@ -157,7 +429,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 				    scn->scn_async_destroying) {
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
-					return (SET_ERROR(EOVERFLOW));
+					return (EOVERFLOW);
 				}
 
 				bcopy(zaptmp, &scn->scn_phys,
@@ -177,7 +449,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		else if (err)
 			return (err);
 
-		if (scn->scn_phys.scn_state == DSS_SCANNING &&
+		/*
+		 * We might be restarting after a reboot, so jump the issued
+		 * counter to how far we've scanned. We know we're consistent
+		 * up to here.
+		 */
+		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
@@ -189,8 +468,24 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub was modified "
 			    "by old software; restarting in txg %llu",
-			    scn->scn_restart_txg);
+			    (longlong_t)scn->scn_restart_txg);
+		}
+	}
+
+	/* reload the queue into the in-core state */
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			scan_ds_queue_insert(scn,
+			    zfs_strtonum(za.za_name, NULL),
+			    za.za_first_integer);
 		}
+		zap_cursor_fini(&zc);
 	}
 
 	spa_scan_stat_init(spa);
@@ -200,19 +495,116 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
-	if (dp->dp_scan) {
+	if (dp->dp_scan != NULL) {
+		dsl_scan_t *scn = dp->dp_scan;
+
+		if (scn->scn_taskq != NULL)
+			taskq_destroy(scn->scn_taskq);
+		scan_ds_queue_clear(scn);
+		avl_destroy(&scn->scn_queue);
+		avl_destroy(&scn->scn_prefetch_queue);
+
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	return (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+	return (scn_phys->scn_state == DSS_SCANNING &&
+	    scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+	return (dsl_scan_scrubbing(scn->scn_dp) &&
+	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *	destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ *	superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *	swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+	int i;
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+	if (scn->scn_bytes_pending == 0) {
+		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+			if (q == NULL)
+				continue;
+
+			mutex_enter(&vd->vdev_scan_io_queue_lock);
+			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+			ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+			mutex_exit(&vd->vdev_scan_io_queue_lock);
+		}
+
+		if (scn->scn_phys.scn_queue_obj != 0)
+			scan_ds_queue_sync(scn, tx);
+		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys, tx));
+		bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+		    sizeof (scn->scn_phys));
+
+		if (scn->scn_checkpointing)
+			zfs_dbgmsg("finish scan checkpoint");
+
+		scn->scn_checkpointing = B_FALSE;
+		scn->scn_last_checkpoint = ddi_get_lbolt();
+	} else if (sync_type == SYNC_CACHED) {
+		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys_cached, tx));
+	}
+}
+
 /* ARGSUSED */
 static int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
-	if (scn->scn_phys.scn_state == DSS_SCANNING)
+	if (dsl_scan_is_running(scn))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
@@ -227,7 +619,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
-	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
 	scn->scn_phys.scn_func = *funcp;
@@ -238,8 +630,11 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+	scn->scn_issued_before_pass = 0;
 	scn->scn_restart_txg = 0;
 	scn->scn_done_txg = 0;
+	scn->scn_last_checkpoint = 0;
+	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -272,8 +667,10 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	if (dp->dp_blkstats == NULL) {
 		dp->dp_blkstats =
 		    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+		mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+		    MUTEX_DEFAULT, NULL);
 	}
-	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+	bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
@@ -281,13 +678,52 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
-	dsl_scan_sync_state(scn, tx);
+	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
 }
 
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	/*
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+		/* got scrub start cmd, resume paused scrub */
+		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+		    POOL_SCRUB_NORMAL);
+		if (err == 0)
+			return (ECANCELED);
+
+		return (SET_ERROR(err));
+	}
+
+	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+}
+
 /* ARGSUSED */
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -315,10 +751,11 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
-		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+		VERIFY0(dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
+	scan_ds_queue_clear(scn);
 
 	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 
@@ -326,13 +763,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
+	if (!dsl_scan_is_running(scn)) {
+		ASSERT(!scn->scn_is_sorted);
 		return;
+	}
 
-	if (complete)
-		scn->scn_phys.scn_state = DSS_FINISHED;
-	else
-		scn->scn_phys.scn_state = DSS_CANCELED;
+	if (scn->scn_is_sorted) {
+		scan_io_queues_destroy(scn);
+		scn->scn_is_sorted = B_FALSE;
+
+		if (scn->scn_taskq != NULL) {
+			taskq_destroy(scn->scn_taskq);
+			scn->scn_taskq = NULL;
+		}
+	}
+
+	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
 
 	if (dsl_scan_restarting(scn, tx))
 		spa_history_log_internal(spa, "scan aborted, restarting", tx,
@@ -345,12 +791,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		    "errors=%llu", spa_get_errlog_size(spa));
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight > 0) {
-			cv_wait(&spa->spa_scrub_io_cv,
-			    &spa->spa_scrub_lock);
-		}
-		mutex_exit(&spa->spa_scrub_lock);
 		spa->spa_scrub_started = B_FALSE;
 		spa->spa_scrub_active = B_FALSE;
 
@@ -379,6 +819,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
+
+	ASSERT(!dsl_scan_is_running(scn));
 }
 
 /* ARGSUSED */
@@ -387,7 +829,7 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
+	if (!dsl_scan_is_running(scn))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
@@ -399,7 +841,7 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
-	dsl_scan_sync_state(scn, tx);
+	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 }
 
 int
@@ -409,16 +851,6 @@ dsl_scan_cancel(dsl_pool_t *dp)
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
-boolean_t
-dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
-{
-	if (dsl_scan_scrubbing(scn->scn_dp) &&
-	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
-		return (B_TRUE);
-
-	return (B_FALSE);
-}
-
 static int
 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
@@ -453,7 +885,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 		/* can't pause a scrub when there is no in-progress scrub */
 		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
 		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
-		dsl_scan_sync_state(scn, tx);
+		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_scan_is_paused_scrub(scn)) {
@@ -466,7 +898,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
 			spa->spa_scan_pass_scrub_pause = 0;
 			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
-			dsl_scan_sync_state(scn, tx);
+			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		}
 	}
 }
@@ -482,25 +914,25 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 	    ZFS_SPACE_CHECK_RESERVED));
 }
 
-boolean_t
-dsl_scan_scrubbing(const dsl_pool_t *dp)
-{
-	dsl_scan_t *scn = dp->dp_scan;
 
-	if (scn->scn_phys.scn_state == DSS_SCANNING &&
-	    scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
-		return (B_TRUE);
+/* start a new scan, or restart an existing one. */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+	if (txg == 0) {
+		dmu_tx_t *tx;
+		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
-	return (B_FALSE);
+		txg = dmu_tx_get_txg(tx);
+		dp->dp_scan->scn_restart_txg = txg;
+		dmu_tx_commit(tx);
+	} else {
+		dp->dp_scan->scn_restart_txg = txg;
+	}
+	zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg);
 }
 
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
-    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
-    dmu_objset_type_t ostype, dmu_tx_t *tx);
-inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
-    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
-    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
-
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
@@ -514,25 +946,169 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 }
 
-static uint64_t
-dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+static int
+scan_ds_queue_compare(const void *a, const void *b)
 {
-	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
-	if (ds->ds_is_snapshot)
-		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
-	return (smt);
+	const scan_ds_t *sds_a = a, *sds_b = b;
+
+	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+		return (-1);
+	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+		return (0);
+	return (1);
 }
 
 static void
-dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+scan_ds_queue_clear(dsl_scan_t *scn)
 {
-	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
-	    &scn->scn_phys, tx));
+	void *cookie = NULL;
+	scan_ds_t *sds;
+	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+		kmem_free(sds, sizeof (*sds));
+	}
 }
 
-extern int zfs_vdev_async_write_active_min_dirty_percent;
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
+{
+	scan_ds_t srch, *sds;
+
+	srch.sds_dsobj = dsobj;
+	sds = avl_find(&scn->scn_queue, &srch, NULL);
+	if (sds != NULL && txg != NULL)
+		*txg = sds->sds_txg;
+	return (sds != NULL);
+}
+
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+	scan_ds_t *sds;
+	avl_index_t where;
+
+	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+	sds->sds_dsobj = dsobj;
+	sds->sds_txg = txg;
+
+	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+	avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+	scan_ds_t srch, *sds;
+
+	srch.sds_dsobj = dsobj;
+
+	sds = avl_find(&scn->scn_queue, &srch, NULL);
+	VERIFY(sds != NULL);
+	avl_remove(&scn->scn_queue, sds);
+	kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+	ASSERT0(scn->scn_bytes_pending);
+	ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+	VERIFY0(dmu_object_free(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, tx));
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+	    DMU_OT_NONE, 0, tx);
+	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+		    sds->sds_txg, tx));
+	}
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ *	queues on a pool gets above this value, we stop the metadata
+ *	scanning portion and start issuing the queued up and sorted
+ *	I/Os to reduce memory usage.
+ *	This limit is calculated as a fraction of physmem (by default 5%).
+ *	We constrain the lower bound of the hard limit to an absolute
+ *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ *	the upper bound to 5% of the total pool size - no chance we'll
+ *	ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ *	issuing I/O to reduce queue memory usage, but we don't want to
+ *	completely empty out the queues, since we might be able to find I/Os
+ *	that will fill in the gaps of our non-sequential IOs at some point
+ *	in the future. So we stop the issuing of I/Os once the amount of
+ *	memory used drops below the soft limit (at which point we stop issuing
+ *	I/O and start scanning metadata again).
+ *
+ *	This limit is calculated by subtracting a fraction of the hard
+ *	limit from the hard limit. By default this fraction is 5%, so
+ *	the soft limit is 95% of the hard limit. We cap the size of the
+ *	difference between the hard and soft limits at an absolute
+ *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ *	sufficient to not cause too frequent switching between the
+ *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ *	worth of queues is about 1.2 GiB of on-pool data, so scanning
+ *	that should take at least a decent fraction of a second).
+ */
+static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+	uint64_t mlim_hard, mlim_soft, mused;
+	uint64_t alloc = metaslab_class_get_alloc(spa_normal_class(
+	    scn->scn_dp->dp_spa));
+
+	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+	    zfs_scan_mem_lim_min);
+	mlim_hard = MIN(mlim_hard, alloc / 20);
+	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+	    zfs_scan_mem_lim_soft_max);
+	mused = 0;
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *tvd = rvd->vdev_child[i];
+		dsl_scan_io_queue_t *queue;
+
+		mutex_enter(&tvd->vdev_scan_io_queue_lock);
+		queue = tvd->vdev_scan_io_queue;
+		if (queue != NULL) {
+			/* #extents in exts_by_size = # in exts_by_addr */
+			mused += avl_numnodes(&queue->q_exts_by_size) *
+			    sizeof (range_seg_t) +
+			    avl_numnodes(&queue->q_sios_by_addr) *
+			    sizeof (scan_io_t);
+		}
+		mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	}
+
+	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+	if (mused == 0)
+		ASSERT0(scn->scn_bytes_pending);
+
+	/*
+	 * If we are above our hard limit, we need to clear out memory.
+	 * If we are below our soft limit, we need to accumulate sequential IOs.
+	 * Otherwise, we should keep doing whatever we are currently doing.
+	 */
+	if (mused >= mlim_hard)
+		return (B_TRUE);
+	else if (mused < mlim_soft)
+		return (B_FALSE);
+	else
+		return (scn->scn_clearing);
+}
 
 static boolean_t
 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
@@ -553,27 +1129,32 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 
 	/*
 	 * We suspend if:
-	 *  - we have scanned for the maximum time: an entire txg
-	 *    timeout (default 5 sec)
-	 *  or
 	 *  - we have scanned for at least the minimum time (default 1 sec
 	 *    for scrub, 3 sec for resilver), and either we have sufficient
 	 *    dirty data that we are starting to write more quickly
-	 *    (default 30%), or someone is explicitly waiting for this txg
-	 *    to complete.
+	 *    (default 30%), someone is explicitly waiting for this txg
+	 *    to complete, or we have used up all of the time in the txg
+	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
+	 *  or
+	 *  - the scan queue has reached its memory use limit
 	 */
-	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
-	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
-	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
-	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
-	    (txg_sync_waiting(scn->scn_dp) ||
-	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
-	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+	if ((NSEC2MSEC(scan_time_ns) > mintime &&
+	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa) ||
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
 		if (zb) {
 			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
@@ -581,12 +1162,16 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
+		} else {
+			dsl_scan_phys_t *scnp = &scn->scn_phys;
+
+			dprintf("suspending at at DDT bookmark "
+			    "%llx/%llx/%llx/%llx\n",
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 		}
-		dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 		scn->scn_suspending = B_TRUE;
 		return (B_TRUE);
 	}
@@ -683,32 +1268,283 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 	zil_free(zilog);
 }
 
-/* ARGSUSED */
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
+{
+	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
+
+	return (zbookmark_compare(spc_a->spc_datablkszsec,
+	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
+
 static void
-dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
-    uint64_t objset, uint64_t object, uint64_t blkid)
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
 {
-	zbookmark_phys_t czb;
-	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+	if (refcount_remove(&spc->spc_refcnt, tag) == 0) {
+		refcount_destroy(&spc->spc_refcnt);
+		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+	}
+}
+
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+	scan_prefetch_ctx_t *spc;
+
+	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+	refcount_create(&spc->spc_refcnt);
+	refcount_add(&spc->spc_refcnt, tag);
+	spc->spc_scn = scn;
+	if (dnp != NULL) {
+		spc->spc_datablkszsec = dnp->dn_datablkszsec;
+		spc->spc_indblkshift = dnp->dn_indblkshift;
+		spc->spc_root = B_FALSE;
+	} else {
+		spc->spc_datablkszsec = 0;
+		spc->spc_indblkshift = 0;
+		spc->spc_root = B_TRUE;
+	}
+
+	return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+	refcount_add(&spc->spc_refcnt, tag);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+    const zbookmark_phys_t *zb)
+{
+	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+	dnode_phys_t tmp_dnp;
+	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+	if (zb->zb_objset != last_zb->zb_objset)
+		return (B_TRUE);
+	if ((int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+	if (zbookmark_subtree_completed(dnp, zb, last_zb))
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+	avl_index_t idx;
+	dsl_scan_t *scn = spc->spc_scn;
+	spa_t *spa = scn->scn_dp->dp_spa;
+	scan_prefetch_issue_ctx_t *spic;
 
 	if (zfs_no_scrub_prefetch)
 		return;
 
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
-	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
 
-	if (BP_IS_PROTECTED(bp)) {
-		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
-		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
-		zio_flags |= ZIO_FLAG_RAW;
+	if (dsl_scan_check_prefetch_resume(spc, zb))
+		return;
+
+	scan_prefetch_ctx_add_ref(spc, scn);
+	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+	spic->spic_spc = spc;
+	spic->spic_bp = *bp;
+	spic->spic_zb = *zb;
+
+	/*
+	 * Add the IO to the queue of blocks to prefetch. This allows us to
+	 * prioritize blocks that we will need first for the main traversal
+	 * thread.
+	 */
+	mutex_enter(&spa->spa_scrub_lock);
+	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+		/* this block is already queued for prefetch */
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+		scan_prefetch_ctx_rele(spc, scn);
+		mutex_exit(&spa->spa_scrub_lock);
+		return;
+	}
+
+	avl_insert(&scn->scn_prefetch_queue, spic, idx);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int i;
+	zbookmark_phys_t zb;
+	scan_prefetch_ctx_t *spc;
+
+	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+		return;
+
+	SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+		zb.zb_blkid = i;
+		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zb.zb_level = 0;
+		zb.zb_blkid = DMU_SPILL_BLKID;
+		dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
+	}
+
+	scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *private)
+{
+	scan_prefetch_ctx_t *spc = private;
+	dsl_scan_t *scn = spc->spc_scn;
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	/* broadcast that the IO has completed for rate limitting purposes */
+	mutex_enter(&spa->spa_scrub_lock);
+	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+
+	/* if there was an error or we are done prefetching, just cleanup */
+	if (buf == NULL || scn->scn_suspending)
+		goto out;
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		zbookmark_phys_t czb;
+
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1, zb->zb_blkid * epb + i);
+			dsl_scan_prefetch(spc, cbp, &czb);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		dnode_phys_t *cdnp;
+		int i;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+		for (i = 0, cdnp = buf->b_data; i < epb;
+		    i += cdnp->dn_extra_slots + 1,
+		    cdnp += cdnp->dn_extra_slots + 1) {
+			dsl_scan_prefetch_dnode(scn, cdnp,
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		objset_phys_t *osp = buf->b_data;
+
+		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+		    zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			dsl_scan_prefetch_dnode(scn,
+			    &osp->os_groupused_dnode, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
+			dsl_scan_prefetch_dnode(scn,
+			    &osp->os_userused_dnode, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
 	}
 
-	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+out:
+	if (buf != NULL)
+		arc_buf_destroy(buf, private);
+	scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+	dsl_scan_t *scn = arg;
+	spa_t *spa = scn->scn_dp->dp_spa;
+	scan_prefetch_issue_ctx_t *spic;
+
+	/* loop until we are told to stop */
+	while (!scn->scn_prefetch_stop) {
+		arc_flags_t flags = ARC_FLAG_NOWAIT |
+		    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+		mutex_enter(&spa->spa_scrub_lock);
+
+		/*
+		 * Wait until we have an IO to issue and are not above our
+		 * maximum in flight limit.
+		 */
+		while (!scn->scn_prefetch_stop &&
+		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		}
 
-	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
-	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
+		/* recheck if we should stop since we waited for the cv */
+		if (scn->scn_prefetch_stop) {
+			mutex_exit(&spa->spa_scrub_lock);
+			break;
+		}
+
+		/* remove the prefetch IO from the tree */
+		spic = avl_first(&scn->scn_prefetch_queue);
+		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+		avl_remove(&scn->scn_prefetch_queue, spic);
+
+		mutex_exit(&spa->spa_scrub_lock);
+
+		if (BP_IS_PROTECTED(&spic->spic_bp)) {
+			ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
+			    BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
+			ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
+			zio_flags |= ZIO_FLAG_RAW;
+		}
+
+		/* issue the prefetch asynchronously */
+		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
+
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+	}
+
+	ASSERT(scn->scn_prefetch_stop);
+
+	/* free any prefetches we didn't get to complete */
+	mutex_enter(&spa->spa_scrub_lock);
+	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+		avl_remove(&scn->scn_prefetch_queue, spic);
+		scan_prefetch_ctx_rele(spic->spic_spc, scn);
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+	}
+	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
@@ -747,6 +1583,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 	return (B_FALSE);
 }
 
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
+inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
+    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
@@ -774,10 +1617,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			return (err);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
-			    zb->zb_object, zb->zb_blkid * epb + i);
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
@@ -790,7 +1629,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
-		int i, j;
+		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		arc_buf_t *buf;
 
@@ -808,15 +1647,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
-			for (j = 0; j < cdnp->dn_nblkptr; j++) {
-				blkptr_t *cbp = &cdnp->dn_blkptr[j];
-				dsl_scan_prefetch(scn, buf, cbp,
-				    zb->zb_objset, zb->zb_blkid * epb + i, j);
-			}
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb;
-		    i += cdnp->dn_extra_slots + 1,
-		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
@@ -843,8 +1673,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 			/*
 			 * We also always visit user/group accounting
 			 * objects, and never skip them, even if we are
-			 * suspending.  This is necessary so that the space
-			 * deltas from this txg get integrated.
+			 * suspending. This is necessary so that the
+			 * space deltas from this txg get integrated.
 			 */
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode,
@@ -894,21 +1724,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
-	blkptr_t *bp_toread;
-
-	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	*bp_toread = *bp;
-
-	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+	blkptr_t *bp_toread = NULL;
 
 	if (dsl_scan_check_suspend(scn, zb))
-		goto out;
+		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
-		goto out;
-
-	if (BP_IS_HOLE(bp))
-		goto out;
+		return;
 
 	scn->scn_visited_this_txg++;
 
@@ -919,14 +1741,24 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * if required to debug an issue in dsl_scan_visitbp().
 	 *
 	 * dprintf_bp(bp,
-	 *    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
-	 *    ds, ds ? ds->ds_object : 0,
-	 *    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
-	 *    bp);
+	 *     "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+	 *     ds, ds ? ds->ds_object : 0,
+	 *     zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+	 *     bp);
 	 */
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
-		goto out;
+	if (BP_IS_HOLE(bp)) {
+		scn->scn_holes_this_txg++;
+		return;
+	}
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+		scn->scn_lt_min_this_txg++;
+		return;
+	}
+
+	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	*bp_toread = *bp;
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
 		goto out;
@@ -938,6 +1770,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
+		scn->scn_ddt_contained_this_txg++;
 		goto out;
 	}
 
@@ -948,9 +1781,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
-		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+		scn->scn_gt_max_this_txg++;
+		goto out;
 	}
+
+	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+
 out:
 	kmem_free(bp_toread, sizeof (blkptr_t));
 }
@@ -960,26 +1797,33 @@ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_phys_t zb;
+	scan_prefetch_ctx_t *spc;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	dsl_scan_visitbp(bp, &zb, NULL,
-	    ds, scn, DMU_OST_NONE, tx);
+
+	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+		    zb.zb_objset, 0, 0, 0);
+	} else {
+		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+	}
+
+	scn->scn_objsets_visited_this_txg++;
+
+	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+	dsl_scan_prefetch(spc, bp, &zb);
+	scan_prefetch_ctx_rele(spc, FTAG);
+
+	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
-void
-dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
 {
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg;
-
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
-		return;
-
-	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
 		if (ds->ds_is_snapshot) {
 			/*
 			 * Note:
@@ -991,23 +1835,57 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 			 *    ignore it when we retraverse it in
 			 *    dsl_scan_visitds().
 			 */
-			scn->scn_phys.scn_bookmark.zb_objset =
+			scn_phys->scn_bookmark.zb_objset =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
-			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
-			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+			SET_BOOKMARK(&scn_phys->scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object);
 		}
-	} else if (zap_lookup_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+	}
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ *	a new dsl_scan_phys_t and marking the objset reference in it
+ *	as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	ds_destroyed_scn_phys(ds, &scn->scn_phys);
+	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds->ds_object);
+		if (ds->ds_is_snapshot)
+			scan_ds_queue_insert(scn,
+			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, &mintxg) == 0) {
 		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
@@ -1036,9 +1914,28 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
-	dsl_scan_sync_state(scn, tx);
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+	if (scn_bookmark->zb_objset == ds->ds_object) {
+		scn_bookmark->zb_objset =
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	}
 }
 
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
@@ -1046,20 +1943,22 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
+	if (!dsl_scan_is_running(scn))
 		return;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
-	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
-		scn->scn_phys.scn_bookmark.zb_objset =
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
-		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
-		    "reset zb_objset to %llu",
-		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
-	} else if (zap_lookup_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds->ds_object);
+		scan_ds_queue_insert(scn,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
@@ -1070,37 +1969,59 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 		    (u_longlong_t)ds->ds_object,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
-	dsl_scan_sync_state(scn, tx);
+
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
-void
-dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+    zbookmark_phys_t *scn_bookmark)
 {
-	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg;
-
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
-		return;
-
-	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
-		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+	if (scn_bookmark->zb_objset == ds1->ds_object) {
+		scn_bookmark->zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
-	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
-		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
+		scn_bookmark->zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    (u_longlong_t)ds1->ds_object);
 	}
+}
+
+/*
+ * Called when a parent dataset and its clone are swapped. If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted parent.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+	if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds1->ds_object);
+		scan_ds_queue_insert(scn, ds2->ds_object, mintxg);
+	}
+	if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds2->ds_object);
+		scan_ds_queue_insert(scn, ds1->ds_object, mintxg);
+	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds1->ds_object, &mintxg) == 0) {
 		int err;
-
 		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1118,8 +2039,9 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
-	} else if (zap_lookup_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+	}
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds2->ds_object, &mintxg) == 0) {
 		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1132,31 +2054,26 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 		    (u_longlong_t)ds1->ds_object);
 	}
 
-	dsl_scan_sync_state(scn, tx);
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
-struct enqueue_clones_arg {
-	dmu_tx_t *tx;
-	uint64_t originobj;
-};
-
 /* ARGSUSED */
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
-	struct enqueue_clones_arg *eca = arg;
+	uint64_t originobj = *(uint64_t *)arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
-	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
-	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
@@ -1166,9 +2083,8 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 			return (err);
 		ds = prev;
 	}
-	VERIFY(zap_add_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds->ds_object,
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
+	scan_ds_queue_insert(scn, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -1214,9 +2130,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 		dsl_dataset_name(ds, dsname);
 		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
 		    "cur_min_txg (%llu) >= max_txg (%llu)",
-		    dsobj, dsname,
-		    scn->scn_phys.scn_cur_min_txg,
-		    scn->scn_phys.scn_max_txg);
+		    (longlong_t)dsobj, dsname,
+		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+		    (longlong_t)scn->scn_phys.scn_max_txg);
 		kmem_free(dsname, MAXNAMELEN);
 
 		goto out;
@@ -1232,7 +2148,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 	 * ZIL here, rather than in scan_recurse(), because the regular
 	 * snapshot block-sharing rules don't apply to it.
 	 */
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
+	if (!ds->ds_is_snapshot)
 		dsl_scan_zil(dp, &os->os_zil_header);
 
 	/*
@@ -1266,9 +2182,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass; visiting again");
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
-		VERIFY(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds->ds_object,
-		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
+		scan_ds_queue_insert(scn, ds->ds_object,
+		    scn->scn_phys.scn_cur_max_txg);
 		goto out;
 	}
 
@@ -1276,10 +2191,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 	 * Add descendent datasets to work queue.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
-		VERIFY(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj,
+		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_next_snap_obj,
-		    dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
+		    dsl_dataset_phys(ds)->ds_creation_txg);
 	}
 	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
@@ -1300,17 +2214,21 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 		}
 
 		if (usenext) {
-			VERIFY0(zap_join_key(dp->dp_meta_objset,
-			    dsl_dataset_phys(ds)->ds_next_clones_obj,
-			    scn->scn_phys.scn_queue_obj,
-			    dsl_dataset_phys(ds)->ds_creation_txg, tx));
+			zap_cursor_t zc;
+			zap_attribute_t za;
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				scan_ds_queue_insert(scn,
+				    zfs_strtonum(za.za_name, NULL),
+				    dsl_dataset_phys(ds)->ds_creation_txg);
+			}
+			zap_cursor_fini(&zc);
 		} else {
-			struct enqueue_clones_arg eca;
-			eca.tx = tx;
-			eca.originobj = ds->ds_object;
-
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-			    enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
+			    enqueue_clones_cb, &ds->ds_object,
+			    DS_FIND_CHILDREN));
 		}
 	}
 
@@ -1322,7 +2240,6 @@ out:
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
-	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
@@ -1352,12 +2269,37 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		ds = prev;
 	}
 
-	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
+	scan_ds_queue_insert(scn, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t bp;
+	zbookmark_phys_t zb = { 0 };
+	int p;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &bp);
+
+		scn->scn_visited_this_txg++;
+		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+	}
+}
+
 /*
  * Scrub/dedup interaction.
  *
@@ -1432,36 +2374,20 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
-/* ARGSUSED */
-void
-dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
-	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
-	blkptr_t bp;
-	zbookmark_phys_t zb = { 0 };
-
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
-		return;
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
-			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
-
-		scn->scn_visited_this_txg++;
-		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
-	}
+	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+	if (ds->ds_is_snapshot)
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+	return (smt);
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
+	scan_ds_t *sds;
 	dsl_pool_t *dp = scn->scn_dp;
-	zap_cursor_t *zc;
-	zap_attribute_t *za;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
@@ -1485,7 +2411,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-			    enqueue_cb, tx, DS_FIND_CHILDREN));
+			    enqueue_cb, NULL, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
@@ -1493,42 +2419,42 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 		ASSERT(!scn->scn_suspending);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
+		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
 		/*
-		 * If we were suspended, continue from here.  Note if the
+		 * If we were suspended, continue from here. Note if the
 		 * ds we were suspended on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
-		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/*
-	 * In case we were suspended right at the end of the ds, zero the
+	 * In case we suspended right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
-	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
-	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
-	/* keep pulling things out of the zap-object-as-queue */
-	while (zap_cursor_init(zc, dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj),
-	    zap_cursor_retrieve(zc, za) == 0) {
+	/*
+	 * Keep pulling things out of the dataset avl queue. Updates to the
+	 * persistent zap-object-as-queue happen only at checkpoints.
+	 */
+	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
 		dsl_dataset_t *ds;
-		uint64_t dsobj;
+		uint64_t dsobj = sds->sds_dsobj;
+		uint64_t txg = sds->sds_txg;
 
-		dsobj = zfs_strtonum(za->za_name, NULL);
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, dsobj, tx));
+		/* dequeue and free the ds from the queue */
+		scan_ds_queue_remove(scn, dsobj);
+		sds = NULL;
 
-		/* Set up min/max txg */
+		/* set up min / max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-		if (za->za_first_integer != 0) {
+		if (txg != 0) {
 			scn->scn_phys.scn_cur_min_txg =
-			    MAX(scn->scn_phys.scn_min_txg,
-			    za->za_first_integer);
+			    MAX(scn->scn_phys.scn_min_txg, txg);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
@@ -1538,14 +2464,360 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
-		zap_cursor_fini(zc);
 		if (scn->scn_suspending)
-			goto out;
+			return;
 	}
-	zap_cursor_fini(zc);
-out:
-	kmem_free(za, sizeof (zap_attribute_t));
-	kmem_free(zc, sizeof (zap_cursor_t));
+
+	/* No more objsets to fetch, we're done */
+	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+	ASSERT0(scn->scn_suspending);
+}
+
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+	uint64_t i, leaves = 0;
+
+	/* we only count leaves that belong to the main pool and are readable */
+	if (vd->vdev_islog || vd->vdev_isspare ||
+	    vd->vdev_isl2cache || !vdev_readable(vd))
+		return (0);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (i = 0; i < vd->vdev_children; i++) {
+		leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+	}
+
+	return (leaves);
+}
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+	int i;
+	uint64_t cur_size = 0;
+
+	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+	}
+
+	q->q_total_zio_size_this_txg += cur_size;
+	q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+    uint64_t end)
+{
+	q->q_total_seg_size_this_txg += end - start;
+	q->q_segs_this_txg++;
+}
+
+static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+	/* See comment in dsl_scan_check_suspend() */
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
+	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+	return ((NSEC2MSEC(scan_time_ns) > mintime &&
+	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the io's out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any zios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio;
+	int64_t bytes_issued = 0;
+	boolean_t suspended = B_FALSE;
+
+	while ((sio = list_head(io_list)) != NULL) {
+		blkptr_t bp;
+
+		if (scan_io_queue_check_suspend(scn)) {
+			suspended = B_TRUE;
+			break;
+		}
+
+		sio2bp(sio, &bp, queue->q_vd->vdev_id);
+		bytes_issued += sio->sio_asize;
+		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+		    &sio->sio_zb, queue);
+		(void) list_remove_head(io_list);
+		scan_io_queues_update_zio_stats(queue, &bp);
+		kmem_cache_free(sio_cache, sio);
+	}
+
+	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+	return (suspended);
+}
+
+/*
+ * This function removes sios from an IO queue which reside within a given
+ * range_seg_t and inserts them (in offset order) into a list. Note that
+ * we only ever return a maximum of 32 sios at once. If there are more sios
+ * to process within this segment that did not make it onto the list we
+ * return B_TRUE and otherwise B_FALSE.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+	scan_io_t srch_sio, *sio, *next_sio;
+	avl_index_t idx;
+	uint_t num_sios = 0;
+	int64_t bytes_issued = 0;
+
+	ASSERT(rs != NULL);
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	srch_sio.sio_offset = rs->rs_start;
+
+	/*
+	 * The exact start of the extent might not contain any matching zios,
+	 * so if that's the case, examine the next one in the tree.
+	 */
+	sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+	if (sio == NULL)
+		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+	while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
+		ASSERT3U(sio->sio_offset, >=, rs->rs_start);
+		ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+
+		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+		avl_remove(&queue->q_sios_by_addr, sio);
+
+		bytes_issued += sio->sio_asize;
+		num_sios++;
+		list_insert_tail(list, sio);
+		sio = next_sio;
+	}
+
+	/*
+	 * We limit the number of sios we process at once to 32 to avoid
+	 * biting off more than we can chew. If we didn't take everything
+	 * in the segment we update it to reflect the work we were able to
+	 * complete. Otherwise, we remove it from the range tree entirely.
+	 */
+	if (sio != NULL && sio->sio_offset < rs->rs_end) {
+		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+		    -bytes_issued);
+		range_tree_resize_segment(queue->q_exts_by_addr, rs,
+		    sio->sio_offset, rs->rs_end - sio->sio_offset);
+
+		return (B_TRUE);
+	} else {
+		range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
+		    rs->rs_end - rs->rs_start);
+		return (B_FALSE);
+	}
+}
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue io's. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ * 	needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ * 	memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+	dsl_scan_t *scn = queue->q_scn;
+
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+	ASSERT(scn->scn_is_sorted);
+
+	/* handle tunable overrides */
+	if (scn->scn_checkpointing || scn->scn_clearing) {
+		if (zfs_scan_issue_strategy == 1) {
+			return (range_tree_first(queue->q_exts_by_addr));
+		} else if (zfs_scan_issue_strategy == 2) {
+			return (avl_first(&queue->q_exts_by_size));
+		}
+	}
+
+	/*
+	 * During normal clearing, we want to issue our largest segments
+	 * first, keeping IO as sequential as possible, and leaving the
+	 * smaller extents for later with the hope that they might eventually
+	 * grow to larger sequential segments. However, when the scan is
+	 * checkpointing, no new extents will be added to the sorting queue,
+	 * so the way we are sorted now is as good as it will ever get.
+	 * In this case, we instead switch to issuing extents in LBA order.
+	 */
+	if (scn->scn_checkpointing) {
+		return (range_tree_first(queue->q_exts_by_addr));
+	} else if (scn->scn_clearing) {
+		return (avl_first(&queue->q_exts_by_size));
+	} else {
+		return (NULL);
+	}
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+	dsl_scan_io_queue_t *queue = arg;
+	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+	boolean_t suspended = B_FALSE;
+	range_seg_t *rs = NULL;
+	scan_io_t *sio = NULL;
+	list_t sio_list;
+	uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+	uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+	ASSERT(queue->q_scn->scn_is_sorted);
+
+	list_create(&sio_list, sizeof (scan_io_t),
+	    offsetof(scan_io_t, sio_nodes.sio_list_node));
+	mutex_enter(q_lock);
+
+	/* calculate maximum in-flight bytes for this txg (min 1MB) */
+	queue->q_maxinflight_bytes =
+	    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+	/* reset per-queue scan statistics for this txg */
+	queue->q_total_seg_size_this_txg = 0;
+	queue->q_segs_this_txg = 0;
+	queue->q_total_zio_size_this_txg = 0;
+	queue->q_zios_this_txg = 0;
+
+	/* loop until we run out of time or sios */
+	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
+		uint64_t seg_start = 0, seg_end = 0;
+		boolean_t more_left = B_TRUE;
+
+		ASSERT(list_is_empty(&sio_list));
+
+		/* loop while we still have sios left to process in this rs */
+		while (more_left) {
+			scan_io_t *first_sio, *last_sio;
+
+			/*
+			 * We have selected which extent needs to be
+			 * processed next. Gather up the corresponding sios.
+			 */
+			more_left = scan_io_queue_gather(queue, rs, &sio_list);
+			ASSERT(!list_is_empty(&sio_list));
+			first_sio = list_head(&sio_list);
+			last_sio = list_tail(&sio_list);
+
+			seg_end = last_sio->sio_offset + last_sio->sio_asize;
+			if (seg_start == 0)
+				seg_start = first_sio->sio_offset;
+
+			/*
+			 * Issuing sios can take a long time so drop the
+			 * queue lock. The sio queue won't be updated by
+			 * other threads since we're in syncing context so
+			 * we can be sure that our trees will remain exactly
+			 * as we left them.
+			 */
+			mutex_exit(q_lock);
+			suspended = scan_io_queue_issue(queue, &sio_list);
+			mutex_enter(q_lock);
+
+			if (suspended)
+				break;
+		}
+
+		/* update statistics for debugging purposes */
+		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+		if (suspended)
+			break;
+	}
+
+	/*
+	 * If we were suspended in the middle of processing,
+	 * requeue any unfinished sios and exit.
+	 */
+	while ((sio = list_head(&sio_list)) != NULL) {
+		list_remove(&sio_list, sio);
+		scan_io_queue_insert_impl(queue, sio);
+	}
+
+	mutex_exit(q_lock);
+	list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's io's only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	ASSERT(scn->scn_is_sorted);
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	if (scn->scn_bytes_pending == 0)
+		return;
+
+	if (scn->scn_taskq == NULL) {
+		int nthreads = spa->spa_root_vdev->vdev_children;
+
+		/*
+		 * We need to make this taskq *always* execute as many
+		 * threads in parallel as we have top-level vdevs and no
+		 * less, otherwise strange serialization of the calls to
+		 * scan_io_queues_run_one can occur during spa_sync runs
+		 * and that significantly impacts performance.
+		 */
+		scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
+		    minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
+	}
+
+	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+		mutex_enter(&vd->vdev_scan_io_queue_lock);
+		if (vd->vdev_scan_io_queue != NULL) {
+			VERIFY(taskq_dispatch(scn->scn_taskq,
+			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
+			    TQ_SLEEP) != TASKQID_INVALID);
+		}
+		mutex_exit(&vd->vdev_scan_io_queue_lock);
+	}
+
+	/*
+	 * Wait for the queues to finish issuing thir IOs for this run
+	 * before we return. There may still be IOs in flight at this
+	 * point.
+	 */
+	taskq_wait(scn->scn_taskq);
 }
 
 static boolean_t
@@ -1586,6 +2858,41 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	return (0);
 }
 
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t i;
+	uint64_t seg_size_total = 0, zio_size_total = 0;
+	uint64_t seg_count_total = 0, zio_count_total = 0;
+
+	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+		if (queue == NULL)
+			continue;
+
+		seg_size_total += queue->q_total_seg_size_this_txg;
+		zio_size_total += queue->q_total_zio_size_this_txg;
+		seg_count_total += queue->q_segs_this_txg;
+		zio_count_total += queue->q_zios_this_txg;
+	}
+
+	if (seg_count_total == 0 || zio_count_total == 0) {
+		scn->scn_avg_seg_size_this_txg = 0;
+		scn->scn_avg_zio_size_this_txg = 0;
+		scn->scn_segs_this_txg = 0;
+		scn->scn_zios_this_txg = 0;
+		return;
+	}
+
+	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+	scn->scn_segs_this_txg = seg_count_total;
+	scn->scn_zios_this_txg = zio_count_total;
+}
+
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
@@ -1596,8 +2903,7 @@ dsl_scan_active(dsl_scan_t *scn)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
-	if ((scn->scn_phys.scn_state == DSS_SCANNING &&
-	    !dsl_scan_is_paused_scrub(scn)) ||
+	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
 	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
@@ -1608,13 +2914,60 @@ dsl_scan_active(dsl_scan_t *scn)
 	return (used != 0);
 }
 
-/* Called whenever a txg syncs. */
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	vdev_t *vd;
+
+	if (DVA_GET_GANG(dva)) {
+		/*
+		 * Gang members may be spread across multiple
+		 * vdevs, so the best estimate we have is the
+		 * scrub range, which has already been checked.
+		 * XXX -- it would be better to change our
+		 * allocation policy to ensure that all
+		 * gang members reside on the same vdev.
+		 */
+		return (B_TRUE);
+	}
+
+	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+	/*
+	 * Check if the txg falls within the range which must be
+	 * resilvered.  DVAs outside this range can always be skipped.
+	 */
+	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+		return (B_FALSE);
+
+	/*
+	 * Check if the top-level vdev must resilver this offset.
+	 * When the offset does not intersect with a dirty leaf DTL
+	 * then it may be possible to skip the resilver IO.  The psize
+	 * is provided instead of asize to simplify the check for RAIDZ.
+	 */
+	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this funciton controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+	int err = 0;
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	int err = 0;
+	state_sync_type_t sync_type = SYNC_OPTIONAL;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
@@ -1627,14 +2980,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u txg=%llu",
-		    func, tx->tx_txg);
+		    func, (longlong_t)tx->tx_txg);
 		dsl_scan_setup_sync(&func, tx);
 	}
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
-	if (spa_sync_pass(dp->dp_spa) > 1)
+	if (spa_sync_pass(spa) > 1)
 		return;
 
 	/*
@@ -1651,7 +3004,17 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
 		return;
 
+	/* reset scan statistics */
 	scn->scn_visited_this_txg = 0;
+	scn->scn_holes_this_txg = 0;
+	scn->scn_lt_min_this_txg = 0;
+	scn->scn_gt_max_this_txg = 0;
+	scn->scn_ddt_contained_this_txg = 0;
+	scn->scn_objsets_visited_this_txg = 0;
+	scn->scn_avg_seg_size_this_txg = 0;
+	scn->scn_segs_this_txg = 0;
+	scn->scn_avg_zio_size_this_txg = 0;
+	scn->scn_zios_this_txg = 0;
 	scn->scn_suspending = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
@@ -1664,13 +3027,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	 * blocks than to scrub them.
 	 */
 	if (zfs_free_bpobj_enabled &&
-	    spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
-		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    dsl_scan_free_block_cb, scn, tx);
-		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+		VERIFY0(zio_wait(scn->scn_zio_root));
+		scn->scn_zio_root = NULL;
 
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
@@ -1679,11 +3043,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		ASSERT(scn->scn_async_destroying);
 		scn->scn_is_bptree = B_TRUE;
-		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bptree_iterate(dp->dp_meta_objset,
 		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
+		scn->scn_zio_root = NULL;
 
 		if (err == EIO || err == ECKSUM) {
 			err = 0;
@@ -1770,110 +3135,189 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
 	}
 
-	if (scn->scn_phys.scn_state != DSS_SCANNING)
+	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
 		return;
 
-	if (scn->scn_done_txg == tx->tx_txg) {
-		ASSERT(!scn->scn_suspending);
-		/* finished with scan. */
-		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
-		dsl_scan_done(scn, B_TRUE, tx);
-		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
-		dsl_scan_sync_state(scn, tx);
+	/*
+	 * Wait a few txgs after importing to begin scanning so that
+	 * we can get the pool imported quickly.
+	 */
+	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
 		return;
-	}
 
-	if (dsl_scan_is_paused_scrub(scn))
-		return;
+	/*
+	 * It is possible to switch from unsorted to sorted at any time,
+	 * but afterwards the scan will remain sorted unless reloaded from
+	 * a checkpoint after a reboot.
+	 */
+	if (!zfs_scan_legacy) {
+		scn->scn_is_sorted = B_TRUE;
+		if (scn->scn_last_checkpoint == 0)
+			scn->scn_last_checkpoint = ddi_get_lbolt();
+	}
 
-	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
-	    scn->scn_phys.scn_ddt_class_max) {
-		zfs_dbgmsg("doing scan sync txg %llu; "
-		    "ddt bm=%llu/%llu/%llu/%llx",
-		    (longlong_t)tx->tx_txg,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
-		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
-		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
-		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
-		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
-		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+	/*
+	 * For sorted scans, determine what kind of work we will be doing
+	 * this txg based on our memory limitations and whether or not we
+	 * need to perform a checkpoint.
+	 */
+	if (scn->scn_is_sorted) {
+		/*
+		 * If we are over our checkpoint interval, set scn_clearing
+		 * so that we can begin checkpointing immediately. The
+		 * checkpoint allows us to save a consisent bookmark
+		 * representing how much data we have scrubbed so far.
+		 * Otherwise, use the memory limit to determine if we should
+		 * scan for metadata or start issue scrub IOs. We accumulate
+		 * metadata until we hit our hard memory limit at which point
+		 * we issue scrub IOs until we are at our soft memory limit.
+		 */
+		if (scn->scn_checkpointing ||
+		    ddi_get_lbolt() - scn->scn_last_checkpoint >
+		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+			if (!scn->scn_checkpointing)
+				zfs_dbgmsg("begin scan checkpoint");
+
+			scn->scn_checkpointing = B_TRUE;
+			scn->scn_clearing = B_TRUE;
+		} else {
+			boolean_t should_clear = dsl_scan_should_clear(scn);
+			if (should_clear && !scn->scn_clearing) {
+				zfs_dbgmsg("begin scan clearing");
+				scn->scn_clearing = B_TRUE;
+			} else if (!should_clear && scn->scn_clearing) {
+				zfs_dbgmsg("finish scan clearing");
+				scn->scn_clearing = B_FALSE;
+			}
+		}
 	} else {
-		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
-		    (longlong_t)tx->tx_txg,
-		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
-		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
-		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
-		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+		ASSERT0(scn->scn_checkpointing);
+		ASSERT0(scn->scn_clearing);
 	}
 
-	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-	    NULL, ZIO_FLAG_CANFAIL);
-	dsl_pool_config_enter(dp, FTAG);
-	dsl_scan_visit(scn, tx);
-	dsl_pool_config_exit(dp, FTAG);
-	(void) zio_wait(scn->scn_zio_root);
-	scn->scn_zio_root = NULL;
+	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+		/* Need to scan metadata for more blocks to scrub */
+		dsl_scan_phys_t *scnp = &scn->scn_phys;
+		taskqid_t prefetch_tqid;
+		uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+		uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
 
-	zfs_dbgmsg("visited %llu blocks in %llums",
-	    (longlong_t)scn->scn_visited_this_txg,
-	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
+		/*
+		 * Calculate the max number of in-flight bytes for pool-wide
+		 * scanning operations (minimum 1MB). Limits for the issuing
+		 * phase are done per top-level vdev and are handled separately.
+		 */
+		scn->scn_maxinflight_bytes =
+		    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+		if (scnp->scn_ddt_bookmark.ddb_class <=
+		    scnp->scn_ddt_class_max) {
+			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+			zfs_dbgmsg("doing scan sync txg %llu; "
+			    "ddt bm=%llu/%llu/%llu/%llx",
+			    (longlong_t)tx->tx_txg,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+		} else {
+			zfs_dbgmsg("doing scan sync txg %llu; "
+			    "bm=%llu/%llu/%llu/%llu",
+			    (longlong_t)tx->tx_txg,
+			    (longlong_t)scnp->scn_bookmark.zb_objset,
+			    (longlong_t)scnp->scn_bookmark.zb_object,
+			    (longlong_t)scnp->scn_bookmark.zb_level,
+			    (longlong_t)scnp->scn_bookmark.zb_blkid);
+		}
 
-	if (!scn->scn_suspending) {
-		scn->scn_done_txg = tx->tx_txg + 1;
-		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
-		    tx->tx_txg, scn->scn_done_txg);
-	}
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
 
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight > 0) {
-			cv_wait(&spa->spa_scrub_io_cv,
-			    &spa->spa_scrub_lock);
-		}
-		mutex_exit(&spa->spa_scrub_lock);
-	}
+		scn->scn_prefetch_stop = B_FALSE;
+		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+		ASSERT(prefetch_tqid != TASKQID_INVALID);
 
-	dsl_scan_sync_state(scn, tx);
-}
+		dsl_pool_config_enter(dp, FTAG);
+		dsl_scan_visit(scn, tx);
+		dsl_pool_config_exit(dp, FTAG);
 
-/*
- * This will start a new scan, or restart an existing one.
- */
-void
-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
-{
-	if (txg == 0) {
-		dmu_tx_t *tx;
-		tx = dmu_tx_create_dd(dp->dp_mos_dir);
-		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+		mutex_enter(&dp->dp_spa->spa_scrub_lock);
+		scn->scn_prefetch_stop = B_TRUE;
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		mutex_exit(&dp->dp_spa->spa_scrub_lock);
 
-		txg = dmu_tx_get_txg(tx);
-		dp->dp_scan->scn_restart_txg = txg;
-		dmu_tx_commit(tx);
-	} else {
-		dp->dp_scan->scn_restart_txg = txg;
+		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		zfs_dbgmsg("scan visited %llu blocks in %llums "
+		    "(%llu os's, %llu holes, %llu < mintxg, "
+		    "%llu in ddt, %llu > maxtxg)",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)NSEC2MSEC(gethrtime() -
+		    scn->scn_sync_start_time),
+		    (longlong_t)scn->scn_objsets_visited_this_txg,
+		    (longlong_t)scn->scn_holes_this_txg,
+		    (longlong_t)scn->scn_lt_min_this_txg,
+		    (longlong_t)scn->scn_ddt_contained_this_txg,
+		    (longlong_t)scn->scn_gt_max_this_txg);
+
+		if (!scn->scn_suspending) {
+			ASSERT0(avl_numnodes(&scn->scn_queue));
+			scn->scn_done_txg = tx->tx_txg + 1;
+			if (scn->scn_is_sorted) {
+				scn->scn_checkpointing = B_TRUE;
+				scn->scn_clearing = B_TRUE;
+			}
+			zfs_dbgmsg("scan complete txg %llu",
+			    (longlong_t)tx->tx_txg);
+		}
+	} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+		/* need to issue scrubbing IOs from per-vdev queues */
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
+		scan_io_queues_run(scn);
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		/* calculate and dprintf the current memory usage */
+		(void) dsl_scan_should_clear(scn);
+		dsl_scan_update_stats(scn);
+
+		zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums "
+		    "(avg_block_size = %llu, avg_seg_size = %llu)",
+		    (longlong_t)scn->scn_zios_this_txg,
+		    (longlong_t)scn->scn_segs_this_txg,
+		    (longlong_t)NSEC2MSEC(gethrtime() -
+		    scn->scn_sync_start_time),
+		    (longlong_t)scn->scn_avg_zio_size_this_txg,
+		    (longlong_t)scn->scn_avg_seg_size_this_txg);
+	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+		/* Finished with everything. Mark the scrub as complete */
+		zfs_dbgmsg("scan issuing complete txg %llu",
+		    (longlong_t)tx->tx_txg);
+		ASSERT3U(scn->scn_done_txg, !=, 0);
+		ASSERT0(spa->spa_scrub_inflight);
+		ASSERT0(scn->scn_bytes_pending);
+		dsl_scan_done(scn, B_TRUE, tx);
+		sync_type = SYNC_MANDATORY;
 	}
-	zfs_dbgmsg("restarting resilver txg=%llu", txg);
-}
 
-boolean_t
-dsl_scan_resilvering(dsl_pool_t *dp)
-{
-	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
-	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+	dsl_scan_sync_state(scn, tx, sync_type);
 }
 
-/*
- * scrub consumers
- */
-
 static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	int i;
 
+	/* update the spa's stats on how many bytes we have issued */
+	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+		    DVA_GET_ASIZE(&bp->blk_dva[i]));
+	}
+
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
@@ -1881,6 +3325,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 	if (zab == NULL)
 		return;
 
+	mutex_enter(&zab->zab_lock);
+
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
@@ -1916,63 +3362,97 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 			break;
 		}
 	}
+
+	mutex_exit(&zab->zab_lock);
 }
 
 static void
-dsl_scan_scrub_done(zio_t *zio)
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
-	spa_t *spa = zio->io_spa;
-
-	abd_free(zio->io_abd);
+	avl_index_t idx;
+	int64_t asize = sio->sio_asize;
+	dsl_scan_t *scn = queue->q_scn;
 
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
-	cv_broadcast(&spa->spa_scrub_io_cv);
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-	if (zio->io_error && (zio->io_error != ECKSUM ||
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+		/* block is already scheduled for reading */
+		atomic_add_64(&scn->scn_bytes_pending, -asize);
+		kmem_cache_free(sio_cache, sio);
+		return;
 	}
-	mutex_exit(&spa->spa_scrub_lock);
+	avl_insert(&queue->q_sios_by_addr, sio, idx);
+	range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
 }
 
-static boolean_t
-dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
-    uint64_t phys_birth)
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+    int zio_flags, const zbookmark_phys_t *zb)
 {
-	vdev_t *vd;
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP);
 
-	if (DVA_GET_GANG(dva)) {
-		/*
-		 * Gang members may be spread across multiple
-		 * vdevs, so the best estimate we have is the
-		 * scrub range, which has already been checked.
-		 * XXX -- it would be better to change our
-		 * allocation policy to ensure that all
-		 * gang members reside on the same vdev.
-		 */
-		return (B_TRUE);
-	}
+	ASSERT0(BP_IS_GANG(bp));
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+	bp2sio(bp, sio, dva_i);
+	sio->sio_flags = zio_flags;
+	sio->sio_zb = *zb;
 
 	/*
-	 * Check if the txg falls within the range which must be
-	 * resilvered.  DVAs outside this range can always be skipped.
+	 * Increment the bytes pending counter now so that we can't
+	 * get an integer underflow in case the worker processes the
+	 * zio before we get to incrementing this counter.
 	 */
-	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
-		return (B_FALSE);
+	atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+
+	scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb)
+{
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	/*
-	 * Check if the top-level vdev must resilver this offset.
-	 * When the offset does not intersect with a dirty leaf DTL
-	 * then it may be possible to skip the resilver IO.  The psize
-	 * is provided instead of asize to simplify the check for RAIDZ.
+	 * Gang blocks are hard to issue sequentially, so we just issue them
+	 * here immediately instead of queuing them.
 	 */
-	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
-		return (B_FALSE);
+	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+		scan_exec_io(dp, bp, zio_flags, zb, NULL);
+		return;
+	}
 
-	return (B_TRUE);
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		dva_t dva;
+		vdev_t *vdev;
+
+		dva = bp->blk_dva[i];
+		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+		ASSERT(vdev != NULL);
+
+		mutex_enter(&vdev->vdev_scan_io_queue_lock);
+		if (vdev->vdev_scan_io_queue == NULL)
+			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+		ASSERT(dp->dp_scan != NULL);
+		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+		    i, zio_flags, zb);
+		mutex_exit(&vdev->vdev_scan_io_queue_lock);
+	}
 }
 
 static int
@@ -1980,32 +3460,29 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
-	size_t psize = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-	int scan_delay = 0;
 
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg)
 		return (0);
 
-	count_block(dp->dp_blkstats, bp);
-
-	if (BP_IS_EMBEDDED(bp))
+	if (BP_IS_EMBEDDED(bp)) {
+		count_block(scn, dp->dp_blkstats, bp);
 		return (0);
+	}
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		needs_io = B_TRUE;
-		scan_delay = zfs_scrub_delay;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		needs_io = B_FALSE;
-		scan_delay = zfs_resilver_delay;
 	}
 
 	/* If it's an intent log block, failure is expected. */
@@ -2029,91 +3506,348 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
-		vdev_t *rvd = spa->spa_root_vdev;
-		uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+		dsl_scan_enqueue(dp, bp, zio_flags, zb);
+	} else {
+		count_block(scn, dp->dp_blkstats, bp);
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	dsl_scan_io_queue_t *queue = zio->io_private;
+
+	abd_free(zio->io_abd);
+
+	if (queue == NULL) {
+		mutex_enter(&spa->spa_scrub_lock);
+		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		mutex_exit(&spa->spa_scrub_lock);
+	} else {
+		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+		cv_broadcast(&queue->q_zio_cv);
+		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+	}
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+	}
+}
 
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+	size_t size = BP_GET_PSIZE(bp);
+	abd_t *data = abd_alloc_for_io(size, B_FALSE);
+
+	if (queue == NULL) {
 		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight >= maxinflight)
+		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		spa->spa_scrub_inflight++;
+		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
 		mutex_exit(&spa->spa_scrub_lock);
+	} else {
+		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
-		/*
-		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
-		 * then throttle our workload to limit the impact of a scan.
-		 */
-		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
-			delay(scan_delay);
+		mutex_enter(q_lock);
+		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+			cv_wait(&queue->q_zio_cv, q_lock);
+		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+		mutex_exit(q_lock);
+	}
+
+	count_block(scn, dp->dp_blkstats, bp);
+	zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
+	    dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+}
 
-		zio_nowait(zio_read(NULL, spa, bp,
-		    abd_alloc_for_io(psize, B_FALSE),
-		    psize, dsl_scan_scrub_done, NULL,
-		    ZIO_PRIORITY_SCRUB, zio_flags, zb));
+/*
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ *	SCORE = 32M + (50 * 3 * 32M) / 100
+ *	SCORE = 32M + (4800M / 100)
+ *	SCORE = 32M + 48M
+ *	         ^     ^
+ *	         |     +--- final total relative fill-based score
+ *	         +--------- final total fill-based score
+ *	SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
+ */
+static int
+ext_size_compare(const void *x, const void *y)
+{
+	const range_seg_t *rsa = x, *rsb = y;
+	uint64_t sa = rsa->rs_end - rsa->rs_start,
+	    sb = rsb->rs_end - rsb->rs_start;
+	uint64_t score_a, score_b;
+
+	score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+	    fill_weight * rsa->rs_fill) >> 7);
+	score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+	    fill_weight * rsb->rs_fill) >> 7);
+
+	if (score_a > score_b)
+		return (-1);
+	if (score_a == score_b) {
+		if (rsa->rs_start < rsb->rs_start)
+			return (-1);
+		if (rsa->rs_start == rsb->rs_start)
+			return (0);
+		return (1);
 	}
+	return (1);
+}
 
-	/* do not relocate this block */
-	return (0);
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+sio_addr_compare(const void *x, const void *y)
+{
+	const scan_io_t *a = x, *b = y;
+
+	if (a->sio_offset < b->sio_offset)
+		return (-1);
+	if (a->sio_offset == b->sio_offset)
+		return (0);
+	return (1);
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+	q->q_scn = scn;
+	q->q_vd = vd;
+	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+	q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
+	    &q->q_exts_by_size, ext_size_compare,
+	    &q->q_vd->vdev_scan_io_queue_lock, zfs_scan_max_ext_gap);
+	avl_create(&q->q_sios_by_addr, sio_addr_compare,
+	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+	return (q);
 }
 
 /*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
  */
-int
-dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 {
-	spa_t *spa = dp->dp_spa;
-	dsl_scan_t *scn = dp->dp_scan;
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio;
+	void *cookie = NULL;
+	int64_t bytes_dequeued = 0;
+
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+	    NULL) {
+		ASSERT(range_tree_contains(queue->q_exts_by_addr,
+		    sio->sio_offset, sio->sio_asize));
+		bytes_dequeued += sio->sio_asize;
+		kmem_cache_free(sio_cache, sio);
+	}
 
-	/*
-	 * Purge all vdev caches and probe all devices.  We do this here
-	 * rather than in sync context because this requires a writer lock
-	 * on the spa_config lock, which we can't do from sync context.  The
-	 * spa_scrub_reopen flag indicates that vdev_open() should not
-	 * attempt to start another scrub.
-	 */
-	spa_vdev_state_enter(spa, SCL_NONE);
-	spa->spa_scrub_reopen = B_TRUE;
-	vdev_reopen(spa->spa_root_vdev);
-	spa->spa_scrub_reopen = B_FALSE;
-	(void) spa_vdev_state_exit(spa, NULL, 0);
+	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+	range_tree_destroy(queue->q_exts_by_addr);
+	avl_destroy(&queue->q_sios_by_addr);
+	cv_destroy(&queue->q_zio_cv);
 
-	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
-		/* got scrub start cmd, resume paused scrub */
-		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
-		    POOL_SCRUB_NORMAL);
-		if (err == 0)
-			return (SET_ERROR(ECANCELED));
+	kmem_free(queue, sizeof (*queue));
+}
 
-		return (SET_ERROR(err));
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+	mutex_enter(&svd->vdev_scan_io_queue_lock);
+	mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+	svd->vdev_scan_io_queue = NULL;
+	if (tvd->vdev_scan_io_queue != NULL) {
+		tvd->vdev_scan_io_queue->q_vd = tvd;
+		range_tree_set_lock(tvd->vdev_scan_io_queue->q_exts_by_addr,
+		    &tvd->vdev_scan_io_queue_lock);
 	}
 
-	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+	mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	mutex_exit(&svd->vdev_scan_io_queue_lock);
 }
 
-static boolean_t
-dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
 {
-	return (scn->scn_restart_txg != 0 &&
-	    scn->scn_restart_txg <= tx->tx_txg);
+	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *tvd = rvd->vdev_child[i];
+
+		mutex_enter(&tvd->vdev_scan_io_queue_lock);
+		if (tvd->vdev_scan_io_queue != NULL)
+			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+		tvd->vdev_scan_io_queue = NULL;
+		mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	}
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
-module_param(zfs_top_maxinflight, int, 0644);
-MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	vdev_t *vdev;
+	kmutex_t *q_lock;
+	dsl_scan_io_queue_t *queue;
+	scan_io_t srch, *sio;
+	avl_index_t idx;
+	uint64_t start, size;
+
+	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+	ASSERT(vdev != NULL);
+	q_lock = &vdev->vdev_scan_io_queue_lock;
+	queue = vdev->vdev_scan_io_queue;
+
+	mutex_enter(q_lock);
+	if (queue == NULL) {
+		mutex_exit(q_lock);
+		return;
+	}
+
+	bp2sio(bp, &srch, dva_i);
+	start = srch.sio_offset;
+	size = srch.sio_asize;
+
+	/*
+	 * We can find the zio in two states:
+	 * 1) Cold, just sitting in the queue of zio's to be issued at
+	 *	some point in the future. In this case, all we do is
+	 *	remove the zio from the q_sios_by_addr tree, decrement
+	 *	its data volume from the containing range_seg_t and
+	 *	resort the q_exts_by_size tree to reflect that the
+	 *	range_seg_t has lost some of its 'fill'. We don't shorten
+	 *	the range_seg_t - this is usually rare enough not to be
+	 *	worth the extra hassle of trying keep track of precise
+	 *	extent boundaries.
+	 * 2) Hot, where the zio is currently in-flight in
+	 *	dsl_scan_issue_ios. In this case, we can't simply
+	 *	reach in and stop the in-flight zio's, so we instead
+	 *	block the caller. Eventually, dsl_scan_issue_ios will
+	 *	be done with issuing the zio's it gathered and will
+	 *	signal us.
+	 */
+	sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+	if (sio != NULL) {
+		int64_t asize = sio->sio_asize;
+		blkptr_t tmpbp;
+
+		/* Got it while it was cold in the queue */
+		ASSERT3U(start, ==, sio->sio_offset);
+		ASSERT3U(size, ==, asize);
+		avl_remove(&queue->q_sios_by_addr, sio);
 
-module_param(zfs_resilver_delay, int, 0644);
-MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
+		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
+
+		/*
+		 * We only update scn_bytes_pending in the cold path,
+		 * otherwise it will already have been accounted for as
+		 * part of the zio's execution.
+		 */
+		atomic_add_64(&scn->scn_bytes_pending, -asize);
 
-module_param(zfs_scrub_delay, int, 0644);
-MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
+		/* count the block as though we issued it */
+		sio2bp(sio, &tmpbp, dva_i);
+		count_block(scn, dp->dp_blkstats, &tmpbp);
 
-module_param(zfs_scan_idle, int, 0644);
-MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
+		kmem_cache_free(sio_cache, sio);
+	}
+	mutex_exit(q_lock);
+}
 
-module_param(zfs_scan_min_time_ms, int, 0644);
-MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT(scn != NULL);
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+		dsl_scan_freed_dva(spa, bp, i);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* CSTYLED */
+module_param(zfs_scan_vdev_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_scan_vdev_limit,
+	"Max bytes in flight per leaf vdev for scrubs and resilvers");
+
+module_param(zfs_scrub_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg");
 
 module_param(zfs_free_min_time_ms, int, 0644);
 MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
@@ -2133,4 +3867,30 @@ MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg");
 
 module_param(zfs_free_bpobj_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj");
+
+module_param(zfs_scan_mem_lim_fact, int, 0644);
+MODULE_PARM_DESC(zfs_scan_mem_lim_fact, "Fraction of RAM for scan hard limit");
+
+module_param(zfs_scan_issue_strategy, int, 0644);
+MODULE_PARM_DESC(zfs_scan_issue_strategy,
+	"IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
+
+module_param(zfs_scan_legacy, int, 0644);
+MODULE_PARM_DESC(zfs_scan_legacy, "Scrub using legacy non-sequential method");
+
+module_param(zfs_scan_checkpoint_intval, int, 0644);
+MODULE_PARM_DESC(zfs_scan_checkpoint_intval,
+	"Scan progress on-disk checkpointing interval");
+
+module_param(zfs_scan_mem_lim_soft_fact, int, 0644);
+MODULE_PARM_DESC(zfs_scan_mem_lim_soft_fact,
+	"Fraction of hard limit used as soft limit");
+
+module_param(zfs_scan_strict_mem_lim, int, 0644);
+MODULE_PARM_DESC(zfs_scan_strict_mem_lim,
+	"Tunable to attempt to reduce lock contention");
+
+module_param(zfs_scan_fill_weight, int, 0644);
+MODULE_PARM_DESC(zfs_scan_fill_weight,
+	"Tunable to adjust bias towards more filled segments during scans");
 #endif
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5dc9ed60d..6320fd388 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -972,85 +972,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 }
 
 /*
- * Create any block allocator specific components. The current allocators
- * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
- */
-static void
-metaslab_rt_create(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT(msp->ms_tree == NULL);
-
-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-/*
- * Destroy the block allocator specific components.
- */
-static void
-metaslab_rt_destroy(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	ASSERT0(avl_numnodes(&msp->ms_size_tree));
-
-	avl_destroy(&msp->ms_size_tree);
-}
-
-static void
-metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	VERIFY(!msp->ms_condensing);
-	avl_add(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	VERIFY(!msp->ms_condensing);
-	avl_remove(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_vacate(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-
-	/*
-	 * Normally one would walk the tree freeing nodes along the way.
-	 * Since the nodes are shared with the range trees we can avoid
-	 * walking all nodes and just reinitialize the avl tree. The nodes
-	 * will be freed by the range tree, so we don't want to free them here.
-	 */
-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-static range_tree_ops_t metaslab_rt_ops = {
-	metaslab_rt_create,
-	metaslab_rt_destroy,
-	metaslab_rt_add,
-	metaslab_rt_remove,
-	metaslab_rt_vacate
-};
-
-/*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
@@ -1425,7 +1346,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+	ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
+	    metaslab_rangesize_compare, &ms->ms_lock, 0);
 	metaslab_group_add(mg, ms);
 
 	metaslab_set_fragmentation(ms);
diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c
index ebef7f447..01ef463ec 100644
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@@ -33,8 +33,58 @@
 #include <sys/zio.h>
 #include <sys/range_tree.h>
 
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
 kmem_cache_t *range_seg_cache;
 
+/* Generic ops for managing an AVL tree alongside a range tree */
+struct range_tree_ops rt_avl_ops = {
+	.rtop_create = rt_avl_create,
+	.rtop_destroy = rt_avl_destroy,
+	.rtop_add = rt_avl_add,
+	.rtop_remove = rt_avl_remove,
+	.rtop_vacate = rt_avl_vacate,
+};
+
 void
 range_tree_init(void)
 {
@@ -75,6 +125,18 @@ range_tree_stat_verify(range_tree_t *rt)
 	}
 }
 
+/*
+ * Changes out the lock used by the range tree. Useful when you are moving
+ * the range tree between containing structures without having to recreate
+ * it. Both the old and new locks must be held by the caller.
+ */
+void
+range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
+	rt->rt_lock = lp;
+}
+
 static void
 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 {
@@ -121,31 +183,38 @@ range_tree_seg_compare(const void *x1, const void *x2)
 }
 
 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+    int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap)
 {
-	range_tree_t *rt;
-
-	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 
 	avl_create(&rt->rt_root, range_tree_seg_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
 
 	rt->rt_lock = lp;
 	rt->rt_ops = ops;
+	rt->rt_gap = gap;
 	rt->rt_arg = arg;
+	rt->rt_avl_compare = avl_compare;
 
-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
 
 	return (rt);
 }
 
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+{
+	return (range_tree_create_impl(ops, arg, NULL, lp, 0));
+}
+
 void
 range_tree_destroy(range_tree_t *rt)
 {
 	VERIFY0(rt->rt_space);
 
-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
 	avl_destroy(&rt->rt_root);
@@ -153,40 +222,102 @@ range_tree_destroy(range_tree_t *rt)
 }
 
 void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	ASSERT3U(rs->rs_fill + delta, !=, 0);
+	ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+	rs->rs_fill += delta;
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
 	range_tree_t *rt = arg;
 	avl_index_t where;
 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
-	uint64_t end = start + size;
+	uint64_t end = start + size, gap = rt->rt_gap;
+	uint64_t bridge_size = 0;
 	boolean_t merge_before, merge_after;
 
 	ASSERT(MUTEX_HELD(rt->rt_lock));
-	VERIFY(size != 0);
+	ASSERT3U(size, !=, 0);
+	ASSERT3U(fill, <=, size);
 
 	rsearch.rs_start = start;
 	rsearch.rs_end = end;
 	rs = avl_find(&rt->rt_root, &rsearch, &where);
 
-	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+	if (gap == 0 && rs != NULL &&
+	    rs->rs_start <= start && rs->rs_end >= end) {
 		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size);
+		    "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
+		    (longlong_t)start, (longlong_t)size,
+		    (longlong_t)rs->rs_start,
+		    (longlong_t)rs->rs_end - rs->rs_start);
+		return;
+	}
+
+	/*
+	 * If this is a gap-supporting range tree, it is possible that we
+	 * are inserting into an existing segment. In this case simply
+	 * bump the fill count and call the remove / add callbacks. If the
+	 * new range will extend an existing segment, we remove the
+	 * existing one, apply the new extent to it and re-insert it using
+	 * the normal code paths.
+	 */
+	if (rs != NULL) {
+		ASSERT3U(gap, !=, 0);
+		if (rs->rs_start <= start && rs->rs_end >= end) {
+			range_tree_adjust_fill(rt, rs, fill);
+			return;
+		}
+
+		avl_remove(&rt->rt_root, rs);
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs);
+		rt->rt_space -= rs->rs_end - rs->rs_start;
+
+		fill += rs->rs_fill;
+		start = MIN(start, rs->rs_start);
+		end = MAX(end, rs->rs_end);
+		size = end - start;
+
+		range_tree_add_impl(rt, start, size, fill);
+
+		kmem_cache_free(range_seg_cache, rs);
 		return;
 	}
 
-	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(rs == NULL);
+	ASSERT3P(rs, ==, NULL);
 
+	/*
+	 * Determine whether or not we will have to merge with our neighbors.
+	 * If gap != 0, we might need to merge with our neighbors even if we
+	 * aren't directly touching.
+	 */
 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
 
-	merge_before = (rs_before != NULL && rs_before->rs_end == start);
-	merge_after = (rs_after != NULL && rs_after->rs_start == end);
+	merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
+	merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+
+	if (merge_before && gap != 0)
+		bridge_size += start - rs_before->rs_end;
+	if (merge_after && gap != 0)
+		bridge_size += rs_after->rs_start - end;
 
 	if (merge_before && merge_after) {
 		avl_remove(&rt->rt_root, rs_before);
-		if (rt->rt_ops != NULL) {
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 		}
@@ -194,43 +325,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
 		range_tree_stat_decr(rt, rs_before);
 		range_tree_stat_decr(rt, rs_after);
 
+		rs_after->rs_fill += rs_before->rs_fill + fill;
 		rs_after->rs_start = rs_before->rs_start;
 		kmem_cache_free(range_seg_cache, rs_before);
 		rs = rs_after;
 	} else if (merge_before) {
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 
 		range_tree_stat_decr(rt, rs_before);
 
+		rs_before->rs_fill += fill;
 		rs_before->rs_end = end;
 		rs = rs_before;
 	} else if (merge_after) {
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 
 		range_tree_stat_decr(rt, rs_after);
 
+		rs_after->rs_fill += fill;
 		rs_after->rs_start = start;
 		rs = rs_after;
 	} else {
 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+
+		rs->rs_fill = fill;
 		rs->rs_start = start;
 		rs->rs_end = end;
 		avl_insert(&rt->rt_root, rs, where);
 	}
 
-	if (rt->rt_ops != NULL)
+	if (gap != 0)
+		ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
+	else
+		ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 
 	range_tree_stat_incr(rt, rs);
-	rt->rt_space += size;
+	rt->rt_space += size + bridge_size;
 }
 
 void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+    boolean_t do_fill)
 {
-	range_tree_t *rt = arg;
 	avl_index_t where;
 	range_seg_t rsearch, *rs, *newseg;
 	uint64_t end = start + size;
@@ -251,6 +398,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 		    (longlong_t)start, (longlong_t)size);
 		return;
 	}
+
+	/*
+	 * Range trees with gap support must only remove complete segments
+	 * from the tree. This allows us to maintain accurate fill accounting
+	 * and to ensure that bridged sections are not leaked. If we need to
+	 * remove less than the full segment, we can only adjust the fill count.
+	 */
+	if (rt->rt_gap != 0) {
+		if (do_fill) {
+			if (rs->rs_fill == size) {
+				start = rs->rs_start;
+				end = rs->rs_end;
+				size = end - start;
+			} else {
+				range_tree_adjust_fill(rt, rs, -size);
+				return;
+			}
+		} else if (rs->rs_start != start || rs->rs_end != end) {
+			zfs_panic_recover("zfs: freeing partial segment of "
+			    "gap tree (offset=%llu size=%llu) of "
+			    "(offset=%llu size=%llu)",
+			    (longlong_t)start, (longlong_t)size,
+			    (longlong_t)rs->rs_start,
+			    (longlong_t)rs->rs_end - rs->rs_start);
+			return;
+		}
+	}
+
 	VERIFY3U(rs->rs_start, <=, start);
 	VERIFY3U(rs->rs_end, >=, end);
 
@@ -259,19 +434,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 
 	range_tree_stat_decr(rt, rs);
 
-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 	if (left_over && right_over) {
 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 		newseg->rs_start = end;
 		newseg->rs_end = rs->rs_end;
+		newseg->rs_fill = newseg->rs_end - newseg->rs_start;
 		range_tree_stat_incr(rt, newseg);
 
 		rs->rs_end = start;
 
 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
 	} else if (left_over) {
 		rs->rs_end = start;
@@ -284,15 +460,55 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 	}
 
 	if (rs != NULL) {
+		/*
+		 * The fill of the leftover segment will always be equal to
+		 * the size, since we do not support removing partial segments
+		 * of range trees with gaps.
+		 */
+		rs->rs_fill = rs->rs_end - rs->rs_start;
 		range_tree_stat_incr(rt, rs);
 
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 	}
 
 	rt->rt_space -= size;
 }
 
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize)
+{
+	int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	range_tree_stat_decr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+	rs->rs_start = newstart;
+	rs->rs_end = newstart + newsize;
+
+	range_tree_stat_incr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+	rt->rt_space += delta;
+}
+
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
@@ -308,7 +524,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 	return (avl_find(&rt->rt_root, &rsearch, &where));
 }
 
-static range_seg_t *
+range_seg_t *
 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
@@ -373,7 +589,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 
 	ASSERT(MUTEX_HELD(rt->rt_lock));
 
-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 
 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
@@ -397,8 +613,60 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 }
 
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	return (avl_first(&rt->rt_root));
+}
+
 uint64_t
 range_tree_space(range_tree_t *rt)
 {
 	return (rt->rt_space);
 }
+
+/* Generic range tree functions for maintaining segments in an AVL tree. */
+void
+rt_avl_create(range_tree_t *rt, void *arg)
+{
+	avl_tree_t *tree = arg;
+
+	avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
+	    offsetof(range_seg_t, rs_pp_node));
+}
+
+void
+rt_avl_destroy(range_tree_t *rt, void *arg)
+{
+	avl_tree_t *tree = arg;
+
+	ASSERT0(avl_numnodes(tree));
+	avl_destroy(tree);
+}
+
+void
+rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	avl_tree_t *tree = arg;
+	avl_add(tree, rs);
+}
+
+void
+rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	avl_tree_t *tree = arg;
+	avl_remove(tree, rs);
+}
+
+void
+rt_avl_vacate(range_tree_t *rt, void *arg)
+{
+	/*
+	 * Normally one would walk the tree freeing nodes along the way.
+	 * Since the nodes are shared with the range trees we can avoid
+	 * walking all nodes and just reinitialize the avl tree. The nodes
+	 * will be freed by the range tree, so we don't want to free them here.
+	 */
+	rt_avl_create(rt, arg);
+}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 0604742ab..e06190f9d 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1996,7 +1996,7 @@ spa_load_verify_done(zio_t *zio)
 	}
 
 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
+	spa->spa_load_verify_ios--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
@@ -2030,9 +2030,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
-	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	spa->spa_scrub_inflight++;
+	spa->spa_load_verify_ios++;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 9a3290e95..116b0ebd9 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1892,6 +1892,7 @@ spa_init(int mode)
 	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
+	scan_init();
 	qat_init();
 }
 
@@ -1915,6 +1916,7 @@ spa_fini(void)
 	unique_fini();
 	refcount_fini();
 	fm_fini();
+	scan_fini();
 	qat_fini();
 
 	avl_destroy(&spa_namespace_avl);
@@ -2016,6 +2018,7 @@ spa_scan_stat_init(spa_t *spa)
 		spa->spa_scan_pass_scrub_pause = 0;
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
+	spa->spa_scan_pass_issued = 0;
 	vdev_scan_stat_init(spa->spa_root_vdev);
 }
 
@@ -2033,18 +2036,21 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 
 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
+	ps->pss_state = scn->scn_phys.scn_state;
 	ps->pss_start_time = scn->scn_phys.scn_start_time;
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
-	ps->pss_examined = scn->scn_phys.scn_examined;
 	ps->pss_to_process = scn->scn_phys.scn_to_process;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
-	ps->pss_state = scn->scn_phys.scn_state;
+	ps->pss_examined = scn->scn_phys.scn_examined;
+	ps->pss_issued =
+	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
 	/* data not stored on disk */
 	ps->pss_pass_start = spa->spa_scan_pass_start;
 	ps->pss_pass_exam = spa->spa_scan_pass_exam;
+	ps->pss_pass_issued = spa->spa_scan_pass_issued;
 	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
 	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
 
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 2df0040af..9edeaf525 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -360,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
@@ -648,6 +649,18 @@ vdev_free(vdev_t *vd)
 	spa_t *spa = vd->vdev_spa;
 
 	/*
+	 * Scan queues are normally destroyed at the end of a scan. If the
+	 * queue exists here, that implies the vdev is being removed while
+	 * the scan is still running.
+	 */
+	if (vd->vdev_scan_io_queue != NULL) {
+		mutex_enter(&vd->vdev_scan_io_queue_lock);
+		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+		vd->vdev_scan_io_queue = NULL;
+		mutex_exit(&vd->vdev_scan_io_queue_lock);
+	}
+
+	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
@@ -723,6 +736,7 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
+	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -800,6 +814,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
+
+	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 36a4bf629..792642952 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = 1 << 20;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 6d1b860cc..2f6aed667 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -1070,7 +1070,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 		}
 		err = zap_add(os, intoobj, za.za_name,
 		    8, 1, &value, tx);
-		if (err)
+		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 4cfda7a9e..311f79e23 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -39,6 +39,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zio.h>
@@ -1050,6 +1051,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
+	dsl_scan_freed(spa, bp);
 
 	/*
 	 * GANG and DEDUP blocks can induce a read (for the gang block header,
@@ -3333,26 +3335,6 @@ zio_vdev_io_start(zio_t *zio)
 
 	ASSERT3P(zio->io_logical, !=, zio);
 
-	/*
-	 * We keep track of time-sensitive I/Os so that the scan thread
-	 * can quickly react to certain workloads.  In particular, we care
-	 * about non-scrubbing, top-level reads and writes with the following
-	 * characteristics:
-	 *	- synchronous writes of user data to non-slog devices
-	 *	- any reads of user data
-	 * When these conditions are met, adjust the timestamp of spa_last_io
-	 * which allows the scan thread to adjust its workload accordingly.
-	 */
-	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
-	    vd == vd->vdev_top && !vd->vdev_islog &&
-	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
-	    zio->io_txg != spa_syncing_txg(spa)) {
-		uint64_t old = spa->spa_last_io;
-		uint64_t new = ddi_get_lbolt64();
-		if (old != new)
-			(void) atomic_cas_64(&spa->spa_last_io, old, new);
-	}
-
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
author	Tom Caputi <[email protected]>	2017-11-15 20:27:01 -0500
committer	Brian Behlendorf <[email protected]>	2017-11-15 17:27:01 -0800
commit	d4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree	1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /module
parent	e301113c17673a290098850830cf2e6d1a1fcbe3 (diff)