Sequential scrub and resilvers

Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <[email protected]> Authored-by: Saso Kiselkov <[email protected]> Authored-by: Alek Pinchuk <[email protected]> Authored-by: Tom Caputi <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Closes #3625 Closes #6256
author: Tom Caputi <[email protected]> 2017-11-15 20:27:01 -0500
committer: Brian Behlendorf <[email protected]> 2017-11-15 17:27:01 -0800
commit: d4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree: 1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /module/zfs/dbuf.c
parent: e301113c17673a290098850830cf2e6d1a1fcbe3 (diff)
1 files changed, 16 insertions, 6 deletions
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 64c1a68af..190d0656a 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 }
 
 static void
-dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
@@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
+		if (buf == NULL) {
+			buf = arc_alloc_buf(db->db_objset->os_spa,
+			    db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+		}
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
-	} else if (err == 0) {
+	} else if (buf != NULL) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
@@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
  * prefetch if the next block down is our target.
  */
 static void
-dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	dbuf_prefetch_arg_t *dpa = private;
 
@@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
 		dbuf_rele(db, FTAG);
 	}
 
-	dpa->dpa_curlevel--;
+	if (abuf == NULL) {
+		kmem_free(dpa, sizeof (*dpa));
+		return;
+	}
 
+	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
-	if (BP_IS_HOLE(bp) || err != 0) {
+
+	if (BP_IS_HOLE(bp)) {
 		kmem_free(dpa, sizeof (*dpa));
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
author	Tom Caputi <[email protected]>	2017-11-15 20:27:01 -0500
committer	Brian Behlendorf <[email protected]>	2017-11-15 17:27:01 -0800
commit	d4a72f23863382bdf6d0ae33196f5b5decbc48fd (patch)
tree	1084ea930b9a1ef46e58d1757943ab3ad66c22c4 /module/zfs/dbuf.c
parent	e301113c17673a290098850830cf2e6d1a1fcbe3 (diff)