diff options
author | Paul Dagnelie <[email protected]> | 2015-12-22 02:31:57 +0100 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-01-08 15:08:19 -0800 |
commit | fcff0f35bd522076bdda7491c88a91cc0aa531a3 (patch) | |
tree | 63e2e9db6fce37f64559cdaaf7247d2f51e85d2d /module/zfs | |
parent | 00af2ff6f219b4f73aebaaf9496cf5ea4b6728a3 (diff) |
Illumos 5960, 5925
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
References:
https://www.illumos.org/issues/5960
https://www.illumos.org/issues/5925
https://github.com/illumos/illumos-gate/commit/a2cdcdd
Porting notes:
- [lib/libzfs/libzfs_sendrecv.c]
- b8864a2 Fix gcc cast warnings
- 325f023 Add linux kernel device support
- 5c3f61e Increase Linux pipe buffer size on 'zfs receive'
- [module/zfs/zfs_vnops.c]
- 3558fd7 Prototype/structure update for Linux
- c12e3a5 Restructure zfs_readdir() to fix regressions
- [module/zfs/zvol.c]
- Function @zvol_map_block() isn't needed in ZoL
- 9965059 Prefetch start and end of volumes
- [module/zfs/dmu.c]
- Fixed ISO C90 - mixed declarations and code
- Function dmu_prefetch() 'int i' is initialized before
the following code block (c90 vs. c99)
- [module/zfs/dbuf.c]
- fc5bb51 Fix stack dbuf_hold_impl()
- 9b67f60 Illumos 4757, 4913
- 34229a2 Reduce stack usage for recursive traverse_visitbp()
- [module/zfs/dmu_send.c]
- Fixed ISO C90 - mixed declarations and code
- b58986e Use large stacks when available
- 241b541 Illumos 5959 - clean up per-dataset feature count code
- 77aef6f Use vmem_alloc() for nvlists
- 00b4602 Add linux kernel memory support
Ported-by: kernelOfTruth [email protected]
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 1 | ||||
-rw-r--r-- | module/zfs/bptree.c | 2 | ||||
-rw-r--r-- | module/zfs/bqueue.c | 111 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 292 | ||||
-rw-r--r-- | module/zfs/dmu.c | 44 | ||||
-rw-r--r-- | module/zfs/dmu_diff.c | 2 | ||||
-rw-r--r-- | module/zfs/dmu_object.c | 5 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 811 | ||||
-rw-r--r-- | module/zfs/dmu_traverse.c | 28 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 6 | ||||
-rw-r--r-- | module/zfs/dmu_zfetch.c | 3 | ||||
-rw-r--r-- | module/zfs/dnode.c | 18 | ||||
-rw-r--r-- | module/zfs/dnode_sync.c | 6 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 28 | ||||
-rw-r--r-- | module/zfs/dsl_destroy.c | 2 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 3 | ||||
-rw-r--r-- | module/zfs/spa.c | 2 | ||||
-rw-r--r-- | module/zfs/space_map.c | 4 | ||||
-rw-r--r-- | module/zfs/zap.c | 13 | ||||
-rw-r--r-- | module/zfs/zfs_vfsops.c | 2 | ||||
-rw-r--r-- | module/zfs/zfs_vnops.c | 3 | ||||
-rw-r--r-- | module/zfs/zio.c | 141 | ||||
-rw-r--r-- | module/zfs/zvol.c | 5 |
23 files changed, 1201 insertions, 331 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 55f8cef16..d3a0206c9 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -14,6 +14,7 @@ $(MODULE)-objs += bpobj.o $(MODULE)-objs += dbuf.o $(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += bptree.o +$(MODULE)-objs += bqueue.o $(MODULE)-objs += ddt.o $(MODULE)-objs += ddt_zap.o $(MODULE)-objs += dmu.o diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 9f62d7b91..c0edc9d76 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -156,7 +156,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (BP_IS_HOLE(bp)) + if (bp == NULL || BP_IS_HOLE(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c new file mode 100644 index 000000000..1ddc697b5 --- /dev/null +++ b/module/zfs/bqueue.c @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include <sys/bqueue.h> +#include <sys/zfs_context.h> + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, + * and offset should give its offset from the start of the struct. Return 0 on + * success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +{ + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + mutex_destroy(&q->bq_lock); + list_destroy(&q->bq_list); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + cv_signal(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + mutex_exit(&q->bq_lock); + cv_signal(&q->bq_add_cv); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index fe0ffa2d8..e08dcc4a3 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -51,7 +51,8 @@ struct dbuf_hold_impl_data { dnode_t *dh_dn; uint8_t dh_level; uint64_t dh_blkid; - int dh_fail_sparse; + boolean_t dh_fail_sparse; + boolean_t dh_fail_uncached; void *dh_tag; dmu_buf_impl_t **dh_dbp; /* Local variables */ @@ -65,8 +66,9 @@ struct dbuf_hold_impl_data { }; static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp, int depth); + dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, + boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); /* @@ -604,11 +606,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) return (abuf); } +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ uint64_t -dbuf_whichblock(dnode_t *dn, uint64_t offset) +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { - if (dn->dn_datablkshift) { - return (offset >> dn->dn_datablkshift); + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + return (offset >> (dn->dn_datablkshift + level * + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); @@ -1786,6 +1812,12 @@ dbuf_clear(dmu_buf_impl_t *db) dbuf_rele(parent, db); } +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or a userused or groupused + * object. + */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, @@ -1828,12 +1860,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, /* this block is referenced from an indirect block */ int err; if (dh == NULL) { - err = dbuf_hold_impl(dn, level+1, blkid >> epbs, - fail_sparse, NULL, parentp); + err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); } else { __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, - blkid >> epbs, fail_sparse, NULL, - parentp, dh->dh_depth + 1); + blkid >> epbs, fail_sparse, FALSE, NULL, + parentp, dh->dh_depth + 1); err = __dbuf_hold_impl(dh + 1); } if (err) @@ -2011,11 +2043,102 @@ dbuf_destroy(dmu_buf_impl_t *db) arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; + +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + arc_flags_t aflags; + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; + + aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &dpa->dpa_zb); +} + +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ +static void +dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + uint64_t nextblkid; + blkptr_t *bp; + + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + } + + dpa->dpa_curlevel--; + + nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + (void) arc_buf_remove_ref(abuf, private); +} + +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. + */ void -dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) { - dmu_buf_impl_t *db = NULL; - blkptr_t *bp = NULL; + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; + dmu_buf_impl_t *db; + zio_t *pio; + dbuf_prefetch_arg_t *dpa; + dsl_dataset_t *ds; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -2023,35 +2146,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dnode_block_freed(dn, blkid)) return; - /* dbuf_find() returns with db_mtx held */ - if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) { + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { + mutex_exit(&db->db_mtx); /* - * This dbuf is already in the cache. We assume that - * it is already CACHED, or else about to be either - * read or filled. + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. */ - mutex_exit(&db->db_mtx); return; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { - if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - arc_flags_t aflags = - ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - zbookmark_phys_t zb; + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } + + curlevel = parent_level; + curblkid = parent_blkid; + } - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, 0, blkid); + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; + } + if (BP_IS_HOLE(&bp)) + return; - (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, NULL, NULL, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - if (db) - dbuf_rele(db, NULL); + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); + + pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); + + dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); } #define DBUF_HOLD_IMPL_MAX_DEPTH 20 @@ -2079,6 +2271,9 @@ top: if (dh->dh_db == NULL) { dh->dh_bp = NULL; + if (dh->dh_fail_uncached) + return (SET_ERROR(ENOENT)); + ASSERT3P(dh->dh_parent, ==, NULL); dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_fail_sparse, &dh->dh_parent, @@ -2099,6 +2294,11 @@ top: dh->dh_parent, dh->dh_bp); } + if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { + mutex_exit(&dh->dh_db->db_mtx); + return (SET_ERROR(ENOENT)); + } + if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); if (dh->dh_db->db_buf->b_data == NULL) { @@ -2159,7 +2359,8 @@ top: * on the stack for 20 levels of recursion. */ int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { struct dbuf_hold_impl_data *dh; @@ -2167,7 +2368,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); - __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); + __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, + fail_uncached, tag, dbp, 0); error = __dbuf_hold_impl(dh); @@ -2179,13 +2381,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp, int depth) + dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth) { dh->dh_dn = dn; dh->dh_level = level; dh->dh_blkid = blkid; + dh->dh_fail_sparse = fail_sparse; + dh->dh_fail_uncached = fail_uncached; + dh->dh_tag = tag; dh->dh_dbp = dbp; dh->dh_depth = depth; @@ -2194,16 +2400,14 @@ __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } @@ -2531,8 +2735,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f4027af9c..b4133f0e4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -138,7 +138,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -421,7 +421,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { @@ -522,17 +522,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) } /* - * Issue prefetch i/os for the given blocks. + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefeteched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. * - * Note: The assumption is that we *know* these blocks will be needed - * almost immediately. Therefore, the prefetch i/os will be issued at - * ZIO_PRIORITY_SYNC_READ - * - * Note: indirect blocks and other metadata will be read synchronously, - * causing this function to block if they are not already cached. + * Note that if the indirect blocks above the blocks being prefetched are not in + * cache, they will be asychronously read in. */ void -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; @@ -548,8 +547,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } @@ -564,10 +564,16 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - - P2ALIGN(offset, 1 << blkshift)) >> blkshift; + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } @@ -575,9 +581,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) if (nblks != 0) { int i; - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, level, offset); for (i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); + dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); @@ -1293,7 +1299,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 91415d0d2..7665d1ca5 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -115,7 +115,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); - if (zb->zb_object != DMU_META_DNODE_OBJECT) + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index c0e8d7b7f..127a6757f 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (0); } +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2502b2c97..88d27c867 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -53,13 +53,38 @@ #include <sys/blkptr.h> #include <sys/dsl_bookmark.h> #include <sys/zfeature.h> +#include <sys/bqueue.h> /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; +}; + +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; +}; + typedef struct dump_bytes_io { dmu_sendarg_t *dbi_dsp; void *dbi_buf; @@ -466,47 +491,108 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) return (B_FALSE); } -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; + + if (sta->cancel) + return (SET_ERROR(EINTR)); -/* ARGSUSED */ + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset(st_arg->ds, st_arg->fromtxg, + st_arg->flags, send_cb, arg); + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { - dmu_sendarg_t *dsp = arg; + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; + dnode_phys_t *blk; + uint64_t dnobj; - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + ASSERT3U(zb->zb_level, >=, 0); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (zb->zb_level == ZB_ZIL_LEVEL) { - /* - * If we are sending a non-snapshot (which is allowed on - * read-only pools), it may have a ZIL, which must be ignored. - */ - return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - dnode_phys_t *blk; - int i; int blksz = BP_GET_LSIZE(bp); arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; + int i; + + ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -514,10 +600,9 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (SET_ERROR(EIO)); blk = abuf->b_data; + dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(dsp, dnobj, blk+i); + err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } @@ -532,20 +617,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); - } else if (backup_do_embed(dsp, bp)) { + } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ - int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - err = dump_write_embedded(dsp, zb->zb_object, + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); - } else { /* it's a level-0 block of a regular object */ - uint64_t offset; + } else { + /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + uint64_t offset; - ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -566,20 +652,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, offset = zb->zb_blkid * blksz; - if (!(dsp->dsa_featureflags & + if (!(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); @@ -590,11 +676,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Releases dp using the specified tag. + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. */ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) { objset_t *os; @@ -603,8 +702,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; + struct send_thread_arg to_arg; + struct send_block_record *to_data; - err = dmu_objset_from_ds(ds, &os); + err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); @@ -630,35 +731,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, } #endif - if (large_block_ok && ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) + if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; - } else { - embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(ds)->ds_creation_time; + dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromzb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; - fromtxg = fromzb->zbm_creation_txg; + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; } - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!ds->ds_is_snapshot) { + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -671,16 +771,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_proc = curproc; dsp->dsa_os = os; dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; - mutex_enter(&ds->ds_sendstream_lock); - list_insert_head(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); - dsl_dataset_long_hold(ds, FTAG); + dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); if (dump_record(dsp, NULL, 0) != 0) { @@ -688,8 +788,40 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, goto out; } - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, dsp); + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, + TS_RUN, minclsyspri); + + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; + } + + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); + + bqueue_destroy(&to_arg.q); + + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) @@ -706,20 +838,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_record(dsp, NULL, 0) != 0) { + if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; - goto out; - } out: - mutex_enter(&ds->ds_sendstream_lock); - list_remove(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); - dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_long_rele(to_ds, FTAG); return (err); } @@ -1139,7 +1269,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ @@ -1313,21 +1444,57 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } -struct restorearg { +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { objset_t *os; - int err; boolean_t byteswap; - vnode_t *vp; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ + bqueue_t q; + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; +}; - dmu_replay_record_t *drr; - dmu_replay_record_t *next_drr; - char *buf; +struct receive_arg { + objset_t *os; + vnode_t *vp; /* The vnode to read the stream from */ + uint64_t voff; /* The current offset in the stream */ + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + list_t ignore_obj_list; +}; - avl_tree_t *guid_to_ds_map; +struct receive_ign_obj_node { + list_node_t node; + uint64_t object; }; typedef struct guid_map_entry { @@ -1366,13 +1533,12 @@ free_guid_map_onexit(void *arg) } static int -restore_read(struct restorearg *ra, int len, void *buf) +receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); - ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; @@ -1493,7 +1659,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } noinline static int -restore_object(struct restorearg *ra, struct drr_object *drro, void *data) +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) { dmu_object_info_t doi; dmu_tx_t *tx; @@ -1507,12 +1674,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } - err = dmu_object_info(ra->os, drro->drr_object, &doi); + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); @@ -1531,14 +1698,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { - err = dmu_free_long_range(ra->os, drro->drr_object, + err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { @@ -1548,7 +1715,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(ra->os, drro->drr_object, + err = dmu_object_claim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || @@ -1556,7 +1723,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(ra->os, drro->drr_object, + err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } @@ -1565,20 +1732,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) return (SET_ERROR(EINVAL)); } - dmu_object_set_checksum(ra->os, drro->drr_object, + dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); - dmu_object_set_compress(ra->os, drro->drr_object, + dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, @@ -1592,7 +1759,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) /* ARGSUSED */ noinline static int -restore_freeobjects(struct restorearg *ra, +receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; @@ -1602,13 +1769,13 @@ restore_freeobjects(struct restorearg *ra, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { + (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(ra->os, obj, NULL) != 0) + if (dmu_object_info(rwa->os, obj, NULL) != 0) continue; - err = dmu_free_long_object(ra->os, obj); + err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } @@ -1616,7 +1783,8 @@ restore_freeobjects(struct restorearg *ra, } noinline static int -restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) { dmu_tx_t *tx; dmu_buf_t *bonus; @@ -1626,10 +1794,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); @@ -1638,14 +1806,14 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) dmu_tx_abort(tx); return (err); } - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, drrw->drr_length); } - if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); @@ -1661,7 +1829,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) * data from the stream to fulfill this write. */ static int -restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; @@ -1680,14 +1849,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { - ref_os = ra->os; + ref_os = rwa->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, @@ -1695,7 +1864,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) if (err != 0) return (err); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); @@ -1704,7 +1873,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) dmu_tx_abort(tx); return (err); } - dmu_write(ra->os, drrwbr->drr_object, + dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); @@ -1712,7 +1881,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) } static int -restore_write_embedded(struct restorearg *ra, +receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwnp, void *data) { dmu_tx_t *tx; @@ -1729,7 +1898,7 @@ restore_write_embedded(struct restorearg *ra, if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); @@ -1739,36 +1908,37 @@ restore_write_embedded(struct restorearg *ra, return (err); } - dmu_write_embedded(ra->os, drrwnp->drr_object, + dmu_write_embedded(rwa->os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, - ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int -restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1795,7 +1965,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) /* ARGSUSED */ noinline static int -restore_free(struct restorearg *ra, struct drr_free *drrf) +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; @@ -1803,11 +1973,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf) drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(ra->os, drrf->drr_object, + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); + return (err); } @@ -1822,7 +1993,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) } static void -restore_cksum(struct restorearg *ra, int len, void *buf) +receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { fletcher_4_incremental_byteswap(buf, len, &ra->cksum); @@ -1832,32 +2003,44 @@ restore_cksum(struct restorearg *ra, int len, void *buf) } /* - * If len != 0, read payload into buf. - * Read next record's header into ra->next_drr. + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int -restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; zio_cksum_t cksum_orig; zio_cksum_t *cksump; if (len != 0) { - ASSERT3U(len, <=, ra->bufsize); - err = restore_read(ra, len, buf); + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + err = receive_read(ra, len, ra->rrd->payload); if (err != 0) return (err); - restore_cksum(ra, len, buf); + receive_cksum(ra, len, ra->rrd->payload); } ra->prev_cksum = ra->cksum; - err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); - if (err != 0) + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (err); - if (ra->next_drr->drr_type == DRR_BEGIN) + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); + } /* * Note: checksum is of everything up to but not including the @@ -1865,107 +2048,248 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - restore_cksum(ra, + receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ra->next_drr); + &ra->next_rrd->header); - cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; - cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; + cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) - byteswap_record(ra->next_drr); + byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); + } - restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&ra->ignore_obj_list); + } + if (node == NULL || node->object > object) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ static int -restore_process_record(struct restorearg *ra) +receive_read_record(struct receive_arg *ra) { int err; - switch (ra->drr->drr_type) { + switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { - struct drr_object *drro = &ra->drr->drr_u.drr_object; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); - if (err != 0) + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_object(ra, drro, ra->buf)); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + struct receive_ign_obj_node *node = + kmem_zalloc(sizeof (*node), + KM_SLEEP); + node->object = drro->drr_object; +#ifdef ZFS_DEBUG + { + struct receive_ign_obj_node *last_object = + list_tail(&ra->ignore_obj_list); + uint64_t last_objnum = (last_object != NULL ? + last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); + } +#endif + list_insert_tail(&ra->ignore_obj_list, node); + err = 0; + } + return (err); } case DRR_FREEOBJECTS: { - struct drr_freeobjects *drrfo = - &ra->drr->drr_u.drr_freeobjects; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_freeobjects(ra, drrfo)); + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_WRITE: { - struct drr_write *drrw = &ra->drr->drr_u.drr_write; + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), drrw->drr_length); - err = restore_read_payload_and_next_header(ra, + err = receive_read_payload_and_next_header(ra, drrw->drr_length, abuf->b_data); - if (err != 0) - return (err); - err = restore_write(ra, drrw, abuf); - /* if restore_write() is successful, it consumes the arc_buf */ - if (err != 0) + if (err != 0) { dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_length); return (err); } case DRR_WRITE_BYREF: { - struct drr_write_byref *drrwbr = - &ra->drr->drr_u.drr_write_byref; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_write_byref(ra, drrwbr)); + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = - &ra->drr->drr_u.drr_write_embedded; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); - if (err != 0) + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_write_embedded(ra, drrwe, ra->buf)); + } + + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); } case DRR_FREE: { - struct drr_free *drrf = &ra->drr->drr_u.drr_free; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_free(ra, drrf)); + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_END: { - struct drr_end *drre = &ra->drr->drr_u.drr_end; + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(EINVAL)); return (0); } case DRR_SPILL: { - struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; - err = restore_read_payload_and_next_header(ra, - drrs->drr_length, ra->buf); + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); if (err != 0) - return (err); - return (restore_spill(ra, drrs, ra->buf)); + kmem_free(buf, drrs->drr_length); + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); } default: return (SET_ERROR(EINVAL)); @@ -1973,6 +2297,50 @@ restore_process_record(struct restorearg *ra) } /* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int @@ -1980,17 +2348,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; - struct restorearg ra = { 0 }; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; int featureflags; + struct receive_ign_obj_node *n; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; - ra.bufsize = SPA_MAXBLOCKSIZE; - ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); - ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); - ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), + offsetof(struct receive_ign_obj_node, node)); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -2021,48 +2389,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, } if (*action_handlep == 0) { - ra.guid_to_ds_map = + rwa.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(ra.guid_to_ds_map, guid_compare, + avl_create(rwa.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, ra.guid_to_ds_map, + free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&ra.guid_to_ds_map); + (void **)&rwa.guid_to_ds_map); if (ra.err != 0) goto out; } - drc->drc_guid_to_ds_map = ra.guid_to_ds_map; + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = restore_read_payload_and_next_header(&ra, 0, NULL); - if (err != 0) + err = receive_read_payload_and_next_header(&ra, 0, NULL); + if (err) goto out; - for (;;) { - void *tmp; + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } - tmp = ra.next_drr; - ra.next_drr = ra.drr; - ra.drr = tmp; + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); - /* process ra.drr, read in ra.next_drr */ - err = restore_process_record(&ra); - if (err != 0) - break; - if (ra.drr->drr_type == DRR_END) + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; break; + } + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); } + mutex_exit(&rwa.mutex); + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) @@ -2076,10 +2488,13 @@ out: dmu_recv_cleanup_ds(drc); } - kmem_free(ra.drr, sizeof (*ra.drr)); - vmem_free(ra.buf, ra.bufsize); - kmem_free(ra.next_drr, sizeof (*ra.next_drr)); *voffp = ra.voff; + + for (n = list_remove_head(&ra.ignore_obj_list); n != NULL; + n = list_remove_head(&ra.ignore_obj_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&ra.ignore_obj_list); return (err); } diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 12d099bfd..07164161b 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -157,7 +157,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * If we already visited this bp & everything below, * don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, td->td_resume)) + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); /* @@ -428,6 +428,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); @@ -435,10 +446,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, break; } - if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } return (err); } @@ -451,6 +473,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); + if (bp == NULL) + return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5ae429f70..016defe79 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -332,7 +332,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, start, + FALSE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { @@ -533,7 +534,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, + FALSE, FALSE, FTAG, &dbuf); if (err) { txh->txh_tx->tx_err = err; break; diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 8ff2f0509..9d41bdc0f 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -293,7 +293,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); + dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, + ARC_FLAG_PREFETCH); } return (fetchsz); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 1d587b6f5..190e66b74 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1112,7 +1112,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, drop_struct_lock = TRUE; } - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) @@ -1409,7 +1409,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -1582,8 +1582,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ @@ -1620,8 +1620,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (tail) { if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { @@ -1853,7 +1853,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) + int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; @@ -1875,8 +1875,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index df5c8e4ee..b47395a1e 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -192,7 +192,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; @@ -288,7 +288,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, - i, B_TRUE, FTAG, &subdb)); + i, TRUE, FALSE, FTAG, &subdb)); rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); @@ -362,7 +362,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, - TRUE, FTAG, &db)); + TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); free_children(db, blkid, nblks, tx); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 829452b1d..edc5ea17a 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -547,6 +547,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, const char *snapname; uint64_t obj; int err = 0; + dsl_dataset_t *ds; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) @@ -555,36 +556,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) - err = dsl_dataset_hold_obj(dp, obj, tag, dsp); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds; + dsl_dataset_t *snap_ds; if (*snapname++ != '@') { - dsl_dataset_rele(*dsp, tag); + dsl_dataset_rele(ds, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); - err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); + dsl_dataset_rele(ds, tag); if (err == 0) { - mutex_enter(&ds->ds_lock); - if (ds->ds_snapname[0] == 0) - (void) strlcpy(ds->ds_snapname, snapname, - sizeof (ds->ds_snapname)); - mutex_exit(&ds->ds_lock); - *dsp = ds; + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; } } - + if (err == 0) + *dsp = ds; dsl_dir_rele(dd, FTAG); return (err); } diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index e45f46d8d..d0015d1bd 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -560,7 +560,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b989e7633..295b8df8b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -619,7 +619,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b4831a724..d7b800adf 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1921,7 +1921,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, size_t size; void *data; - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b3aa469bf..126fd6bee 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -76,8 +76,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) mutex_exit(sm->sm_lock); if (end > bufsize) { - dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, - end - bufsize); + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, + end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index cf0b92c7d..189956af0 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -949,7 +950,8 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -1295,9 +1297,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } else { int b; - dmu_prefetch(zap->zap_objset, zap->zap_object, + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index f105d9aed..bc4fa85fa 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 944f0ad3d..174c918ab 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -2118,7 +2118,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) /* Prefetch znode */ if (prefetch) { - dmu_prefetch(os, objnum, 0, 0); + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); } /* diff --git a/module/zfs/zio.c b/module/zfs/zio.c index fef56ef7b..3271f8207 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. @@ -3450,39 +3453,129 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; -/* dnp is the dnode for zb1->zb_object */ -boolean_t -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, - const zbookmark_phys_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; + + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; } - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 8208d29f3..1e27d81d6 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1397,8 +1397,9 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, len); - dmu_prefetch(os, ZVOL_OBJ, volsize - len, len); + dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, + ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; |