diff options
author | Paul Dagnelie <[email protected]> | 2015-12-22 02:31:57 +0100 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-01-08 15:08:19 -0800 |
commit | fcff0f35bd522076bdda7491c88a91cc0aa531a3 (patch) | |
tree | 63e2e9db6fce37f64559cdaaf7247d2f51e85d2d /module/zfs/zio.c | |
parent | 00af2ff6f219b4f73aebaaf9496cf5ea4b6728a3 (diff) |
Illumos 5960, 5925
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
References:
https://www.illumos.org/issues/5960
https://www.illumos.org/issues/5925
https://github.com/illumos/illumos-gate/commit/a2cdcdd
Porting notes:
- [lib/libzfs/libzfs_sendrecv.c]
- b8864a2 Fix gcc cast warnings
- 325f023 Add linux kernel device support
- 5c3f61e Increase Linux pipe buffer size on 'zfs receive'
- [module/zfs/zfs_vnops.c]
- 3558fd7 Prototype/structure update for Linux
- c12e3a5 Restructure zfs_readdir() to fix regressions
- [module/zfs/zvol.c]
- Function @zvol_map_block() isn't needed in ZoL
- 9965059 Prefetch start and end of volumes
- [module/zfs/dmu.c]
- Fixed ISO C90 - mixed declarations and code
- Function dmu_prefetch() 'int i' is initialized before
the following code block (c90 vs. c99)
- [module/zfs/dbuf.c]
- fc5bb51 Fix stack dbuf_hold_impl()
- 9b67f60 Illumos 4757, 4913
- 34229a2 Reduce stack usage for recursive traverse_visitbp()
- [module/zfs/dmu_send.c]
- Fixed ISO C90 - mixed declarations and code
- b58986e Use large stacks when available
- 241b541 Illumos 5959 - clean up per-dataset feature count code
- 77aef6f Use vmem_alloc() for nvlists
- 00b4602 Add linux kernel memory support
Ported-by: kernelOfTruth [email protected]
Signed-off-by: Brian Behlendorf <[email protected]>
Diffstat (limited to 'module/zfs/zio.c')
-rw-r--r-- | module/zfs/zio.c | 141 |
1 files changed, 117 insertions, 24 deletions
diff --git a/module/zfs/zio.c b/module/zfs/zio.c index fef56ef7b..3271f8207 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. @@ -3450,39 +3453,129 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; -/* dnp is the dnode for zb1->zb_object */ -boolean_t -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, - const zbookmark_phys_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; + + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; } - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); } #if defined(_KERNEL) && defined(HAVE_SPL) |