summaryrefslogtreecommitdiffstats
path: root/module/zfs/dmu_send.c
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2014-11-03 12:15:08 -0800
committerBrian Behlendorf <[email protected]>2015-05-11 12:23:16 -0700
commitf1512ee61e2f22186ac16481a09d86112b2d6788 (patch)
tree6098f841f27955f13d4c2a53b2a4826b5f3bacda /module/zfs/dmu_send.c
parent3df293404a102398445fc013b67250073db9004e (diff)
Illumos 5027 - zfs large block support
5027 zfs large block support Reviewed by: Alek Pinchuk <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Josef 'Jeff' Sipek <[email protected]> Reviewed by: Richard Elling <[email protected]> Reviewed by: Saso Kiselkov <[email protected]> Reviewed by: Brian Behlendorf <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <[email protected]> Closes #354
Diffstat (limited to 'module/zfs/dmu_send.c')
-rw-r--r--module/zfs/dmu_send.c83
1 files changed, 63 insertions, 20 deletions
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index c1cac2e67..e26d344d6 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- if (BP_IS_EMBEDDED(bp)) {
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
- * There's no pre-computed checksum of embedded BP's, so
- * (like fletcher4-checkummed blocks) userland will have
- * to compute a dedup-capable checksum itself.
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
+ uint64_t offset;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, bp, abuf->b_data);
+ offset = zb->zb_blkid * blksz;
+
+ if (!(dsp->dsa_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ blksz > SPA_OLD_MAXBLOCKSIZE) {
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, blksz, bp, abuf->b_data);
+ }
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
+ if (large_block_ok && ds->ds_large_blocks)
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -684,7 +708,8 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+ boolean_t embedok, boolean_t large_block_ok,
+ int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !newds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ newds->ds_large_blocks = B_TRUE;
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
+ ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > SPA_MAXBLOCKSIZE)
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.cksum = drc->drc_cksum;
ra.vp = vp;
ra.voff = *voffp;
- ra.bufsize = 1<<20;
+ ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */