Illumos 5027 - zfs large block support

5027 zfs large block support Reviewed by: Alek Pinchuk <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Josef 'Jeff' Sipek <[email protected]> Reviewed by: Richard Elling <[email protected]> Reviewed by: Saso Kiselkov <[email protected]> Reviewed by: Brian Behlendorf <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <[email protected]> Closes #354
author: Matthew Ahrens <[email protected]> 2014-11-03 12:15:08 -0800
committer: Brian Behlendorf <[email protected]> 2015-05-11 12:23:16 -0700
commit: f1512ee61e2f22186ac16481a09d86112b2d6788 (patch)
tree: 6098f841f27955f13d4c2a53b2a4826b5f3bacda /module/zfs/dmu_send.c
parent: 3df293404a102398445fc013b67250073db9004e (diff)
1 files changed, 63 insertions, 20 deletions
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index c1cac2e67..e26d344d6 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
-	if (BP_IS_EMBEDDED(bp)) {
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
 		/*
-		 * There's no pre-computed checksum of embedded BP's, so
-		 * (like fletcher4-checkummed blocks) userland will have
-		 * to compute a dedup-capable checksum itself.
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 
@@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		    zb->zb_blkid * blksz, blksz, bp);
 	} else { /* it's a level-0 block of a regular object */
 		uint32_t aflags = ARC_WAIT;
+		uint64_t offset;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 			}
 		}
 
-		err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, bp, abuf->b_data);
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsp->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsp, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsp, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 {
 	objset_t *os;
 	dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	}
 #endif
 
+	if (large_block_ok && ds->ds_large_blocks)
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -684,7 +708,8 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, vnode_t *vp, offset_t *off)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
@@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
 		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, vp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, vp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, vp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, vp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
     int outfd, vnode_t *vp, offset_t *off)
 {
 	dsl_pool_t *dp;
@@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, vp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, vp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, vp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, vp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
@@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
@@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !newds->ds_large_blocks) {
+		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+		newds->ds_large_blocks = B_TRUE;
+	}
+
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
+	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
@@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
@@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > SPA_MAXBLOCKSIZE)
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
 		return (SET_ERROR(EINVAL));
 
 	data = restore_read(ra, drrs->drr_length, NULL);
@@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 	ra.cksum = drc->drc_cksum;
 	ra.vp = vp;
 	ra.voff = *voffp;
-	ra.bufsize = 1<<20;
+	ra.bufsize = SPA_MAXBLOCKSIZE;
 	ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */
author	Matthew Ahrens <[email protected]>	2014-11-03 12:15:08 -0800
committer	Brian Behlendorf <[email protected]>	2015-05-11 12:23:16 -0700
commit	f1512ee61e2f22186ac16481a09d86112b2d6788 (patch)
tree	6098f841f27955f13d4c2a53b2a4826b5f3bacda /module/zfs/dmu_send.c
parent	3df293404a102398445fc013b67250073db9004e (diff)