From f1512ee61e2f22186ac16481a09d86112b2d6788 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 3 Nov 2014 12:15:08 -0800 Subject: Illumos 5027 - zfs large block support 5027 zfs large block support Reviewed by: Alek Pinchuk Reviewed by: George Wilson Reviewed by: Josef 'Jeff' Sipek Reviewed by: Richard Elling Reviewed by: Saso Kiselkov Reviewed by: Brian Behlendorf Approved by: Dan McDonald References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf Closes #354 --- module/zcommon/zfs_prop.c | 4 +- module/zcommon/zpool_prop.c | 2 + module/zfs/arc.c | 4 +- module/zfs/bpobj.c | 5 +- module/zfs/bptree.c | 2 +- module/zfs/dbuf.c | 6 +-- module/zfs/dmu_objset.c | 16 ++++++ module/zfs/dmu_send.c | 83 +++++++++++++++++++++++-------- module/zfs/dmu_tx.c | 24 ++++++--- module/zfs/dnode.c | 10 ++-- module/zfs/dsl_dataset.c | 114 ++++++++++++++++++++++++++++++++++++++++++- module/zfs/dsl_deadlist.c | 8 +-- module/zfs/dsl_destroy.c | 7 +++ module/zfs/dsl_pool.c | 4 +- module/zfs/sa.c | 6 +-- module/zfs/spa.c | 26 ++++++++-- module/zfs/spa_history.c | 2 +- module/zfs/spa_misc.c | 10 ++++ module/zfs/vdev.c | 6 +-- module/zfs/vdev_disk.c | 4 +- module/zfs/vdev_queue.c | 2 +- module/zfs/zap_micro.c | 16 +++--- module/zfs/zfeature_common.c | 14 +++++- module/zfs/zfs_ioctl.c | 54 +++++++++++++++++--- module/zfs/zfs_log.c | 2 +- module/zfs/zfs_vfsops.c | 9 ++-- module/zfs/zfs_vnops.c | 8 ++- module/zfs/zfs_znode.c | 8 ++- module/zfs/zil.c | 11 +++-- module/zfs/zio.c | 29 ++++++++--- 30 files changed, 396 insertions(+), 100 deletions(-) (limited to 'module') diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 70f1d93df..aaebab444 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -426,8 +426,8 @@ zfs_prop_init(void) /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", - SPA_MAXBLOCKSIZE, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE"); + SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); /* hidden properties */ zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e5f69c815..910c56dcc 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -131,6 +131,8 @@ zpool_prop_init(void) /* hidden properties */ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME"); + zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6975ada62..1699ea7e7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1329,7 +1329,7 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) arc_buf_hdr_t *hdr; arc_buf_t *buf; - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + VERIFY3U(size, <=, spa_maxblocksize(spa)); hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; @@ -3289,7 +3289,7 @@ top: * Gracefully handle a damaged logical block size as a * checksum error by passing a dummy zio to the done callback. */ - if (size > SPA_MAXBLOCKSIZE) { + if (size > spa_maxblocksize(spa)) { if (done) { rzio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 25767e83f..ebfdc2e7a 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { ASSERT0(dp->dp_empty_bpobj); dp->dp_empty_bpobj = - bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx); + bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, @@ -399,7 +399,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpo->bpo_phys->bpo_subobjs == 0) { bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, - DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); } ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index d6ea9d7c6..9f62d7b91 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx) bptree_phys_t *bt; obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, - SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (bptree_phys_t), tx); /* diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index cd74ce3e8..7d8adcd73 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2216,10 +2216,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; - if (blksz > SPA_MAXBLOCKSIZE) - blksz = SPA_MAXBLOCKSIZE; - else - blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 6d42bf8a2..ae4e1dd21 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -256,6 +256,14 @@ logbias_changed_cb(void *arg, uint64_t newval) zil_set_logbias(os->os_zil, newval); } +static void +recordsize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + os->os_recordsize = newval; +} + void dmu_objset_byteswap(void *buf, size_t size) { @@ -385,6 +393,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + recordsize_changed_cb, os); + } } if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, @@ -660,6 +673,9 @@ dmu_objset_evict(objset_t *os) VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os)); + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + recordsize_changed_cb, os)); } VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index c1cac2e67..e26d344d6 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; - if (BP_IS_EMBEDDED(bp)) { + if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* - * There's no pre-computed checksum of embedded BP's, so - * (like fletcher4-checkummed blocks) userland will have - * to compute a dedup-capable checksum itself. + * There's no pre-computed checksum for partial-block + * writes or embedded BP's, so (like + * fletcher4-checkummed blocks) userland will have to + * compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { @@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; + if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) + drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); @@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; + uint64_t offset; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); @@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } } - err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, bp, abuf->b_data); + offset = zb->zb_blkid * blksz; + + if (!(dsp->dsa_featureflags & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + blksz > SPA_OLD_MAXBLOCKSIZE) { + char *buf = abuf->b_data; + while (blksz > 0 && err == 0) { + int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); + err = dump_write(dsp, type, zb->zb_object, + offset, n, NULL, buf); + offset += n; + buf += n; + blksz -= n; + } + } else { + err = dump_write(dsp, type, zb->zb_object, + offset, blksz, bp, abuf->b_data); + } (void) arc_buf_remove_ref(abuf, &abuf); } @@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, - int outfd, vnode_t *vp, offset_t *off) + boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; @@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, } #endif + if (large_block_ok && ds->ds_large_blocks) + featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; @@ -684,7 +708,8 @@ out: int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, int outfd, vnode_t *vp, offset_t *off) + boolean_t embedok, boolean_t large_block_ok, + int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; @@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, - outfd, vp, off); + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, outfd, vp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, - outfd, vp, off); + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, outfd, vp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int -dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, +dmu_send(const char *tosnap, const char *fromsnap, + boolean_t embedok, boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; @@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, dsl_pool_rele(dp, FTAG); return (err); } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, - outfd, vp, off); + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, outfd, vp, off); } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, - outfd, vp, off); + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, outfd, vp, off); } if (owned) dsl_dataset_disown(ds, FTAG); @@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); + /* + * The receiving code doesn't know how to translate large blocks + * to smaller ones, so the pool must have the LARGE_BLOCKS + * feature enabled if the stream has LARGE_BLOCKS. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ @@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !newds->ds_large_blocks) { + dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); + newds->ds_large_blocks = B_TRUE; + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf) /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); + ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; @@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } @@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > SPA_MAXBLOCKSIZE) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) return (SET_ERROR(EINVAL)); data = restore_read(ra, drrs->drr_length, NULL); @@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; - ra.bufsize = 1<<20; + ra.bufsize = SPA_MAXBLOCKSIZE; ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 89f45a781..62a8d471e 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -241,7 +241,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) return; min_bs = SPA_MINBLOCKSHIFT; - max_bs = SPA_MAXBLOCKSHIFT; + max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; @@ -310,6 +310,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; + } else { + /* + * The blocksize can increase up to the recordsize, + * or if it is already more than the recordsize, + * up to the next power of 2. + */ + min_bs = highbit64(dn->dn_datablksz - 1); + max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); } /* @@ -745,11 +753,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) - txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ; else - txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_towrite += MZAP_MAX_BLKSZ; if (!BP_IS_HOLE(bp)) - txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += MZAP_MAX_BLKSZ; return; } @@ -1546,18 +1554,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) /* If blkptr doesn't exist then add space to towrite */ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE; } else { blkptr_t *bp; bp = &dn->dn_phys->dn_spill; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) - txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE; else - txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE; if (!BP_IS_HOLE(bp)) - txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE; } } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 235884931..e1ea165aa 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -540,10 +540,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, { int i; + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; - else if (blocksize > SPA_MAXBLOCKSIZE) - blocksize = SPA_MAXBLOCKSIZE; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); @@ -624,7 +624,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); - ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); @@ -1377,10 +1378,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) dmu_buf_impl_t *db; int err; + ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; - if (size > SPA_MAXBLOCKSIZE) - size = SPA_MAXBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index b444fca64..9a66c6552 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -51,6 +51,17 @@ #include #include +/* + * The SPA supports block sizes up to 16MB. However, very large blocks + * can have an impact on i/o latency (e.g. tying up a spinning disk for + * ~300ms), and also potentially on the memory allocator. Therefore, + * we do not allow the recordsize to be set larger than zfs_max_recordsize + * (default 1MB). Larger blocks can be created by changing this tunable, + * and pools with larger blocks can always be imported and used, regardless + * of this setting. + */ +int zfs_max_recordsize = 1 * 1024 * 1024; + #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ @@ -60,8 +71,6 @@ #define DS_REF_MAX (1ULL << 62) -#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE - extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); /* @@ -117,6 +126,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) + ds->ds_need_large_blocks = B_TRUE; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -414,6 +425,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS); + if (err == 0) + ds->ds_large_blocks = B_TRUE; + else + ASSERT3U(err, ==, ENOENT); + } + if (err == 0) { err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, @@ -730,6 +749,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); + if (origin->ds_large_blocks) + dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); + dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_dataset_phys(origin)->ds_num_children++; @@ -1253,6 +1275,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; dmu_buf_rele(dbuf, FTAG); + if (ds->ds_large_blocks) + dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); + ASSERT3U(ds->ds_prev != 0, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { @@ -1541,6 +1566,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; dmu_objset_sync(ds->ds_objset, zio, tx); + + if (ds->ds_need_large_blocks && !ds->ds_large_blocks) { + dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); + ds->ds_large_blocks = B_TRUE; + } } static void @@ -3222,6 +3252,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, return (err); } +static int +dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx) +{ + const char *dsname = arg; + dsl_dataset_t *ds; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + + ASSERT(spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_EXTENSIBLE_DATASET)); + + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_large_blocks) + error = EALREADY; + dsl_dataset_rele(ds, FTAG); + + return (error); +} + +void +dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t zero = 0; + + spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx); + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + + VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS, + sizeof (zero), 1, &zero, tx)); +} + +static void +dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx) +{ + const char *dsname = arg; + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds)); + + dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); + ASSERT(!ds->ds_large_blocks); + ds->ds_large_blocks = B_TRUE; + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_activate_large_blocks(const char *dsname) +{ + int error; + + error = dsl_sync_task(dsname, + dsl_dataset_activate_large_blocks_check, + dsl_dataset_activate_large_blocks_sync, (void *)dsname, + 1, ZFS_SPACE_CHECK_RESERVED); + + /* + * EALREADY indicates that this dataset already supports large blocks. + */ + if (error == EALREADY) + error = 0; + return (error); +} + /* * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. * For example, they could both be snapshots of the same filesystem, and @@ -3275,6 +3376,15 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) } #if defined(_KERNEL) && defined(HAVE_SPL) +#if defined(_LP64) +module_param(zfs_max_recordsize, int, 0644); +MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size"); +#else +/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ +module_param(zfs_max_recordsize, int, 0444); +MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size"); +#endif + EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_obj); EXPORT_SYMBOL(dsl_dataset_own); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 55810f5c3..8da77ebd7 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -148,7 +148,7 @@ uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) { if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) - return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); + return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, sizeof (dsl_deadlist_phys_t), tx)); } @@ -185,7 +185,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, { if (dle->dle_bpobj.bpo_object == dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { - uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); @@ -259,7 +259,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = mintxg; - obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); @@ -344,7 +344,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, if (dle->dle_mintxg >= maxtxg) break; - obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 4623f5dd5..0e2238f99 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -277,6 +277,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) obj = ds->ds_object; + if (ds->ds_large_blocks) { + ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS)); + spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); + } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); VERIFY0(dsl_dataset_hold_obj(dp, @@ -738,6 +742,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT0(ds->ds_reserved); } + if (ds->ds_large_blocks) + spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); + dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index fe1a4d8b7..13961918e 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -372,7 +372,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ - obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, @@ -804,7 +804,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, - SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 8e857e129..a3fcea87b 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -504,7 +504,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) if (size == 0) { blocksize = SPA_MINBLOCKSIZE; - } else if (size > SPA_MAXBLOCKSIZE) { + } else if (size > SPA_OLD_MAXBLOCKSIZE) { ASSERT(0); return (SET_ERROR(EFBIG)); } else { @@ -693,7 +693,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, &spill_idx, &used, &spilling); - if (used > SPA_MAXBLOCKSIZE) + if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? @@ -717,7 +717,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i, &spill_used, &dummy); - if (spill_used > SPA_MAXBLOCKSIZE) + if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3312c301c..2a9ef9ce5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -266,6 +266,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -482,7 +490,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (!error) { objset_t *os; - uint64_t compress; + uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( @@ -494,15 +502,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (error) break; - /* Must be ZPL and not gzip compressed. */ + /* + * Must be ZPL, and its property settings + * must be supported by GRUB (compression + * is not gzip, and large blocks are not used). + */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &compress)) == 0 && - !BOOTFS_COMPRESS_VALID(compress)) { + &propval)) == 0 && + !BOOTFS_COMPRESS_VALID(propval)) { + error = SET_ERROR(ENOTSUP); + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + &propval)) == 0 && + propval > SPA_OLD_MAXBLOCKSIZE) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 1041c8f57..01aa4641e 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -89,7 +89,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) ASSERT(spa->spa_history == 0); spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, - SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, sizeof (spa_history_phys_t), tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 8e5fa683e..b46b2d0a4 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1985,6 +1985,15 @@ spa_debug_enabled(spa_t *spa) return (spa->spa_debug); } +int +spa_maxblocksize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SPA_MAXBLOCKSIZE); + else + return (SPA_OLD_MAXBLOCKSIZE); +} + #if defined(_KERNEL) && defined(HAVE_SPL) /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); @@ -2040,6 +2049,7 @@ EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); +EXPORT_SYMBOL(spa_maxblocksize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_rename); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 52198261e..fe6631925 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -847,9 +847,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) /* * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the current "typical" blocksize. - * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, - * or we will inconsistently account for existing bp's. + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index db13b7bc4..0196f3945 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -552,9 +552,9 @@ retry: goto retry; } - dr->dr_bio[i] = bio_alloc(GFP_NOIO, - bio_nr_pages(bio_ptr, bio_size)); /* bio_alloc() with __GFP_WAIT never returns NULL */ + dr->dr_bio[i] = bio_alloc(GFP_NOIO, + MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (ENOMEM); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index cf0301649..a0d6fc4e3 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -167,7 +167,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 5af98c2bc..8d920c2fa 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -34,6 +34,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -654,9 +655,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && - leaf_blockshift <= SPA_MAXBLOCKSHIFT && + leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && - indirect_blockshift <= SPA_MAXBLOCKSHIFT); + indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); @@ -1347,7 +1348,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, zap_t *zap; int err = 0; - /* * Since, we don't have a name, we cannot figure out which blocks will * be affected in this operation. So, account for the worst case : @@ -1360,7 +1360,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, * large microzap results in a promotion to fatzap. */ if (name == NULL) { - *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE; return (err); } @@ -1384,7 +1384,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, /* * We treat this case as similar to (name == NULL) */ - *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE; } } else { /* @@ -1403,12 +1403,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, * ptrtbl blocks */ if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += SPA_MAXBLOCKSIZE; + *tooverwrite += MZAP_MAX_BLKSZ; else - *towrite += SPA_MAXBLOCKSIZE; + *towrite += MZAP_MAX_BLKSZ; if (add) { - *towrite += 4 * SPA_MAXBLOCKSIZE; + *towrite += 4 * MZAP_MAX_BLKSZ; } } diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index 461456275..609a72ab3 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || - c == (after_colon ? '_' : '.')); + (after_colon && c == '_') || + (!after_colon && (c == '.' || c == '-'))); } /* @@ -230,4 +231,15 @@ zpool_feature_init(void) "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", B_FALSE, B_TRUE, B_TRUE, NULL); + + { + static const spa_feature_t large_blocks_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_BLOCKS, + "org.open-zfs:large_blocks", "large_blocks", + "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE, + large_blocks_deps); + } } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index caaa3a21c..f5137d09e 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -2392,7 +2392,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; - int err; + int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) @@ -3790,8 +3790,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) * the SPA supports it. We ignore any errors here since * we'll catch them later. */ - if (nvpair_type(pair) == DATA_TYPE_UINT64 && - nvpair_value_uint64(pair, &intval) == 0) { + if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, @@ -3842,6 +3841,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) return (SET_ERROR(ENOTSUP)); break; + case ZFS_PROP_RECORDSIZE: + /* Record sizes above 128k need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval > SPA_OLD_MAXBLOCKSIZE) { + spa_t *spa; + + /* + * If this is a bootable dataset then + * the we don't allow large (>128K) blocks, + * because GRUB doesn't support them. + */ + if (zfs_is_bootfs(dsname) && + intval > SPA_OLD_MAXBLOCKSIZE) { + return (SET_ERROR(EDOM)); + } + + /* + * We don't allow setting the property above 1MB, + * unless the tunable has been changed. + */ + if (intval > zfs_max_recordsize || + intval > SPA_MAXBLOCKSIZE) + return (SET_ERROR(EDOM)); + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_BLOCKS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); @@ -4221,7 +4256,7 @@ out: * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. - * zc_flags if =1, WRITE_EMBEDDED records are permitted + * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set @@ -4233,6 +4268,7 @@ zfs_ioc_send(zfs_cmd_t *zc) offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); + boolean_t large_block_ok = (zc->zc_flags & 0x2); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4294,7 +4330,8 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off); + zc->zc_fromobj, embedok, large_block_ok, + zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5160,6 +5197,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * } @@ -5175,6 +5214,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) char *fromname = NULL; int fd; file_t *fp; + boolean_t largeblockok; boolean_t embedok; error = nvlist_lookup_int32(innvl, "fd", &fd); @@ -5183,13 +5223,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); + largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); if ((fp = getf(fd)) == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, largeblockok, + fd, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index cfce83138..38d8de0eb 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -492,7 +492,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, * If the write would overflow the largest block then split it. */ if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) - len = SPA_MAXBLOCKSIZE >> 1; + len = SPA_OLD_MAXBLOCKSIZE >> 1; else len = resid; diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 59f73776c..e86b21aee 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -188,10 +188,9 @@ static void blksz_changed_cb(void *arg, uint64_t newval) { zfs_sb_t *zsb = arg; - - if (newval < SPA_MINBLOCKSIZE || - newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) - newval = SPA_MAXBLOCKSIZE; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zsb->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); zsb->z_max_blksz = newval; } @@ -672,7 +671,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) */ zsb->z_sb = NULL; zsb->z_parent = zsb; - zsb->z_max_blksz = SPA_MAXBLOCKSIZE; + zsb->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zsb->z_os = os; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 5ce8a1e98..19a4132e4 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -771,8 +771,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) uint64_t new_blksz; if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index a3d64fe01..f25ad0fc6 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -61,6 +61,7 @@ #endif /* _KERNEL */ #include +#include #include #include #include @@ -1304,8 +1305,13 @@ zfs_extend(znode_t *zp, uint64_t end) * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ ASSERT(!ISP2(zp->z_blksz)); - newblksz = MIN(end, SPA_MAXBLOCKSIZE); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index be1a86b24..3ae171e59 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -243,6 +243,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { + ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; @@ -257,6 +258,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { + ASSERT3U(zilc->zc_nused, <=, + SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; @@ -342,7 +345,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); + lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { @@ -389,7 +392,7 @@ done: (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); + zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } @@ -941,7 +944,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. + * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ @@ -1023,7 +1026,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) continue; zil_blksz = zil_block_buckets[i]; if (zil_blksz == UINT64_MAX) - zil_blksz = SPA_MAXBLOCKSIZE; + zil_blksz = SPA_OLD_MAXBLOCKSIZE; zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 9204df2b2..2b338f2a7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -24,6 +24,7 @@ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ +#include #include #include #include @@ -107,9 +108,8 @@ zio_init(void) /* * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache - * for each quarter-power of 2. For large buffers, we want - * a cache for each multiple of PAGESIZE. + * SPA_MINBLOCKSIZE. For larger buffers, we want a cache + * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; @@ -117,7 +117,16 @@ zio_init(void) size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; - while (p2 & (p2 - 1)) +#ifdef _ILP32 + /* + * Cache size limited to 1M on 32-bit platforms until ARC + * buffers no longer require virtual address space. + */ + if (size > zfs_max_recordsize) + break; +#endif + + while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL @@ -132,10 +141,8 @@ zio_init(void) #endif if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, PAGESIZE)) { - align = PAGESIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = p2 >> 2; + align = MIN(p2 >> 2, PAGESIZE); } if (align != 0) { @@ -174,6 +181,14 @@ zio_fini(void) kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { +#ifdef _ILP32 + /* + * Cache size limited to 1M on 32-bit platforms until ARC + * buffers no longer require virtual address space. + */ + if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) + break; +#endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); -- cgit v1.2.3