diff options
author | Matthew Ahrens <[email protected]> | 2014-11-03 12:15:08 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2015-05-11 12:23:16 -0700 |
commit | f1512ee61e2f22186ac16481a09d86112b2d6788 (patch) | |
tree | 6098f841f27955f13d4c2a53b2a4826b5f3bacda /cmd | |
parent | 3df293404a102398445fc013b67250073db9004e (diff) |
Illumos 5027 - zfs large block support
5027 zfs large block support
Reviewed by: Alek Pinchuk <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Josef 'Jeff' Sipek <[email protected]>
Reviewed by: Richard Elling <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Approved by: Dan McDonald <[email protected]>
References:
https://www.illumos.org/issues/5027
https://github.com/illumos/illumos-gate/commit/b515258
Porting Notes:
* Included in this patch is a tiny ISP2() cleanup in zio_init() from
Illumos 5255.
* Unlike the upstream Illumos commit this patch does not impose an
arbitrary 128K block size limit on volumes. Volumes, like filesystems,
are limited by the zfs_max_recordsize=1M module option.
* By default the maximum record size is limited to 1M by the module
option zfs_max_recordsize. This value may be safely increased up to
16M which is the largest block size supported by the on-disk format.
At the moment, 1M blocks clearly offer a significant performance
improvement but the benefits of going beyond this for the majority
of workloads are less clear.
* The illumos version of this patch increased DMU_MAX_ACCESS to 32M.
This was determined not to be large enough when using 16M blocks
because the zfs_make_xattrdir() function will fail (EFBIG) when
assigning a TX. This was immediately observed under Linux because
all newly created files must have a security xattr created and
that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M.
* On 32-bit platforms a hard limit of 1M is set for blocks due
to the limited virtual address space. We should be able to relax
this one the ABD patches are merged.
Ported-by: Brian Behlendorf <[email protected]>
Closes #354
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/zdb/zdb.c | 32 | ||||
-rw-r--r-- | cmd/zfs/zfs_main.c | 11 | ||||
-rw-r--r-- | cmd/zstreamdump/zstreamdump.c | 19 | ||||
-rw-r--r-- | cmd/ztest/ztest.c | 13 |
4 files changed, 61 insertions, 14 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 25f44212a..ab6155054 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2185,6 +2185,8 @@ dump_label(const char *dev) (void) close(fd); } +static uint64_t num_large_blocks; + /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) @@ -2197,6 +2199,8 @@ dump_one_dir(const char *dsname, void *arg) (void) printf("Could not open %s, error %d\n", dsname, error); return (0); } + if (dmu_objset_ds(os)->ds_large_blocks) + num_large_blocks++; dump_dir(os); dmu_objset_disown(os, FTAG); fuid_table_destroy(); @@ -2207,7 +2211,7 @@ dump_one_dir(const char *dsname, void *arg) /* * Block statistics. */ -#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1) +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; @@ -2273,7 +2277,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; - zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); @@ -2979,6 +2991,7 @@ dump_zpool(spa_t *spa) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { + uint64_t refcount; dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_bpobj(&spa->spa_deferred_bpobj, @@ -2998,8 +3011,21 @@ dump_zpool(spa_t *spa) } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount); + if (num_large_blocks != refcount) { + (void) printf("large_blocks feature refcount mismatch: " + "expected %lld != actual %lld\n", + (longlong_t)num_large_blocks, + (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified large_blocks feature refcount " + "is correct (%llu)\n", (longlong_t)refcount); + } } - if (dump_opt['b'] || dump_opt['c']) + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 50ac59fba..32b9239b7 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -258,9 +258,9 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] <snapshot>\n")); case HELP_SEND: - return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] " + return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " "<snapshot>\n" - "\tsend [-e] [-i snapshot|bookmark] " + "\tsend [-Le] [-i snapshot|bookmark] " "<filesystem|volume|snapshot>\n")); case HELP_SET: return (gettext("\tset <property=value> " @@ -3683,7 +3683,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) { switch (c) { case 'i': if (fromname) @@ -3718,6 +3718,9 @@ zfs_do_send(int argc, char **argv) case 'n': flags.dryrun = B_TRUE; break; + case 'L': + flags.largeblock = B_TRUE; + break; case 'e': flags.embed_data = B_TRUE; break; @@ -3774,6 +3777,8 @@ zfs_do_send(int argc, char **argv) if (zhp == NULL) return (1); + if (flags.largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags.embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c index c51a9c806..176dd66b2 100644 --- a/cmd/zstreamdump/zstreamdump.c +++ b/cmd/zstreamdump/zstreamdump.c @@ -56,7 +56,6 @@ uint64_t total_stream_len = 0; FILE *send_stream = 0; boolean_t do_byteswap = B_FALSE; boolean_t do_cksum = B_TRUE; -#define INITIAL_BUFLEN (1<<20) static void usage(void) @@ -69,6 +68,18 @@ usage(void) exit(1); } +static void * +safe_malloc(size_t size) +{ + void *rv = malloc(size); + if (rv == NULL) { + (void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n", + (unsigned)size); + abort(); + } + return (rv); +} + /* * ssread - send stream read. * @@ -160,7 +171,7 @@ print_block(char *buf, int length) int main(int argc, char *argv[]) { - char *buf = malloc(INITIAL_BUFLEN); + char *buf = safe_malloc(SPA_MAXBLOCKSIZE); uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; uint64_t total_records = 0; dmu_replay_record_t thedrr; @@ -308,9 +319,9 @@ main(int argc, char *argv[]) nvlist_t *nv; int sz = drr->drr_payloadlen; - if (sz > INITIAL_BUFLEN) { + if (sz > SPA_MAXBLOCKSIZE) { free(buf); - buf = malloc(sz); + buf = safe_malloc(sz); } (void) ssread(buf, sz, &zc); if (ferror(send_stream)) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0602a7ec5..6b939bdb6 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -1040,9 +1040,14 @@ ztest_spa_get_ashift(void) { static int ztest_random_blocksize(void) { - // Choose a block size >= the ashift. - uint64_t block_shift = - ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1); + /* + * Choose a block size >= the ashift. + * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. + */ + int maxbs = SPA_OLD_MAXBLOCKSHIFT; + if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) + maxbs = 20; + uint64_t block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } @@ -4972,7 +4977,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) char *path0; char *pathrand; size_t fsize; - int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; int maxfaults; int mirror_save; |