diff options
author | Brian Behlendorf <[email protected]> | 2016-11-30 14:48:16 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2016-11-30 14:48:16 -0700 |
commit | 7657defc48b7c47a8bf0c8f21c78783d293dc5ed (patch) | |
tree | ec6ebdcc7289bc707076205314737cf04fc3bfc0 /cmd | |
parent | ce43e88dd65509a4cf62c4acc76619e571d8518a (diff) | |
parent | 982957483450d53683681f456d1c84cfeb56afad (diff) |
Introduce ARC Buffer Data (ABD)
ZFS currently uses ARC buffers which are backed by virtual memory.
While functional, there are some major problems with this approach
which can be observed on all OpenZFS platforms. ABD was designed
to address these issues and includes contributions from OpenZFS
developers from multiple platforms.
While all OpenZFS platforms will benefit from ABD this functionality
is critical for Linux. Unlike the other OpenZFS platforms the Linux
kernel discourages extensive use of virtual memory. The provided
interfaces are not optimized for frequent allocations from the virtual
address space. To maintain good performance a kmem cache is
used which contains relatively long lived slabs backed by virtual
memory. The downside to the approach is that those slabs can
become highly fragmented resulting in an inefficient use of memory.
Another issue is that on 32-bit systems the available virtual
address space in the kernel is only a small fraction of total
system memory. This means the ARC size is highly constrained
which hurts performance and make allocating memory difficult
and OOMs more likely.
ABD is designed to address these issues by using scatter lists
of pages for data buffers. This removes the need for slabs
which resolves the fragmentation issue. It also allows high
memory pages to be allocated which alleviates the virtual
address space pressure on 32-bit systems.
For metadata buffers, which are small, linear ABDs are allocated
from the slab. This is preferable because there are many places
in the code which expect to be able to read from a given offset
in the buffer. Using linear ABDs means none of that code needs
to be modified. The majority of these buffers are allocated with
kmalloc so there's minimal impact of the virtual address space.
Tested-by: Kash Pande <[email protected]>
Tested-by: kernelOfTruth <[email protected]>
Tested-by: RageLtMan <rageltman@sempervictus>
Tested-by: DHE <[email protected]>
Reviewed-by: Chunwei Chen <[email protected]>
Reviewed-by: Dan Kimmel <[email protected]>
Reviewed-by: David Quigley <[email protected]>
Reviewed-by: Gvozden Neskovic <[email protected]>
Reviewed-by: Tom Caputi <[email protected]>
Reviewed-by: Isaac Huang <[email protected]>
Reviewed-by: Jinshan Xiong <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #3441
Closes #5135
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/raidz_test/raidz_bench.c | 8 | ||||
-rw-r--r-- | cmd/raidz_test/raidz_test.c | 53 | ||||
-rw-r--r-- | cmd/raidz_test/raidz_test.h | 6 | ||||
-rw-r--r-- | cmd/zdb/zdb.c | 48 | ||||
-rw-r--r-- | cmd/zdb/zdb_il.c | 59 | ||||
-rw-r--r-- | cmd/ztest/ztest.c | 18 |
6 files changed, 110 insertions, 82 deletions
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index f1710ccc7..7a18902eb 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -53,18 +53,18 @@ bench_init_raidz_map(void) /* * To permit larger column sizes these have to be done - * allocated using aligned alloc instead of zio_data_buf_alloc + * allocated using aligned alloc instead of zio_abd_buf_alloc */ - zio_bench.io_data = raidz_alloc(max_data_size); + zio_bench.io_abd = raidz_alloc(max_data_size); - init_zio_data(&zio_bench); + init_zio_abd(&zio_bench); } static void bench_fini_raidz_maps(void) { /* tear down golden zio */ - raidz_free(zio_bench.io_data, max_data_size); + raidz_free(zio_bench.io_abd, max_data_size); bzero(&zio_bench, sizeof (zio_t)); } diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 0019ae84a..3e0a089fd 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -181,10 +181,10 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data) +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) #define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data) +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) #define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) static int @@ -195,10 +195,9 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) VERIFY(parity >= 1 && parity <= 3); for (i = 0; i < parity; i++) { - if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i), - CODE_COL_SIZE(rm, i))) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { ret++; - LOG_OPT(D_DEBUG, opts, "\nParity block [%d] different!\n", i); } @@ -213,8 +212,8 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); for (i = 0; i < dcols; i++) { - if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i), - DATA_COL_SIZE(opts->rm_golden, i))) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { ret++; LOG_OPT(D_DEBUG, opts, @@ -224,37 +223,41 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) return (ret); } +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *) data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { int i; - int *dst; raidz_col_t *col; for (i = 0; i < cnt; i++) { col = &rm->rm_col[tgts[i]]; - dst = col->rc_data; - for (i = 0; i < col->rc_size / sizeof (int); i++) - dst[i] = rand(); + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); } } void -init_zio_data(zio_t *zio) +init_zio_abd(zio_t *zio) { - int i; - int *dst = (int *) zio->io_data; - - for (i = 0; i < zio->io_size / sizeof (int); i++) { - dst[i] = rand_data[i]; - } + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); } static void fini_raidz_map(zio_t **zio, raidz_map_t **rm) { vdev_raidz_map_free(*rm); - raidz_free((*zio)->io_data, (*zio)->io_size); + raidz_free((*zio)->io_abd, (*zio)->io_size); umem_free(*zio, sizeof (zio_t)); *zio = NULL; @@ -279,11 +282,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; - opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize); - zio_test->io_data = raidz_alloc(opts->rto_dsize); + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); - init_zio_data(opts->zio_golden); - init_zio_data(zio_test); + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); VERIFY0(vdev_raidz_impl_set("original")); @@ -326,8 +329,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_offset = 0; (*zio)->io_size = alloc_dsize; - (*zio)->io_data = raidz_alloc(alloc_dsize); - init_zio_data(*zio); + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index b279d82f2..a7fd26b8b 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -104,11 +104,11 @@ static inline size_t ilog2(size_t a) #define SEP "----------------\n" -#define raidz_alloc(size) zio_data_buf_alloc(size) -#define raidz_free(p, size) zio_data_buf_free(p, size) +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) -void init_zio_data(zio_t *zio); +void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b9b0b29bc..8379cec3e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -59,6 +59,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include <sys/zfeature.h> +#include <sys/abd.h> #include <zfs_comutil.h> #include <libzfs.h> @@ -2464,7 +2465,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2530,7 +2531,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -2543,7 +2544,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(NULL, spa, bp, data, size, + zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } @@ -3321,6 +3322,13 @@ name: return (NULL); } +/* ARGSUSED */ +static int +random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) +{ + return (random_get_pseudo_bytes(buf, len)); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -3352,7 +3360,8 @@ zdb_read_block(char *thing, spa_t *spa) uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *pbuf, *lbuf, *buf; + abd_t *pabd; + void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3425,8 +3434,7 @@ zdb_read_block(char *thing, spa_t *spa) psize = size; lsize = size; - /* Some 4K native devices require 4K buffer alignment */ - pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL); + pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3454,15 +3462,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3485,13 +3493,13 @@ zdb_read_block(char *thing, spa_t *spa) void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - bcopy(pbuf, pbuf2, psize); + abd_copy_to_buf(pbuf2, pabd, psize); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, + random_get_pseudo_bytes_cb, NULL)); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize)); /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, @@ -3506,10 +3514,10 @@ zdb_read_block(char *thing, spa_t *spa) "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); - if (zio_decompress_data(c, pbuf, lbuf, - psize, lsize) == 0 && - zio_decompress_data(c, pbuf2, lbuf2, - psize, lsize) == 0 && + if (zio_decompress_data(c, pabd, + lbuf, psize, lsize) == 0 && + zio_decompress_data_buf(c, pbuf2, + lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } @@ -3527,7 +3535,7 @@ zdb_read_block(char *thing, spa_t *spa) buf = lbuf; size = lsize; } else { - buf = pbuf; + buf = abd_to_buf(pabd); size = psize; } @@ -3545,7 +3553,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: - umem_free(pbuf, SPA_MAXBLOCKSIZE); + abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 1501e879d..190bfee86 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -42,6 +42,7 @@ #include <sys/resource.h> #include <sys/zil.h> #include <sys/zil_impl.h> +#include <sys/abd.h> extern uint8_t dump_opt[256]; @@ -120,13 +121,29 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) } /* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + int i; + + for (i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { - char *data, *dlimit; + abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; - char *buf; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -137,9 +154,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (txtype == TX_WRITE2 || verbose < 5) return; - if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL) - return; - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, !BP_IS_HOLE(bp) && @@ -150,43 +164,38 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - bzero(buf, SPA_MAXBLOCKSIZE); (void) printf("%s<hole>\n", prefix); - goto exit; + return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s<block already committed>\n", prefix); - goto exit; + return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) - goto exit; - data = buf; + goto out; } else { - data = (char *)(lr + 1); + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); } - dlimit = data + MIN(lr->lr_length, - (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("%s", prefix); - while (data < dlimit) { - if (isprint(*data)) - (void) printf("%c ", *data); - else - (void) printf("%2hhX", *data); - data++; - } + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); (void) printf("\n"); -exit: - free(buf); + +out: + abd_free(data); } /* ARGSUSED */ diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 2e4dae3a9..cab0ef734 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -114,6 +114,7 @@ #include <sys/refcount.h> #include <sys/zfeature.h> #include <sys/dsl_userhold.h> +#include <sys/abd.h> #include <stdio.h> #include <stdio_ext.h> #include <stdlib.h> @@ -193,6 +194,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5444,7 +5446,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5525,14 +5527,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); - ztest_pattern_set(buf, psize, ~pattern); + abd = abd_alloc_linear(psize, B_TRUE); + ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(abd); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); @@ -5965,6 +5967,12 @@ ztest_resume_thread(void *arg) */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); |