diff options
59 files changed, 5011 insertions, 2245 deletions
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index f1710ccc7..7a18902eb 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -53,18 +53,18 @@ bench_init_raidz_map(void) /* * To permit larger column sizes these have to be done - * allocated using aligned alloc instead of zio_data_buf_alloc + * allocated using aligned alloc instead of zio_abd_buf_alloc */ - zio_bench.io_data = raidz_alloc(max_data_size); + zio_bench.io_abd = raidz_alloc(max_data_size); - init_zio_data(&zio_bench); + init_zio_abd(&zio_bench); } static void bench_fini_raidz_maps(void) { /* tear down golden zio */ - raidz_free(zio_bench.io_data, max_data_size); + raidz_free(zio_bench.io_abd, max_data_size); bzero(&zio_bench, sizeof (zio_t)); } diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 0019ae84a..3e0a089fd 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -181,10 +181,10 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data) +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) #define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data) +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) #define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) static int @@ -195,10 +195,9 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) VERIFY(parity >= 1 && parity <= 3); for (i = 0; i < parity; i++) { - if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i), - CODE_COL_SIZE(rm, i))) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { ret++; - LOG_OPT(D_DEBUG, opts, "\nParity block [%d] different!\n", i); } @@ -213,8 +212,8 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); for (i = 0; i < dcols; i++) { - if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i), - DATA_COL_SIZE(opts->rm_golden, i))) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { ret++; LOG_OPT(D_DEBUG, opts, @@ -224,37 +223,41 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) return (ret); } +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *) data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { int i; - int *dst; raidz_col_t *col; for (i = 0; i < cnt; i++) { col = &rm->rm_col[tgts[i]]; - dst = col->rc_data; - for (i = 0; i < col->rc_size / sizeof (int); i++) - dst[i] = rand(); + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); } } void -init_zio_data(zio_t *zio) +init_zio_abd(zio_t *zio) { - int i; - int *dst = (int *) zio->io_data; - - for (i = 0; i < zio->io_size / sizeof (int); i++) { - dst[i] = rand_data[i]; - } + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); } static void fini_raidz_map(zio_t **zio, raidz_map_t **rm) { vdev_raidz_map_free(*rm); - raidz_free((*zio)->io_data, (*zio)->io_size); + raidz_free((*zio)->io_abd, (*zio)->io_size); umem_free(*zio, sizeof (zio_t)); *zio = NULL; @@ -279,11 +282,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; - opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize); - zio_test->io_data = raidz_alloc(opts->rto_dsize); + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); - init_zio_data(opts->zio_golden); - init_zio_data(zio_test); + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); VERIFY0(vdev_raidz_impl_set("original")); @@ -326,8 +329,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_offset = 0; (*zio)->io_size = alloc_dsize; - (*zio)->io_data = raidz_alloc(alloc_dsize); - init_zio_data(*zio); + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index b279d82f2..a7fd26b8b 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -104,11 +104,11 @@ static inline size_t ilog2(size_t a) #define SEP "----------------\n" -#define raidz_alloc(size) zio_data_buf_alloc(size) -#define raidz_free(p, size) zio_data_buf_free(p, size) +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) -void init_zio_data(zio_t *zio); +void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b9b0b29bc..8379cec3e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -59,6 +59,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include <sys/zfeature.h> +#include <sys/abd.h> #include <zfs_comutil.h> #include <libzfs.h> @@ -2464,7 +2465,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2530,7 +2531,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -2543,7 +2544,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(NULL, spa, bp, data, size, + zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } @@ -3321,6 +3322,13 @@ name: return (NULL); } +/* ARGSUSED */ +static int +random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) +{ + return (random_get_pseudo_bytes(buf, len)); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -3352,7 +3360,8 @@ zdb_read_block(char *thing, spa_t *spa) uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *pbuf, *lbuf, *buf; + abd_t *pabd; + void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3425,8 +3434,7 @@ zdb_read_block(char *thing, spa_t *spa) psize = size; lsize = size; - /* Some 4K native devices require 4K buffer alignment */ - pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL); + pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3454,15 +3462,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3485,13 +3493,13 @@ zdb_read_block(char *thing, spa_t *spa) void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - bcopy(pbuf, pbuf2, psize); + abd_copy_to_buf(pbuf2, pabd, psize); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, + random_get_pseudo_bytes_cb, NULL)); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize)); /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, @@ -3506,10 +3514,10 @@ zdb_read_block(char *thing, spa_t *spa) "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); - if (zio_decompress_data(c, pbuf, lbuf, - psize, lsize) == 0 && - zio_decompress_data(c, pbuf2, lbuf2, - psize, lsize) == 0 && + if (zio_decompress_data(c, pabd, + lbuf, psize, lsize) == 0 && + zio_decompress_data_buf(c, pbuf2, + lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } @@ -3527,7 +3535,7 @@ zdb_read_block(char *thing, spa_t *spa) buf = lbuf; size = lsize; } else { - buf = pbuf; + buf = abd_to_buf(pabd); size = psize; } @@ -3545,7 +3553,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: - umem_free(pbuf, SPA_MAXBLOCKSIZE); + abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 1501e879d..190bfee86 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -42,6 +42,7 @@ #include <sys/resource.h> #include <sys/zil.h> #include <sys/zil_impl.h> +#include <sys/abd.h> extern uint8_t dump_opt[256]; @@ -120,13 +121,29 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) } /* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + int i; + + for (i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { - char *data, *dlimit; + abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; - char *buf; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -137,9 +154,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (txtype == TX_WRITE2 || verbose < 5) return; - if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL) - return; - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, !BP_IS_HOLE(bp) && @@ -150,43 +164,38 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - bzero(buf, SPA_MAXBLOCKSIZE); (void) printf("%s<hole>\n", prefix); - goto exit; + return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s<block already committed>\n", prefix); - goto exit; + return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) - goto exit; - data = buf; + goto out; } else { - data = (char *)(lr + 1); + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); } - dlimit = data + MIN(lr->lr_length, - (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("%s", prefix); - while (data < dlimit) { - if (isprint(*data)) - (void) printf("%c ", *data); - else - (void) printf("%2hhX", *data); - data++; - } + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); (void) printf("\n"); -exit: - free(buf); + +out: + abd_free(data); } /* ARGSUSED */ diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 2e4dae3a9..cab0ef734 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -114,6 +114,7 @@ #include <sys/refcount.h> #include <sys/zfeature.h> #include <sys/dsl_userhold.h> +#include <sys/abd.h> #include <stdio.h> #include <stdio_ext.h> #include <stdlib.h> @@ -193,6 +194,7 @@ extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5444,7 +5446,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5525,14 +5527,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); - ztest_pattern_set(buf, psize, ~pattern); + abd = abd_alloc_linear(psize, B_TRUE); + ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(abd); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); @@ -5965,6 +5967,12 @@ ztest_resume_thread(void *arg) */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 37df6e1d2..956643801 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = fm fs crypto sysevent COMMON_H = \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000..d2db7e199 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include <sys/isa_defs.h> +#include <sys/int_types.h> +#include <sys/debug.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <linux/mm.h> +#include <linux/bio.h> +#include <sys/uio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */ +} abd_flags_t; + +typedef struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; + uint_t abd_nents; + struct scatterlist *abd_sgl; + } abd_scatter; + struct abd_linear { + void *abd_buf; + } abd_linear; + } abd_u; +} abd_t; + +typedef int abd_iter_func_t(void *buf, size_t len, void *private); +typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); + +extern int zfs_abd_scatter_enabled; + +static inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0); +} + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_free(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); + +#if defined(_KERNEL) && defined(HAVE_SPL) +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + +/* + * Wrappers for calls with offsets of 0 + */ + +static inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +static inline void +abd_copy_from_buf(abd_t *abd, void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +static inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +static inline int +abd_cmp_buf(abd_t *abd, void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +static inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * Module lifecycle + */ + +void abd_init(void); +void abd_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index d2dc527fe..f5b7cb42a 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -166,7 +166,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - void *b_pdata; + abd_t *b_pabd; } l1arc_buf_hdr_t; typedef struct l2arc_dev { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 3befcb844..667795f96 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DDT_H @@ -35,6 +36,8 @@ extern "C" { #endif +struct abd; + /* * On-disk DDT formats, in the desired search order (newest version first). */ @@ -108,7 +111,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + struct abd *dde_repair_abd; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff --git a/include/sys/spa.h b/include/sys/spa.h index 3d0b962e6..d679e53d6 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -416,15 +416,17 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) -#define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ @@ -569,8 +571,7 @@ _NOTE(CONSTCOND) } while (0) } #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA) + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, @@ -585,7 +586,6 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, nvlist_t *zplprops); -extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b9a2d181b..d7f11a2b8 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -53,6 +53,7 @@ extern "C" { typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; extern int zfs_vdev_queue_depth_pct; extern uint32_t zfs_vdev_async_write_max_active; @@ -87,7 +88,7 @@ typedef const struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + struct abd *ve_abd; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 735b67764..b4663b97c 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -28,6 +28,7 @@ #include <sys/types.h> #include <sys/debug.h> #include <sys/kstat.h> +#include <sys/abd.h> #ifdef __cplusplus extern "C" { @@ -104,7 +105,7 @@ typedef struct raidz_col { size_t rc_devidx; /* child device index for I/O */ size_t rc_offset; /* device offset */ size_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_abd; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ unsigned int rc_tried; /* Did we attempt this I/O column? */ @@ -121,7 +122,7 @@ typedef struct raidz_map { size_t rm_firstdatacol; /* First data column/parity count */ size_t rm_nskip; /* Skipped sectors for padding */ size_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ size_t rm_reports; /* # of referencing checksum reports */ unsigned int rm_freed; /* map no longer has referencing ZIO */ unsigned int rm_ecksuminjected; /* checksum error was injected */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 864e8b2be..6c5153dcf 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -301,6 +301,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; +struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -333,12 +334,12 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, struct abd *data, uint64_t offset); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -396,8 +397,8 @@ struct zio { uint64_t io_lsize; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + struct abd *io_abd; + struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; @@ -455,19 +456,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t lsize, zio_done_func_t *done, void *private, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + struct abd *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -483,12 +484,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -517,21 +518,20 @@ extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -extern void *zio_buf_alloc_flags(size_t size, int flags); -extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index b4c2c8c08..a6cafc9b2 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright Saso Kiselkov 2013, All rights reserved. */ @@ -34,12 +34,12 @@ extern "C" { #endif +struct abd; + /* * Signature for checksum functions. */ -typedef void zio_checksum_func_t(const void *, uint64_t, const void *, - zio_cksum_t *); -typedef void zio_checksum_t(const void *data, uint64_t size, +typedef void zio_checksum_t(struct abd *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp); typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); typedef void zio_checksum_tmpl_free_t(void *ctx_template); @@ -83,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; -extern zio_checksum_t zio_checksum_SHA512_native; -extern zio_checksum_t zio_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; /* Skein */ -extern zio_checksum_t zio_checksum_skein_native; -extern zio_checksum_t zio_checksum_skein_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free; +extern zio_checksum_t abd_checksum_skein_native; +extern zio_checksum_t abd_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; /* Edon-R */ -extern zio_checksum_t zio_checksum_edonr_native; -extern zio_checksum_t zio_checksum_edonr_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +extern zio_checksum_t abd_checksum_edonr_native; +extern zio_checksum_t abd_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, - void *, uint64_t, uint64_t, zio_bad_cksum_t *); + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index da58ef7aa..1642823d3 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -22,12 +22,14 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H +#include <sys/abd.h> + #ifdef __cplusplus extern "C" { #endif @@ -60,13 +62,20 @@ typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); /* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); +/* * Information about each compression function. */ typedef const struct zio_compress_info { - zio_compress_func_t *ci_compress; /* compression function */ - zio_decompress_func_t *ci_decompress; /* decompression function */ - int ci_level; /* level parameter */ - char *ci_name; /* algorithm name */ + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; } zio_compress_info_t; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; @@ -96,13 +105,16 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); - +extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len); -extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len); #ifdef __cplusplus diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index 633606d14..5c7a61c56 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -48,15 +48,16 @@ extern "C" { * checksum method is added. This method will ignore last (size % 4) bytes of * the data buffer. */ +void fletcher_init(zio_cksum_t *); void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); +int fletcher_2_incremental_native(void *, size_t, void *); +int fletcher_2_incremental_byteswap(void *, size_t, void *); void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +int fletcher_4_incremental_native(void *, size_t, void *); +int fletcher_4_incremental_byteswap(void *, size_t, void *); int fletcher_4_impl_set(const char *selector); void fletcher_4_init(void); void fletcher_4_fini(void); diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index afd64fcca..3c99529f1 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -24,6 +24,7 @@ USER_C = \ getmntany.c \ list.c \ mkdirp.c \ + page.c \ strlcat.c \ strlcpy.c \ strnlen.c \ diff --git a/lib/libspl/include/sys/param.h b/lib/libspl/include/sys/param.h index 9f362dd8b..c22d508f9 100644 --- a/lib/libspl/include/sys/param.h +++ b/lib/libspl/include/sys/param.h @@ -57,8 +57,11 @@ #define MAXUID UINT32_MAX /* max user id */ #define MAXPROJID MAXUID /* max project id */ -#ifndef PAGESIZE -#define PAGESIZE (sysconf(_SC_PAGESIZE)) +#ifdef PAGESIZE +#undef PAGESIZE #endif /* PAGESIZE */ +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + #endif diff --git a/lib/libspl/page.c b/lib/libspl/page.c new file mode 100644 index 000000000..06d9fcfa0 --- /dev/null +++ b/lib/libspl/page.c @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <unistd.h> + +size_t pagesize = 0; + +size_t +spl_pagesize(void) +{ + if (pagesize == 0) + pagesize = sysconf(_SC_PAGESIZE); + + return (pagesize); +} diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 8926d1173..2334245c1 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -366,11 +366,12 @@ cksummer(void *arg) if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, zero_cksum) || !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { - SHA256_CTX ctx; + SHA2_CTX ctx; zio_cksum_t tmpsha256; - zio_checksum_SHA256(buf, - payload_size, &ctx, &tmpsha256); + SHA2Init(SHA256, &ctx); + SHA2Update(&ctx, buf, payload_size); + SHA2Final(&tmpsha256, &ctx); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index b02555708..40c460284 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -33,6 +33,7 @@ KERNEL_C = \ zfs_uio.c \ zpool_prop.c \ zprop_common.c \ + abd.c \ arc.c \ blkptr.c \ bplist.c \ diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 9c2f9c00f..fb0a14991 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -28,6 +28,10 @@ */ /* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* * Fletcher Checksums * ------------------ * @@ -219,14 +223,26 @@ static boolean_t fletcher_4_initialized = B_FALSE; /*ARGSUSED*/ void -fletcher_2_native(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +fletcher_init(zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +int +fletcher_2_incremental_native(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -234,18 +250,33 @@ fletcher_2_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } /*ARGSUSED*/ void -fletcher_2_byteswap(const void *buf, uint64_t size, +fletcher_2_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + fletcher_init(zcp); + (void) fletcher_2_incremental_native((void *) buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -253,6 +284,16 @@ fletcher_2_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); +} + +/*ARGSUSED*/ +void +fletcher_2_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } static void @@ -523,25 +564,28 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, } } -void -fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +int +fletcher_4_incremental_native(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); + return (0); } -void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +int +fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); + return (0); } @@ -607,6 +651,9 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, + zio_cksum_t *); + static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -618,8 +665,9 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); - zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : - fletcher_4_byteswap; + + fletcher_checksum_func_t *fletcher_4_test = native ? + fletcher_4_native : fletcher_4_byteswap; for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; @@ -769,6 +817,9 @@ module_param_call(zfs_fletcher_4_impl, fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +EXPORT_SYMBOL(fletcher_init); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_2_native); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 5ad319f32..6712b9b3c 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -7,6 +7,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += abd.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000..ffee9a5f8 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,1543 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we weren't using segkpm (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs + * on each ABD access. (If segkpm isn't available then we use all linear + * ABDs to avoid this penalty.) See seg_kpm.c for more details. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not + * available, which is the case on all 32-bit systems and any 64-bit systems + * where kpm_enable is turned off. + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include <sys/abd.h> +#include <sys/param.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> +#ifdef _KERNEL +#include <linux/scatterlist.h> +#include <linux/kmap_compat.h> +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +#ifdef _KERNEL +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +static unsigned long +abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(nid, gfp, order); + if (!page) + return (0); + + return ((unsigned long) page_address(page)); +} + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + int order; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned long paddr; + unsigned chunk_pages; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order); + if (paddr == 0) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + page = virt_to_page(paddr); + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + if (chunks > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; +} +#else +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct scatterlist *sg; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +static void +abd_free_pages(abd_t *abd) +{ + struct scatterlist *sg; + struct sg_table table; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i, j; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + for (j = 0; j < sg->length; ) { + page = nth_page(sg_page(sg), j >> PAGE_SHIFT); + order = compound_order(page); + __free_pages(page, order); + j += (PAGESIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + } + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = nr_pages; + sg_free_table(&table); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +struct page; + +#define kpm_enable 1 +#define abd_alloc_chunk(o) \ + ((struct page *) umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP)) +#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o)) +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) { + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = abd_alloc_chunk(0); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +static void +abd_free_pages(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + int j; + + abd_for_each_sg(abd, sg, n, i) { + for (j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT); + abd_free_chunk(p, 0); + } + } + + vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); +} + +#endif /* _KERNEL */ + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + size_t n; + int i; + struct scatterlist *sg; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(void) +{ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -sizeof (abd_t)); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + abd_t *abd; + + if (!zfs_abd_scatter_enabled || size <= PAGESIZE) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd = abd_alloc_struct(); + abd->abd_flags = ABD_FLAG_OWNER; + abd_alloc_pages(abd, size); + + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(size, PAGESIZE) - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_pages(abd); + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - P2ROUNDUP(abd->abd_size, PAGESIZE)); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags | ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + int i; + struct scatterlist *sg; + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + } + + abd->abd_size = size; + abd->abd_parent = sabd; + refcount_create(&abd->abd_children); + (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + + VERIFY3U(size, >, 0); + + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +#ifndef HAVE_1ARG_KMAP_ATOMIC +#define NR_KM_TYPE (6) +#ifdef _KERNEL +int km_table[NR_KM_TYPE] = { + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, +}; +#endif +#endif + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int iter_km; /* KM_* for kmap_atomic */ +#endif +}; + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +#ifndef HAVE_1ARG_KMAP_ATOMIC + ASSERT3U(km_type, <, NR_KM_TYPE); + aiter->iter_km = km_type; +#endif +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + while (size > 0) { + size_t len; + abd_iter_map(&aiter); + + len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd, 0); + abd_iter_init(&saiter, sabd, 1); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + size_t dlen, slen, len; + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + dlen = MIN(daiter.iter_mapsize, size); + slen = MIN(saiter.iter_mapsize, size); + len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter; + void *caddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i], i); + + if (dabd) + abd_iter_init(&daiter, dabd, i); + + ASSERT3S(dsize, >=, 0); + + local_irq_save(flags); + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + case 2: + len = MIN(caiters[1].iter_mapsize, len); + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + local_irq_restore(flags); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i], 2*i); + abd_iter_init(&xiters[i], tabds[i], 2*i+1); + } + + local_irq_save(flags); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + local_irq_restore(flags); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = abd->abd_u.abd_scatter.abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) + - (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e3e933044..e54a7cc59 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -136,14 +136,14 @@ * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pdata) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an + * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each @@ -182,7 +182,7 @@ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t - * | b_pdata +-+ |b_next +---->+-----------+ + * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | @@ -199,8 +199,8 @@ * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a - * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be @@ -224,7 +224,7 @@ * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL + * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ @@ -238,19 +238,19 @@ * | +------+ | * +---------------------------------+ * - * Writing to the ARC requires that the ARC first discard the hdr's b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pdata. Writes are always done into buffers which have + * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * - * When the L2ARC is in use, it will also take advantage of the b_pdata. The - * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the @@ -271,7 +271,9 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> #include <sys/multilist.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -315,7 +317,7 @@ int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 5; -/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ @@ -455,13 +457,13 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* - * Uncompressed size of the data stored in b_pdata. If compressed + * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ @@ -960,7 +962,7 @@ typedef struct l2arc_read_callback { typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; @@ -970,10 +972,14 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); -static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_pabd(arc_buf_hdr_t *); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); @@ -1336,7 +1342,9 @@ static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && - buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); @@ -1376,8 +1384,6 @@ arc_cksum_verify(arc_buf_t *buf) return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1424,7 +1430,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); lsize = HDR_GET_LSIZE(hdr); - csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* @@ -1459,7 +1466,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); @@ -1483,18 +1490,9 @@ arc_cksum_compute(arc_buf_t *buf) mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { - /* - * Since the checksum doesn't apply to compressed buffers, we - * only keep a checksum if there are uncompressed buffers. - * Therefore there must be another buffer, which is - * uncompressed. - */ - IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL, - hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } @@ -1589,8 +1587,6 @@ arc_buf_thaw(arc_buf_t *buf) * allocate b_thawed. */ if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1609,8 +1605,6 @@ arc_buf_freeze(arc_buf_t *buf) return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1740,7 +1734,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { @@ -1792,7 +1786,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, + hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* @@ -1829,7 +1823,7 @@ arc_decompress(arc_buf_t *buf) } /* - * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) @@ -1862,14 +1856,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -1897,14 +1891,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -2051,7 +2045,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2120,7 +2114,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -2150,7 +2144,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(bufcnt, ==, buffers); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { @@ -2163,7 +2157,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, @@ -2204,7 +2198,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } @@ -2302,7 +2296,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pdata. + * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) @@ -2397,17 +2391,20 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, * set the appropriate bit in the hdr's b_flags to indicate the hdr is * allocate a new buffer to store the buf's data. * - * There is one additional restriction here because we're sharing - * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively - * involved in an L2ARC write, because if this buf is used by an - * arc_write() then the hdr's data buffer will be released when the + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. */ - can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && + abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { - buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { @@ -2492,11 +2489,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) } static void -l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; + df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); @@ -2521,7 +2518,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) } (void) refcount_remove_many(&state->arcs_size, size, hdr); - l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* @@ -2533,7 +2530,7 @@ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2542,7 +2539,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pdata = buf->b_data; + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; @@ -2560,7 +2559,7 @@ static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2569,7 +2568,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - hdr->b_l1hdr.b_pdata = NULL; + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* @@ -2665,7 +2666,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pdata to share it with the new + * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * @@ -2680,8 +2681,8 @@ arc_buf_destroy_impl(arc_buf_t *buf) /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the @@ -2714,26 +2715,26 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then @@ -2745,10 +2746,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } - hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); @@ -2784,7 +2785,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2824,7 +2825,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -2842,11 +2843,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } @@ -2931,6 +2932,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + } + return (buf); } @@ -2999,9 +3012,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - if (hdr->b_l1hdr.b_pdata != NULL) { - arc_hdr_free_pdata(hdr); - } + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_pabd(hdr); } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -3068,7 +3080,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pdata field) during its write phase. + * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. @@ -3084,7 +3096,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); + ASSERT(hdr->b_l1hdr.b_pabd == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3149,9 +3161,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly - * in arc_free_data_buf(). + * in arc_free_data_impl(). */ - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -3249,7 +3261,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since - * arc_get_data_buf() doesn't check for overflow + * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the @@ -4154,13 +4166,13 @@ arc_kmem_reap_now(void) } /* - * Threads can block in arc_get_data_buf() waiting for this thread to evict + * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_buf() are sleeping while holding the hash lock for their + * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * - * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, @@ -4509,18 +4521,45 @@ arc_is_overflowing(void) return (arc_size >= arc_c + overflow); } +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - void *datap = NULL; - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -4562,11 +4601,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { - ASSERT(type == ARC_BUFC_DATA); - datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -4602,14 +4638,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } - return (datap); +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } } /* * Free the arc data buffer. */ static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); @@ -4626,11 +4682,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - zio_buf_free(data, size); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(data, size); arc_space_return(size, ARC_SPACE_DATA); } } @@ -4912,7 +4966,7 @@ arc_read_done(zio_t *zio) if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || @@ -5009,7 +5063,7 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -5161,7 +5215,7 @@ top: hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5179,9 +5233,9 @@ top: * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* @@ -5285,7 +5339,7 @@ top: ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, - size, hdr->b_l1hdr.b_pdata, + size, hdr->b_l1hdr.b_pabd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | @@ -5325,7 +5379,7 @@ top: } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { @@ -5557,16 +5611,17 @@ arc_release(arc_buf_t *buf, void *tag) arc_unshare_buf(hdr, buf); /* - * Now we need to recreate the hdr's b_pdata. Since we + * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pdata and copy the + * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pdata(hdr); - bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + arc_hdr_alloc_pabd(hdr); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, + buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { @@ -5582,7 +5637,7 @@ arc_release(arc_buf_t *buf, void *tag) HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, @@ -5601,7 +5656,7 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); /* - * Allocate a new hdr. The new hdr will contain a b_pdata + * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); @@ -5677,6 +5732,7 @@ arc_write_ready(zio_t *zio) arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); enum zio_compress compress; + fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); @@ -5690,15 +5746,15 @@ arc_write_ready(zio_t *zio) if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } } } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5720,33 +5776,47 @@ arc_write_ready(zio_t *zio) arc_hdr_set_compress(hdr, compress); /* - * If the hdr is compressed, then copy the compressed - * zio contents into arc_buf_hdr_t. Otherwise, copy the original - * data buf into the hdr. Ideally, we would like to always copy the - * io_data into b_pdata but the user may have disabled compressed - * arc thus the on-disk block may or may not match what we maintain - * in the hdr's b_pdata field. + * Fill the hdr with data. If the hdr is compressed, the data we want + * is available from the zio, otherwise we can take it from the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - !ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - arc_hdr_alloc_pdata(hdr); - bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + arc_hdr_alloc_pabd(hdr); + + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, + ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } } else { - ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - /* - * This hdr is not compressed so we're able to share - * the arc_buf_t data buffer with the hdr. - */ arc_share_buf(hdr, buf); - ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, - HDR_GET_LSIZE(hdr))); } + arc_hdr_verify(hdr, zio->io_bp); + spl_fstrans_unmark(cookie); } static void @@ -5850,6 +5920,7 @@ arc_write_done(zio_t *zio) ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); + abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -5886,10 +5957,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_buf = buf; /* - * The hdr's b_pdata is now stale, free it now. A new data block + * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. @@ -5899,15 +5970,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, @@ -6768,13 +6840,8 @@ l2arc_do_free_on_write(void) for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_data, !=, NULL); - if (df->l2df_type == ARC_BUFC_METADATA) { - zio_buf_free(df->l2df_data, df->l2df_size); - } else { - ASSERT(df->l2df_type == ARC_BUFC_DATA); - zio_data_buf_free(df->l2df_data, df->l2df_size); - } + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6928,12 +6995,12 @@ l2arc_read_done(zio_t *zio) mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(zio->io_data, !=, NULL); + ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ @@ -6967,7 +7034,7 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -7191,7 +7258,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t asize, size; - void *to_write; + abd_t *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -7264,7 +7331,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); size = arc_hdr_size(hdr); @@ -7280,18 +7347,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * add it to the l2arc_free_on_write queue. */ if (!HDR_SHARED_DATA(hdr)) { - to_write = hdr->b_l1hdr.b_pdata; + to_write = hdr->b_l1hdr.b_pabd; } else { - arc_buf_contents_t type = arc_buf_type(hdr); - if (type == ARC_BUFC_METADATA) { - to_write = zio_buf_alloc(size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - to_write = zio_data_buf_alloc(size); - } - - bcopy(hdr->b_l1hdr.b_pdata, to_write, size); - l2arc_free_data_on_write(to_write, size, type); + to_write = abd_alloc_for_io(size, + HDR_ISTYPE_METADATA(hdr)); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + l2arc_free_abd_on_write(to_write, size, + arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, size, to_write, diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c index d56e19996..99accfa0f 100644 --- a/module/zfs/blkptr.c +++ b/module/zfs/blkptr.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1d8c0518a..6e7a5a0fb 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -46,6 +46,7 @@ #include <sys/range_tree.h> #include <sys/trace_dbuf.h> #include <sys/callb.h> +#include <sys/abd.h> struct dbuf_hold_impl_data { /* Function arguments */ @@ -3709,6 +3710,9 @@ dbuf_write_override_done(zio_t *zio) mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); } /* Issue I/O to commit a dirty buffer to disk. */ @@ -3801,7 +3805,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - void *contents = (data != NULL) ? data->b_data : NULL; + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 09a3536f5..cbec70057 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -36,6 +36,7 @@ #include <sys/zio_checksum.h> #include <sys/zio_compress.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_cache; @@ -706,9 +707,8 @@ ddt_free(ddt_entry_t *dde) for (p = 0; p < DDT_PHYS_TYPES; p++) ASSERT(dde->dde_lead_zio[p] == NULL); - if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); cv_destroy(&dde->dde_cv); kmem_cache_free(ddt_entry_cache, dde); @@ -1002,7 +1002,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ddt_enter(ddt); - if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else @@ -1040,7 +1040,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index a817fdbce..b5ddec2d9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -47,6 +47,7 @@ #include <sys/zio_compress.h> #include <sys/sa.h> #include <sys/zfeature.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <sys/zfs_znode.h> @@ -1513,6 +1514,7 @@ dmu_sync_late_arrival_done(zio_t *zio) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } @@ -1537,11 +1539,11 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), - zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, - zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, - NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL, zb)); + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } @@ -2062,6 +2064,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); @@ -2087,6 +2090,7 @@ dmu_fini(void) xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); + abd_fini(); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index f9414ea3a..af6208e4d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -166,7 +166,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { @@ -179,13 +179,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } - fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - fletcher_4_incremental_native(payload, payload_len, + (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); @@ -1786,11 +1786,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - fletcher_4_incremental_byteswap(drr_begin, + (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - fletcher_4_incremental_native(drr_begin, + (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); @@ -2470,9 +2470,9 @@ static void receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { - fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { - fletcher_4_incremental_native(buf, len, &ra->cksum); + (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 41b3ce79b..fd7a53bc9 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2016 Gary Mills */ @@ -47,6 +47,7 @@ #include <sys/sa.h> #include <sys/sa_impl.h> #include <sys/zfeature.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -1820,7 +1821,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1904,7 +1905,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) @@ -1919,9 +1919,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(scan_delay); - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); + zio_nowait(zio_read(NULL, spa, bp, + abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, + NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ diff --git a/module/zfs/edonr_zfs.c b/module/zfs/edonr_zfs.c index 3c7d98656..e92da6d6c 100644 --- a/module/zfs/edonr_zfs.c +++ b/module/zfs/edonr_zfs.c @@ -22,20 +22,32 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/edonr.h> #include <sys/zfs_context.h> /* For CTASSERT() */ +#include <sys/abd.h> #define EDONR_MODE 512 #define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE +static int +edonr_incremental(void *buf, size_t size, void *arg) +{ + EdonRState *ctx = arg; + EdonRUpdate(ctx, buf, size * 8); + return (0); +} + /* * Native zio_checksum interface for the Edon-R hash function. */ /*ARGSUSED*/ void -zio_checksum_edonr_native(const void *buf, uint64_t size, +abd_checksum_edonr_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { uint8_t digest[EDONR_MODE / 8]; @@ -43,7 +55,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size, ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - EdonRUpdate(&ctx, buf, size * 8); + (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); EdonRFinal(&ctx, digest); bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); } @@ -52,12 +64,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size, * Byteswapped zio_checksum interface for the Edon-R hash function. */ void -zio_checksum_edonr_byteswap(const void *buf, uint64_t size, +abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_edonr_native(buf, size, ctx_template, &tmp); + abd_checksum_edonr_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); @@ -65,7 +77,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size, } void * -zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) { EdonRState *ctx; uint8_t salt_block[EDONR_BLOCK_SIZE]; @@ -94,7 +106,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) } void -zio_checksum_edonr_tmpl_free(void *ctx_template) +abd_checksum_edonr_tmpl_free(void *ctx_template) { EdonRState *ctx = ctx_template; diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c index c8a4882f8..23a97aa3d 100644 --- a/module/zfs/sha256.c +++ b/module/zfs/sha256.c @@ -24,30 +24,39 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/zio.h> -#include <sys/zio_checksum.h> #include <sys/sha2.h> +#include <sys/abd.h> + +static int +sha_incremental(void *buf, size_t size, void *arg) +{ + SHA2_CTX *ctx = arg; + SHA2Update(ctx, buf, size); + return (0); +} /*ARGSUSED*/ void -zio_checksum_SHA256(const void *buf, uint64_t size, +abd_checksum_SHA256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; zio_cksum_t tmp; SHA2Init(SHA256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(&tmp, &ctx); /* * A prior implementation of this function had a * private SHA256 implementation always wrote things out in * Big Endian and there wasn't a byteswap variant of it. - * To preseve on disk compatibility we need to force that - * behaviour. + * To preserve on disk compatibility we need to force that + * behavior. */ zcp->zc_word[0] = BE_64(tmp.zc_word[0]); zcp->zc_word[1] = BE_64(tmp.zc_word[1]); @@ -57,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size, /*ARGSUSED*/ void -zio_checksum_SHA512_native(const void *buf, uint64_t size, +abd_checksum_SHA512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; SHA2Init(SHA512_256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(zcp, &ctx); } /*ARGSUSED*/ void -zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, +abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); + abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff --git a/module/zfs/skein_zfs.c b/module/zfs/skein_zfs.c index 659234039..8deb84b26 100644 --- a/module/zfs/skein_zfs.c +++ b/module/zfs/skein_zfs.c @@ -20,42 +20,52 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/skein.h> +#include <sys/abd.h> + +static int +skein_incremental(void *buf, size_t size, void *arg) +{ + Skein_512_Ctxt_t *ctx = arg; + (void) Skein_512_Update(ctx, buf, size); + return (0); +} /* * Computes a native 256-bit skein MAC checksum. Please note that this * function requires the presence of a ctx_template that should be allocated - * using zio_checksum_skein_tmpl_init. + * using abd_checksum_skein_tmpl_init. */ /*ARGSUSED*/ void -zio_checksum_skein_native(const void *buf, uint64_t size, +abd_checksum_skein_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { Skein_512_Ctxt_t ctx; ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - (void) Skein_512_Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); (void) Skein_512_Final(&ctx, (uint8_t *)zcp); bzero(&ctx, sizeof (ctx)); } /* - * Byteswapped version of zio_checksum_skein_native. This just invokes + * Byteswapped version of abd_checksum_skein_native. This just invokes * the native checksum function and byteswaps the resulting checksum (since * skein is internally endian-insensitive). */ void -zio_checksum_skein_byteswap(const void *buf, uint64_t size, +abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_skein_native(buf, size, ctx_template, &tmp); + abd_checksum_skein_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); @@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size, * computations and returns a pointer to it. */ void * -zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) { Skein_512_Ctxt_t *ctx; @@ -82,7 +92,7 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) * zio_checksum_skein_tmpl_init. */ void -zio_checksum_skein_tmpl_free(void *ctx_template) +abd_checksum_skein_tmpl_free(void *ctx_template) { Skein_512_Ctxt_t *ctx = ctx_template; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 05e15a2e6..c55225a10 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1963,6 +1963,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; spa_t *spa = zio->io_spa; + abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) @@ -1970,7 +1971,6 @@ spa_load_verify_done(zio_t *zio) else atomic_inc_64(&sle->sle_data_count); } - zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1993,7 +1993,6 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { zio_t *rio; size_t size; - void *data; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -2004,12 +2003,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (!spa_load_verify_metadata) return (0); - if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -2017,7 +2015,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(rio, spa, bp, data, size, + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index db44d2ae1..8fc1a8d28 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -43,6 +43,7 @@ #include <sys/arc.h> #include <sys/zil.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> @@ -999,16 +1000,16 @@ vdev_probe_done(zio_t *zio) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, - zio->io_offset, zio->io_size, zio->io_data, + zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; @@ -1126,8 +1127,8 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 321ea4a2f..ec701097b 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -31,6 +31,7 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/kstat.h> +#include <sys/abd.h> /* * Virtual device read-ahead caching. @@ -136,12 +137,12 @@ static void vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) { ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - ASSERT(ve->ve_data != NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_abd); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -171,14 +172,14 @@ vdev_cache_allocate(zio_t *zio) ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); - ASSERT(ve->ve_hits != 0); + ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -192,7 +193,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); @@ -201,7 +202,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); } /* @@ -216,16 +217,16 @@ vdev_cache_fill(zio_t *fio) zio_t *pio; zio_link_t *zl; - ASSERT(fio->io_size == VCBS); + ASSERT3U(fio->io_size, ==, VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == fio); - ASSERT(ve->ve_offset == fio->io_offset); - ASSERT(ve->ve_data == fio->io_data); + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); ve->ve_fill_io = NULL; @@ -256,7 +257,7 @@ vdev_cache_read(zio_t *zio) zio_t *fio; ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); - ASSERT(zio->io_type == ZIO_TYPE_READ); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (B_FALSE); @@ -270,7 +271,7 @@ vdev_cache_read(zio_t *zio) if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (B_FALSE); - ASSERT(cache_phase + zio->io_size <= VCBS); + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); mutex_enter(&vc->vc_lock); @@ -309,7 +310,7 @@ vdev_cache_read(zio_t *zio) } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; @@ -337,7 +338,7 @@ vdev_cache_write(zio_t *zio) uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); mutex_enter(&vc->vc_lock); @@ -354,8 +355,8 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_abd, zio->io_abd, start - io_start, + start - ve->ve_offset, end - start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index ce65760ee..ae6ed4de9 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -30,6 +30,7 @@ #include <sys/spa.h> #include <sys/vdev_disk.h> #include <sys/vdev_impl.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/zio.h> #include <sys/sunldi.h> @@ -411,6 +412,7 @@ vdev_disk_dio_put(dio_request_t *dr) ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); + zio_delay_interrupt(zio); } } @@ -434,17 +436,10 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) #endif } - /* Drop reference aquired by __vdev_disk_physio */ + /* Drop reference acquired by __vdev_disk_physio */ rc = vdev_disk_dio_put(dr); } -static inline unsigned long -bio_nr_pages(void *bio_ptr, unsigned int bio_size) -{ - return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> - PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); -} - static unsigned int bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) { @@ -484,6 +479,15 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) return (bio_size); } +static unsigned int +bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) +{ + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); + + return (abd_scatter_bio_map_off(bio, abd, size, off)); +} + #ifndef bio_set_op_attrs #define bio_set_op_attrs(bio, rw, flags) \ do { (bio)->bi_rw |= (rw)|(flags); } while (0) @@ -516,11 +520,11 @@ vdev_submit_bio(struct bio *bio) } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, - size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags) +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t abd_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; @@ -528,7 +532,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, struct blk_plug plug; #endif - ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + ASSERT(zio != NULL); + ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -547,9 +552,10 @@ retry: * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; - bio_offset = kbuf_offset; - bio_size = kbuf_size; + + abd_offset = 0; + bio_offset = io_offset; + bio_size = io_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -569,7 +575,8 @@ retry: /* bio_alloc() with __GFP_WAIT never returns NULL */ dr->dr_bio[i] = bio_alloc(GFP_NOIO, - MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); + MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), + BIO_MAX_PAGES)); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (ENOMEM); @@ -585,10 +592,11 @@ retry: bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -730,7 +738,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, flags); if (error) { zio->io_error = error; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index a0a23598b..c78f2f421 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -31,6 +31,7 @@ #include <sys/zio.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> +#include <sys/abd.h> /* * Virtual device vector for files. @@ -150,11 +151,21 @@ vdev_file_io_strategy(void *arg) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 4edbfa41e..7a3a0e8a0 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -145,6 +145,7 @@ #include <sys/metaslab.h> #include <sys/zio.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> /* @@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -587,6 +588,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -599,7 +601,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -607,7 +610,7 @@ retry: zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -646,7 +649,7 @@ retry: goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); return (config); } @@ -782,8 +785,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - char *pad2; + abd_t *vp_abd; + abd_t *pad2; uberblock_t *ub; + abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -867,8 +872,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. @@ -928,7 +934,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -936,14 +942,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); - bzero(ub, VDEV_UBERBLOCK_RING); - *ub = spa->spa_uberblock; + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); - bzero(pad2, VDEV_PAD_SIZE); + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -953,7 +960,7 @@ retry: for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -966,7 +973,7 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -979,9 +986,9 @@ retry: } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -1039,7 +1046,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -1060,7 +1067,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } static void @@ -1076,8 +1083,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } @@ -1144,7 +1151,7 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { - uberblock_t *ubbuf; + abd_t *ub_abd; int c, l, n; for (c = 0; c < vd->vdev_children; c++) @@ -1158,17 +1165,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); - *ubbuf = *ub; + /* Copy the uberblock_t into the ABD */ + ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1245,6 +1253,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1263,15 +1272,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1279,7 +1289,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); nvlist_free(label); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 780311195..2b9081168 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -31,6 +31,7 @@ #include <sys/spa.h> #include <sys/vdev_impl.h> #include <sys/zio.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> /* @@ -272,13 +273,13 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -433,7 +434,8 @@ vdev_mirror_io_start(zio_t *zio) mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } @@ -458,7 +460,7 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; @@ -543,7 +545,7 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; @@ -584,7 +586,7 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 8f394eef5..91ef106b4 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -37,6 +37,7 @@ #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/kstat.h> +#include <sys/abd.h> /* * ZFS I/O Scheduler @@ -496,12 +497,12 @@ vdev_queue_agg_io_done(zio_t *aio) zio_t *pio; zio_link_t *zl = NULL; while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_abd, aio->io_abd, + 0, pio->io_offset - aio->io_offset, pio->io_size); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_abd); } /* @@ -523,7 +524,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; - void *buf; + abd_t *abd; limit = MAX(MIN(zfs_vdev_aggregation_limit, spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0); @@ -626,12 +627,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, limit); - buf = zio_buf_alloc_flags(size, KM_NOSLEEP); - if (buf == NULL) + abd = abd_alloc_for_io(size, B_TRUE); + if (abd == NULL) return (NULL); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - buf, size, first->io_type, zio->io_priority, + abd, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -644,12 +645,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_abd, + dio->io_offset - aio->io_offset, dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_abd, dio->io_abd, + dio->io_offset - aio->io_offset, 0, dio->io_size); } zio_add_child(dio, aio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index d1b415367..a92d3cbaa 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -30,6 +30,7 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> @@ -136,7 +137,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -144,11 +145,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_abd); size += rm->rm_col[c].rc_size; + } - if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -185,7 +188,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -199,8 +202,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -208,15 +212,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_abd = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -226,13 +235,18 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_data = bad_parity[x]; + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset_size( + rm->rm_abd_copy, offset, + rm->rm_col[x].rc_size); + offset += rm->rm_col[x].rc_size; } } @@ -246,8 +260,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); } /* @@ -260,7 +276,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -274,7 +290,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); - if (rm->rm_datacopy != NULL) + if (rm->rm_abd_copy != NULL) return; /* @@ -290,17 +306,21 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + rm->rm_abd_copy = + abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, + col->rc_size); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -329,6 +349,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -373,7 +394,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - rm->rm_datacopy = NULL; + rm->rm_abd_copy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; @@ -389,7 +410,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_abd = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; @@ -412,13 +433,18 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rm->rm_col[c].rc_size); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_size); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -464,29 +490,84 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -494,50 +575,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -546,59 +620,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -630,40 +693,159 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + int i; + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + int j; + uint8_t *b; + + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); - src = rm->rm_col[VDEV_RAIDZ_P].rc_data; - dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -672,57 +854,46 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); - count = MIN(ccount, xcount); + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - dst = rm->rm_col[x].rc_data; + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; exp = 255 - (rm->rm_cols - 1 - x); + rq.q = abd_to_buf(src); + rq.exp = exp; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -730,11 +901,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -750,15 +923,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); - rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -767,12 +940,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xd = rm->rm_col[x].rc_data; - yd = rm->rm_col[y].rc_data; + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; /* * We now have: @@ -796,24 +969,27 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + ASSERT3U(xsize, >=, ysize); + rpq.p = p; + rpq.q = q; + rpq.pxy = pxy; + rpq.qxy = qxy; + rpq.aexp = aexp; + rpq.bexp = bexp; - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, - rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1131,7 +1307,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = abd_to_buf(rm->rm_col[c].rc_abd); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1139,7 +1315,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1187,8 +1363,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1275,6 +1468,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1321,7 +1528,6 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; - /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) @@ -1479,7 +1685,7 @@ vdev_raidz_io_start(zio_t *zio) rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1536,7 +1742,7 @@ vdev_raidz_io_start(zio_t *zio) if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1552,6 +1758,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1565,9 +1772,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_abd, buf, rc->rc_size); } } @@ -1616,7 +1825,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1625,7 +1834,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1728,7 +1937,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_abd, + rc->rc_size); } /* @@ -1758,7 +1968,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_abd, orig[i], + rc->rc_size); } do { @@ -1997,7 +2208,7 @@ vdev_raidz_io_done(zio_t *zio) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); @@ -2077,7 +2288,7 @@ done: continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 33c05dadd..c050c9099 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_avx512f_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ - &vdev_raidz_avx512bw_impl, + // &vdev_raidz_avx512bw_impl, #endif #if defined(__aarch64__) &vdev_raidz_aarch64_neon_impl, @@ -240,17 +240,17 @@ int vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, const int *dt, const int nbaddata) { - raidz_rec_f rec_data = NULL; + raidz_rec_f rec_fn = NULL; switch (raidz_parity(rm)) { case PARITY_P: - rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); break; case PARITY_PQ: - rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); break; case PARITY_PQR: - rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", @@ -258,10 +258,10 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, break; } - if (rec_data == NULL) + if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_data(rm, dt)); + return (rec_fn(rm, dt)); } const char *raidz_gen_name[] = { @@ -471,13 +471,12 @@ vdev_raidz_math_init(void) return; #endif - /* Fake an zio and run the benchmark on it */ + /* Fake an zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ - bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE); - VERIFY(bench_zio->io_data); - memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { @@ -501,7 +500,7 @@ vdev_raidz_math_init(void) vdev_raidz_map_free(bench_rm); /* cleanup the bench zio */ - zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE); + abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); /* install kstats for all impl */ diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index f6a433f10..c7b8afd38 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -23,11 +23,38 @@ */ #include <sys/isa_defs.h> +#include <sys/types.h> #if defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + #define GEN_P_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() @@ -38,15 +65,12 @@ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ @@ -54,69 +78,115 @@ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_31() \ - GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ - GEN_X_DEFINE_0_3() \ +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 2 #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 2 #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 2 #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() @@ -124,7 +194,6 @@ #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h index 08dbddaea..cb9ff86c1 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h +++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -125,7 +125,7 @@ #define ASM_BUG() ASSERT(0) -#define OFFSET(ptr, val) (((unsigned char *)ptr)+val) +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) extern const uint8_t gf_clmul_mod_lt[4*256][16]; @@ -135,20 +135,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prfm pstl1strm, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prfm pldl1keep, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -242,6 +228,19 @@ typedef struct v { #define ZERO(r...) \ { \ switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ + "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ case 4: \ __asm( \ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c index d8d1f1bce..f8688a06a 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -28,111 +28,179 @@ #include "vdev_raidz_math_aarch64_neon_common.h" -#define GEN_P_DEFINE() \ +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 8 +#define ZERO_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() -#define GEN_P_STRIDE 8 -#define GEN_P_P 0, 1, 2, 3, 4, 5, 6, 7 +#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7 -#define GEN_PQ_DEFINE() \ +#define COPY_STRIDE 8 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define ADD_STRIDE 8 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQR_STRIDE 4 #define GEN_PQR_D 0, 1, 2, 3 -#define GEN_PQR_P 4, 5, 6, 7 -#define GEN_PQR_Q 8, 9, 22, 23 -#define GEN_PQR_R 24, 25, 26, 27 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ +#define SYN_Q_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_22_23() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 4 #define REC_PQ_X 0, 1, 2, 3 #define REC_PQ_Y 4, 5, 6, 7 -#define REC_PQ_D 8, 9, 22, 23 +#define REC_PQ_T 8, 9, 22, 23 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 4 #define REC_PR_X 0, 1, 2, 3 #define REC_PR_Y 4, 5, 6, 7 -#define REC_PR_D 8, 9, 22, 23 +#define REC_PR_T 8, 9, 22, 23 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 4 #define REC_QR_X 0, 1, 2, 3 #define REC_QR_Y 4, 5, 6, 7 -#define REC_QR_D 8, 9, 22, 23 +#define REC_QR_T 8, 9, 22, 23 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ - GEN_X_DEFINE_28_30() \ - GEN_X_DEFINE_31() \ GEN_X_DEFINE_33_36() -#define REC_PQR_STRIDE 4 -#define REC_PQR_X 0, 1, 2, 3 -#define REC_PQR_Y 4, 5, 6, 7 -#define REC_PQR_Z 8, 9, 22, 23 -#define REC_PQR_D 24, 25, 26, 27 -#define REC_PQR_XS 24, 25, 26, 27 -#define REC_PQR_YS 28, 29, 30, 31 +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 #include <sys/vdev_raidz_impl.h> #include "vdev_raidz_math_impl.h" diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c index 90c94c77c..07113a235 100644 --- a/module/zfs/vdev_raidz_math_avx2.c +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -21,7 +21,6 @@ /* * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ - #include <sys/isa_defs.h> #if defined(__x86_64) && defined(HAVE_AVX2) @@ -66,19 +65,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} #define XOR_ACC(src, r...) \ { \ @@ -122,25 +108,7 @@ typedef struct v { } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \ - "vpxor %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \ - "vpxor %" VR3(r) ", %" VR3(r)", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -335,59 +303,86 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; kfpu_end(); \ } -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c index bcbe657d0..465d1e569 100644 --- a/module/zfs/vdev_raidz_math_avx512bw.c +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -24,7 +24,7 @@ #include <sys/isa_defs.h> -#if defined(__x86_64) && defined(HAVE_AVX512BW) +#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW) #include <sys/types.h> #include <linux/simd_x86.h> @@ -345,6 +345,22 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; kfpu_end(); \ } +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_DEFINE() {} #define GEN_P_STRIDE 4 #define GEN_P_P 0, 1, 2, 3 diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c index cc3868bce..0b6108c10 100644 --- a/module/zfs/vdev_raidz_math_avx512f.c +++ b/module/zfs/vdev_raidz_math_avx512f.c @@ -20,6 +20,7 @@ */ /* * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ #include <sys/isa_defs.h> @@ -74,29 +75,12 @@ #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 #define R_23(REG...) _R_23(REG, 1, 2, 3) -#define ASM_BUG() ASSERT(0) - -extern const uint8_t gf_clmul_mod_lt[4*256][16]; - #define ELEM_SIZE 64 typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} #define XOR_ACC(src, r...) \ { \ @@ -109,14 +93,6 @@ typedef struct v { "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ : : [SRC] "r" (src)); \ break; \ - case 2: \ - __asm( \ - "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ - "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ - : : [SRC] "r" (src)); \ - break; \ - default: \ - ASM_BUG(); \ } \ } @@ -135,30 +111,12 @@ typedef struct v { "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ break; \ - default: \ - ASM_BUG(); \ } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \ - "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \ - "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} + +#define ZERO(r...) XOR(r, r) + #define COPY(r...) \ { \ @@ -175,8 +133,6 @@ typedef struct v { "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \ "vmovdqa64 %" VR1(r) ", %" VR3(r)); \ break; \ - default: \ - ASM_BUG(); \ } \ } @@ -191,14 +147,6 @@ typedef struct v { "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \ : : [SRC] "r" (src)); \ break; \ - case 2: \ - __asm( \ - "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ - "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ - : : [SRC] "r" (src)); \ - break; \ - default: \ - ASM_BUG(); \ } \ } @@ -213,31 +161,17 @@ typedef struct v { "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \ : : [DST] "r" (dst)); \ break; \ - case 2: \ - __asm( \ - "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ - "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ - : : [DST] "r" (dst)); \ - break; \ - default: \ - ASM_BUG(); \ } \ } -#define FLUSH() \ -{ \ - __asm("vzeroupper"); \ -} - #define MUL2_SETUP() \ { \ - __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \ - __asm("vpbroadcastq %xmm14, %zmm14"); \ - __asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \ - __asm("vpbroadcastq %xmm13, %zmm13"); \ - __asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \ - __asm("vpbroadcastq %xmm12, %zmm12"); \ - __asm("vpxorq %zmm0, %zmm0 ,%zmm0"); \ + __asm("vmovq %0, %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm31, %zmm31"); \ + __asm("vmovq %0, %%xmm30" :: "r"(0x8080808080808080)); \ + __asm("vpbroadcastq %xmm30, %zmm30"); \ + __asm("vmovq %0, %%xmm29" :: "r"(0xfefefefefefefefe)); \ + __asm("vpbroadcastq %xmm29, %zmm29"); \ } #define _MUL2(r...) \ @@ -245,23 +179,21 @@ typedef struct v { switch (REG_CNT(r)) { \ case 2: \ __asm( \ - "vpandq %" VR0(r)", %zmm13, %zmm10\n" \ - "vpandq %" VR1(r)", %zmm13, %zmm11\n" \ - "vpsrlq $7, %zmm10, %zmm30\n" \ - "vpsrlq $7, %zmm11, %zmm31\n" \ - "vpsllq $1, %zmm10, %zmm10\n" \ - "vpsllq $1, %zmm11, %zmm11\n" \ - "vpsubq %zmm30, %zmm10, %zmm10\n" \ - "vpsubq %zmm31, %zmm11, %zmm11\n" \ + "vpandq %" VR0(r)", %zmm30, %zmm26\n" \ + "vpandq %" VR1(r)", %zmm30, %zmm25\n" \ + "vpsrlq $7, %zmm26, %zmm28\n" \ + "vpsrlq $7, %zmm25, %zmm27\n" \ + "vpsllq $1, %zmm26, %zmm26\n" \ + "vpsllq $1, %zmm25, %zmm25\n" \ + "vpsubq %zmm28, %zmm26, %zmm26\n" \ + "vpsubq %zmm27, %zmm25, %zmm25\n" \ "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \ "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \ - "vpandq %zmm10, %zmm14, %zmm10\n" \ - "vpandq %zmm11, %zmm14, %zmm11\n" \ - "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \ - "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \ + "vpandq %zmm26, %zmm31, %zmm26\n" \ + "vpandq %zmm25, %zmm31, %zmm25\n" \ + "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \ + "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \ break; \ - default: \ - ASM_BUG(); \ } \ } @@ -275,8 +207,6 @@ typedef struct v { case 2: \ _MUL2(r); \ break; \ - default: \ - ASM_BUG(); \ } \ } @@ -286,216 +216,249 @@ typedef struct v { MUL2(r); \ } -/* - * Must match the init above - */ -#define _0f "zmm0" -#define _as "zmm14" -#define _bs "zmm13" -#define _ltmod "zmm12" -#define _ltmul "zmm11" -#define _ta "zmm10" -#define _tb "zmm15" -/* - * Must be in the first 16, otherwise an EVEX pshufb is generated - * Must match above - */ -#define _asYlo "ymm14" -#define _bsYlo "ymm13" -#define _ltmodYlo "ymm12" -#define _ltmulYlo "ymm11" -#define _taYlo "ymm10" -#define _tbYlo "ymm15" +/* General multiplication by adding powers of two */ -/* - * Must be in the first 16, otherwise an EVEX pshufb is generated - * ... - */ -#define _asYhi "ymm9" -#define _bsYhi "ymm8" -#define _ltmodYhi "ymm7" -#define _ltmulYhi "ymm6" -#define _taYhi "ymm5" -#define _tbYhi "ymm4" +#define _mul_x2_in 21, 22 +#define _mul_x2_acc 23, 24 -/* - * This uses a pair of AVX2 pshufb to emulate the missing AVX512 pshufb. - * AVX512BW has the full pshufb - * To get VEX pshufb (AVX2, supported in KNL) instead of EVEX pshufb - * (AVX512BW, not supported on KNL, probably also requiring AVX51VL - * since we use a 256 bits version), all registers in parameters to - * pshufb must be among ymm0-ymm15, since only EVEX can encore - * ymm16-ymm31 - * This is a bit hackish, but short of encoding the instruction in - * binary, how do we force the use of AVX2 pshufb ? - * Note that the other way round (forcing AVX512) is easy, just encode - * k0 as the mask register (k0 is all-1). - */ -#define _MULx2(c, r...) \ +#define _MUL_PARAM(x, in, acc) \ { \ - switch (REG_CNT(r)) { \ - case 2: \ - __asm( \ - "vmovq %[c0f], %%xmm0\n" \ - "vpbroadcastq %%xmm0, %%" _0f "\n" \ - /* upper bits */ \ - "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n" \ - "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n" \ - \ - "vpsrad $0x4, %%" VR0(r) ", %%"_as "\n" \ - "vpsrad $0x4, %%" VR1(r) ", %%"_bs "\n" \ - "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \ - "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \ - "vpandq %%" _0f ", %%" _as ", %%" _as "\n" \ - "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n" \ - \ - "vextracti64x4 $1,%%" _ltmod ",%%" _ltmodYhi"\n" \ - \ - "vextracti64x4 $1,%%" _as ",%%" _asYhi"\n" \ - "vpshufb %%" _asYlo ", %%" _ltmodYlo ", %%" _taYlo "\n" \ - "vpshufb %%" _asYhi ", %%" _ltmodYhi ", %%" _taYhi "\n" \ - "vinserti64x4 $1,%%" _taYhi ",%%" _ta ",%%" _ta "\n" \ - \ - "vextracti64x4 $1,%%" _bs ",%%" _bsYhi"\n" \ - "vpshufb %%" _bsYlo ", %%" _ltmodYlo ", %%" _tbYlo "\n" \ - "vpshufb %%" _bsYhi ", %%" _ltmodYhi ", %%" _tbYhi "\n" \ - "vinserti64x4 $1,%%" _tbYhi ",%%" _tb ",%%" _tb "\n" \ - \ - "vextracti64x4 $1,%%" _ltmul ",%%" _ltmulYhi"\n" \ - \ - "vpshufb %%" _asYlo ", %%" _ltmulYlo ", %%" _asYlo "\n" \ - "vpshufb %%" _asYhi ", %%" _ltmulYhi ", %%" _asYhi "\n" \ - "vinserti64x4 $1,%%" _asYhi ",%%" _as ",%%" _as "\n" \ - \ - "vpshufb %%" _bsYlo ", %%" _ltmulYlo ", %%" _bsYlo "\n" \ - "vpshufb %%" _bsYhi ", %%" _ltmulYhi ", %%" _bsYhi "\n" \ - "vinserti64x4 $1,%%" _bsYhi ",%%" _bs ",%%" _bs "\n" \ - \ - /* lower bits */ \ - "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n" \ - "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n" \ - \ - "vpxorq %%" _ta ", %%" _as ", %%" _as "\n" \ - "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n" \ - \ - "vextracti64x4 $1,%%" _ltmod ",%%" _ltmodYhi"\n" \ - \ - "vextracti64x4 $0,%%" VR0(r) ",%%" "ymm1" "\n" \ - "vextracti64x4 $1,%%" VR0(r) ",%%" _asYhi"\n" \ - "vpshufb %%" "ymm1" ", %%" _ltmodYlo ", %%" _taYlo "\n" \ - "vpshufb %%" _asYhi ", %%" _ltmodYhi ", %%" _taYhi "\n" \ - "vinserti64x4 $1,%%" _taYhi ",%%" _ta ",%%" _ta "\n" \ - \ - "vextracti64x4 $0,%%" VR1(r) ",%%" "ymm2" "\n" \ - "vextracti64x4 $1,%%" VR1(r) ",%%" _bsYhi"\n" \ - "vpshufb %%" "ymm2" ", %%" _ltmodYlo ", %%" _tbYlo "\n" \ - "vpshufb %%" _bsYhi ", %%" _ltmodYhi ", %%" _tbYhi "\n" \ - "vinserti64x4 $1,%%" _tbYhi ",%%" _tb ",%%" _tb "\n" \ - \ - "vextracti64x4 $1,%%" _ltmul ",%%" _ltmulYhi"\n" \ - \ - "vpshufb %%" "ymm1" ", %%" _ltmulYlo ", %%" "ymm1" "\n" \ - "vpshufb %%" _asYhi ", %%" _ltmulYhi ", %%" _asYhi "\n" \ - "vinserti64x4 $1,%%" _asYhi ",%%" "zmm1" ",%%" VR0(r) "\n" \ - \ - "vpshufb %%" "ymm2" ", %%" _ltmulYlo ", %%" "ymm2" "\n" \ - "vpshufb %%" _bsYhi ", %%" _ltmulYhi ", %%" _bsYhi "\n" \ - "vinserti64x4 $1,%%" _bsYhi ",%%" "zmm2" ",%%" VR1(r) "\n" \ - \ - "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \ - "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \ - "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \ - "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \ - : : [c0f] "r" (0x0f0f0f0f0f0f0f0f), \ - [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ - break; \ - default: \ - ASM_BUG(); \ - } \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ + if (x & 0xfe) { MUL2(in); } \ + if (x & 0x02) { XOR(in, acc); } \ + if (x & 0xfc) { MUL2(in); } \ + if (x & 0x04) { XOR(in, acc); } \ + if (x & 0xf8) { MUL2(in); } \ + if (x & 0x08) { XOR(in, acc); } \ + if (x & 0xf0) { MUL2(in); } \ + if (x & 0x10) { XOR(in, acc); } \ + if (x & 0xe0) { MUL2(in); } \ + if (x & 0x20) { XOR(in, acc); } \ + if (x & 0xc0) { MUL2(in); } \ + if (x & 0x40) { XOR(in, acc); } \ + if (x & 0x80) { MUL2(in); XOR(in, acc); } \ } -#define MUL(c, r...) \ +#define MUL_x2_DEFINE(x) \ +static void \ +mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); } + + +MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3); +MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7); +MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11); +MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15); +MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19); +MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23); +MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27); +MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31); +MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35); +MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39); +MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43); +MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47); +MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51); +MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55); +MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59); +MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63); +MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67); +MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71); +MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75); +MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79); +MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83); +MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87); +MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91); +MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95); +MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99); +MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103); +MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107); +MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111); +MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115); +MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119); +MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123); +MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127); +MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131); +MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135); +MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139); +MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143); +MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147); +MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151); +MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155); +MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159); +MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163); +MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167); +MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171); +MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175); +MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179); +MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183); +MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187); +MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191); +MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195); +MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199); +MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203); +MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207); +MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211); +MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215); +MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219); +MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223); +MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227); +MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231); +MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235); +MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239); +MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243); +MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247); +MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251); +MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255); + + +typedef void (*mul_fn_ptr_t)(void); + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x2_mul_fns[256] = { + mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5, + mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11, + mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17, + mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23, + mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29, + mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35, + mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41, + mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47, + mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53, + mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59, + mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65, + mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71, + mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77, + mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83, + mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89, + mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95, + mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101, + mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107, + mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113, + mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119, + mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125, + mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131, + mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137, + mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143, + mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149, + mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155, + mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161, + mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167, + mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173, + mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179, + mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185, + mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191, + mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197, + mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203, + mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209, + mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215, + mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221, + mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227, + mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233, + mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239, + mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245, + mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251, + mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255 +}; + +#define MUL(c, r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ - _MULx2(c, R_01(r)); \ - _MULx2(c, R_23(r)); \ - break; \ - case 2: \ - _MULx2(c, R_01(r)); \ - break; \ - default: \ - ASM_BUG(); \ + COPY(R_01(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_01(r)); \ + COPY(R_23(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_23(r)); \ } \ } + #define raidz_math_begin() kfpu_begin() -#define raidz_math_end() \ -{ \ - FLUSH(); \ - kfpu_end(); \ -} +#define raidz_math_end() kfpu_end() + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1, 2, 3 -/* - * This use zmm16-zmm31 registers to free up zmm0-zmm15 - * to use with the AVX2 pshufb, see above - */ -#define GEN_P_DEFINE() {} #define GEN_P_STRIDE 4 -#define GEN_P_P 20, 21, 22, 23 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 -#define GEN_PQ_D 20, 21, 22, 23 -#define GEN_PQ_P 24, 25, 26, 27 -#define GEN_PQ_Q 28, 29, 3, 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 20, 21 -#define GEN_PQR_P 22, 23 -#define GEN_PQR_Q 24, 25 -#define GEN_PQR_R 26, 27 - -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 20, 21, 22, 23 - -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 20, 21, 22, 23 - -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 20, 21, 22, 23 - -#define REC_PQ_DEFINE() {} -#define REC_PQ_STRIDE 2 -#define REC_PQ_X 20, 21 -#define REC_PQ_Y 22, 23 -#define REC_PQ_D 24, 25 - -#define REC_PR_DEFINE() {} -#define REC_PR_STRIDE 2 -#define REC_PR_X 20, 21 -#define REC_PR_Y 22, 23 -#define REC_PR_D 24, 25 - -#define REC_QR_DEFINE() {} -#define REC_QR_STRIDE 2 -#define REC_QR_X 20, 21 -#define REC_QR_Y 22, 23 -#define REC_QR_D 24, 25 - -#define REC_PQR_DEFINE() {} -#define REC_PQR_STRIDE 2 -#define REC_PQR_X 20, 21 -#define REC_PQR_Y 22, 23 -#define REC_PQR_Z 24, 25 -#define REC_PQR_D 26, 27 -#define REC_PQR_XS 26, 27 -#define REC_PQR_YS 28, 29 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 4 +#define REC_PQ_DEFINE() MUL2_SETUP() +#define REC_PQ_X 0, 1, 2, 3 +#define REC_PQ_Y 4, 5, 6, 7 +#define REC_PQ_T 8, 9, 10, 11 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 4 +#define REC_PR_DEFINE() MUL2_SETUP() +#define REC_PR_X 0, 1, 2, 3 +#define REC_PR_Y 4, 5, 6, 7 +#define REC_PR_T 8, 9, 10, 11 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 4 +#define REC_QR_DEFINE() MUL2_SETUP() +#define REC_QR_X 0, 1, 2, 3 +#define REC_QR_Y 4, 5, 6, 7 +#define REC_QR_T 8, 9, 10, 11 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 4 +#define REC_PQR_DEFINE() MUL2_SETUP() +#define REC_PQR_X 0, 1, 2, 3 +#define REC_PQR_Y 4, 5, 6, 7 +#define REC_PQR_Z 8, 9, 10, 11 +#define REC_PQR_XS 12, 13, 14, 15 +#define REC_PQR_YS 16, 17, 18, 19 #include <sys/vdev_raidz_impl.h> @@ -508,6 +471,7 @@ static boolean_t raidz_will_avx512f_work(void) { return (zfs_avx_available() && + zfs_avx2_available() && zfs_avx512f_available()); } diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 70257ee49..171380524 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -32,250 +32,14 @@ #define noinline __attribute__((noinline)) #endif -/* Calculate data offset in raidz column, offset is in bytes */ -#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_data) + (off))) - -/* - * PARITY CALCULATION - * An optimized function is called for a full length of data columns - * If RAIDZ map contains remainder columns (shorter columns) the same function - * is called for reminder of full columns. - * - * GEN_[P|PQ|PQR]_BLOCK() functions are designed to be efficiently in-lined by - * the compiler. This removes a lot of conditionals from the inside loop which - * makes the code faster, especially for vectorized code. - * They are also highly parametrized, allowing for each implementation to define - * most optimal stride, and register allocation. - */ - -static raidz_inline void -GEN_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t *col; - - GEN_P_DEFINE(); - - for (ioff = off; ioff < end; ioff += (GEN_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&(rm->rm_col[1]), ioff), GEN_P_P); - - for (c = 2; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), GEN_P_P); - } - - STORE(COL_OFF(pcol, ioff), GEN_P_P); - } -} - -/* - * Generate P parity (RAIDZ1) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_P_BLOCK(rm, 0, short_size, ncols); - - /* fullcols */ - GEN_P_BLOCK(rm, short_size, psize, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -static raidz_inline void -GEN_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t *col; - - GEN_PQ_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[2], ioff), GEN_PQ_P); - COPY(GEN_PQ_P, GEN_PQ_Q); - - for (c = 3; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQ_D); - MUL2(GEN_PQ_Q); - XOR(GEN_PQ_D, GEN_PQ_P); - XOR(GEN_PQ_D, GEN_PQ_Q); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQ_P); - - for (; c < ncols; c++) - MUL2(GEN_PQ_Q); - - STORE(COL_OFF(qcol, ioff), GEN_PQ_Q); - } -} - -/* - * Generate PQ parity (RAIDZ2) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_PQ_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQ_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - - -static raidz_inline void -GEN_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t *col; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - - GEN_PQR_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[3], ioff), GEN_PQR_P); - COPY(GEN_PQR_P, GEN_PQR_Q); - COPY(GEN_PQR_P, GEN_PQR_R); - - for (c = 4; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQR_D); - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - XOR(GEN_PQR_D, GEN_PQR_P); - XOR(GEN_PQR_D, GEN_PQR_Q); - XOR(GEN_PQR_D, GEN_PQR_R); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQR_P); - - for (; c < ncols; c++) { - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - } - - STORE(COL_OFF(qcol, ioff), GEN_PQR_Q); - STORE(COL_OFF(rcol, ioff), GEN_PQR_R); - } -} - - -/* - * Generate PQR parity (RAIDZ3) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_PQR_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQR_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -/* - * DATA RECONSTRUCTION - * - * Data reconstruction process consists of two phases: - * - Syndrome calculation - * - Data reconstruction - * - * Syndrome is calculated by generating parity using available data columns - * and zeros in places of erasure. Existing parity is added to corresponding - * syndrome value to obtain the [P|Q|R]syn values from equation: - * P = Psyn + Dx + Dy + Dz - * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz - * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz - * - * For data reconstruction phase, the corresponding equations are solved - * for missing data (Dx, Dy, Dz). This generally involves multiplying known - * symbols by an coefficient and adding them together. The multiplication - * constant coefficients are calculated ahead of the operation in - * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. - * - * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" - * and "short" columns. - * For this reason, reconstruction is performed in minimum of - * two steps. First, from offset 0 to short_size, then from short_size to - * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work - * over both ranges. The split also enables removal of conditional expressions - * from loop bodies, improving throughput of SIMD implementations. - * For the best performance, all functions marked with raidz_inline attribute - * must be inlined by compiler. - * - * parity data - * columns columns - * <----------> <------------------> - * x y <----+ missing columns (x, y) - * | | - * +---+---+---+---+-v-+---+-v-+---+ ^ 0 - * | | | | | | | | | | - * | | | | | | | | | | - * | P | Q | R | D | D | D | D | D | | - * | | | | 0 | 1 | 2 | 3 | 4 | | - * | | | | | | | | | v - * | | | | | +---+---+---+ ^ short_size - * | | | | | | | - * +---+---+---+---+---+ v big_size - * <------------------> <----------> - * big columns short columns - * - */ - /* * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. * @rm RAIDZ map * @tgtidx array of missing data indexes - * @coeff output array of coefficients. Array must be user - * provided and must hold minimum MUL_CNT values + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. */ static noinline void raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) @@ -383,240 +147,602 @@ raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) coeff[MUL_PQR_YQ] = yd; } +/* + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused + */ +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) +{ + v_t *dst = (v_t *) dc; + size_t i; + + ZERO_DEFINE(); + + (void) private; /* unused */ + + ZERO(ZERO_D); + + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); + } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ +} /* - * Reconstruction using P parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @ncols number of column + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused */ -static raidz_inline void -REC_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int ncols) +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + v_t *dst = (v_t *) dc; + const v_t *src = (v_t *) sc; + size_t i; - REC_P_DEFINE(); + COPY_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_P_X); + (void) private; /* unused */ - for (c = firstdc; c < x; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); + } + + return (0); +} - for (c++; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } - STORE(COL_OFF(xcol, ioff), REC_P_X); +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ +} + +/* + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *) dc; + const v_t *src = (v_t *) sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ } /* - * Reconstruct single data column using P parity - * @rec_method REC_P_BLOCK() + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. * - * @rm RAIDZ map - * @tgtidx array of missing data indexes + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) */ -static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +static int +raidz_mul_abd_cb(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *) private); + v_t *d = (v_t *) dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros + */ +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; raidz_math_begin(); - /* 0 - short_size */ - REC_P_BLOCK(rm, 0, short_size, x, ncols); + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); - /* short_size - xsize */ - REC_P_BLOCK(rm, short_size, xsize, x, nbigcols); + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; - raidz_math_end(); + /* add data column */ + raidz_add(pabd, dabd, size); + } - return (1 << CODE_P); + raidz_math_end(); } + /* - * Reconstruct using Q parity + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *) c[0]; + v_t *q = (v_t *) c[1]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); -#define REC_Q_SYN_UPDATE() MUL2(REC_Q_X) + MUL2_SETUP(); -#define REC_Q_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_Q_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); \ + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } } + /* - * Reconstruction using Q parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map */ static raidz_inline void -REC_Q_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +raidz_generate_pq_impl(raidz_map_t * const rm) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; - REC_Q_DEFINE(); + raidz_math_begin(); - for (ioff = off; ioff < end; ioff += (REC_Q_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); - ZERO(REC_Q_X); + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_Q_INNER_LOOP(c); + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } - REC_Q_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_Q_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_Q_SYN_UPDATE(); - if (x != c) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); - } - } - for (; c < ncols; c++) - REC_Q_SYN_UPDATE(); - } + raidz_math_end(); +} - XOR_ACC(COL_OFF(qcol, ioff), REC_Q_X); - MUL(coeff[MUL_Q_X], REC_Q_X); - STORE(COL_OFF(xcol, ioff), REC_Q_X); + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *) c[0]; + v_t *q = (v_t *) c[1]; + v_t *r = (v_t *) c[CODE_R]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); } } + /* - * Reconstruct single data column using Q parity - * @rec_method REC_Q_BLOCK() + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + +/* + * Reconstruct single data column using P parity + * + * @syn_method raidz_add_abd() + * @rec_method not applicable * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; - - raidz_rec_q_coeff(rm, tgtidx, coeff); + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; raidz_math_begin(); - /* 0 - short_size */ - REC_Q_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); - /* short_size - xsize */ - REC_Q_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } raidz_math_end(); - return (1 << CODE_Q); + return (1 << CODE_P); } + /* - * Reconstruct using R parity + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) */ +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) +{ + v_t *x = (v_t *) xc[TARGET_X]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); -#define REC_R_SYN_UPDATE() MUL4(REC_R_X) -#define REC_R_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_R_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_R_X); \ + SYN_Q_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); + } } + /* - * Reconstruction using R parity + * Reconstruct single data column using Q parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * @tgtidx array of missing data indexes */ -static raidz_inline void -REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +static raidz_inline int +raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) { - int c; - size_t ioff = 0; + size_t c; + size_t dsize; + abd_t *dabd; const size_t firstdc = raidz_parity(rm); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; - REC_R_DEFINE(); - - for (ioff = off; ioff < end; ioff += (REC_R_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + unsigned coeff[MUL_CNT]; + raidz_rec_q_coeff(rm, tgtidx, coeff); - ZERO(REC_R_X); + raidz_math_begin(); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_R_INNER_LOOP(c); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } - REC_R_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_R_INNER_LOOP(c); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; } else { - for (c = firstdc; c < nbigcols; c++) { - REC_R_SYN_UPDATE(); - if (c != x) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_R_X); - } - } - for (; c < ncols; c++) - REC_R_SYN_UPDATE(); + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; } - XOR_ACC(COL_OFF(rcol, ioff), REC_R_X); - MUL(coeff[MUL_R_X], REC_R_X); - STORE(COL_OFF(xcol, ioff), REC_R_X); + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); + + raidz_math_end(); + + return (1 << CODE_Q); } + +/* + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) xc[TARGET_X]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); + + SYN_R_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); + } +} + + /* * Reconstruct single data column using R parity - * @rec_method REC_R_BLOCK() + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -624,122 +750,136 @@ REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + unsigned coeff[MUL_CNT]; raidz_rec_r_coeff(rm, tgtidx, coeff); raidz_math_begin(); - /* 0 - short_size */ - REC_R_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); - /* short_size - xsize */ - REC_R_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); raidz_math_end(); return (1 << CODE_R); } + /* - * Reconstruct using PQ parity + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) tc[TARGET_X]; + v_t *y = (v_t *) tc[TARGET_Y]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PQ_DEFINE(); -#define REC_PQ_SYN_UPDATE() MUL2(REC_PQ_Y) -#define REC_PQ_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PQ_D); \ - REC_PQ_SYN_UPDATE(); \ - XOR(REC_PQ_D, REC_PQ_X); \ - XOR(REC_PQ_D, REC_PQ_Y); \ + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); + } } /* - * Reconstruction using PQ parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants */ -static raidz_inline void -REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) tc[TARGET_X]; + v_t *y = (v_t *) tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; REC_PQ_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PQ_X); - ZERO(REC_PQ_Y); - MUL2_SETUP(); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQ_INNER_LOOP(c); + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); - REC_PQ_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQ_INNER_LOOP(c); - - REC_PQ_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQ_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQ_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQ_D); - XOR(REC_PQ_D, REC_PQ_X); - XOR(REC_PQ_D, REC_PQ_Y); - } - } - for (; c < ncols; c++) - REC_PQ_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_PQ_Y); + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); /* Save Pxy */ - COPY(REC_PQ_X, REC_PQ_D); + COPY(REC_PQ_X, REC_PQ_T); /* Calc X */ - MUL(coeff[MUL_PQ_X], REC_PQ_X); - MUL(coeff[MUL_PQ_Y], REC_PQ_Y); + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); XOR(REC_PQ_Y, REC_PQ_X); - STORE(COL_OFF(xcol, ioff), REC_PQ_X); + STORE(x, REC_PQ_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PQ_D, REC_PQ_X); - STORE(COL_OFF(ycol, ioff), REC_PQ_X); - } + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); } } + /* * Reconstruct two data columns using PQ parity - * @rec_method REC_PQ_BLOCK() + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -747,126 +887,156 @@ REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + unsigned coeff[MUL_CNT]; raidz_rec_pq_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQ_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } - /* short_size - xsize */ - REC_PQ_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + return ((1 << CODE_P) | (1 << CODE_Q)); } + /* - * Reconstruct using PR parity + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); -#define REC_PR_SYN_UPDATE() MUL4(REC_PR_Y) -#define REC_PR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PR_D); \ - REC_PR_SYN_UPDATE(); \ - XOR(REC_PR_D, REC_PR_X); \ - XOR(REC_PR_D, REC_PR_Y); \ + SYN_PR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } } /* - * Reconstruction using PR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants */ -static raidz_inline void -REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; REC_PR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PR_X); - ZERO(REC_PR_Y); - MUL2_SETUP(); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PR_D); - XOR(REC_PR_D, REC_PR_X); - XOR(REC_PR_D, REC_PR_Y); - } - } - for (; c < ncols; c++) - REC_PR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(rcol, ioff), REC_PR_Y); + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); /* Save Pxy */ - COPY(REC_PR_X, REC_PR_D); + COPY(REC_PR_X, REC_PR_T); /* Calc X */ - MUL(coeff[MUL_PR_X], REC_PR_X); - MUL(coeff[MUL_PR_Y], REC_PR_Y); + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); XOR(REC_PR_Y, REC_PR_X); - STORE(COL_OFF(xcol, ioff), REC_PR_X); + STORE(x, REC_PR_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PR_D, REC_PR_X); - STORE(COL_OFF(ycol, ioff), REC_PR_X); - } + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); } } /* * Reconstruct two data columns using PR parity - * @rec_method REC_PR_BLOCK() + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -874,134 +1044,162 @@ REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); - /* short_size - xsize */ - REC_PR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); - return ((1 << CODE_P) | (1 << CODE_R)); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); } /* - * Reconstruct using QR parity + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); -#define REC_QR_SYN_UPDATE() \ -{ \ - MUL2(REC_QR_X); \ - MUL4(REC_QR_Y); \ -} + SYN_QR_DEFINE(); + + MUL2_SETUP(); -#define REC_QR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_QR_D); \ - REC_QR_SYN_UPDATE(); \ - XOR(REC_QR_D, REC_QR_X); \ - XOR(REC_QR_D, REC_QR_Y); \ + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); + } } + /* - * Reconstruction using QR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants */ -static raidz_inline void -REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; REC_QR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_QR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - ZERO(REC_QR_X); - ZERO(REC_QR_Y); + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_QR_INNER_LOOP(c); + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); - REC_QR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_QR_INNER_LOOP(c); - - REC_QR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_QR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_QR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_QR_D); - XOR(REC_QR_D, REC_QR_X); - XOR(REC_QR_D, REC_QR_Y); - } - } - for (; c < ncols; c++) - REC_QR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_QR_X); - XOR_ACC(COL_OFF(rcol, ioff), REC_QR_Y); - - /* Save Qxy */ - COPY(REC_QR_X, REC_QR_D); + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); /* Calc X */ - MUL(coeff[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ - MUL(coeff[MUL_QR_X], REC_QR_X); /* X = X * xm */ - STORE(COL_OFF(xcol, ioff), REC_QR_X); - - if (calcy) { - /* Calc Y */ - MUL(coeff[MUL_QR_YQ], REC_QR_D); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_D); /* X = R ^ X */ - MUL(coeff[MUL_QR_Y], REC_QR_D); /* X = X * xm */ - STORE(COL_OFF(ycol, ioff), REC_QR_D); - } + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); } } + /* * Reconstruct two data columns using QR parity - * @rec_method REC_QR_BLOCK() + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1009,158 +1207,182 @@ REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_QR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); - /* short_size - xsize */ - REC_QR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_Q) | (1 << CODE_R)); } + /* - * Reconstruct using PQR parity + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) */ +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *) c[TARGET_X]; + v_t *y = (v_t *) c[TARGET_Y]; + v_t *z = (v_t *) c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (v_t *) dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); -#define REC_PQR_SYN_UPDATE() \ -{ \ - MUL2(REC_PQR_Y); \ - MUL4(REC_PQR_Z); \ -} + SYN_PQR_DEFINE(); -#define REC_PQR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[(c)]; \ - LOAD(COL_OFF(col, ioff), REC_PQR_D); \ - REC_PQR_SYN_UPDATE(); \ - XOR(REC_PQR_D, REC_PQR_X); \ - XOR(REC_PQR_D, REC_PQR_Y); \ - XOR(REC_PQR_D, REC_PQR_Z); \ + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } } + /* - * Reconstruction using PQR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @z missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - * @calcz calculate third data column + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants */ -static raidz_inline void -REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const int z, const unsigned *coeff, - const int ncols, const int nbigcols, const boolean_t calcy, - const boolean_t calcz) +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t * const zcol = raidz_col_p(rm, z); - raidz_col_t *col; + v_t *x = (v_t *) t[TARGET_X]; + v_t *y = (v_t *) t[TARGET_Y]; + v_t *z = (v_t *) t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *) c[CODE_P]; + const v_t *q = (v_t *) c[CODE_Q]; + const v_t *r = (v_t *) c[CODE_R]; REC_PQR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - LOAD(COL_OFF(pcol, ioff), REC_PQR_X); - ZERO(REC_PQR_Y); - ZERO(REC_PQR_Z); + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < z; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQR_SYN_UPDATE(); - if (c != x && c != y && c != z) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQR_D); - XOR(REC_PQR_D, REC_PQR_X); - XOR(REC_PQR_D, REC_PQR_Y); - XOR(REC_PQR_D, REC_PQR_Z); - } - } - for (; c < ncols; c++) - REC_PQR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_PQR_Y); - XOR_ACC(COL_OFF(rcol, ioff), REC_PQR_Z); + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); /* Save Pxyz and Qxyz */ COPY(REC_PQR_X, REC_PQR_XS); COPY(REC_PQR_Y, REC_PQR_YS); /* Calc X */ - MUL(coeff[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ - MUL(coeff[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ XOR(REC_PQR_Y, REC_PQR_X); - MUL(coeff[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ - STORE(COL_OFF(xcol, ioff), REC_PQR_X); - - if (calcy) { - /* Calc Y */ - XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ - MUL(coeff[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ - XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ - COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ - MUL(coeff[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ - MUL(coeff[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ - XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ - STORE(COL_OFF(ycol, ioff), REC_PQR_YS); - } - - if (calcz) { - /* Calc Z */ - XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ - STORE(COL_OFF(zcol, ioff), REC_PQR_YS); - } + STORE(x, REC_PQR_X); + + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); + + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); } } + /* * Reconstruct three data columns using PQR parity - * @rec_method REC_PQR_BLOCK() + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1168,31 +1390,87 @@ REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, static raidz_inline int raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int z = tgtidx[TARGET_Z]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t zsize = raidz_col_size(rm, z); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQR_BLOCK(rm, 0, short_size, x, y, z, coeff, ncols, ncols, - B_TRUE, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } - /* short_size - xsize */ - REC_PQR_BLOCK(rm, short_size, xsize, x, y, z, coeff, ncols, nbigcols, - xsize == ysize, xsize == zsize); + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); } diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c index 993d406e6..a693bff63 100644 --- a/module/zfs/vdev_raidz_math_scalar.c +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -154,71 +154,96 @@ static const struct { #define raidz_math_begin() {} #define raidz_math_end() {} -#define GEN_P_DEFINE() v_t p0 -#define GEN_P_STRIDE 1 -#define GEN_P_P p0 - -#define GEN_PQ_DEFINE() v_t d0, p0, q0 -#define GEN_PQ_STRIDE 1 -#define GEN_PQ_D d0 -#define GEN_PQ_P p0 -#define GEN_PQ_Q q0 - -#define GEN_PQR_DEFINE() v_t d0, p0, q0, r0 -#define GEN_PQR_STRIDE 1 -#define GEN_PQR_D d0 -#define GEN_PQR_P p0 -#define GEN_PQR_Q q0 -#define GEN_PQR_R r0 - -#define REC_P_DEFINE() v_t x0 -#define REC_P_STRIDE 1 -#define REC_P_X x0 - -#define REC_Q_DEFINE() v_t x0 -#define REC_Q_STRIDE 1 -#define REC_Q_X x0 - -#define REC_R_DEFINE() v_t x0 -#define REC_R_STRIDE 1 -#define REC_R_X x0 - -#define REC_PQ_DEFINE() v_t x0, y0, d0 -#define REC_PQ_STRIDE 1 -#define REC_PQ_X x0 -#define REC_PQ_Y y0 -#define REC_PQ_D d0 - -#define REC_PR_DEFINE() v_t x0, y0, d0 -#define REC_PR_STRIDE 1 -#define REC_PR_X x0 -#define REC_PR_Y y0 -#define REC_PR_D d0 - -#define REC_QR_DEFINE() v_t x0, y0, d0 -#define REC_QR_STRIDE 1 -#define REC_QR_X x0 -#define REC_QR_Y y0 -#define REC_QR_D d0 - -#define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0 -#define REC_PQR_STRIDE 1 -#define REC_PQR_X x0 -#define REC_PQR_Y y0 -#define REC_PQR_Z z0 -#define REC_PQR_D d0 -#define REC_PQR_XS d0 -#define REC_PQR_YS t0 +#define SYN_STRIDE 1 -#include "vdev_raidz_math_impl.h" +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 -/* - * If compiled with -O0, gcc doesn't do any stack frame coalescing - * and -Wframe-larger-than=1024 is triggered in debug mode. - * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which - * does not trigger this warning. - */ -#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 + +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 + +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 + +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 + +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 + +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 + +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 + + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 + + +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 + +#include "vdev_raidz_math_impl.h" DEFINE_GEN_METHODS(scalar); DEFINE_REC_METHODS(scalar); diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c index 6fc81215a..9985da273 100644 --- a/module/zfs/vdev_raidz_math_sse2.c +++ b/module/zfs/vdev_raidz_math_sse2.c @@ -58,9 +58,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) {} -#define PREFETCH(ptr, offset) {} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -106,27 +103,8 @@ typedef struct v { break; \ } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r) "\n" \ - "pxor %" VR2(r) ", %" VR2(r) "\n" \ - "pxor %" VR3(r) ", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r)); \ - break; \ - case 1: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r)); \ - break; \ - } \ -} + +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -236,6 +214,10 @@ typedef struct v { #define MUL2(r...) \ { \ switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(VR0(r), VR1(r)); \ + _MUL2_x2(VR2(r), VR3(r)); \ + break; \ case 2: \ _MUL2_x2(VR0(r), VR1(r)); \ break; \ @@ -255,7 +237,7 @@ typedef struct v { #define _MUL_PARAM(x, in, acc) \ { \ - if (x & 0x01) { COPY(in, acc); } else { XOR(acc, acc); } \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ if (x & 0xfe) { MUL2(in); } \ if (x & 0x02) { XOR(in, acc); } \ if (x & 0xfc) { MUL2(in); } \ @@ -271,8 +253,8 @@ typedef struct v { if (x & 0x80) { MUL2(in); XOR(in, acc); } \ } -#define _mul_x1_in 9 -#define _mul_x1_acc 11 +#define _mul_x1_in 11 +#define _mul_x1_acc 12 #define MUL_x1_DEFINE(x) \ static void \ @@ -533,61 +515,87 @@ gf_x2_mul_fns[256] = { #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 2 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 +#define GEN_PQ_STRIDE 4 #define GEN_PQ_DEFINE() {} -#define GEN_PQ_STRIDE 2 -#define GEN_PQ_D 0, 1 -#define GEN_PQ_P 2, 3 -#define GEN_PQ_Q 4, 5 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 2 -#define REC_Q_X 0, 1 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 2 -#define REC_R_X 0, 1 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() MUL2_SETUP() #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() MUL2_SETUP() #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() MUL2_SETUP() #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() MUL2_SETUP() #define REC_PQR_X 0 #define REC_PQR_Y 1 #define REC_PQR_Z 2 -#define REC_PQR_D 3 -#define REC_PQR_XS 4 -#define REC_PQR_YS 5 +#define REC_PQR_XS 3 +#define REC_PQR_YS 4 #include <sys/vdev_raidz_impl.h> diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index d93441349..cebb0fe2b 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -66,19 +66,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} #define XOR_ACC(src, r...) \ { \ @@ -122,25 +109,7 @@ typedef struct v { } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r) "\n" \ - "pxor %" VR2(r) ", %" VR2(r) "\n" \ - "pxor %" VR3(r) ", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -337,59 +306,86 @@ typedef struct v { #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 @@ -419,7 +415,8 @@ const raidz_impl_ops_t vdev_raidz_ssse3_impl = { #endif /* defined(__x86_64) && defined(HAVE_SSSE3) */ -#if defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2)) +#if defined(__x86_64) +#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) const uint8_t __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { @@ -2473,4 +2470,5 @@ __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } }; -#endif /* defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2)) */ +#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */ +#endif /* defined(__x86_64) */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 760f0a891..b2d07166e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -40,6 +40,7 @@ #include <sys/dsl_pool.h> #include <sys/metaslab.h> #include <sys/trace_zil.h> +#include <sys/abd.h> /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -878,6 +879,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -914,12 +916,14 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 223c20abe..b608ed6ea 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -42,6 +42,7 @@ #include <sys/metaslab_impl.h> #include <sys/time.h> #include <sys/trace_zio.h> +#include <sys/abd.h> /* * ========================================================================== @@ -67,6 +68,11 @@ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#endif + int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 @@ -212,6 +218,13 @@ zio_fini(void) if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; #endif +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) + (void) printf("zio_fini: [%d] %llu != %llu\n", + (int)((c + 1) << SPA_MINBLOCKSHIFT), + (long long unsigned)zio_buf_cache_allocs[c], + (long long unsigned)zio_buf_cache_frees[c]); +#endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); @@ -251,6 +264,9 @@ zio_buf_alloc(size_t size) size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_allocs[c], 1); +#endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } @@ -271,26 +287,15 @@ zio_data_buf_alloc(size_t size) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } -/* - * Use zio_buf_alloc_flags when specific allocation flags are needed. e.g. - * passing KM_NOSLEEP when it is acceptable for an allocation to fail. - */ -void * -zio_buf_alloc_flags(size_t size, int flags) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - return (kmem_cache_alloc(zio_buf_cache[c], flags)); -} - void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_frees[c], 1); +#endif kmem_cache_free(zio_buf_cache[c], buf); } @@ -311,12 +316,18 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - zt->zt_orig_data = zio->io_data; + /* + * Ensure that anyone expecting this zio to contain a linear ABD isn't + * going to get a nasty surprise when they try to access the data. + */ + IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); + + zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; @@ -324,7 +335,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; - zio->io_data = data; + zio->io_abd = data; zio->io_size = size; } @@ -336,12 +347,12 @@ zio_pop_transforms(zio_t *zio) while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, - zt->zt_orig_data, zt->zt_orig_size); + zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_abd); - zio->io_data = zt->zt_orig_data; + zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; @@ -355,21 +366,26 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_abd, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size); + abd_return_buf_copy(data, tmp, size); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } } /* @@ -552,7 +568,7 @@ zio_timestamp_compare(const void *x1, const void *x2) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, @@ -611,7 +627,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; - zio->io_orig_data = zio->io_data = data; + zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; @@ -755,7 +771,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -773,7 +789,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -814,7 +830,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -967,7 +983,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -988,7 +1004,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -1011,8 +1027,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -1024,7 +1041,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -1090,7 +1107,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -1151,14 +1168,17 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); - - zio_push_transform(zio, cbuf, psize, psize, zio_decompress); + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1299,7 +1319,7 @@ zio_write_compress(zio_t *zio) /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1337,9 +1357,11 @@ zio_write_compress(zio_t *zio) zio_buf_free(cbuf, lsize); psize = lsize; } else { - bzero((char *)cbuf + psize, rounded - psize); + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); psize = rounded; - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1942,26 +1964,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1972,8 +2006,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1983,7 +2021,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1991,16 +2030,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -2064,13 +2105,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -2087,13 +2129,16 @@ zio_gang_tree_assemble_done(zio_t *zio) if (zio->io_error) return; + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_abd); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -2103,7 +2148,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2117,7 +2163,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -2126,13 +2172,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -2165,7 +2212,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -2205,6 +2253,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static int zio_write_gang_block(zio_t *pio) { @@ -2215,6 +2269,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -2275,12 +2330,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -2302,9 +2359,9 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, - lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, - &gn->gn_child[g], pio->io_priority, + abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, + lsize, &zp, zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { @@ -2320,7 +2377,6 @@ zio_write_gang_block(zio_t *pio) zp.zp_copies, cio, flags)); } zio_nowait(cio); - } /* @@ -2423,10 +2479,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ - if (zio->io_error == 0 && dde->dde_repair_data == NULL) - dde->dde_repair_data = zio->io_data; + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } @@ -2459,16 +2516,16 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, - zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); @@ -2498,8 +2555,9 @@ zio_ddt_read_done(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } - if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2537,12 +2595,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || - bcmp(zio->io_data, lio->io_data, - zio->io_size) != 0); + abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, - zio->io_orig_size) != 0); + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } @@ -2552,7 +2608,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; - void *tmpbuf; + abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); @@ -2563,19 +2619,19 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) ddt_exit(ddt); - tmpbuf = zio_buf_alloc(psize); + tmpabd = abd_alloc_for_io(psize, B_TRUE); - error = zio_wait(zio_read(NULL, spa, &blk, tmpbuf, + error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { - if (bcmp(tmpbuf, zio->io_data, psize) != 0) + if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } - zio_buf_free(tmpbuf, psize); + abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { @@ -2597,7 +2653,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) &aflags, &zio->io_bookmark); if (error == 0) { - if (bcmp(abuf->b_data, zio->io_orig_data, + if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); @@ -2762,12 +2818,12 @@ zio_ddt_write(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } @@ -2784,13 +2840,13 @@ zio_ddt_write(zio_t *zio) ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } @@ -3130,11 +3186,11 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -3264,7 +3320,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3398,7 +3454,7 @@ zio_checksum_generate(zio_t *zio) } } - zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } @@ -3537,7 +3593,7 @@ zio_ready(zio_t *zio) if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { - ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } @@ -3616,6 +3672,7 @@ zio_done(zio_t *zio) * Always attempt to keep stack usage minimal here since * we can be called recurisvely up to 19 levels deep. */ + uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; zio_link_t *zl = NULL; @@ -3696,28 +3753,35 @@ zio_done(zio_t *zio) while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; - - if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + uint64_t asize = P2ROUNDUP(psize, align); + char *abuf = NULL; + abd_t *adata = zio->io_abd; + + if (asize != psize) { + adata = abd_alloc_linear(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); } + if (adata != NULL) + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); - if (asize != zio->io_size) - zio_buf_free(abuf, asize); + if (adata != NULL) + abd_return_buf(adata, abuf, asize); + + if (asize != psize) + abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ - vdev_stat_update(zio, zio->io_size); + vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding @@ -4098,7 +4162,6 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp, EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); -EXPORT_SYMBOL(zio_buf_alloc_flags); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index d3d2f05a8..37116f049 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -20,8 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -30,6 +30,7 @@ #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zil.h> +#include <sys/abd.h> #include <zfs_fletcher.h> /* @@ -92,45 +93,85 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +/*ARGSUSED*/ +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, - {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, - zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, - {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, - zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, }; @@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *abd, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + void *data = abd_to_buf(abd); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); eck->zec_cksum = cksum; } else { - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &bp->blk_cksum); } } int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) + abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; int byteswap; @@ -308,25 +350,32 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; zio_cksum_t verifier; + size_t eck_offset; + uint64_t data_size = size; + void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) + if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; - else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) + } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); - else + } else { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } - if (nused > size) + if (nused > data_size) { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) @@ -341,11 +390,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - ci->ci_func[byteswap](data, size, + abd_return_buf_copy(abd, data, data_size); + + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); - eck->zec_cksum = expected_cksum; + abd_copy_from_buf_off(abd, &expected_cksum, + eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, @@ -354,7 +407,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](data, size, + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } @@ -383,7 +436,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 6b8d6c39b..7e44d16e4 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -28,7 +28,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -41,24 +41,23 @@ /* * Compression vectors. */ - zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {NULL, NULL, 0, "inherit"}, - {NULL, NULL, 0, "on"}, - {NULL, NULL, 0, "uncompressed"}, - {lzjb_compress, lzjb_decompress, 0, "lzjb"}, - {NULL, NULL, 0, "empty"}, - {gzip_compress, gzip_decompress, 1, "gzip-1"}, - {gzip_compress, gzip_decompress, 2, "gzip-2"}, - {gzip_compress, gzip_decompress, 3, "gzip-3"}, - {gzip_compress, gzip_decompress, 4, "gzip-4"}, - {gzip_compress, gzip_decompress, 5, "gzip-5"}, - {gzip_compress, gzip_decompress, 6, "gzip-6"}, - {gzip_compress, gzip_decompress, 7, "gzip-7"}, - {gzip_compress, gzip_decompress, 8, "gzip-8"}, - {gzip_compress, gzip_decompress, 9, "gzip-9"}, - {zle_compress, zle_decompress, 64, "zle"}, - {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, + {"inherit", 0, NULL, NULL}, + {"on", 0, NULL, NULL}, + {"uncompressed", 0, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress}, + {"empty", 0, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress}, + {"gzip-2", 2, gzip_compress, gzip_decompress}, + {"gzip-3", 3, gzip_compress, gzip_decompress}, + {"gzip-4", 4, gzip_compress, gzip_decompress}, + {"gzip-5", 5, gzip_compress, gzip_decompress}, + {"gzip-6", 6, gzip_compress, gzip_decompress}, + {"gzip-7", 7, gzip_compress, gzip_decompress}, + {"gzip-8", 8, gzip_compress, gzip_decompress}, + {"gzip-9", 9, gzip_compress, gzip_decompress}, + {"zle", 64, zle_compress, zle_decompress}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs} }; enum zio_compress @@ -85,12 +84,26 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + uint64_t *word; + + for (word = data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + size_t -zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) { - uint64_t *word, *word_end; size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; + void *tmp; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); @@ -99,12 +112,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by returning zero size. */ - word_end = (uint64_t *)((char *)src + s_len); - for (word = src; word < word_end; word++) - if (*word != 0) - break; - - if (word == word_end) + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) return (0); if (c == ZIO_COMPRESS_EMPTY) @@ -112,7 +120,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); - c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); + + /* No compression algorithms can read from ABDs directly */ + tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + abd_return_buf(src, tmp, s_len); if (c_len > d_len) return (s_len); @@ -122,13 +134,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) } int -zio_decompress_data(enum zio_compress c, void *src, void *dst, +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len) { zio_compress_info_t *ci = &zio_compress_table[c]; - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + abd_return_buf(src, tmp, s_len); + + return (ret); +} |