aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/raidz_test/raidz_bench.c8
-rw-r--r--cmd/raidz_test/raidz_test.c53
-rw-r--r--cmd/raidz_test/raidz_test.h6
-rw-r--r--cmd/zdb/zdb.c48
-rw-r--r--cmd/zdb/zdb_il.c59
-rw-r--r--cmd/zpool/zpool_iter.c2
-rw-r--r--cmd/zpool/zpool_main.c6
-rw-r--r--cmd/ztest/ztest.c18
-rw-r--r--include/sys/Makefile.am1
-rw-r--r--include/sys/abd.h179
-rw-r--r--include/sys/arc_impl.h2
-rw-r--r--include/sys/ddt.h5
-rw-r--r--include/sys/sa.h2
-rw-r--r--include/sys/spa.h24
-rw-r--r--include/sys/spa_impl.h1
-rw-r--r--include/sys/vdev_impl.h3
-rw-r--r--include/sys/vdev_raidz_impl.h5
-rw-r--r--include/sys/zio.h30
-rw-r--r--include/sys/zio_checksum.h36
-rw-r--r--include/sys/zio_compress.h28
-rw-r--r--include/sys/zio_impl.h3
-rw-r--r--include/zfs_fletcher.h9
-rw-r--r--lib/libspl/Makefile.am1
-rw-r--r--lib/libspl/include/sys/param.h7
-rw-r--r--lib/libspl/page.c34
-rw-r--r--lib/libzfs/libzfs_sendrecv.c7
-rw-r--r--lib/libzpool/Makefile.am1
-rw-r--r--man/man5/zfs-module-parameters.52
-rw-r--r--module/zcommon/zfs_fletcher.c75
-rw-r--r--module/zfs/Makefile.in1
-rw-r--r--module/zfs/abd.c1543
-rw-r--r--module/zfs/arc.c388
-rw-r--r--module/zfs/blkptr.c2
-rw-r--r--module/zfs/dbuf.c23
-rw-r--r--module/zfs/ddt.c25
-rw-r--r--module/zfs/dmu.c14
-rw-r--r--module/zfs/dmu_send.c14
-rw-r--r--module/zfs/dsl_scan.c12
-rw-r--r--module/zfs/edonr_zfs.c24
-rw-r--r--module/zfs/sa.c25
-rw-r--r--module/zfs/sha256.c27
-rw-r--r--module/zfs/skein_zfs.c26
-rw-r--r--module/zfs/spa.c8
-rw-r--r--module/zfs/spa_misc.c3
-rw-r--r--module/zfs/spa_stats.c50
-rw-r--r--module/zfs/space_map.c8
-rw-r--r--module/zfs/txg.c37
-rw-r--r--module/zfs/vdev.c11
-rw-r--r--module/zfs/vdev_cache.c37
-rw-r--r--module/zfs/vdev_disk.c46
-rw-r--r--module/zfs/vdev_file.c17
-rw-r--r--module/zfs/vdev_label.c76
-rw-r--r--module/zfs/vdev_mirror.c14
-rw-r--r--module/zfs/vdev_queue.c24
-rw-r--r--module/zfs/vdev_raidz.c603
-rw-r--r--module/zfs/vdev_raidz_math.c23
-rw-r--r--module/zfs/vdev_raidz_math_aarch64_neon.c131
-rw-r--r--module/zfs/vdev_raidz_math_aarch64_neon_common.h29
-rw-r--r--module/zfs/vdev_raidz_math_aarch64_neonx2.c154
-rw-r--r--module/zfs/vdev_raidz_math_avx2.c113
-rw-r--r--module/zfs/vdev_raidz_math_avx512bw.c18
-rw-r--r--module/zfs/vdev_raidz_math_avx512f.c530
-rw-r--r--module/zfs/vdev_raidz_math_impl.h1842
-rw-r--r--module/zfs/vdev_raidz_math_scalar.c151
-rw-r--r--module/zfs/vdev_raidz_math_sse2.c120
-rw-r--r--module/zfs/vdev_raidz_math_ssse3.c118
-rw-r--r--module/zfs/zap_micro.c6
-rw-r--r--module/zfs/zfs_sa.c8
-rw-r--r--module/zfs/zil.c6
-rw-r--r--module/zfs/zio.c297
-rw-r--r--module/zfs/zio_checksum.c113
-rw-r--r--module/zfs/zio_compress.c80
-rw-r--r--module/zfs/zio_inject.c2
-rw-r--r--tests/runfiles/linux.run6
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh50
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh2
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh5
77 files changed, 5160 insertions, 2357 deletions
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c
index f1710ccc7..7a18902eb 100644
--- a/cmd/raidz_test/raidz_bench.c
+++ b/cmd/raidz_test/raidz_bench.c
@@ -53,18 +53,18 @@ bench_init_raidz_map(void)
/*
* To permit larger column sizes these have to be done
- * allocated using aligned alloc instead of zio_data_buf_alloc
+ * allocated using aligned alloc instead of zio_abd_buf_alloc
*/
- zio_bench.io_data = raidz_alloc(max_data_size);
+ zio_bench.io_abd = raidz_alloc(max_data_size);
- init_zio_data(&zio_bench);
+ init_zio_abd(&zio_bench);
}
static void
bench_fini_raidz_maps(void)
{
/* tear down golden zio */
- raidz_free(zio_bench.io_data, max_data_size);
+ raidz_free(zio_bench.io_abd, max_data_size);
bzero(&zio_bench, sizeof (zio_t));
}
diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c
index 0019ae84a..3e0a089fd 100644
--- a/cmd/raidz_test/raidz_test.c
+++ b/cmd/raidz_test/raidz_test.c
@@ -181,10 +181,10 @@ static void process_options(int argc, char **argv)
}
}
-#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data)
+#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
-#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data)
+#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
static int
@@ -195,10 +195,9 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
VERIFY(parity >= 1 && parity <= 3);
for (i = 0; i < parity; i++) {
- if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i),
- CODE_COL_SIZE(rm, i))) {
+ if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
+ != 0) {
ret++;
-
LOG_OPT(D_DEBUG, opts,
"\nParity block [%d] different!\n", i);
}
@@ -213,8 +212,8 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
for (i = 0; i < dcols; i++) {
- if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i),
- DATA_COL_SIZE(opts->rm_golden, i))) {
+ if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
+ != 0) {
ret++;
LOG_OPT(D_DEBUG, opts,
@@ -224,37 +223,41 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
return (ret);
}
+static int
+init_rand(void *data, size_t size, void *private)
+{
+ int i;
+ int *dst = (int *) data;
+
+ for (i = 0; i < size / sizeof (int); i++)
+ dst[i] = rand_data[i];
+
+ return (0);
+}
+
static void
corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
{
int i;
- int *dst;
raidz_col_t *col;
for (i = 0; i < cnt; i++) {
col = &rm->rm_col[tgts[i]];
- dst = col->rc_data;
- for (i = 0; i < col->rc_size / sizeof (int); i++)
- dst[i] = rand();
+ abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
}
}
void
-init_zio_data(zio_t *zio)
+init_zio_abd(zio_t *zio)
{
- int i;
- int *dst = (int *) zio->io_data;
-
- for (i = 0; i < zio->io_size / sizeof (int); i++) {
- dst[i] = rand_data[i];
- }
+ abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
}
static void
fini_raidz_map(zio_t **zio, raidz_map_t **rm)
{
vdev_raidz_map_free(*rm);
- raidz_free((*zio)->io_data, (*zio)->io_size);
+ raidz_free((*zio)->io_abd, (*zio)->io_size);
umem_free(*zio, sizeof (zio_t));
*zio = NULL;
@@ -279,11 +282,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
- opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize);
- zio_test->io_data = raidz_alloc(opts->rto_dsize);
+ opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
+ zio_test->io_abd = raidz_alloc(opts->rto_dsize);
- init_zio_data(opts->zio_golden);
- init_zio_data(zio_test);
+ init_zio_abd(opts->zio_golden);
+ init_zio_abd(zio_test);
VERIFY0(vdev_raidz_impl_set("original"));
@@ -326,8 +329,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
(*zio)->io_offset = 0;
(*zio)->io_size = alloc_dsize;
- (*zio)->io_data = raidz_alloc(alloc_dsize);
- init_zio_data(*zio);
+ (*zio)->io_abd = raidz_alloc(alloc_dsize);
+ init_zio_abd(*zio);
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h
index b279d82f2..a7fd26b8b 100644
--- a/cmd/raidz_test/raidz_test.h
+++ b/cmd/raidz_test/raidz_test.h
@@ -104,11 +104,11 @@ static inline size_t ilog2(size_t a)
#define SEP "----------------\n"
-#define raidz_alloc(size) zio_data_buf_alloc(size)
-#define raidz_free(p, size) zio_data_buf_free(p, size)
+#define raidz_alloc(size) abd_alloc(size, B_FALSE)
+#define raidz_free(p, size) abd_free(p)
-void init_zio_data(zio_t *zio);
+void init_zio_abd(zio_t *zio);
void run_raidz_benchmark(void);
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index b9b0b29bc..8379cec3e 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -59,6 +59,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#include <zfs_comutil.h>
#include <libzfs.h>
@@ -2464,7 +2465,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
- zio_data_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -2530,7 +2531,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
+ abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */
@@ -2543,7 +2544,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
- zio_nowait(zio_read(NULL, spa, bp, data, size,
+ zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}
@@ -3321,6 +3322,13 @@ name:
return (NULL);
}
+/* ARGSUSED */
+static int
+random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
+{
+ return (random_get_pseudo_bytes(buf, len));
+}
+
/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
@@ -3352,7 +3360,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
- void *pbuf, *lbuf, *buf;
+ abd_t *pabd;
+ void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;
@@ -3425,8 +3434,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;
- /* Some 4K native devices require 4K buffer alignment */
- pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL);
+ pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp);
@@ -3454,15 +3462,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
- zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
- zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
- ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+ psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@@ -3485,13 +3493,13 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- bcopy(pbuf, pbuf2, psize);
+ abd_copy_to_buf(pbuf2, pabd, psize);
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
+ random_get_pseudo_bytes_cb, NULL));
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+ SPA_MAXBLOCKSIZE - psize));
/*
* XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB,
@@ -3506,10 +3514,10 @@ zdb_read_block(char *thing, spa_t *spa)
"Trying %05llx -> %05llx (%s)\n",
(u_longlong_t)psize, (u_longlong_t)lsize,
zio_compress_table[c].ci_name);
- if (zio_decompress_data(c, pbuf, lbuf,
- psize, lsize) == 0 &&
- zio_decompress_data(c, pbuf2, lbuf2,
- psize, lsize) == 0 &&
+ if (zio_decompress_data(c, pabd,
+ lbuf, psize, lsize) == 0 &&
+ zio_decompress_data_buf(c, pbuf2,
+ lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
@@ -3527,7 +3535,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
- buf = pbuf;
+ buf = abd_to_buf(pabd);
size = psize;
}
@@ -3545,7 +3553,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);
out:
- umem_free(pbuf, SPA_MAXBLOCKSIZE);
+ abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 1501e879d..190bfee86 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
/*
@@ -42,6 +42,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
+#include <sys/abd.h>
extern uint8_t dump_opt[256];
@@ -120,13 +121,29 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
}
/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+ char *cdata = data;
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (isprint(*cdata))
+ (void) printf("%c ", *cdata);
+ else
+ (void) printf("%2X", *cdata);
+ cdata++;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
- char *data, *dlimit;
+ abd_t *data;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
- char *buf;
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
@@ -137,9 +154,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (txtype == TX_WRITE2 || verbose < 5)
return;
- if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL)
- return;
-
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", prefix,
!BP_IS_HOLE(bp) &&
@@ -150,43 +164,38 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
- bzero(buf, SPA_MAXBLOCKSIZE);
(void) printf("%s<hole>\n", prefix);
- goto exit;
+ return;
}
if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
(void) printf("%s<block already committed>\n", prefix);
- goto exit;
+ return;
}
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
+ data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa,
- bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error)
- goto exit;
- data = buf;
+ goto out;
} else {
- data = (char *)(lr + 1);
+ /* data is stored after the end of the lr_write record */
+ data = abd_alloc(lr->lr_length, B_FALSE);
+ abd_copy_from_buf(data, lr + 1, lr->lr_length);
}
- dlimit = data + MIN(lr->lr_length,
- (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
-
(void) printf("%s", prefix);
- while (data < dlimit) {
- if (isprint(*data))
- (void) printf("%c ", *data);
- else
- (void) printf("%2hhX", *data);
- data++;
- }
+ (void) abd_iterate_func(data,
+ 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+ zil_prt_rec_write_cb, NULL);
(void) printf("\n");
-exit:
- free(buf);
+
+out:
+ abd_free(data);
}
/* ARGSUSED */
diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c
index b397c7ec2..ac64e977e 100644
--- a/cmd/zpool/zpool_iter.c
+++ b/cmd/zpool/zpool_iter.c
@@ -447,7 +447,7 @@ vdev_cmd_data_list_t *
all_pools_for_each_vdev_run(int argc, char **argv, char *cmd)
{
vdev_cmd_data_list_t *vcdl;
- vcdl = safe_malloc(sizeof (vcdl));
+ vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t));
vcdl->cmd = cmd;
/* Gather our list of all vdevs in all pools */
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index e50acbd2a..0216484b5 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -1422,7 +1422,7 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max,
uint_t c, children;
int ret;
- name = zpool_vdev_name(g_zfs, zhp, nv, name_flags | VDEV_NAME_TYPE_ID);
+ name = zpool_vdev_name(g_zfs, zhp, nv, name_flags);
if (strlen(name) + depth > max)
max = strlen(name) + depth;
@@ -2066,7 +2066,7 @@ show_import(nvlist_t *config)
(void) printf(gettext(" config:\n\n"));
- cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0);
+ cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, VDEV_NAME_TYPE_ID);
if (cb.cb_namewidth < 10)
cb.cb_namewidth = 10;
@@ -5995,7 +5995,7 @@ status_callback(zpool_handle_t *zhp, void *data)
print_scan_status(ps);
cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
- cbp->cb_name_flags);
+ cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
if (cbp->cb_namewidth < 10)
cbp->cb_namewidth = 10;
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 2e4dae3a9..cab0ef734 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -114,6 +114,7 @@
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
+#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
@@ -193,6 +194,7 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
+extern int zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -5444,7 +5446,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
- void *buf;
+ abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;
int i;
@@ -5525,14 +5527,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
- buf = zio_buf_alloc(psize);
- ztest_pattern_set(buf, psize, ~pattern);
+ abd = abd_alloc_linear(psize, B_TRUE);
+ ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
- buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
- zio_buf_free(buf, psize);
+ abd_free(abd);
(void) rw_unlock(&ztest_name_lock);
umem_free(od, sizeof (ztest_od_t));
@@ -5965,6 +5967,12 @@ ztest_resume_thread(void *arg)
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
+
+ /*
+ * Periodically change the zfs_abd_scatter_enabled setting.
+ */
+ if (ztest_random(10) == 0)
+ zfs_abd_scatter_enabled = ztest_random(2);
}
thread_exit();
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 37df6e1d2..956643801 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -1,6 +1,7 @@
SUBDIRS = fm fs crypto sysevent
COMMON_H = \
+ $(top_srcdir)/include/sys/abd.h \
$(top_srcdir)/include/sys/arc.h \
$(top_srcdir)/include/sys/arc_impl.h \
$(top_srcdir)/include/sys/avl.h \
diff --git a/include/sys/abd.h b/include/sys/abd.h
new file mode 100644
index 000000000..d2db7e199
--- /dev/null
+++ b/include/sys/abd.h
@@ -0,0 +1,179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_H
+#define _ABD_H
+
+#include <sys/isa_defs.h>
+#include <sys/int_types.h>
+#include <sys/debug.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <sys/uio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum abd_flags {
+ ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
+ ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
+ ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
+ ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
+ ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */
+} abd_flags_t;
+
+typedef struct abd {
+ abd_flags_t abd_flags;
+ uint_t abd_size; /* excludes scattered abd_offset */
+ struct abd *abd_parent;
+ refcount_t abd_children;
+ union {
+ struct abd_scatter {
+ uint_t abd_offset;
+ uint_t abd_nents;
+ struct scatterlist *abd_sgl;
+ } abd_scatter;
+ struct abd_linear {
+ void *abd_buf;
+ } abd_linear;
+ } abd_u;
+} abd_t;
+
+typedef int abd_iter_func_t(void *buf, size_t len, void *private);
+typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private);
+
+extern int zfs_abd_scatter_enabled;
+
+static inline boolean_t
+abd_is_linear(abd_t *abd)
+{
+ return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
+}
+
+/*
+ * Allocations and deallocations
+ */
+
+abd_t *abd_alloc(size_t, boolean_t);
+abd_t *abd_alloc_linear(size_t, boolean_t);
+abd_t *abd_alloc_for_io(size_t, boolean_t);
+abd_t *abd_alloc_sametype(abd_t *, size_t);
+void abd_free(abd_t *);
+abd_t *abd_get_offset(abd_t *, size_t);
+abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
+abd_t *abd_get_from_buf(void *, size_t);
+void abd_put(abd_t *);
+
+/*
+ * Conversion to and from a normal buffer
+ */
+
+void *abd_to_buf(abd_t *);
+void *abd_borrow_buf(abd_t *, size_t);
+void *abd_borrow_buf_copy(abd_t *, size_t);
+void abd_return_buf(abd_t *, void *, size_t);
+void abd_return_buf_copy(abd_t *, void *, size_t);
+void abd_take_ownership_of_buf(abd_t *, boolean_t);
+void abd_release_ownership_of_buf(abd_t *);
+
+/*
+ * ABD operations
+ */
+
+int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
+int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
+ abd_iter_func2_t *, void *);
+void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
+void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
+int abd_cmp(abd_t *, abd_t *);
+int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_zero_off(abd_t *, size_t, size_t);
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
+ size_t);
+unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+#endif
+
+void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t));
+void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul);
+
+/*
+ * Wrappers for calls with offsets of 0
+ */
+
+static inline void
+abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
+{
+ abd_copy_off(dabd, sabd, 0, 0, size);
+}
+
+static inline void
+abd_copy_from_buf(abd_t *abd, void *buf, size_t size)
+{
+ abd_copy_from_buf_off(abd, buf, 0, size);
+}
+
+static inline void
+abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
+{
+ abd_copy_to_buf_off(buf, abd, 0, size);
+}
+
+static inline int
+abd_cmp_buf(abd_t *abd, void *buf, size_t size)
+{
+ return (abd_cmp_buf_off(abd, buf, 0, size));
+}
+
+static inline void
+abd_zero(abd_t *abd, size_t size)
+{
+ abd_zero_off(abd, 0, size);
+}
+
+/*
+ * Module lifecycle
+ */
+
+void abd_init(void);
+void abd_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ABD_H */
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index d2dc527fe..f5b7cb42a 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -166,7 +166,7 @@ typedef struct l1arc_buf_hdr {
refcount_t b_refcnt;
arc_callback_t *b_acb;
- void *b_pdata;
+ abd_t *b_pabd;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 3befcb844..667795f96 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_DDT_H
@@ -35,6 +36,8 @@
extern "C" {
#endif
+struct abd;
+
/*
* On-disk DDT formats, in the desired search order (newest version first).
*/
@@ -108,7 +111,7 @@ struct ddt_entry {
ddt_key_t dde_key;
ddt_phys_t dde_phys[DDT_PHYS_TYPES];
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
- void *dde_repair_data;
+ struct abd *dde_repair_abd;
enum ddt_type dde_type;
enum ddt_class dde_class;
uint8_t dde_loading;
diff --git a/include/sys/sa.h b/include/sys/sa.h
index 01d24662a..b7ed9fe38 100644
--- a/include/sys/sa.h
+++ b/include/sys/sa.h
@@ -134,8 +134,6 @@ int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
int sa_size(sa_handle_t *, sa_attr_type_t, int *);
-int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
- uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
void sa_object_info(sa_handle_t *, dmu_object_info_t *);
void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
void *sa_get_userdata(sa_handle_t *);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3d0b962e6..58520118e 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -416,15 +416,17 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+#define BP_IS_METADATA(bp) \
+ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+
#define BP_GET_ASIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-#define BP_GET_UCSIZE(bp) \
- ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
+#define BP_GET_UCSIZE(bp) \
+ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
@@ -569,8 +571,7 @@ _NOTE(CONSTCOND) } while (0)
}
#define BP_GET_BUFC_TYPE(bp) \
- (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
- ARC_BUFC_METADATA : ARC_BUFC_DATA)
+ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
typedef enum spa_import_type {
SPA_IMPORT_EXISTING,
@@ -585,7 +586,6 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
nvlist_t *zplprops);
-extern int spa_import_rootpool(char *devpath, char *devid);
extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
@@ -725,6 +725,13 @@ typedef enum txg_state {
TXG_STATE_COMMITTED = 5,
} txg_state_t;
+typedef struct txg_stat {
+ vdev_stat_t vs1;
+ vdev_stat_t vs2;
+ uint64_t txg;
+ uint64_t ndirty;
+} txg_stat_t;
+
extern void spa_stats_init(spa_t *spa);
extern void spa_stats_destroy(spa_t *spa);
extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
@@ -732,8 +739,9 @@ extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time);
extern int spa_txg_history_set(spa_t *spa, uint64_t txg,
txg_state_t completed_state, hrtime_t completed_time);
-extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
- uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty);
+extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
+ struct dsl_pool *);
+extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
/* Pool configuration locks */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 88bde98dc..a1bf14eac 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -239,6 +239,7 @@ struct spa {
uint64_t spa_autoexpand; /* lun expansion on/off */
ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_ditto; /* dedup ditto threshold */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index b9a2d181b..d7f11a2b8 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -53,6 +53,7 @@ extern "C" {
typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
+struct abd;
extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active;
@@ -87,7 +88,7 @@ typedef const struct vdev_ops {
* Virtual device properties
*/
struct vdev_cache_entry {
- char *ve_data;
+ struct abd *ve_abd;
uint64_t ve_offset;
clock_t ve_lastused;
avl_node_t ve_offset_node;
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
index 735b67764..b4663b97c 100644
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@@ -28,6 +28,7 @@
#include <sys/types.h>
#include <sys/debug.h>
#include <sys/kstat.h>
+#include <sys/abd.h>
#ifdef __cplusplus
extern "C" {
@@ -104,7 +105,7 @@ typedef struct raidz_col {
size_t rc_devidx; /* child device index for I/O */
size_t rc_offset; /* device offset */
size_t rc_size; /* I/O size */
- void *rc_data; /* I/O data */
+ abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
unsigned int rc_tried; /* Did we attempt this I/O column? */
@@ -121,7 +122,7 @@ typedef struct raidz_map {
size_t rm_firstdatacol; /* First data column/parity count */
size_t rm_nskip; /* Skipped sectors for padding */
size_t rm_skipstart; /* Column index of padding start */
- void *rm_datacopy; /* rm_asize-buffer of copied data */
+ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
size_t rm_reports; /* # of referencing checksum reports */
unsigned int rm_freed; /* map no longer has referencing ZIO */
unsigned int rm_ecksuminjected; /* checksum error was injected */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 864e8b2be..6c5153dcf 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -301,6 +301,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size);
struct zio_bad_cksum; /* defined in zio_checksum.h */
struct dnode_phys;
+struct abd;
struct zio_cksum_report {
struct zio_cksum_report *zcr_next;
@@ -333,12 +334,12 @@ typedef struct zio_gang_node {
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
- zio_gang_node_t *gn, void *data);
+ zio_gang_node_t *gn, struct abd *data, uint64_t offset);
-typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
typedef struct zio_transform {
- void *zt_orig_data;
+ struct abd *zt_orig_abd;
uint64_t zt_orig_size;
uint64_t zt_bufsize;
zio_transform_func_t *zt_transform;
@@ -396,8 +397,8 @@ struct zio {
uint64_t io_lsize;
/* Data represented by this I/O */
- void *io_data;
- void *io_orig_data;
+ struct abd *io_abd;
+ struct abd *io_orig_abd;
uint64_t io_size;
uint64_t io_orig_size;
@@ -455,19 +456,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *private, enum zio_flag flags);
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
- uint64_t lsize, zio_done_func_t *done, void *private,
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
+ struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
+ struct abd *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
@@ -483,12 +484,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
+ uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
+ uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
@@ -517,21 +518,20 @@ extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
-extern void *zio_buf_alloc_flags(size_t size, int flags);
-extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
- uint64_t offset, void *data, uint64_t size, int type,
+ uint64_t offset, struct abd *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, zio_priority_t priority,
+ struct abd *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index b4c2c8c08..a6cafc9b2 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
* Copyright Saso Kiselkov 2013, All rights reserved.
*/
@@ -34,12 +34,12 @@
extern "C" {
#endif
+struct abd;
+
/*
* Signature for checksum functions.
*/
-typedef void zio_checksum_func_t(const void *, uint64_t, const void *,
- zio_cksum_t *);
-typedef void zio_checksum_t(const void *data, uint64_t size,
+typedef void zio_checksum_t(struct abd *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp);
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
typedef void zio_checksum_tmpl_free_t(void *ctx_template);
@@ -83,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
-extern zio_checksum_t zio_checksum_SHA256;
-extern zio_checksum_t zio_checksum_SHA512_native;
-extern zio_checksum_t zio_checksum_SHA512_byteswap;
+extern zio_checksum_t abd_checksum_SHA256;
+extern zio_checksum_t abd_checksum_SHA512_native;
+extern zio_checksum_t abd_checksum_SHA512_byteswap;
/* Skein */
-extern zio_checksum_t zio_checksum_skein_native;
-extern zio_checksum_t zio_checksum_skein_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+extern zio_checksum_t abd_checksum_skein_native;
+extern zio_checksum_t abd_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
/* Edon-R */
-extern zio_checksum_t zio_checksum_edonr_native;
-extern zio_checksum_t zio_checksum_edonr_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+extern zio_checksum_t abd_checksum_edonr_native;
+extern zio_checksum_t abd_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
-extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size);
+extern void zio_checksum_compute(zio_t *, enum zio_checksum,
+ struct abd *, uint64_t);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
- void *, uint64_t, uint64_t, zio_bad_cksum_t *);
+ struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa);
diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index da58ef7aa..1642823d3 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -22,12 +22,14 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
+#include <sys/abd.h>
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -60,13 +62,20 @@ typedef int zio_decompress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
/*
+ * Common signature for all zio decompress functions using an ABD as input.
+ * This is helpful if you have both compressed ARC and scatter ABDs enabled,
+ * but is not a requirement for all compression algorithms.
+ */
+typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
+ size_t s_len, size_t d_len, int);
+/*
* Information about each compression function.
*/
typedef const struct zio_compress_info {
- zio_compress_func_t *ci_compress; /* compression function */
- zio_decompress_func_t *ci_decompress; /* decompression function */
- int ci_level; /* level parameter */
- char *ci_name; /* algorithm name */
+ char *ci_name;
+ int ci_level;
+ zio_compress_func_t *ci_compress;
+ zio_decompress_func_t *ci_decompress;
} zio_compress_info_t;
extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
@@ -96,13 +105,16 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level);
-
+extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len,
+ int level);
/*
* Compress and decompress data if necessary.
*/
-extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len);
-extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len);
+extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len);
#ifdef __cplusplus
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index a36749a30..4d56e9066 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -30,9 +30,6 @@
#ifndef _ZIO_IMPL_H
#define _ZIO_IMPL_H
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h
index 633606d14..5c7a61c56 100644
--- a/include/zfs_fletcher.h
+++ b/include/zfs_fletcher.h
@@ -48,15 +48,16 @@ extern "C" {
* checksum method is added. This method will ignore last (size % 4) bytes of
* the data buffer.
*/
+void fletcher_init(zio_cksum_t *);
void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
+int fletcher_2_incremental_native(void *, size_t, void *);
+int fletcher_2_incremental_byteswap(void *, size_t, void *);
void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *);
void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_incremental_native(const void *, uint64_t,
- zio_cksum_t *);
-void fletcher_4_incremental_byteswap(const void *, uint64_t,
- zio_cksum_t *);
+int fletcher_4_incremental_native(void *, size_t, void *);
+int fletcher_4_incremental_byteswap(void *, size_t, void *);
int fletcher_4_impl_set(const char *selector);
void fletcher_4_init(void);
void fletcher_4_fini(void);
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index afd64fcca..3c99529f1 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -24,6 +24,7 @@ USER_C = \
getmntany.c \
list.c \
mkdirp.c \
+ page.c \
strlcat.c \
strlcpy.c \
strnlen.c \
diff --git a/lib/libspl/include/sys/param.h b/lib/libspl/include/sys/param.h
index 9f362dd8b..c22d508f9 100644
--- a/lib/libspl/include/sys/param.h
+++ b/lib/libspl/include/sys/param.h
@@ -57,8 +57,11 @@
#define MAXUID UINT32_MAX /* max user id */
#define MAXPROJID MAXUID /* max project id */
-#ifndef PAGESIZE
-#define PAGESIZE (sysconf(_SC_PAGESIZE))
+#ifdef PAGESIZE
+#undef PAGESIZE
#endif /* PAGESIZE */
+extern size_t spl_pagesize(void);
+#define PAGESIZE (spl_pagesize())
+
#endif
diff --git a/lib/libspl/page.c b/lib/libspl/page.c
new file mode 100644
index 000000000..06d9fcfa0
--- /dev/null
+++ b/lib/libspl/page.c
@@ -0,0 +1,34 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <unistd.h>
+
+size_t pagesize = 0;
+
+size_t
+spl_pagesize(void)
+{
+ if (pagesize == 0)
+ pagesize = sysconf(_SC_PAGESIZE);
+
+ return (pagesize);
+}
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 8926d1173..2334245c1 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -366,11 +366,12 @@ cksummer(void *arg)
if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
zero_cksum) ||
!DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
- SHA256_CTX ctx;
+ SHA2_CTX ctx;
zio_cksum_t tmpsha256;
- zio_checksum_SHA256(buf,
- payload_size, &ctx, &tmpsha256);
+ SHA2Init(SHA256, &ctx);
+ SHA2Update(&ctx, buf, payload_size);
+ SHA2Final(&tmpsha256, &ctx);
drrw->drr_key.ddk_cksum.zc_word[0] =
BE_64(tmpsha256.zc_word[0]);
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index b02555708..40c460284 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -33,6 +33,7 @@ KERNEL_C = \
zfs_uio.c \
zpool_prop.c \
zprop_common.c \
+ abd.c \
arc.c \
blkptr.c \
bplist.c \
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 24398fea0..5ed200081 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1863,7 +1863,7 @@ Default value: \fB30,000\fR.
Throttle block allocations in the ZIO pipeline. This allows for
dynamic allocation distribution when devices are imbalanced.
.sp
-Default value: \fB0\fR.
+Default value: \fB1\fR.
.RE
.sp
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 9c2f9c00f..fb0a14991 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -28,6 +28,10 @@
*/
/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
* Fletcher Checksums
* ------------------
*
@@ -219,14 +223,26 @@ static boolean_t fletcher_4_initialized = B_FALSE;
/*ARGSUSED*/
void
-fletcher_2_native(const void *buf, uint64_t size,
- const void *ctx_template, zio_cksum_t *zcp)
+fletcher_init(zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
{
+ zio_cksum_t *zcp = data;
+
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
a0 += ip[0];
a1 += ip[1];
b0 += a0;
@@ -234,18 +250,33 @@ fletcher_2_native(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
}
/*ARGSUSED*/
void
-fletcher_2_byteswap(const void *buf, uint64_t size,
+fletcher_2_native(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
a0 += BSWAP_64(ip[0]);
a1 += BSWAP_64(ip[1]);
b0 += a0;
@@ -253,6 +284,16 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
}
static void
@@ -523,25 +564,28 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
}
}
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
{
+ zio_cksum_t *zcp = data;
/* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
else
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
+ return (0);
}
-void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
{
+ zio_cksum_t *zcp = data;
/* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
else
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
+ return (0);
}
@@ -607,6 +651,9 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */
+typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
+ zio_cksum_t *);
+
static void
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
{
@@ -618,8 +665,9 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
zio_cksum_t zc;
uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
- zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
- fletcher_4_byteswap;
+
+ fletcher_checksum_func_t *fletcher_4_test = native ?
+ fletcher_4_native : fletcher_4_byteswap;
for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
@@ -769,6 +817,9 @@ module_param_call(zfs_fletcher_4_impl,
fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
+EXPORT_SYMBOL(fletcher_init);
+EXPORT_SYMBOL(fletcher_2_incremental_native);
+EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
EXPORT_SYMBOL(fletcher_4_init);
EXPORT_SYMBOL(fletcher_4_fini);
EXPORT_SYMBOL(fletcher_2_native);
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 5ad319f32..6712b9b3c 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -7,6 +7,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
obj-$(CONFIG_ZFS) := $(MODULE).o
+$(MODULE)-objs += abd.o
$(MODULE)-objs += arc.o
$(MODULE)-objs += blkptr.o
$(MODULE)-objs += bplist.o
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
new file mode 100644
index 000000000..d14564b33
--- /dev/null
+++ b/module/zfs/abd.c
@@ -0,0 +1,1543 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ * (1) They avoid use of kmem_*, preventing performance problems where running
+ * kmem_reap on very large memory systems never finishes and causes
+ * constant TLB shootdowns.
+ *
+ * (2) Fragmentation is less of an issue since when we are at the limit of
+ * allocatable space, we won't have to search around for a long free
+ * hole in the VA space for large ARC allocations. Each chunk is mapped in
+ * individually, so even if we weren't using segkpm (see next point) we
+ * wouldn't need to worry about finding a contiguous address range.
+ *
+ * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
+ * on each ABD access. (If segkpm isn't available then we use all linear
+ * ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ *
+ * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
+ * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
+ * available, which is the case on all 32-bit systems and any 64-bit systems
+ * where kpm_enable is turned off.
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#ifdef _KERNEL
+#include <linux/scatterlist.h>
+#include <linux/kmap_compat.h>
+#else
+#define MAX_ORDER 1
+#endif
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+ kstat_named_t abdstat_scatter_page_multi_chunk;
+ kstat_named_t abdstat_scatter_page_multi_zone;
+ kstat_named_t abdstat_scatter_page_alloc_retry;
+ kstat_named_t abdstat_scatter_sg_table_retry;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of compound allocations of a given order. These
+ * allocations are spread over all currently allocated ABDs, and
+ * act as a measure of memory fragmentation.
+ */
+ { { "scatter_order_N", KSTAT_DATA_UINT64 } },
+ /*
+ * The number of scatter ABDs which contain multiple chunks.
+ * ABDs are preferentially allocated from the minimum number of
+ * contiguous multi-page chunks, a single chunk is optimal.
+ */
+ { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are split across memory zones.
+ * ABDs are preferentially allocated using pages from a single zone.
+ */
+ { "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the pages to populate the scatter ABD.
+ */
+ { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the sg table for an ABD.
+ */
+ { "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
+};
+
+#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
+#define ABDSTAT_INCR(stat, val) \
+ atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
+#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
+
+#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
+#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
+#define abd_for_each_sg(abd, sg, n, i) \
+ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+/* see block comment above for description */
+int zfs_abd_scatter_enabled = B_TRUE;
+unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+
+static kmem_cache_t *abd_cache = NULL;
+static kstat_t *abd_ksp;
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+#ifdef _KERNEL
+#ifndef CONFIG_HIGHMEM
+
+#ifndef __GFP_RECLAIM
+#define __GFP_RECLAIM __GFP_WAIT
+#endif
+
+static unsigned long
+abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_pages_node(nid, gfp, order);
+ if (!page)
+ return (0);
+
+ return ((unsigned long) page_address(page));
+}
+
+/*
+ * The goal is to minimize fragmentation by preferentially populating ABDs
+ * with higher order compound pages from a single zone. Allocation size is
+ * progressively decreased until it can be satisfied without performing
+ * reclaim or compaction. When necessary this function will degenerate to
+ * allocating individual pages and allowing reclaim to satisfy allocations.
+ */
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ struct list_head pages;
+ struct sg_table table;
+ struct scatterlist *sg;
+ struct page *page, *tmp_page;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
+ int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int chunks = 0, zones = 0;
+ size_t remaining_size;
+ int nid = NUMA_NO_NODE;
+ int alloc_pages = 0;
+ int order;
+
+ INIT_LIST_HEAD(&pages);
+
+ while (alloc_pages < nr_pages) {
+ unsigned long paddr;
+ unsigned chunk_pages;
+
+ order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
+ chunk_pages = (1U << order);
+
+ paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
+ if (paddr == 0) {
+ if (order == 0) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ } else {
+ max_order = MAX(0, order - 1);
+ }
+ continue;
+ }
+
+ page = virt_to_page(paddr);
+ list_add_tail(&page->lru, &pages);
+
+ if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
+ zones++;
+
+ nid = page_to_nid(page);
+ ABDSTAT_BUMP(abdstat_scatter_orders[order]);
+ chunks++;
+ alloc_pages += chunk_pages;
+ }
+
+ ASSERT3S(alloc_pages, ==, nr_pages);
+
+ while (sg_alloc_table(&table, chunks, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ sg = table.sgl;
+ remaining_size = size;
+ list_for_each_entry_safe(page, tmp_page, &pages, lru) {
+ size_t sg_size = MIN(PAGESIZE << compound_order(page),
+ remaining_size);
+ sg_set_page(sg, page, sg_size, 0);
+ remaining_size -= sg_size;
+
+ sg = sg_next(sg);
+ list_del(&page->lru);
+ }
+
+ if (chunks > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+ if (zones) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
+ abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
+ }
+ }
+
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = table.nents;
+}
+#else
+/*
+ * Allocate N individual pages to construct a scatter ABD. This function
+ * makes no attempt to request contiguous pages and requires the minimal
+ * number of kernel interfaces. It's designed for maximum compatibility.
+ */
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ struct scatterlist *sg;
+ struct sg_table table;
+ struct page *page;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int i;
+
+ while (sg_alloc_table(&table, nr_pages, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ASSERT3U(table.nents, ==, nr_pages);
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ while ((page = __page_cache_alloc(gfp)) == NULL) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_orders[0]);
+ sg_set_page(sg, page, PAGESIZE, 0);
+ }
+
+ if (nr_pages > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+ }
+}
+#endif /* !CONFIG_HIGHMEM */
+
+static void
+abd_free_pages(abd_t *abd)
+{
+ struct scatterlist *sg;
+ struct sg_table table;
+ struct page *page;
+ int nr_pages = ABD_SCATTER(abd).abd_nents;
+ int order, i, j;
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ for (j = 0; j < sg->length; ) {
+ page = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+ order = compound_order(page);
+ __free_pages(page, order);
+ j += (PAGESIZE << order);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+ }
+ }
+
+ table.sgl = ABD_SCATTER(abd).abd_sgl;
+ table.nents = table.orig_nents = nr_pages;
+ sg_free_table(&table);
+}
+
+#else /* _KERNEL */
+
+#ifndef PAGE_SHIFT
+#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
+#endif
+
+struct page;
+
+#define kpm_enable 1
+#define abd_alloc_chunk(o) \
+ ((struct page *) umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
+#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o))
+#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
+#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
+#define local_irq_save(flags) do { (void)(flags); } while (0)
+#define local_irq_restore(flags) do { (void)(flags); } while (0)
+#define nth_page(pg, i) \
+ ((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+ struct page *page;
+ int length;
+ int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr) {
+ memset(sg, 0, nr * sizeof (struct scatterlist));
+ sg[nr - 1].end = 1;
+}
+
+#define for_each_sg(sgl, sg, nr, i) \
+ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+ unsigned int offset)
+{
+ /* currently we don't use offset */
+ ASSERT(offset == 0);
+ sg->page = page;
+ sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+ return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+ if (sg->end)
+ return (NULL);
+
+ return (sg + 1);
+}
+
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+ struct scatterlist *sg;
+ int i;
+
+ ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+ sizeof (struct scatterlist), KM_SLEEP);
+ sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ struct page *p = abd_alloc_chunk(0);
+ sg_set_page(sg, p, PAGESIZE, 0);
+ }
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+static void
+abd_free_pages(abd_t *abd)
+{
+ int i, n = ABD_SCATTER(abd).abd_nents;
+ struct scatterlist *sg;
+ int j;
+
+ abd_for_each_sg(abd, sg, n, i) {
+ for (j = 0; j < sg->length; j += PAGESIZE) {
+ struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
+ abd_free_chunk(p, 0);
+ }
+ }
+
+ vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist));
+}
+
+#endif /* _KERNEL */
+
+void
+abd_init(void)
+{
+ int i;
+
+ abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+
+ for (i = 0; i < MAX_ORDER; i++) {
+ snprintf(abd_stats.abdstat_scatter_orders[i].name,
+ KSTAT_STRLEN, "scatter_order_%d", i);
+ abd_stats.abdstat_scatter_orders[i].data_type =
+ KSTAT_DATA_UINT64;
+ }
+ }
+}
+
+void
+abd_fini(void)
+{
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ if (abd_cache) {
+ kmem_cache_destroy(abd_cache);
+ abd_cache = NULL;
+ }
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
+ ABD_FLAG_MULTI_CHUNK));
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+ } else {
+ size_t n;
+ int i;
+ struct scatterlist *sg;
+
+ ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+ ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+ ABD_SCATTER(abd).abd_sgl->length);
+ n = ABD_SCATTER(abd).abd_nents;
+ abd_for_each_sg(abd, sg, n, i) {
+ ASSERT3P(sg_page(sg), !=, NULL);
+ }
+ }
+}
+
+static inline abd_t *
+abd_alloc_struct(void)
+{
+ abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
+
+ return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+ kmem_cache_free(abd_cache, abd);
+ ABDSTAT_INCR(abdstat_struct_size, -sizeof (abd_t));
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd;
+
+ if (!zfs_abd_scatter_enabled || size <= PAGESIZE)
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd = abd_alloc_struct();
+ abd->abd_flags = ABD_FLAG_OWNER;
+ abd_alloc_pages(abd, size);
+
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_scatter.abd_offset = 0;
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ P2ROUNDUP(size, PAGESIZE) - size);
+
+ return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ abd_free_pages(abd);
+
+ refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ abd->abd_size - P2ROUNDUP(abd->abd_size, PAGESIZE));
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct();
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ if (is_metadata) {
+ abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+ } else {
+ abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ } else {
+ zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ }
+
+ refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT3P(abd->abd_parent, ==, NULL);
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd))
+ abd_free_linear(abd);
+ else
+ abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy. But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+static inline abd_t *
+abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
+{
+ abd_t *abd;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ abd = abd_alloc_struct();
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+
+ abd->abd_u.abd_linear.abd_buf =
+ (char *)sabd->abd_u.abd_linear.abd_buf + off;
+ } else {
+ int i;
+ struct scatterlist *sg;
+ size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+
+ abd = abd_alloc_struct();
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = 0;
+
+ abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+ if (new_offset < sg->length)
+ break;
+ new_offset -= sg->length;
+ }
+
+ ABD_SCATTER(abd).abd_sgl = sg;
+ ABD_SCATTER(abd).abd_offset = new_offset;
+ ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+ }
+
+ abd->abd_size = size;
+ abd->abd_parent = sabd;
+ refcount_create(&abd->abd_children);
+ (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+ return (abd);
+}
+
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+ size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
+
+ VERIFY3U(size, >, 0);
+
+ return (abd_get_offset_impl(sabd, off, size));
+}
+
+abd_t *
+abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
+{
+ ASSERT3U(off + size, <=, sabd->abd_size);
+
+ return (abd_get_offset_impl(sabd, off, size));
+}
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct();
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_linear.abd_buf = buf;
+
+ return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+ if (abd->abd_parent != NULL) {
+ (void) refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+
+ refcount_destroy(&abd->abd_children);
+ abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+ (void) refcount_add_many(&abd->abd_children, n, buf);
+
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+ (void) refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+#define NR_KM_TYPE (6)
+#ifdef _KERNEL
+int km_table[NR_KM_TYPE] = {
+ KM_USER0,
+ KM_USER1,
+ KM_BIO_SRC_IRQ,
+ KM_BIO_DST_IRQ,
+ KM_PTE0,
+ KM_PTE1,
+};
+#endif
+#endif
+
+struct abd_iter {
+ /* public interface */
+ void *iter_mapaddr; /* addr corresponding to iter_pos */
+ size_t iter_mapsize; /* length of data valid at mapaddr */
+
+ /* private */
+ abd_t *iter_abd; /* ABD being iterated through */
+ size_t iter_pos;
+ size_t iter_offset; /* offset in current sg/abd_buf, */
+ /* abd_offset included */
+ struct scatterlist *iter_sg; /* current sg */
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+ int iter_km; /* KM_* for kmap_atomic */
+#endif
+};
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type)
+{
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+ aiter->iter_pos = 0;
+ if (abd_is_linear(abd)) {
+ aiter->iter_offset = 0;
+ aiter->iter_sg = NULL;
+ } else {
+ aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+ aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+ }
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+ ASSERT3U(km_type, <, NR_KM_TYPE);
+ aiter->iter_km = km_type;
+#endif
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ aiter->iter_pos += amount;
+ aiter->iter_offset += amount;
+ if (!abd_is_linear(aiter->iter_abd)) {
+ while (aiter->iter_offset >= aiter->iter_sg->length) {
+ aiter->iter_offset -= aiter->iter_sg->length;
+ aiter->iter_sg = sg_next(aiter->iter_sg);
+ if (aiter->iter_sg == NULL) {
+ ASSERT0(aiter->iter_offset);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+ } else {
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+
+ paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
+ km_table[aiter->iter_km]);
+ }
+
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (!abd_is_linear(aiter->iter_abd)) {
+ /* LINTED E_FUNC_SET_NOT_USED */
+ zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
+ km_table[aiter->iter_km]);
+ }
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter aiter;
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_iter_init(&aiter, abd, 0);
+ abd_iter_advance(&aiter, off);
+
+ while (size > 0) {
+ size_t len;
+ abd_iter_map(&aiter);
+
+ len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ abd_iter_init(&daiter, dabd, 0);
+ abd_iter_init(&saiter, sabd, 1);
+ abd_iter_advance(&daiter, doff);
+ abd_iter_advance(&saiter, soff);
+
+ while (size > 0) {
+ size_t dlen, slen, len;
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ dlen = MIN(daiter.iter_mapsize, size);
+ slen = MIN(saiter.iter_mapsize, size);
+ len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&daiter, len);
+ abd_iter_advance(&saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the contents of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd)
+{
+ ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
+ return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
+ abd_cmp_cb, NULL));
+}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @dabd data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ * is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+ int i;
+ ssize_t len, dlen;
+ struct abd_iter caiters[3];
+ struct abd_iter daiter;
+ void *caddrs[3];
+ unsigned long flags;
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++)
+ abd_iter_init(&caiters[i], cabds[i], i);
+
+ if (dabd)
+ abd_iter_init(&daiter, dabd, i);
+
+ ASSERT3S(dsize, >=, 0);
+
+ local_irq_save(flags);
+ while (csize > 0) {
+ len = csize;
+
+ if (dabd && dsize > 0)
+ abd_iter_map(&daiter);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&caiters[i]);
+ caddrs[i] = caiters[i].iter_mapaddr;
+ }
+
+ switch (parity) {
+ case 3:
+ len = MIN(caiters[2].iter_mapsize, len);
+ case 2:
+ len = MIN(caiters[1].iter_mapsize, len);
+ case 1:
+ len = MIN(caiters[0].iter_mapsize, len);
+ }
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+
+ if (dabd && dsize > 0) {
+ /* this needs precise iter.length */
+ len = MIN(daiter.iter_mapsize, len);
+ dlen = len;
+ } else
+ dlen = 0;
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&caiters[i]);
+ abd_iter_advance(&caiters[i], len);
+ }
+
+ if (dabd && dsize > 0) {
+ abd_iter_unmap(&daiter);
+ abd_iter_advance(&daiter, dlen);
+ dsize -= dlen;
+ }
+
+ csize -= len;
+
+ ASSERT3S(dsize, >=, 0);
+ ASSERT3S(csize, >=, 0);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @tabds rec target ABDs, at most 3
+ * @tsize size of data target columns
+ * @func_raidz_rec expects syndrome data in target columns. Function
+ * reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul)
+{
+ int i;
+ ssize_t len;
+ struct abd_iter citers[3];
+ struct abd_iter xiters[3];
+ void *caddrs[3], *xaddrs[3];
+ unsigned long flags;
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_init(&citers[i], cabds[i], 2*i);
+ abd_iter_init(&xiters[i], tabds[i], 2*i+1);
+ }
+
+ local_irq_save(flags);
+ while (tsize > 0) {
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&citers[i]);
+ abd_iter_map(&xiters[i]);
+ caddrs[i] = citers[i].iter_mapaddr;
+ xaddrs[i] = xiters[i].iter_mapaddr;
+ }
+
+ len = tsize;
+ switch (parity) {
+ case 3:
+ len = MIN(xiters[2].iter_mapsize, len);
+ len = MIN(citers[2].iter_mapsize, len);
+ case 2:
+ len = MIN(xiters[1].iter_mapsize, len);
+ len = MIN(citers[1].iter_mapsize, len);
+ case 1:
+ len = MIN(xiters[0].iter_mapsize, len);
+ len = MIN(citers[0].iter_mapsize, len);
+ }
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_rec(xaddrs, len, caddrs, mul);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&xiters[i]);
+ abd_iter_unmap(&citers[i]);
+ abd_iter_advance(&xiters[i], len);
+ abd_iter_advance(&citers[i], len);
+ }
+
+ tsize -= len;
+ ASSERT3S(tsize, >=, 0);
+ }
+ local_irq_restore(flags);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/*
+ * bio_nr_pages for ABD.
+ * @off is the offset in @abd
+ */
+unsigned long
+abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
+{
+ unsigned long pos;
+
+ if (abd_is_linear(abd))
+ pos = (unsigned long)abd_to_buf(abd) + off;
+ else
+ pos = abd->abd_u.abd_scatter.abd_offset + off;
+
+ return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT)
+ - (pos >> PAGE_SHIFT);
+}
+
+/*
+ * bio_map for scatter ABD.
+ * @off is the offset in @abd
+ * Remaining IO size is returned
+ */
+unsigned int
+abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
+ unsigned int io_size, size_t off)
+{
+ int i;
+ struct abd_iter aiter;
+
+ ASSERT(!abd_is_linear(abd));
+ ASSERT3U(io_size, <=, abd->abd_size - off);
+
+ abd_iter_init(&aiter, abd, 0);
+ abd_iter_advance(&aiter, off);
+
+ for (i = 0; i < bio->bi_max_vecs; i++) {
+ struct page *pg;
+ size_t len, sgoff, pgoff;
+ struct scatterlist *sg;
+
+ if (io_size <= 0)
+ break;
+
+ sg = aiter.iter_sg;
+ sgoff = aiter.iter_offset;
+ pgoff = sgoff & (PAGESIZE - 1);
+ len = MIN(io_size, PAGESIZE - pgoff);
+ ASSERT(len > 0);
+
+ pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
+ if (bio_add_page(bio, pg, len, pgoff) != len)
+ break;
+
+ io_size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (io_size);
+}
+
+/* Tunable Parameters */
+module_param(zfs_abd_scatter_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_enabled,
+ "Toggle whether ABD allocations must be linear.");
+module_param(zfs_abd_scatter_max_order, uint, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+ "Maximum order allocation used for a scatter ABD.");
+#endif
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index e3e933044..e54a7cc59 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -136,14 +136,14 @@
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
- * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
*
* The L1ARC's data pointer may or may not be uncompressed. The ARC has the
- * ability to store the physical data (b_pdata) associated with the DVA of the
- * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
* it will match its on-disk compression characteristics. This behavior can be
* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pdata will point to an
+ * compressed ARC functionality is disabled, the b_pabd will point to an
* uncompressed version of the on-disk data.
*
* Data in the L1ARC is not accessed by consumers of the ARC directly. Each
@@ -182,7 +182,7 @@
* | l1arc_buf_hdr_t
* | | arc_buf_t
* | b_buf +------------>+-----------+ arc_buf_t
- * | b_pdata +-+ |b_next +---->+-----------+
+ * | b_pabd +-+ |b_next +---->+-----------+
* +-----------+ | |-----------| |b_next +-->NULL
* | |b_comp = T | +-----------+
* | |b_data +-+ |b_comp = F |
@@ -199,8 +199,8 @@
* When a consumer reads a block, the ARC must first look to see if the
* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
* arc_buf_t and either copies uncompressed data into a new data buffer from an
- * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
- * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
* hdr is compressed and the desired compression characteristics of the
* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
@@ -224,7 +224,7 @@
* | | arc_buf_t (shared)
* | b_buf +------------>+---------+ arc_buf_t
* | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
+ * | b_pabd +-+ |---------| |b_next +-->NULL
* +-----------+ | | | +---------+
* | |b_data +-+ | |
* | +---------+ | |b_data +-+
@@ -238,19 +238,19 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
* since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called
* with the transformed data and will bcopy the transformed on-disk block into
- * a newly allocated b_pdata. Writes are always done into buffers which have
+ * a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there
* were originally other readers of the buf's original hdr). This ensures that
* the ARC only needs to update a single buf and its hdr after a write occurs.
*
- * When the L2ARC is in use, it will also take advantage of the b_pdata. The
- * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
* that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
@@ -271,7 +271,9 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
#include <sys/multilist.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -315,7 +317,7 @@ int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */
static int arc_grow_retry = 5;
-/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */
@@ -455,13 +457,13 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
/*
- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* Note that the compressed bytes may match the uncompressed bytes
* if the block is either not compressed or compressed arc is disabled.
*/
kstat_named_t arcstat_compressed_size;
/*
- * Uncompressed size of the data stored in b_pdata. If compressed
+ * Uncompressed size of the data stored in b_pabd. If compressed
* arc is disabled then this value will be identical to the stat
* above.
*/
@@ -960,7 +962,7 @@ typedef struct l2arc_read_callback {
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
- void *l2df_data;
+ abd_t *l2df_abd;
size_t l2df_size;
arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
@@ -970,10 +972,14 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
-static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_pabd(arc_buf_hdr_t *);
+static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
@@ -1336,7 +1342,9 @@ static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
boolean_t shared = (buf->b_data != NULL &&
- buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+ buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+ abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+ buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
@@ -1376,8 +1384,6 @@ arc_cksum_verify(arc_buf_t *buf)
return;
if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- hdr->b_l1hdr.b_bufcnt > 1);
return;
}
@@ -1424,7 +1430,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
lsize = HDR_GET_LSIZE(hdr);
- csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
+
ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
if (csize < HDR_GET_PSIZE(hdr)) {
/*
@@ -1459,7 +1466,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
* logical I/O size and not just a gang fragment.
*/
valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
- BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0);
zio_pop_transforms(zio);
return (valid_cksum);
@@ -1483,18 +1490,9 @@ arc_cksum_compute(arc_buf_t *buf)
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
- ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
} else if (ARC_BUF_COMPRESSED(buf)) {
- /*
- * Since the checksum doesn't apply to compressed buffers, we
- * only keep a checksum if there are uncompressed buffers.
- * Therefore there must be another buffer, which is
- * uncompressed.
- */
- IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL,
- hdr->b_l1hdr.b_bufcnt > 1);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
@@ -1589,8 +1587,6 @@ arc_buf_thaw(arc_buf_t *buf)
* allocate b_thawed.
*/
if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- hdr->b_l1hdr.b_bufcnt > 1);
return;
}
@@ -1609,8 +1605,6 @@ arc_buf_freeze(arc_buf_t *buf)
return;
if (ARC_BUF_COMPRESSED(buf)) {
- ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
- hdr->b_l1hdr.b_bufcnt > 1);
return;
}
@@ -1740,7 +1734,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
- bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+ abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
} else {
@@ -1792,7 +1786,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
return (0);
} else {
int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pdata, buf->b_data,
+ hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
/*
@@ -1829,7 +1823,7 @@ arc_decompress(arc_buf_t *buf)
}
/*
- * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/
static uint64_t
arc_hdr_size(arc_buf_hdr_t *hdr)
@@ -1862,14 +1856,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@@ -1897,14 +1891,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@@ -2051,7 +2045,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
- update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
} else {
old_state = arc_l2c_only;
refcnt = 0;
@@ -2120,7 +2114,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
(void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
} else {
arc_buf_t *buf;
uint32_t buffers = 0;
@@ -2150,7 +2144,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
ASSERT3U(bufcnt, ==, buffers);
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr);
} else {
@@ -2163,7 +2157,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
/*
* When moving a header off of a ghost state,
@@ -2204,7 +2198,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf);
}
ASSERT3U(bufcnt, ==, buffers);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
(void) refcount_remove_many(
&old_state->arcs_size, arc_hdr_size(hdr), hdr);
}
@@ -2302,7 +2296,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
/*
* Given a hdr and a buf, returns whether that buf can share its b_data buffer
- * with the hdr's b_pdata.
+ * with the hdr's b_pabd.
*/
static boolean_t
arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
@@ -2397,17 +2391,20 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
* allocate a new buffer to store the buf's data.
*
- * There is one additional restriction here because we're sharing
- * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
- * involved in an L2ARC write, because if this buf is used by an
- * arc_write() then the hdr's data buffer will be released when the
+ * There are two additional restrictions here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+ * actively involved in an L2ARC write, because if this buf is used by
+ * an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
+ * Second, the hdr's ABD must be linear so that the buf's user doesn't
+ * need to be ABD-aware.
*/
- can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+ can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
+ abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */
if (can_share) {
- buf->b_data = hdr->b_l1hdr.b_pdata;
+ buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
@@ -2492,11 +2489,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
}
static void
-l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
{
l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
- df->l2df_data = data;
+ df->l2df_abd = abd;
df->l2df_size = size;
df->l2df_type = type;
mutex_enter(&l2arc_free_on_write_mtx);
@@ -2521,7 +2518,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
}
(void) refcount_remove_many(&state->arcs_size, size, hdr);
- l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
+ l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
}
/*
@@ -2533,7 +2530,7 @@ static void
arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
ASSERT(arc_can_share(hdr, buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@@ -2542,7 +2539,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* the refcount whenever an arc_buf_t is shared.
*/
refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr);
- hdr->b_l1hdr.b_pdata = buf->b_data;
+ hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+ abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+ HDR_ISTYPE_METADATA(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
@@ -2560,7 +2559,7 @@ static void
arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
ASSERT(arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@@ -2569,7 +2568,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
- hdr->b_l1hdr.b_pdata = NULL;
+ abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+ abd_put(hdr->b_l1hdr.b_pabd);
+ hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
@@ -2665,7 +2666,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
* If the current arc_buf_t is sharing its data buffer with the
- * hdr, then reassign the hdr's b_pdata to share it with the new
+ * hdr, then reassign the hdr's b_pabd to share it with the new
* buffer at the end of the list. The shared buffer is always
* the last one on the hdr's buffer list.
*
@@ -2680,8 +2681,8 @@ arc_buf_destroy_impl(arc_buf_t *buf)
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
- arc_hdr_free_pdata(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ arc_hdr_free_pabd(hdr);
/*
* We must setup a new shared block between the
@@ -2714,26 +2715,26 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
static void
-arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
{
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
}
static void
-arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
/*
* If the hdr is currently being written to the l2arc then
@@ -2745,10 +2746,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
arc_hdr_free_on_write(hdr);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
- arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
}
- hdr->b_l1hdr.b_pdata = NULL;
+ hdr->b_l1hdr.b_pabd = NULL;
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
@@ -2784,7 +2785,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
- arc_hdr_alloc_pdata(hdr);
+ arc_hdr_alloc_pabd(hdr);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@@ -2824,7 +2825,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
nhdr->b_l1hdr.b_state = arc_l2c_only;
/* Verify previous threads set to NULL before freeing */
- ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
@@ -2842,11 +2843,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
/*
* A buffer must not be moved into the arc_l2c_only
* state if it's not finished being written out to the
- * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+ * l2arc device. Otherwise, the b_l1hdr.b_pabd field
* might try to be accessed, even though it was removed.
*/
VERIFY(!HDR_L2_WRITING(hdr));
- VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
}
@@ -2931,6 +2932,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ if (!arc_buf_is_shared(buf)) {
+ /*
+ * To ensure that the hdr has the correct data in it if we call
+ * arc_decompress() on this buf before it's been written to
+ * disk, it's easiest if we just set up sharing between the
+ * buf and the hdr.
+ */
+ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
+ arc_hdr_free_pabd(hdr);
+ arc_share_buf(hdr, buf);
+ }
+
return (buf);
}
@@ -2999,9 +3012,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
while (hdr->b_l1hdr.b_buf != NULL)
arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
- if (hdr->b_l1hdr.b_pdata != NULL) {
- arc_hdr_free_pdata(hdr);
- }
+ if (hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_pabd(hdr);
}
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -3068,7 +3080,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/*
* l2arc_write_buffers() relies on a header's L1 portion
- * (i.e. its b_pdata field) during its write phase.
+ * (i.e. its b_pabd field) during it's write phase.
* Thus, we cannot push a header onto the arc_l2c_only
* state (removing its L1 piece) until the header is
* done being written to the l2arc.
@@ -3084,7 +3096,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
if (HDR_HAS_L2HDR(hdr)) {
- ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+ ASSERT(hdr->b_l1hdr.b_pabd == NULL);
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
@@ -3149,9 +3161,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If this hdr is being evicted and has a compressed
* buffer then we discard it here before we change states.
* This ensures that the accounting is updated correctly
- * in arc_free_data_buf().
+ * in arc_free_data_impl().
*/
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
@@ -3249,7 +3261,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* thread. If we used cv_broadcast, we could
* wake up "too many" threads causing arc_size
* to significantly overflow arc_c; since
- * arc_get_data_buf() doesn't check for overflow
+ * arc_get_data_impl() doesn't check for overflow
* when it's woken up (it doesn't because it's
* possible for the ARC to be overflowing while
* full of un-evictable buffers, and the
@@ -4154,13 +4166,13 @@ arc_kmem_reap_now(void)
}
/*
- * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * Threads can block in arc_get_data_impl() waiting for this thread to evict
* enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * arc_get_data_impl() are sleeping while holding the hash lock for their
* particular arc header. Thus, we must be careful to never sleep on a
* hash lock in this thread. This is to prevent the following deadlock:
*
- * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
* waiting for the reclaim thread to signal it.
*
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
@@ -4509,18 +4521,45 @@ arc_is_overflowing(void)
return (arc_size >= arc_c + overflow);
}
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ return (abd_alloc(size, B_TRUE));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (abd_alloc(size, B_FALSE));
+ }
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ return (zio_buf_alloc(size));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (zio_data_buf_alloc(size));
+ }
+}
+
/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
* limit, we'll only signal the reclaim thread and continue on.
*/
-static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
- void *datap = NULL;
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
arc_adapt(size, state);
@@ -4562,11 +4601,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
- datap = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_META);
} else {
- ASSERT(type == ARC_BUFC_DATA);
- datap = zio_data_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
}
@@ -4602,14 +4638,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
- return (datap);
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+ arc_free_data_impl(hdr, size, tag);
+ abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_free_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf, size);
+ }
}
/*
* Free the arc data buffer.
*/
static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
@@ -4626,11 +4682,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(data, size);
arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(data, size);
arc_space_return(size, ARC_SPACE_DATA);
}
}
@@ -4912,7 +4966,7 @@ arc_read_done(zio_t *zio)
if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@@ -5009,7 +5063,7 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
- if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED;
@@ -5161,7 +5215,7 @@ top:
hdr_full_cache);
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -5179,9 +5233,9 @@ top:
* avoid hitting an assert in remove_reference().
*/
arc_access(hdr, hash_lock);
- arc_hdr_alloc_pdata(hdr);
+ arc_hdr_alloc_pabd(hdr);
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
size = arc_hdr_size(hdr);
/*
@@ -5285,7 +5339,7 @@ top:
ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr,
- size, hdr->b_l1hdr.b_pdata,
+ size, hdr->b_l1hdr.b_pabd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
@@ -5325,7 +5379,7 @@ top:
}
}
- rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+ rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
if (*arc_flags & ARC_FLAG_WAIT) {
@@ -5557,16 +5611,17 @@ arc_release(arc_buf_t *buf, void *tag)
arc_unshare_buf(hdr, buf);
/*
- * Now we need to recreate the hdr's b_pdata. Since we
+ * Now we need to recreate the hdr's b_pabd. Since we
* have lastbuf handy, we try to share with it, but if
- * we can't then we allocate a new b_pdata and copy the
+ * we can't then we allocate a new b_pabd and copy the
* data from buf into it.
*/
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
- arc_hdr_alloc_pdata(hdr);
- bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
+ arc_hdr_alloc_pabd(hdr);
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+ buf->b_data, psize);
}
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
@@ -5582,7 +5637,7 @@ arc_release(arc_buf_t *buf, void *tag)
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
@@ -5601,7 +5656,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock);
/*
- * Allocate a new hdr. The new hdr will contain a b_pdata
+ * Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write().
*/
nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
@@ -5677,6 +5732,7 @@ arc_write_ready(zio_t *zio)
arc_buf_hdr_t *hdr = buf->b_hdr;
uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
enum zio_compress compress;
+ fstrans_cookie_t cookie = spl_fstrans_mark();
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
@@ -5690,15 +5746,15 @@ arc_write_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
}
}
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf));
@@ -5720,33 +5776,47 @@ arc_write_ready(zio_t *zio)
arc_hdr_set_compress(hdr, compress);
/*
- * If the hdr is compressed, then copy the compressed
- * zio contents into arc_buf_hdr_t. Otherwise, copy the original
- * data buf into the hdr. Ideally, we would like to always copy the
- * io_data into b_pdata but the user may have disabled compressed
- * arc thus the on-disk block may or may not match what we maintain
- * in the hdr's b_pdata field.
+ * Fill the hdr with data. If the hdr is compressed, the data we want
+ * is available from the zio, otherwise we can take it from the buf.
+ *
+ * We might be able to share the buf's data with the hdr here. However,
+ * doing so would cause the ARC to be full of linear ABDs if we write a
+ * lot of shareable data. As a compromise, we check whether scattered
+ * ABDs are allowed, and assume that if they are then the user wants
+ * the ARC to be primarily filled with them regardless of the data being
+ * written. Therefore, if they're allowed then we allocate one and copy
+ * the data into it; otherwise, we share the data directly if we can.
*/
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
- !ARC_BUF_COMPRESSED(buf)) {
- ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
- ASSERT3U(psize, >, 0);
- arc_hdr_alloc_pdata(hdr);
- bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+ if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+ arc_hdr_alloc_pabd(hdr);
+
+ /*
+ * Ideally, we would always copy the io_abd into b_pabd, but the
+ * user may have disabled compressed ARC, thus we must check the
+ * hdr's compression setting rather than the io_bp's.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
+ ZIO_COMPRESS_OFF);
+ ASSERT3U(psize, >, 0);
+
+ abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+ } else {
+ ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ }
} else {
- ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+ ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- /*
- * This hdr is not compressed so we're able to share
- * the arc_buf_t data buffer with the hdr.
- */
arc_share_buf(hdr, buf);
- ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
- HDR_GET_LSIZE(hdr)));
}
+
arc_hdr_verify(hdr, zio->io_bp);
+ spl_fstrans_unmark(cookie);
}
static void
@@ -5850,6 +5920,7 @@ arc_write_done(zio_t *zio)
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
+ abd_put(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t));
}
@@ -5886,10 +5957,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
callback->awcb_buf = buf;
/*
- * The hdr's b_pdata is now stale, free it now. A new data block
+ * The hdr's b_pabd is now stale, free it now. A new data block
* will be allocated when the zio pipeline calls arc_write_ready().
*/
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
/*
* If the buf is currently sharing the data block with
* the hdr then we need to break that relationship here.
@@ -5899,15 +5970,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
}
VERIFY3P(buf->b_data, !=, NULL);
arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
}
ASSERT(!arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- zio = zio_write(pio, spa, txg, bp, buf->b_data,
+ zio = zio_write(pio, spa, txg, bp,
+ abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp,
arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
@@ -6768,13 +6840,8 @@ l2arc_do_free_on_write(void)
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
- ASSERT3P(df->l2df_data, !=, NULL);
- if (df->l2df_type == ARC_BUFC_METADATA) {
- zio_buf_free(df->l2df_data, df->l2df_size);
- } else {
- ASSERT(df->l2df_type == ARC_BUFC_DATA);
- zio_data_buf_free(df->l2df_data, df->l2df_size);
- }
+ ASSERT3P(df->l2df_abd, !=, NULL);
+ abd_free(df->l2df_abd);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
@@ -6928,12 +6995,12 @@ l2arc_read_done(zio_t *zio)
mutex_enter(hash_lock);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3P(zio->io_data, !=, NULL);
+ ASSERT3P(zio->io_abd, !=, NULL);
/*
* Check this survived the L2ARC journey.
*/
- ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+ ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
@@ -6967,7 +7034,7 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
- hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+ hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb));
}
@@ -7191,7 +7258,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
for (; hdr; hdr = hdr_prev) {
kmutex_t *hash_lock;
uint64_t asize, size;
- void *to_write;
+ abd_t *to_write;
if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr);
@@ -7264,7 +7331,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0);
size = arc_hdr_size(hdr);
@@ -7280,18 +7347,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* add it to the l2arc_free_on_write queue.
*/
if (!HDR_SHARED_DATA(hdr)) {
- to_write = hdr->b_l1hdr.b_pdata;
+ to_write = hdr->b_l1hdr.b_pabd;
} else {
- arc_buf_contents_t type = arc_buf_type(hdr);
- if (type == ARC_BUFC_METADATA) {
- to_write = zio_buf_alloc(size);
- } else {
- ASSERT3U(type, ==, ARC_BUFC_DATA);
- to_write = zio_data_buf_alloc(size);
- }
-
- bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
- l2arc_free_data_on_write(to_write, size, type);
+ to_write = abd_alloc_for_io(size,
+ HDR_ISTYPE_METADATA(hdr));
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+ l2arc_free_abd_on_write(to_write, size,
+ arc_buf_type(hdr));
}
wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, size, to_write,
diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
index d56e19996..99accfa0f 100644
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@@ -14,7 +14,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 1d8c0518a..cfa4fd1fc 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -46,6 +46,7 @@
#include <sys/range_tree.h>
#include <sys/trace_dbuf.h>
#include <sys/callb.h>
+#include <sys/abd.h>
struct dbuf_hold_impl_data {
/* Function arguments */
@@ -1019,7 +1020,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(max_bonuslen);
+ db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
if (bonuslen < max_bonuslen)
bzero(db->db.db_data, max_bonuslen);
@@ -1131,10 +1132,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
*/
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
- /* Note that the data bufs here are zio_bufs */
dnode_t *dn = DB_DNODE(db);
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
- dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
@@ -1207,7 +1207,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
} else if (db->db_state == DB_UNCACHED) {
spa_t *spa = dn->dn_objset->os_spa;
- if (zio == NULL)
+ if (zio == NULL &&
+ db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr))
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
err = dbuf_read_impl(db, zio, flags);
@@ -1221,7 +1222,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
- if (!err && !havepzio)
+ if (!err && !havepzio && zio != NULL)
err = zio_wait(zio);
} else {
/*
@@ -2156,7 +2157,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
ASSERT(db->db.db_data != NULL);
- zio_buf_free(db->db.db_data, bonuslen);
+ kmem_free(db->db.db_data, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
db->db_state = DB_UNCACHED;
}
@@ -3303,7 +3304,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
if (*datap != db->db.db_data) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- zio_buf_free(*datap, bonuslen);
+ kmem_free(*datap, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
}
db->db_data_pending = NULL;
@@ -3709,6 +3710,9 @@ dbuf_write_override_done(zio_t *zio)
mutex_exit(&db->db_mtx);
dbuf_write_done(zio, NULL, db);
+
+ if (zio->io_abd != NULL)
+ abd_put(zio->io_abd);
}
/* Issue I/O to commit a dirty buffer to disk. */
@@ -3801,7 +3805,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
- void *contents = (data != NULL) ? data->b_data : NULL;
+ abd_t *contents = (data != NULL) ?
+ abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg,
&dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 09a3536f5..75ab7f5b2 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -36,6 +36,7 @@
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache;
@@ -528,10 +529,17 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
uint64_t
ddt_get_dedup_dspace(spa_t *spa)
{
- ddt_stat_t dds_total = { 0 };
+ ddt_stat_t dds_total;
+
+ if (spa->spa_dedup_dspace != ~0ULL)
+ return (spa->spa_dedup_dspace);
+
+ bzero(&dds_total, sizeof (ddt_stat_t));
+ /* Calculate and cache the stats */
ddt_get_dedup_stats(spa, &dds_total);
- return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+ spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
+ return (spa->spa_dedup_dspace);
}
uint64_t
@@ -706,9 +714,8 @@ ddt_free(ddt_entry_t *dde)
for (p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL);
- if (dde->dde_repair_data != NULL)
- zio_buf_free(dde->dde_repair_data,
- DDK_GET_PSIZE(&dde->dde_key));
+ if (dde->dde_repair_abd != NULL)
+ abd_free(dde->dde_repair_abd);
cv_destroy(&dde->dde_cv);
kmem_cache_free(ddt_entry_cache, dde);
@@ -915,6 +922,7 @@ ddt_load(spa_t *spa)
*/
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
+ spa->spa_dedup_dspace = ~0ULL;
}
return (0);
@@ -1002,7 +1010,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
ddt_enter(ddt);
- if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
@@ -1040,7 +1048,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
- rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
}
@@ -1182,6 +1190,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
sizeof (ddt->ddt_histogram));
+ spa->spa_dedup_dspace = ~0ULL;
}
void
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index a817fdbce..b5ddec2d9 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -47,6 +47,7 @@
#include <sys/zio_compress.h>
#include <sys/sa.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <sys/zfs_znode.h>
@@ -1513,6 +1514,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+ abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}
@@ -1537,11 +1539,11 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
- zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
- zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
- zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL,
- NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
- ZIO_FLAG_CANFAIL, zb));
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+ zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+ dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
}
@@ -2062,6 +2064,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
+ abd_init();
zfs_dbgmsg_init();
sa_cache_init();
xuio_stat_init();
@@ -2087,6 +2090,7 @@ dmu_fini(void)
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
+ abd_fini();
}
#if defined(_KERNEL) && defined(HAVE_SPL)
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index f9414ea3a..af6208e4d 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -166,7 +166,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
- fletcher_4_incremental_native(dsp->dsa_drr,
+ (void) fletcher_4_incremental_native(dsp->dsa_drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
&dsp->dsa_zc);
if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
@@ -179,13 +179,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
if (dsp->dsa_drr->drr_type == DRR_END) {
dsp->dsa_sent_end = B_TRUE;
}
- fletcher_4_incremental_native(&dsp->dsa_drr->
+ (void) fletcher_4_incremental_native(&dsp->dsa_drr->
drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), &dsp->dsa_zc);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
if (payload_len != 0) {
- fletcher_4_incremental_native(payload, payload_len,
+ (void) fletcher_4_incremental_native(payload, payload_len,
&dsp->dsa_zc);
if (dump_bytes(dsp, payload, payload_len) != 0)
return (SET_ERROR(EINTR));
@@ -1786,11 +1786,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
- fletcher_4_incremental_byteswap(drr_begin,
+ (void) fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
byteswap_record(drr_begin);
} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
- fletcher_4_incremental_native(drr_begin,
+ (void) fletcher_4_incremental_native(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
} else {
return (SET_ERROR(EINVAL));
@@ -2470,9 +2470,9 @@ static void
receive_cksum(struct receive_arg *ra, int len, void *buf)
{
if (ra->byteswap) {
- fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+ (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
} else {
- fletcher_4_incremental_native(buf, len, &ra->cksum);
+ (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
}
}
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 41b3ce79b..fd7a53bc9 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
*/
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
@@ -1820,7 +1821,7 @@ dsl_scan_scrub_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- zio_data_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -1904,7 +1905,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (needs_io && !zfs_no_scrub_io) {
vdev_t *rvd = spa->spa_root_vdev;
uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
- void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= maxinflight)
@@ -1919,9 +1919,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
delay(scan_delay);
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
- zio_flags, zb));
+ zio_nowait(zio_read(NULL, spa, bp,
+ abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
+ NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/* do not relocate this block */
diff --git a/module/zfs/edonr_zfs.c b/module/zfs/edonr_zfs.c
index 3c7d98656..e92da6d6c 100644
--- a/module/zfs/edonr_zfs.c
+++ b/module/zfs/edonr_zfs.c
@@ -22,20 +22,32 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/edonr.h>
#include <sys/zfs_context.h> /* For CTASSERT() */
+#include <sys/abd.h>
#define EDONR_MODE 512
#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+ EdonRState *ctx = arg;
+ EdonRUpdate(ctx, buf, size * 8);
+ return (0);
+}
+
/*
* Native zio_checksum interface for the Edon-R hash function.
*/
/*ARGSUSED*/
void
-zio_checksum_edonr_native(const void *buf, uint64_t size,
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
uint8_t digest[EDONR_MODE / 8];
@@ -43,7 +55,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
- EdonRUpdate(&ctx, buf, size * 8);
+ (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
EdonRFinal(&ctx, digest);
bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
}
@@ -52,12 +64,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
* Byteswapped zio_checksum interface for the Edon-R hash function.
*/
void
-zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+ abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
@@ -65,7 +77,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
}
void *
-zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
{
EdonRState *ctx;
uint8_t salt_block[EDONR_BLOCK_SIZE];
@@ -94,7 +106,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
}
void
-zio_checksum_edonr_tmpl_free(void *ctx_template)
+abd_checksum_edonr_tmpl_free(void *ctx_template)
{
EdonRState *ctx = ctx_template;
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index 0df3b4755..46aacfb1b 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -1690,7 +1690,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size;
- old_data[1] = zio_buf_alloc(spill_data_size);
+ old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
bcopy(hdl->sa_spill->db_data, old_data[1],
hdl->sa_spill->db_size);
spill_attr_count =
@@ -1773,7 +1773,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if (old_data[0])
kmem_free(old_data[0], bonus_data_size);
if (old_data[1])
- zio_buf_free(old_data[1], spill_data_size);
+ vmem_free(old_data[1], spill_data_size);
kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
return (error);
@@ -1846,26 +1846,6 @@ sa_update(sa_handle_t *hdl, sa_attr_type_t type,
return (error);
}
-int
-sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
- uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
-{
- int error;
- sa_bulk_attr_t bulk;
-
- VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
-
- bulk.sa_attr = attr;
- bulk.sa_data = userdata;
- bulk.sa_data_func = locator;
- bulk.sa_length = buflen;
-
- mutex_enter(&hdl->sa_lock);
- error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
- mutex_exit(&hdl->sa_lock);
- return (error);
-}
-
/*
* Return size of an attribute
*/
@@ -2044,7 +2024,6 @@ EXPORT_SYMBOL(sa_bulk_lookup);
EXPORT_SYMBOL(sa_bulk_lookup_locked);
EXPORT_SYMBOL(sa_bulk_update);
EXPORT_SYMBOL(sa_size);
-EXPORT_SYMBOL(sa_update_from_cb);
EXPORT_SYMBOL(sa_object_info);
EXPORT_SYMBOL(sa_object_size);
EXPORT_SYMBOL(sa_get_userdata);
diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c
index c8a4882f8..23a97aa3d 100644
--- a/module/zfs/sha256.c
+++ b/module/zfs/sha256.c
@@ -24,30 +24,39 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#include <sys/sha2.h>
+#include <sys/abd.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+ SHA2_CTX *ctx = arg;
+ SHA2Update(ctx, buf, size);
+ return (0);
+}
/*ARGSUSED*/
void
-zio_checksum_SHA256(const void *buf, uint64_t size,
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA2_CTX ctx;
zio_cksum_t tmp;
SHA2Init(SHA256, &ctx);
- SHA2Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(&tmp, &ctx);
/*
* A prior implementation of this function had a
* private SHA256 implementation always wrote things out in
* Big Endian and there wasn't a byteswap variant of it.
- * To preseve on disk compatibility we need to force that
- * behaviour.
+ * To preserve on disk compatibility we need to force that
+ * behavior.
*/
zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
@@ -57,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size,
/*ARGSUSED*/
void
-zio_checksum_SHA512_native(const void *buf, uint64_t size,
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA2_CTX ctx;
SHA2Init(SHA512_256, &ctx);
- SHA2Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(zcp, &ctx);
}
/*ARGSUSED*/
void
-zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+ abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/module/zfs/skein_zfs.c b/module/zfs/skein_zfs.c
index 659234039..8deb84b26 100644
--- a/module/zfs/skein_zfs.c
+++ b/module/zfs/skein_zfs.c
@@ -20,42 +20,52 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/skein.h>
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+ Skein_512_Ctxt_t *ctx = arg;
+ (void) Skein_512_Update(ctx, buf, size);
+ return (0);
+}
/*
* Computes a native 256-bit skein MAC checksum. Please note that this
* function requires the presence of a ctx_template that should be allocated
- * using zio_checksum_skein_tmpl_init.
+ * using abd_checksum_skein_tmpl_init.
*/
/*ARGSUSED*/
void
-zio_checksum_skein_native(const void *buf, uint64_t size,
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
Skein_512_Ctxt_t ctx;
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
- (void) Skein_512_Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
bzero(&ctx, sizeof (ctx));
}
/*
- * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
* the native checksum function and byteswaps the resulting checksum (since
* skein is internally endian-insensitive).
*/
void
-zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+ abd_checksum_skein_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
@@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size,
* computations and returns a pointer to it.
*/
void *
-zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
{
Skein_512_Ctxt_t *ctx;
@@ -82,7 +92,7 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
* zio_checksum_skein_tmpl_init.
*/
void
-zio_checksum_skein_tmpl_free(void *ctx_template)
+abd_checksum_skein_tmpl_free(void *ctx_template)
{
Skein_512_Ctxt_t *ctx = ctx_template;
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 9f652dc32..5203ea826 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1963,6 +1963,7 @@ spa_load_verify_done(zio_t *zio)
int error = zio->io_error;
spa_t *spa = zio->io_spa;
+ abd_free(zio->io_abd);
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG)
@@ -1970,7 +1971,6 @@ spa_load_verify_done(zio_t *zio)
else
atomic_inc_64(&sle->sle_data_count);
}
- zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -1993,7 +1993,6 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
{
zio_t *rio;
size_t size;
- void *data;
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
@@ -2004,12 +2003,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (!spa_load_verify_metadata)
return (0);
- if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+ if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
return (0);
rio = arg;
size = BP_GET_PSIZE(bp);
- data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
@@ -2017,7 +2015,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
- zio_nowait(zio_read(rio, spa, bp, data, size,
+ zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 42bd25980..8ae5fb559 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -635,6 +635,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
+ /* Reset cached value */
+ spa->spa_dedup_dspace = ~0ULL;
+
/*
* As a pool is being created, treat all features as disabled by
* setting SPA_FEATURE_DISABLED for all entries in the feature
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index c3d1c3b95..20d7dc9d1 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -474,7 +474,7 @@ spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
/*
* Set txg IO stats.
*/
-int
+static int
spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
{
@@ -503,6 +503,54 @@ spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
return (error);
}
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+ txg_stat_t *ts;
+
+ if (zfs_txg_history == 0)
+ return (NULL);
+
+ ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ ts->txg = txg;
+ ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+ spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
+
+ return (ts);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+ if (ts == NULL)
+ return;
+
+ if (zfs_txg_history == 0) {
+ kmem_free(ts, sizeof (txg_stat_t));
+ return;
+ }
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
+ spa_txg_history_set_io(spa, ts->txg,
+ ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
+ ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
+ ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
+ ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
+ ts->ndirty);
+
+ kmem_free(ts, sizeof (txg_stat_t));
+}
+
/*
* ==========================================================================
* SPA TX Assign Histogram Routines
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index 126fd6bee..87e90f219 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -72,7 +72,7 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
}
bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
- entry_map = zio_buf_alloc(bufsize);
+ entry_map = vmem_alloc(bufsize, KM_SLEEP);
mutex_exit(sm->sm_lock);
if (end > bufsize) {
@@ -128,7 +128,7 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
else
range_tree_vacate(rt, NULL, NULL);
- zio_buf_free(entry_map, bufsize);
+ vmem_free(entry_map, bufsize);
return (error);
}
@@ -272,7 +272,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
expected_entries = space_map_entries(sm, rt);
- entry_map = zio_buf_alloc(sm->sm_blksz);
+ entry_map = vmem_alloc(sm->sm_blksz, KM_SLEEP);
entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
entry = entry_map;
@@ -335,7 +335,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
VERIFY3U(range_tree_space(rt), ==, rt_space);
VERIFY3U(range_tree_space(rt), ==, total);
- zio_buf_free(entry_map, sm->sm_blksz);
+ vmem_free(entry_map, sm->sm_blksz);
}
static int
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 9dda58c28..043547e97 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -481,22 +481,17 @@ txg_sync_thread(dsl_pool_t *dp)
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
- vdev_stat_t *vs1, *vs2;
clock_t start, delta;
(void) spl_fstrans_mark();
txg_thread_enter(tx, &cpr);
- vs1 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP);
- vs2 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP);
-
start = delta = 0;
for (;;) {
- clock_t timer, timeout;
+ clock_t timeout = zfs_txg_timeout * hz;
+ clock_t timer;
uint64_t txg;
- uint64_t ndirty;
-
- timeout = zfs_txg_timeout * hz;
+ txg_stat_t *ts;
/*
* We sync when we're scanning, there's someone waiting
@@ -527,15 +522,8 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
- if (tx->tx_exiting) {
- kmem_free(vs2, sizeof (vdev_stat_t));
- kmem_free(vs1, sizeof (vdev_stat_t));
+ if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
- }
-
- spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, vs1);
- spa_config_exit(spa, SCL_ALL, FTAG);
/*
* Consume the quiesced txg which has been handed off to
@@ -546,16 +534,13 @@ txg_sync_thread(dsl_pool_t *dp)
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
+ ts = spa_txg_history_init_io(spa, txg, dp);
cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
- spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC,
- gethrtime());
- ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
-
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
@@ -564,23 +549,13 @@ txg_sync_thread(dsl_pool_t *dp)
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
+ spa_txg_history_fini_io(spa, ts);
cv_broadcast(&tx->tx_sync_done_cv);
/*
* Dispatch commit callbacks to worker threads.
*/
txg_dispatch_callbacks(dp, txg);
-
- spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, vs2);
- spa_config_exit(spa, SCL_ALL, FTAG);
- spa_txg_history_set_io(spa, txg,
- vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
- vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
- vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
- vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
- ndirty);
- spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index db44d2ae1..8fc1a8d28 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -43,6 +43,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
@@ -999,16 +1000,16 @@ vdev_probe_done(zio_t *zio)
vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
- zio->io_offset, zio->io_size, zio->io_data,
+ zio->io_offset, zio->io_size, zio->io_abd,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else {
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0)
vps->vps_writeable = 1;
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
} else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio;
zio_link_t *zl;
@@ -1126,8 +1127,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad2)),
- VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
+ offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
+ abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c
index 321ea4a2f..ec701097b 100644
--- a/module/zfs/vdev_cache.c
+++ b/module/zfs/vdev_cache.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -31,6 +31,7 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/kstat.h>
+#include <sys/abd.h>
/*
* Virtual device read-ahead caching.
@@ -136,12 +137,12 @@ static void
vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
{
ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
- ASSERT(ve->ve_data != NULL);
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+ ASSERT3P(ve->ve_abd, !=, NULL);
avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve);
- zio_buf_free(ve->ve_data, VCBS);
+ abd_free(ve->ve_abd);
kmem_free(ve, sizeof (vdev_cache_entry_t));
}
@@ -171,14 +172,14 @@ vdev_cache_allocate(zio_t *zio)
ve = avl_first(&vc->vc_lastused_tree);
if (ve->ve_fill_io != NULL)
return (NULL);
- ASSERT(ve->ve_hits != 0);
+ ASSERT3U(ve->ve_hits, !=, 0);
vdev_cache_evict(vc, ve);
}
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
ve->ve_lastused = ddi_get_lbolt();
- ve->ve_data = zio_buf_alloc(VCBS);
+ ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
avl_add(&vc->vc_offset_tree, ve);
avl_add(&vc->vc_lastused_tree, ve);
@@ -192,7 +193,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
@@ -201,7 +202,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
}
ve->ve_hits++;
- bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+ abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
}
/*
@@ -216,16 +217,16 @@ vdev_cache_fill(zio_t *fio)
zio_t *pio;
zio_link_t *zl;
- ASSERT(fio->io_size == VCBS);
+ ASSERT3U(fio->io_size, ==, VCBS);
/*
* Add data to the cache.
*/
mutex_enter(&vc->vc_lock);
- ASSERT(ve->ve_fill_io == fio);
- ASSERT(ve->ve_offset == fio->io_offset);
- ASSERT(ve->ve_data == fio->io_data);
+ ASSERT3P(ve->ve_fill_io, ==, fio);
+ ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+ ASSERT3P(ve->ve_abd, ==, fio->io_abd);
ve->ve_fill_io = NULL;
@@ -256,7 +257,7 @@ vdev_cache_read(zio_t *zio)
zio_t *fio;
ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS));
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (B_FALSE);
@@ -270,7 +271,7 @@ vdev_cache_read(zio_t *zio)
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (B_FALSE);
- ASSERT(cache_phase + zio->io_size <= VCBS);
+ ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
mutex_enter(&vc->vc_lock);
@@ -309,7 +310,7 @@ vdev_cache_read(zio_t *zio)
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+ ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
@@ -337,7 +338,7 @@ vdev_cache_write(zio_t *zio)
uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
avl_index_t where;
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
mutex_enter(&vc->vc_lock);
@@ -354,8 +355,8 @@ vdev_cache_write(zio_t *zio)
if (ve->ve_fill_io != NULL) {
ve->ve_missed_update = 1;
} else {
- bcopy((char *)zio->io_data + start - io_start,
- ve->ve_data + start - ve->ve_offset, end - start);
+ abd_copy_off(ve->ve_abd, zio->io_abd, start - io_start,
+ start - ve->ve_offset, end - start);
}
ve = AVL_NEXT(&vc->vc_offset_tree, ve);
}
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index ce65760ee..ae6ed4de9 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -30,6 +30,7 @@
#include <sys/spa.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
@@ -411,6 +412,7 @@ vdev_disk_dio_put(dio_request_t *dr)
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
+
zio_delay_interrupt(zio);
}
}
@@ -434,17 +436,10 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
#endif
}
- /* Drop reference aquired by __vdev_disk_physio */
+ /* Drop reference acquired by __vdev_disk_physio */
rc = vdev_disk_dio_put(dr);
}
-static inline unsigned long
-bio_nr_pages(void *bio_ptr, unsigned int bio_size)
-{
- return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
-}
-
static unsigned int
bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
{
@@ -484,6 +479,15 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
return (bio_size);
}
+static unsigned int
+bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
+{
+ if (abd_is_linear(abd))
+ return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
+
+ return (abd_scatter_bio_map_off(bio, abd, size, off));
+}
+
#ifndef bio_set_op_attrs
#define bio_set_op_attrs(bio, rw, flags) \
do { (bio)->bi_rw |= (rw)|(flags); } while (0)
@@ -516,11 +520,11 @@ vdev_submit_bio(struct bio *bio)
}
static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
- size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags)
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+ size_t io_size, uint64_t io_offset, int rw, int flags)
{
dio_request_t *dr;
- caddr_t bio_ptr;
+ uint64_t abd_offset;
uint64_t bio_offset;
int bio_size, bio_count = 16;
int i = 0, error = 0;
@@ -528,7 +532,8 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
struct blk_plug plug;
#endif
- ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
+ ASSERT(zio != NULL);
+ ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
retry:
dr = vdev_disk_dio_alloc(bio_count);
@@ -547,9 +552,10 @@ retry:
* their volume block size to match the maximum request size and
* the common case will be one bio per vdev IO request.
*/
- bio_ptr = kbuf_ptr;
- bio_offset = kbuf_offset;
- bio_size = kbuf_size;
+
+ abd_offset = 0;
+ bio_offset = io_offset;
+ bio_size = io_size;
for (i = 0; i <= dr->dr_bio_count; i++) {
/* Finished constructing bio's for given buffer */
@@ -569,7 +575,8 @@ retry:
/* bio_alloc() with __GFP_WAIT never returns NULL */
dr->dr_bio[i] = bio_alloc(GFP_NOIO,
- MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
+ MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+ BIO_MAX_PAGES));
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
return (ENOMEM);
@@ -585,10 +592,11 @@ retry:
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
/* Remaining size is returned to become the new size */
- bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
+ bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
+ bio_size, abd_offset);
/* Advance in buffer and construct another bio if needed */
- bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]);
+ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
}
@@ -730,7 +738,7 @@ vdev_disk_io_start(zio_t *zio)
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
- error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
+ error = __vdev_disk_physio(vd->vd_bdev, zio,
zio->io_size, zio->io_offset, rw, flags);
if (error) {
zio->io_error = error;
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index a0a23598b..c78f2f421 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -31,6 +31,7 @@
#include <sys/zio.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
/*
* Virtual device vector for files.
@@ -150,11 +151,21 @@ vdev_file_io_strategy(void *arg)
vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd;
ssize_t resid;
+ void *buf;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ else
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
- UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
- zio->io_size, zio->io_offset, UIO_SYSSPACE,
- 0, RLIM64_INFINITY, kcred, &resid);
+ UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size,
+ zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
if (resid != 0 && zio->io_error == 0)
zio->io_error = SET_ERROR(ENOSPC);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 4edbfa41e..7a3a0e8a0 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -145,6 +145,7 @@
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
@@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
}
static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
@@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
}
static void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
@@ -587,6 +588,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
+ abd_t *vp_abd;
zio_t *zio;
uint64_t best_txg = 0;
int error = 0;
@@ -599,7 +601,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ vp = abd_to_buf(vp_abd);
retry:
for (l = 0; l < VDEV_LABELS; l++) {
@@ -607,7 +610,7 @@ retry:
zio = zio_root(spa, NULL, NULL, flags);
- vdev_label_read(zio, vd, l, vp,
+ vdev_label_read(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@@ -646,7 +649,7 @@ retry:
goto retry;
}
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
return (config);
}
@@ -782,8 +785,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
- char *pad2;
+ abd_t *vp_abd;
+ abd_t *pad2;
uberblock_t *ub;
+ abd_t *ub_abd;
zio_t *zio;
char *buf;
size_t buflen;
@@ -867,8 +872,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize its label.
*/
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
/*
* Generate a label describing the pool and our top-level vdev.
@@ -928,7 +934,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
if (error != 0) {
nvlist_free(label);
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
/* EFAULT means nvlist_pack ran out of room */
return (error == EFAULT ? ENAMETOOLONG : EINVAL);
}
@@ -936,14 +942,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
- bzero(ub, VDEV_UBERBLOCK_RING);
- *ub = spa->spa_uberblock;
+ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+ abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+ ub = abd_to_buf(ub_abd);
ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
- pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
- bzero(pad2, VDEV_PAD_SIZE);
+ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(pad2, VDEV_PAD_SIZE);
/*
* Write everything in parallel.
@@ -953,7 +960,7 @@ retry:
for (l = 0; l < VDEV_LABELS; l++) {
- vdev_label_write(zio, vd, l, vp,
+ vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@@ -966,7 +973,7 @@ retry:
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
- vdev_label_write(zio, vd, l, ub,
+ vdev_label_write(zio, vd, l, ub_abd,
offsetof(vdev_label_t, vl_uberblock),
VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
@@ -979,9 +986,9 @@ retry:
}
nvlist_free(label);
- zio_buf_free(pad2, VDEV_PAD_SIZE);
- zio_buf_free(ub, VDEV_UBERBLOCK_RING);
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(pad2);
+ abd_free(ub_abd);
+ abd_free(vp_abd);
/*
* If this vdev hasn't been previously identified as a spare, then we
@@ -1039,7 +1046,7 @@ vdev_uberblock_load_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
- uberblock_t *ub = zio->io_data;
+ uberblock_t *ub = abd_to_buf(zio->io_abd);
struct ubl_cbdata *cbp = rio->io_private;
ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
@@ -1060,7 +1067,7 @@ vdev_uberblock_load_done(zio_t *zio)
mutex_exit(&rio->io_lock);
}
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
}
static void
@@ -1076,8 +1083,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (l = 0; l < VDEV_LABELS; l++) {
for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
- zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
- VDEV_UBERBLOCK_OFFSET(vd, n),
+ abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+ B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_load_done, zio, flags);
}
@@ -1144,7 +1151,7 @@ vdev_uberblock_sync_done(zio_t *zio)
static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
- uberblock_t *ubbuf;
+ abd_t *ub_abd;
int c, l, n;
for (c = 0; c < vd->vdev_children; c++)
@@ -1158,17 +1165,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
- ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
- *ubbuf = *ub;
+ /* Copy the uberblock_t into the ABD */
+ ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
for (l = 0; l < VDEV_LABELS; l++)
- vdev_label_write(zio, vd, l, ubbuf,
+ vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE);
- zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+ abd_free(ub_abd);
}
/* Sync the uberblocks to all vdevs in svd[] */
@@ -1245,6 +1253,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
+ abd_t *vp_abd;
char *buf;
size_t buflen;
int c;
@@ -1263,15 +1272,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
*/
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
for (; l < VDEV_LABELS; l += 2) {
- vdev_label_write(zio, vd, l, vp,
+ vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
vdev_label_sync_done, zio->io_private,
@@ -1279,7 +1289,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
}
}
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
nvlist_free(label);
}
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 780311195..2b9081168 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -31,6 +31,7 @@
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
@@ -272,13 +273,13 @@ vdev_mirror_scrub_done(zio_t *zio)
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
- bcopy(zio->io_data, pio->io_data, pio->io_size);
+ abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
mutex_exit(&pio->io_lock);
}
mutex_exit(&zio->io_lock);
}
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
@@ -433,7 +434,8 @@ vdev_mirror_io_start(zio_t *zio)
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
- zio_buf_alloc(zio->io_size), zio->io_size,
+ abd_alloc_sametype(zio->io_abd,
+ zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
@@ -458,7 +460,7 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
@@ -543,7 +545,7 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc));
return;
@@ -584,7 +586,7 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
- zio->io_data, zio->io_size,
+ zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 8f394eef5..91ef106b4 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -37,6 +37,7 @@
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/kstat.h>
+#include <sys/abd.h>
/*
* ZFS I/O Scheduler
@@ -496,12 +497,12 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_t *pio;
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
- bcopy((char *)aio->io_data + (pio->io_offset -
- aio->io_offset), pio->io_data, pio->io_size);
+ abd_copy_off(pio->io_abd, aio->io_abd,
+ 0, pio->io_offset - aio->io_offset, pio->io_size);
}
}
- zio_buf_free(aio->io_data, aio->io_size);
+ abd_free(aio->io_abd);
}
/*
@@ -523,7 +524,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
- void *buf;
+ abd_t *abd;
limit = MAX(MIN(zfs_vdev_aggregation_limit,
spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0);
@@ -626,12 +627,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
size = IO_SPAN(first, last);
ASSERT3U(size, <=, limit);
- buf = zio_buf_alloc_flags(size, KM_NOSLEEP);
- if (buf == NULL)
+ abd = abd_alloc_for_io(size, B_TRUE);
+ if (abd == NULL)
return (NULL);
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
- buf, size, first->io_type, zio->io_priority,
+ abd, size, first->io_type, zio->io_priority,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
@@ -644,12 +645,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
- bzero((char *)aio->io_data + (dio->io_offset -
- aio->io_offset), dio->io_size);
+ abd_zero_off(aio->io_abd,
+ dio->io_offset - aio->io_offset, dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) {
- bcopy(dio->io_data, (char *)aio->io_data +
- (dio->io_offset - aio->io_offset),
- dio->io_size);
+ abd_copy_off(aio->io_abd, dio->io_abd,
+ dio->io_offset - aio->io_offset, 0, dio->io_size);
}
zio_add_child(dio, aio);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index d1b415367..a92d3cbaa 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -30,6 +30,7 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h>
@@ -136,7 +137,7 @@ vdev_raidz_map_free(raidz_map_t *rm)
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
- zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+ abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -144,11 +145,13 @@ vdev_raidz_map_free(raidz_map_t *rm)
}
size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size;
+ }
- if (rm->rm_datacopy != NULL)
- zio_buf_free(rm->rm_datacopy, size);
+ if (rm->rm_abd_copy != NULL)
+ abd_free(rm->rm_abd_copy);
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
@@ -185,7 +188,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
size_t x;
const char *good = NULL;
- const char *bad = rm->rm_col[c].rc_data;
+ char *bad;
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
@@ -199,8 +202,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* data never changes for a given logical ZIO)
*/
if (rm->rm_col[0].rc_gdata == NULL) {
- char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
+ int offset;
/*
* Set up the rm_col[]s to generate the parity for
@@ -208,15 +212,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* replacing them with buffers to hold the result.
*/
for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_data;
- rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+ bad_parity[x] = rm->rm_col[x].rc_abd;
+ rm->rm_col[x].rc_gdata =
zio_buf_alloc(rm->rm_col[x].rc_size);
+ rm->rm_col[x].rc_abd =
+ abd_get_from_buf(rm->rm_col[x].rc_gdata,
+ rm->rm_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
for (; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rm_col[x].rc_size);
buf += rm->rm_col[x].rc_size;
}
@@ -226,13 +235,18 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++)
- rm->rm_col[x].rc_data = bad_parity[x];
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = bad_parity[x];
+ }
- buf = rm->rm_datacopy;
+ offset = 0;
for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
- buf += rm->rm_col[x].rc_size;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_offset_size(
+ rm->rm_abd_copy, offset,
+ rm->rm_col[x].rc_size);
+ offset += rm->rm_col[x].rc_size;
}
}
@@ -246,8 +260,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
good += rm->rm_col[x].rc_size;
}
+ bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
}
/*
@@ -260,7 +276,7 @@ static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- caddr_t buf;
+ size_t offset;
raidz_map_t *rm = zio->io_vsd;
size_t size;
@@ -274,7 +290,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
- if (rm->rm_datacopy != NULL)
+ if (rm->rm_abd_copy != NULL)
return;
/*
@@ -290,17 +306,21 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
size += rm->rm_col[c].rc_size;
- buf = rm->rm_datacopy = zio_buf_alloc(size);
+ rm->rm_abd_copy =
+ abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
+ abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
+ col->rc_size);
- bcopy(col->rc_data, buf, col->rc_size);
- col->rc_data = buf;
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- buf += col->rc_size;
+ offset += col->rc_size;
}
- ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+ ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -329,6 +349,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t off = 0;
/*
* "Quotient": The number of data sectors for this stripe on all but
@@ -373,7 +394,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
- rm->rm_datacopy = NULL;
+ rm->rm_abd_copy = NULL;
rm->rm_reports = 0;
rm->rm_freed = 0;
rm->rm_ecksuminjected = 0;
@@ -389,7 +410,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_abd = NULL;
rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
@@ -412,13 +433,18 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+ rm->rm_col[c].rc_abd =
+ abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
+ rm->rm_col[c].rc_size);
+ off = rm->rm_col[c].rc_size;
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
+ for (c = c + 1; c < acols; c++) {
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
+ rm->rm_col[c].rc_size);
+ off += rm->rm_col[c].rc_size;
+ }
/*
* If all data stored spans all columns, there's a danger that parity
@@ -464,29 +490,84 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
return (rm);
}
+struct pqr_struct {
+ uint64_t *p;
+ uint64_t *q;
+ uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++)
+ *pqr->p ^= *src;
+
+ return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+ *pqr->r ^= *src;
+ }
+
+ return (0);
+}
+
static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
- uint64_t *p, *src, pcount, ccount, i;
+ uint64_t *p;
int c;
-
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ abd_t *src;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p = *src;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p ^= *src;
- }
+ struct pqr_struct pqr = { p, NULL, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_p_func, &pqr);
}
}
}
@@ -494,50 +575,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p = *src;
- *q = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++) {
- *p = 0;
- *q = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p ^= *src;
+ struct pqr_struct pqr = { p, q, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pq_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
}
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
}
@@ -546,59 +620,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{
- uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_R].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p = *src;
- *q = *src;
- *r = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++, r++) {
- *p = 0;
- *q = 0;
- *r = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ (void) memcpy(r, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p ^= *src;
-
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ struct pqr_struct pqr = { p, q, r };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pqr_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_4(*r, mask);
- *r ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ r[i] = 0;
}
-
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++, r++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
- VDEV_RAIDZ_64MUL_4(*r, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ VDEV_RAIDZ_64MUL_4(r[i], mask);
}
}
}
@@ -630,40 +693,159 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
}
}
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ int cnt = size / sizeof (src[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dst[i] ^= src[i];
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+ void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++, src++) {
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ *dst ^= *src;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+ uint64_t *dst = buf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++) {
+ /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ }
+
+ return (0);
+}
+
+struct reconst_q_struct {
+ uint64_t *q;
+ int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+ struct reconst_q_struct *rq = private;
+ uint64_t *dst = buf;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++, rq->q++) {
+ int j;
+ uint8_t *b;
+
+ *dst ^= *rq->q;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, rq->exp);
+ }
+ }
+
+ return (0);
+}
+
+struct reconst_pq_struct {
+ uint8_t *p;
+ uint8_t *q;
+ uint8_t *pxy;
+ uint8_t *qxy;
+ int aexp;
+ int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ uint8_t *yd = ybuf;
+ int i;
+
+ for (i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ *yd = *rpq->p ^ *rpq->pxy ^ *xd;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ int i;
+
+ for (i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+ /* same operation as vdev_raidz_reconst_pq_func() on xd */
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ }
+
+ return (0);
+}
+
static int
vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, i;
int x = tgts[0];
int c;
+ abd_t *dst, *src;
ASSERT(ntgts == 1);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(x < rm->rm_cols);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
- ASSERT(xcount > 0);
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ ASSERT(rm->rm_col[x].rc_size > 0);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- dst = rm->rm_col[x].rc_data;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst = *src;
- }
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
+ uint64_t size = MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
+
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == x)
continue;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
-
- for (i = 0; i < count; i++, dst++, src++) {
- *dst ^= *src;
- }
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_p_func, NULL);
}
return (1 << VDEV_RAIDZ_P);
@@ -672,57 +854,46 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, mask, i;
- uint8_t *b;
int x = tgts[0];
- int c, j, exp;
+ int c, exp;
+ abd_t *dst, *src;
+ struct reconst_q_struct rq;
ASSERT(ntgts == 1);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
-
- if (c == x)
- ccount = 0;
- else
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
- count = MIN(ccount, xcount);
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == rm->rm_firstdatacol) {
- for (i = 0; i < count; i++, dst++, src++) {
- *dst = *src;
- }
- for (; i < xcount; i++, dst++) {
- *dst = 0;
- }
+ abd_copy(dst, src, size);
+ if (rm->rm_col[x].rc_size > size)
+ abd_zero_off(dst, size,
+ rm->rm_col[x].rc_size - size);
} else {
- for (i = 0; i < count; i++, dst++, src++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- *dst ^= *src;
- }
-
- for (; i < xcount; i++, dst++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- }
+ ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_q_pre_func, NULL);
+ (void) abd_iterate_func(dst,
+ size, rm->rm_col[x].rc_size - size,
+ vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- dst = rm->rm_col[x].rc_data;
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
exp = 255 - (rm->rm_cols - 1 - x);
+ rq.q = abd_to_buf(src);
+ rq.exp = exp;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst ^= *src;
- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
- *b = vdev_raidz_exp2(*b, exp);
- }
- }
+ (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
@@ -730,11 +901,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
- void *pdata, *qdata;
- uint64_t xsize, ysize, i;
+ uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+ abd_t *pdata, *qdata;
+ uint64_t xsize, ysize;
int x = tgts[0];
int y = tgts[1];
+ abd_t *xd, *yd;
+ struct reconst_pq_struct rpq;
ASSERT(ntgts == 2);
ASSERT(x < y);
@@ -750,15 +923,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
xsize = rm->rm_col[x].rc_size;
ysize = rm->rm_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
- rm->rm_col[VDEV_RAIDZ_Q].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
rm->rm_col[x].rc_size = 0;
rm->rm_col[y].rc_size = 0;
@@ -767,12 +940,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
rm->rm_col[x].rc_size = xsize;
rm->rm_col[y].rc_size = ysize;
- p = pdata;
- q = qdata;
- pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- xd = rm->rm_col[x].rc_data;
- yd = rm->rm_col[y].rc_data;
+ p = abd_to_buf(pdata);
+ q = abd_to_buf(qdata);
+ pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rm->rm_col[x].rc_abd;
+ yd = rm->rm_col[y].rc_abd;
/*
* We now have:
@@ -796,24 +969,27 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
- for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
- *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
- vdev_raidz_exp2(*q ^ *qxy, bexp);
+ ASSERT3U(xsize, >=, ysize);
+ rpq.p = p;
+ rpq.q = q;
+ rpq.pxy = pxy;
+ rpq.qxy = qxy;
+ rpq.aexp = aexp;
+ rpq.bexp = bexp;
- if (i < ysize)
- *yd = *p ^ *pxy ^ *xd;
- }
+ (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+ vdev_raidz_reconst_pq_func, &rpq);
+ (void) abd_iterate_func(xd, ysize, xsize - ysize,
+ vdev_raidz_reconst_pq_tail_func, &rpq);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
- rm->rm_col[VDEV_RAIDZ_P].rc_size);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1131,7 +1307,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
c = used[i];
ASSERT3U(c, <, rm->rm_cols);
- src = rm->rm_col[c].rc_data;
+ src = abd_to_buf(rm->rm_col[c].rc_abd);
ccount = rm->rm_col[c].rc_size;
for (j = 0; j < nmissing; j++) {
cc = missing[j] + rm->rm_firstdatacol;
@@ -1139,7 +1315,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
ASSERT3U(cc, <, rm->rm_cols);
ASSERT3U(cc, !=, c);
- dst[j] = rm->rm_col[cc].rc_data;
+ dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
dcount[j] = rm->rm_col[cc].rc_size;
}
@@ -1187,8 +1363,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used;
+ abd_t **bufs = NULL;
+
int code = 0;
+ /*
+ * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+ * temporary linear ABDs.
+ */
+ if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bufs[c] = col->rc_abd;
+ col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
+ }
n = rm->rm_cols - rm->rm_firstdatacol;
@@ -1275,6 +1468,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
kmem_free(p, psize);
+ /*
+ * copy back from temporary linear abds and free them
+ */
+ if (bufs) {
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ col->rc_abd = bufs[c];
+ }
+ kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ }
+
return (code);
}
@@ -1321,7 +1528,6 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
dt = &tgts[nbadparity];
-
/* Reconstruct using the new math implementation */
ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
if (ret != RAIDZ_ORIGINAL_IMPL)
@@ -1479,7 +1685,7 @@ vdev_raidz_io_start(zio_t *zio)
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1536,7 +1742,7 @@ vdev_raidz_io_start(zio_t *zio)
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1552,6 +1758,7 @@ vdev_raidz_io_start(zio_t *zio)
static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
+ void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -1565,9 +1772,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
+ buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
- rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+ rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc);
+ abd_return_buf(rc->rc_abd, buf, rc->rc_size);
}
}
@@ -1616,7 +1825,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig[c], rc->rc_size);
+ abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
vdev_raidz_generate_parity(rm);
@@ -1625,7 +1834,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
- if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) {
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -1728,7 +1937,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
ASSERT3S(c, >=, 0);
ASSERT3S(c, <, rm->rm_cols);
rc = &rm->rm_col[c];
- bcopy(rc->rc_data, orig[i], rc->rc_size);
+ abd_copy_to_buf(orig[i], rc->rc_abd,
+ rc->rc_size);
}
/*
@@ -1758,7 +1968,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
- bcopy(orig[i], rc->rc_data, rc->rc_size);
+ abd_copy_from_buf(rc->rc_abd, orig[i],
+ rc->rc_size);
}
do {
@@ -1997,7 +2208,7 @@ vdev_raidz_io_done(zio_t *zio)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
@@ -2077,7 +2288,7 @@ done:
continue;
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
index 33c05dadd..c050c9099 100644
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
&vdev_raidz_avx512f_impl,
#endif
#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
- &vdev_raidz_avx512bw_impl,
+ // &vdev_raidz_avx512bw_impl,
#endif
#if defined(__aarch64__)
&vdev_raidz_aarch64_neon_impl,
@@ -240,17 +240,17 @@ int
vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
const int *dt, const int nbaddata)
{
- raidz_rec_f rec_data = NULL;
+ raidz_rec_f rec_fn = NULL;
switch (raidz_parity(rm)) {
case PARITY_P:
- rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
+ rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
break;
case PARITY_PQ:
- rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
+ rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
break;
case PARITY_PQR:
- rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
+ rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
@@ -258,10 +258,10 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
break;
}
- if (rec_data == NULL)
+ if (rec_fn == NULL)
return (RAIDZ_ORIGINAL_IMPL);
else
- return (rec_data(rm, dt));
+ return (rec_fn(rm, dt));
}
const char *raidz_gen_name[] = {
@@ -471,13 +471,12 @@ vdev_raidz_math_init(void)
return;
#endif
- /* Fake an zio and run the benchmark on it */
+ /* Fake an zio and run the benchmark on a warmed up buffer */
bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
bench_zio->io_offset = 0;
bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
- bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE);
- VERIFY(bench_zio->io_data);
- memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */
+ bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
+ memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
/* Benchmark parity generation methods */
for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
@@ -501,7 +500,7 @@ vdev_raidz_math_init(void)
vdev_raidz_map_free(bench_rm);
/* cleanup the bench zio */
- zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE);
+ abd_free(bench_zio->io_abd);
kmem_free(bench_zio, sizeof (zio_t));
/* install kstats for all impl */
diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
index f6a433f10..c7b8afd38 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -23,11 +23,38 @@
*/
#include <sys/isa_defs.h>
+#include <sys/types.h>
#if defined(__aarch64__)
#include "vdev_raidz_math_aarch64_neon_common.h"
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define MUL_D 0, 1, 2, 3
+
#define GEN_P_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_33_36()
@@ -38,15 +65,12 @@
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
- GEN_X_DEFINE_8_9() \
- GEN_X_DEFINE_10_11() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
#define GEN_PQ_STRIDE 4
#define GEN_PQ_D 0, 1, 2, 3
-#define GEN_PQ_P 4, 5, 6, 7
-#define GEN_PQ_Q 8, 9, 10, 11
+#define GEN_PQ_C 4, 5, 6, 7
#define GEN_PQR_DEFINE() \
GEN_X_DEFINE_0_3() \
@@ -54,69 +78,115 @@
GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
- GEN_X_DEFINE_31() \
- GEN_X_DEFINE_32() \
GEN_X_DEFINE_33_36()
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 0, 1
-#define GEN_PQR_P 2, 3
-#define GEN_PQR_Q 4, 5
-#define GEN_PQR_R 6, 7
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
-#define REC_P_DEFINE() \
- GEN_X_DEFINE_0_3() \
+#define SYN_Q_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
+#define SYN_Q_STRIDE 4
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
-#define REC_Q_DEFINE() \
+#define SYN_R_DEFINE() \
GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_Q_STRIDE 4
-#define REC_Q_X 0, 1, 2, 3
+#define SYN_R_STRIDE 4
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
-#define REC_R_DEFINE() \
+#define SYN_PQ_DEFINE() \
GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_R_STRIDE 4
-#define REC_R_X 0, 1, 2, 3
+#define SYN_PQ_STRIDE 4
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
#define REC_PQ_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
- GEN_X_DEFINE_16() \
- GEN_X_DEFINE_17() \
GEN_X_DEFINE_31() \
GEN_X_DEFINE_32() \
GEN_X_DEFINE_33_36()
#define REC_PQ_STRIDE 2
#define REC_PQ_X 0, 1
#define REC_PQ_Y 2, 3
-#define REC_PQ_D 4, 5
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PR_STRIDE 4
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
-#define REC_PR_DEFINE() REC_PQ_DEFINE()
+#define REC_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
#define REC_PR_STRIDE 2
#define REC_PR_X 0, 1
#define REC_PR_Y 2, 3
-#define REC_PR_D 4, 5
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_QR_STRIDE 4
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
-#define REC_QR_DEFINE() REC_PQ_DEFINE()
+#define REC_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
#define REC_QR_STRIDE 2
#define REC_QR_X 0, 1
#define REC_QR_Y 2, 3
-#define REC_QR_D 4, 5
+#define REC_QR_T 4, 5
-#define REC_PQR_DEFINE() \
+#define SYN_PQR_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
- GEN_X_DEFINE_8_9() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PQR_STRIDE 4
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
GEN_X_DEFINE_31() \
GEN_X_DEFINE_32() \
GEN_X_DEFINE_33_36()
@@ -124,7 +194,6 @@
#define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5
-#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9
diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
index 08dbddaea..cb9ff86c1 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -125,7 +125,7 @@
#define ASM_BUG() ASSERT(0)
-#define OFFSET(ptr, val) (((unsigned char *)ptr)+val)
+#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val)
extern const uint8_t gf_clmul_mod_lt[4*256][16];
@@ -135,20 +135,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) \
-{ \
- __asm( \
- "prfm pstl1strm, %[MEM]\n" \
- : : [MEM] "Q" (*(ptr + offset))); \
-}
-
-#define PREFETCH(ptr, offset) \
-{ \
- __asm( \
- "prfm pldl1keep, %[MEM]\n" \
- : : [MEM] "Q" (*(ptr + offset))); \
-}
-
#define XOR_ACC(src, r...) \
{ \
switch (REG_CNT(r)) { \
@@ -242,6 +228,19 @@ typedef struct v {
#define ZERO(r...) \
{ \
switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
+ "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
+ "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \
+ "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \
+ "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \
+ "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \
+ "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \
+ "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \
+ : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
+ WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \
+ break; \
case 4: \
__asm( \
"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
index d8d1f1bce..f8688a06a 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -28,111 +28,179 @@
#include "vdev_raidz_math_aarch64_neon_common.h"
-#define GEN_P_DEFINE() \
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 8
+#define ZERO_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7()
-#define GEN_P_STRIDE 8
-#define GEN_P_P 0, 1, 2, 3, 4, 5, 6, 7
+#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7
-#define GEN_PQ_DEFINE() \
+#define COPY_STRIDE 8
+#define COPY_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7()
+#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7
+
+#define ADD_STRIDE 8
+#define ADD_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7()
+#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_33_36()
+#define GEN_P_STRIDE 4
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
- GEN_X_DEFINE_8_9() \
- GEN_X_DEFINE_10_11() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
#define GEN_PQ_STRIDE 4
#define GEN_PQ_D 0, 1, 2, 3
-#define GEN_PQ_P 4, 5, 6, 7
-#define GEN_PQ_Q 8, 9, 10, 11
+#define GEN_PQ_C 4, 5, 6, 7
#define GEN_PQR_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
- GEN_X_DEFINE_8_9() \
- GEN_X_DEFINE_22_23() \
- GEN_X_DEFINE_24_27() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
#define GEN_PQR_STRIDE 4
#define GEN_PQR_D 0, 1, 2, 3
-#define GEN_PQR_P 4, 5, 6, 7
-#define GEN_PQR_Q 8, 9, 22, 23
-#define GEN_PQR_R 24, 25, 26, 27
+#define GEN_PQR_C 4, 5, 6, 7
-#define REC_P_DEFINE() \
+#define SYN_Q_DEFINE() \
GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
+#define SYN_Q_STRIDE 4
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
-#define REC_Q_DEFINE() \
+#define SYN_R_DEFINE() \
GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_Q_STRIDE 4
-#define REC_Q_X 0, 1, 2, 3
+#define SYN_R_STRIDE 4
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
-#define REC_R_DEFINE() \
+#define SYN_PQ_DEFINE() \
GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
GEN_X_DEFINE_33_36()
-#define REC_R_STRIDE 4
-#define REC_R_X 0, 1, 2, 3
+#define SYN_PQ_STRIDE 4
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
#define REC_PQ_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
GEN_X_DEFINE_8_9() \
- GEN_X_DEFINE_16() \
- GEN_X_DEFINE_17() \
GEN_X_DEFINE_22_23() \
GEN_X_DEFINE_33_36()
#define REC_PQ_STRIDE 4
#define REC_PQ_X 0, 1, 2, 3
#define REC_PQ_Y 4, 5, 6, 7
-#define REC_PQ_D 8, 9, 22, 23
+#define REC_PQ_T 8, 9, 22, 23
-#define REC_PR_DEFINE() REC_PQ_DEFINE()
+#define SYN_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_PR_STRIDE 4
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_33_36()
#define REC_PR_STRIDE 4
#define REC_PR_X 0, 1, 2, 3
#define REC_PR_Y 4, 5, 6, 7
-#define REC_PR_D 8, 9, 22, 23
+#define REC_PR_T 8, 9, 22, 23
-#define REC_QR_DEFINE() REC_PQ_DEFINE()
+#define SYN_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_16() \
+ GEN_X_DEFINE_17() \
+ GEN_X_DEFINE_33_36()
+#define SYN_QR_STRIDE 4
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_22_23() \
+ GEN_X_DEFINE_33_36()
#define REC_QR_STRIDE 4
#define REC_QR_X 0, 1, 2, 3
#define REC_QR_Y 4, 5, 6, 7
-#define REC_QR_D 8, 9, 22, 23
+#define REC_QR_T 8, 9, 22, 23
-#define REC_PQR_DEFINE() \
+#define SYN_PQR_DEFINE() \
GEN_X_DEFINE_0_3() \
GEN_X_DEFINE_4_5() \
GEN_X_DEFINE_6_7() \
- GEN_X_DEFINE_8_9() \
GEN_X_DEFINE_16() \
GEN_X_DEFINE_17() \
- GEN_X_DEFINE_22_23() \
- GEN_X_DEFINE_24_27() \
- GEN_X_DEFINE_28_30() \
- GEN_X_DEFINE_31() \
GEN_X_DEFINE_33_36()
-#define REC_PQR_STRIDE 4
-#define REC_PQR_X 0, 1, 2, 3
-#define REC_PQR_Y 4, 5, 6, 7
-#define REC_PQR_Z 8, 9, 22, 23
-#define REC_PQR_D 24, 25, 26, 27
-#define REC_PQR_XS 24, 25, 26, 27
-#define REC_PQR_YS 28, 29, 30, 31
+#define SYN_PQR_STRIDE 4
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+#define REC_PQR_DEFINE() \
+ GEN_X_DEFINE_0_3() \
+ GEN_X_DEFINE_4_5() \
+ GEN_X_DEFINE_6_7() \
+ GEN_X_DEFINE_8_9() \
+ GEN_X_DEFINE_31() \
+ GEN_X_DEFINE_32() \
+ GEN_X_DEFINE_33_36()
+#define REC_PQR_STRIDE 2
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
#include <sys/vdev_raidz_impl.h>
#include "vdev_raidz_math_impl.h"
diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
index 90c94c77c..07113a235 100644
--- a/module/zfs/vdev_raidz_math_avx2.c
+++ b/module/zfs/vdev_raidz_math_avx2.c
@@ -21,7 +21,6 @@
/*
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
-
#include <sys/isa_defs.h>
#if defined(__x86_64) && defined(HAVE_AVX2)
@@ -66,19 +65,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) \
-{ \
- __asm( \
- "prefetchnta " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
-
-#define PREFETCH(ptr, offset) \
-{ \
- __asm( \
- "prefetcht0 " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
#define XOR_ACC(src, r...) \
{ \
@@ -122,25 +108,7 @@ typedef struct v {
} \
}
-#define ZERO(r...) \
-{ \
- switch (REG_CNT(r)) { \
- case 4: \
- __asm( \
- "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
- "vpxor %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
- "vpxor %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
- break; \
- case 2: \
- __asm( \
- "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
- break; \
- default: \
- ASM_BUG(); \
- } \
-}
+#define ZERO(r...) XOR(r, r)
#define COPY(r...) \
{ \
@@ -335,59 +303,86 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
kfpu_end(); \
}
-#define GEN_P_DEFINE() {}
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3
-#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
#define GEN_PQ_D 0, 1, 2, 3
-#define GEN_PQ_P 4, 5, 6, 7
-#define GEN_PQ_Q 8, 9, 10, 11
+#define GEN_PQ_C 4, 5, 6, 7
+#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {}
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 0, 1
-#define GEN_PQR_P 2, 3
-#define GEN_PQR_Q 4, 5
-#define GEN_PQR_R 6, 7
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
-#define REC_P_DEFINE() {}
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
-#define REC_Q_DEFINE() {}
-#define REC_Q_STRIDE 4
-#define REC_Q_X 0, 1, 2, 3
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
-#define REC_R_DEFINE() {}
-#define REC_R_STRIDE 4
-#define REC_R_X 0, 1, 2, 3
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
-#define REC_PQ_DEFINE() {}
#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
#define REC_PQ_X 0, 1
#define REC_PQ_Y 2, 3
-#define REC_PQ_D 4, 5
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
-#define REC_PR_DEFINE() {}
#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
#define REC_PR_X 0, 1
#define REC_PR_Y 2, 3
-#define REC_PR_D 4, 5
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
-#define REC_QR_DEFINE() {}
#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
#define REC_QR_X 0, 1
#define REC_QR_Y 2, 3
-#define REC_QR_D 4, 5
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
-#define REC_PQR_DEFINE() {}
#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
#define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5
-#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
index bcbe657d0..465d1e569 100644
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@@ -24,7 +24,7 @@
#include <sys/isa_defs.h>
-#if defined(__x86_64) && defined(HAVE_AVX512BW)
+#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
#include <sys/types.h>
#include <linux/simd_x86.h>
@@ -345,6 +345,22 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
kfpu_end(); \
}
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
#define GEN_P_DEFINE() {}
#define GEN_P_STRIDE 4
#define GEN_P_P 0, 1, 2, 3
diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
index cc3868bce..0b6108c10 100644
--- a/module/zfs/vdev_raidz_math_avx512f.c
+++ b/module/zfs/vdev_raidz_math_avx512f.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
#include <sys/isa_defs.h>
@@ -74,29 +75,12 @@
#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
#define R_23(REG...) _R_23(REG, 1, 2, 3)
-#define ASM_BUG() ASSERT(0)
-
-extern const uint8_t gf_clmul_mod_lt[4*256][16];
-
#define ELEM_SIZE 64
typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) \
-{ \
- __asm( \
- "prefetchnta " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
-
-#define PREFETCH(ptr, offset) \
-{ \
- __asm( \
- "prefetcht0 " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
#define XOR_ACC(src, r...) \
{ \
@@ -109,14 +93,6 @@ typedef struct v {
"vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \
: : [SRC] "r" (src)); \
break; \
- case 2: \
- __asm( \
- "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
- "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
- : : [SRC] "r" (src)); \
- break; \
- default: \
- ASM_BUG(); \
} \
}
@@ -135,30 +111,12 @@ typedef struct v {
"vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \
"vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \
break; \
- default: \
- ASM_BUG(); \
} \
}
-#define ZERO(r...) \
-{ \
- switch (REG_CNT(r)) { \
- case 4: \
- __asm( \
- "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
- "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
- "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
- break; \
- case 2: \
- __asm( \
- "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
- break; \
- default: \
- ASM_BUG(); \
- } \
-}
+
+#define ZERO(r...) XOR(r, r)
+
#define COPY(r...) \
{ \
@@ -175,8 +133,6 @@ typedef struct v {
"vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \
"vmovdqa64 %" VR1(r) ", %" VR3(r)); \
break; \
- default: \
- ASM_BUG(); \
} \
}
@@ -191,14 +147,6 @@ typedef struct v {
"vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \
: : [SRC] "r" (src)); \
break; \
- case 2: \
- __asm( \
- "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \
- "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \
- : : [SRC] "r" (src)); \
- break; \
- default: \
- ASM_BUG(); \
} \
}
@@ -213,31 +161,17 @@ typedef struct v {
"vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \
: : [DST] "r" (dst)); \
break; \
- case 2: \
- __asm( \
- "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \
- "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \
- : : [DST] "r" (dst)); \
- break; \
- default: \
- ASM_BUG(); \
} \
}
-#define FLUSH() \
-{ \
- __asm("vzeroupper"); \
-}
-
#define MUL2_SETUP() \
{ \
- __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
- __asm("vpbroadcastq %xmm14, %zmm14"); \
- __asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \
- __asm("vpbroadcastq %xmm13, %zmm13"); \
- __asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \
- __asm("vpbroadcastq %xmm12, %zmm12"); \
- __asm("vpxorq %zmm0, %zmm0 ,%zmm0"); \
+ __asm("vmovq %0, %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm31, %zmm31"); \
+ __asm("vmovq %0, %%xmm30" :: "r"(0x8080808080808080)); \
+ __asm("vpbroadcastq %xmm30, %zmm30"); \
+ __asm("vmovq %0, %%xmm29" :: "r"(0xfefefefefefefefe)); \
+ __asm("vpbroadcastq %xmm29, %zmm29"); \
}
#define _MUL2(r...) \
@@ -245,23 +179,21 @@ typedef struct v {
switch (REG_CNT(r)) { \
case 2: \
__asm( \
- "vpandq %" VR0(r)", %zmm13, %zmm10\n" \
- "vpandq %" VR1(r)", %zmm13, %zmm11\n" \
- "vpsrlq $7, %zmm10, %zmm30\n" \
- "vpsrlq $7, %zmm11, %zmm31\n" \
- "vpsllq $1, %zmm10, %zmm10\n" \
- "vpsllq $1, %zmm11, %zmm11\n" \
- "vpsubq %zmm30, %zmm10, %zmm10\n" \
- "vpsubq %zmm31, %zmm11, %zmm11\n" \
+ "vpandq %" VR0(r)", %zmm30, %zmm26\n" \
+ "vpandq %" VR1(r)", %zmm30, %zmm25\n" \
+ "vpsrlq $7, %zmm26, %zmm28\n" \
+ "vpsrlq $7, %zmm25, %zmm27\n" \
+ "vpsllq $1, %zmm26, %zmm26\n" \
+ "vpsllq $1, %zmm25, %zmm25\n" \
+ "vpsubq %zmm28, %zmm26, %zmm26\n" \
+ "vpsubq %zmm27, %zmm25, %zmm25\n" \
"vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
"vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
- "vpandq %zmm10, %zmm14, %zmm10\n" \
- "vpandq %zmm11, %zmm14, %zmm11\n" \
- "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \
- "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \
+ "vpandq %zmm26, %zmm31, %zmm26\n" \
+ "vpandq %zmm25, %zmm31, %zmm25\n" \
+ "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \
+ "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \
break; \
- default: \
- ASM_BUG(); \
} \
}
@@ -275,8 +207,6 @@ typedef struct v {
case 2: \
_MUL2(r); \
break; \
- default: \
- ASM_BUG(); \
} \
}
@@ -286,216 +216,249 @@ typedef struct v {
MUL2(r); \
}
-/*
- * Must match the init above
- */
-#define _0f "zmm0"
-#define _as "zmm14"
-#define _bs "zmm13"
-#define _ltmod "zmm12"
-#define _ltmul "zmm11"
-#define _ta "zmm10"
-#define _tb "zmm15"
-/*
- * Must be in the first 16, otherwise an EVEX pshufb is generated
- * Must match above
- */
-#define _asYlo "ymm14"
-#define _bsYlo "ymm13"
-#define _ltmodYlo "ymm12"
-#define _ltmulYlo "ymm11"
-#define _taYlo "ymm10"
-#define _tbYlo "ymm15"
+/* General multiplication by adding powers of two */
-/*
- * Must be in the first 16, otherwise an EVEX pshufb is generated
- * ...
- */
-#define _asYhi "ymm9"
-#define _bsYhi "ymm8"
-#define _ltmodYhi "ymm7"
-#define _ltmulYhi "ymm6"
-#define _taYhi "ymm5"
-#define _tbYhi "ymm4"
+#define _mul_x2_in 21, 22
+#define _mul_x2_acc 23, 24
-/*
- * This uses a pair of AVX2 pshufb to emulate the missing AVX512 pshufb.
- * AVX512BW has the full pshufb
- * To get VEX pshufb (AVX2, supported in KNL) instead of EVEX pshufb
- * (AVX512BW, not supported on KNL, probably also requiring AVX51VL
- * since we use a 256 bits version), all registers in parameters to
- * pshufb must be among ymm0-ymm15, since only EVEX can encore
- * ymm16-ymm31
- * This is a bit hackish, but short of encoding the instruction in
- * binary, how do we force the use of AVX2 pshufb ?
- * Note that the other way round (forcing AVX512) is easy, just encode
- * k0 as the mask register (k0 is all-1).
- */
-#define _MULx2(c, r...) \
+#define _MUL_PARAM(x, in, acc) \
{ \
- switch (REG_CNT(r)) { \
- case 2: \
- __asm( \
- "vmovq %[c0f], %%xmm0\n" \
- "vpbroadcastq %%xmm0, %%" _0f "\n" \
- /* upper bits */ \
- "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n" \
- "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n" \
- \
- "vpsrad $0x4, %%" VR0(r) ", %%"_as "\n" \
- "vpsrad $0x4, %%" VR1(r) ", %%"_bs "\n" \
- "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \
- "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \
- "vpandq %%" _0f ", %%" _as ", %%" _as "\n" \
- "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n" \
- \
- "vextracti64x4 $1,%%" _ltmod ",%%" _ltmodYhi"\n" \
- \
- "vextracti64x4 $1,%%" _as ",%%" _asYhi"\n" \
- "vpshufb %%" _asYlo ", %%" _ltmodYlo ", %%" _taYlo "\n" \
- "vpshufb %%" _asYhi ", %%" _ltmodYhi ", %%" _taYhi "\n" \
- "vinserti64x4 $1,%%" _taYhi ",%%" _ta ",%%" _ta "\n" \
- \
- "vextracti64x4 $1,%%" _bs ",%%" _bsYhi"\n" \
- "vpshufb %%" _bsYlo ", %%" _ltmodYlo ", %%" _tbYlo "\n" \
- "vpshufb %%" _bsYhi ", %%" _ltmodYhi ", %%" _tbYhi "\n" \
- "vinserti64x4 $1,%%" _tbYhi ",%%" _tb ",%%" _tb "\n" \
- \
- "vextracti64x4 $1,%%" _ltmul ",%%" _ltmulYhi"\n" \
- \
- "vpshufb %%" _asYlo ", %%" _ltmulYlo ", %%" _asYlo "\n" \
- "vpshufb %%" _asYhi ", %%" _ltmulYhi ", %%" _asYhi "\n" \
- "vinserti64x4 $1,%%" _asYhi ",%%" _as ",%%" _as "\n" \
- \
- "vpshufb %%" _bsYlo ", %%" _ltmulYlo ", %%" _bsYlo "\n" \
- "vpshufb %%" _bsYhi ", %%" _ltmulYhi ", %%" _bsYhi "\n" \
- "vinserti64x4 $1,%%" _bsYhi ",%%" _bs ",%%" _bs "\n" \
- \
- /* lower bits */ \
- "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n" \
- "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n" \
- \
- "vpxorq %%" _ta ", %%" _as ", %%" _as "\n" \
- "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n" \
- \
- "vextracti64x4 $1,%%" _ltmod ",%%" _ltmodYhi"\n" \
- \
- "vextracti64x4 $0,%%" VR0(r) ",%%" "ymm1" "\n" \
- "vextracti64x4 $1,%%" VR0(r) ",%%" _asYhi"\n" \
- "vpshufb %%" "ymm1" ", %%" _ltmodYlo ", %%" _taYlo "\n" \
- "vpshufb %%" _asYhi ", %%" _ltmodYhi ", %%" _taYhi "\n" \
- "vinserti64x4 $1,%%" _taYhi ",%%" _ta ",%%" _ta "\n" \
- \
- "vextracti64x4 $0,%%" VR1(r) ",%%" "ymm2" "\n" \
- "vextracti64x4 $1,%%" VR1(r) ",%%" _bsYhi"\n" \
- "vpshufb %%" "ymm2" ", %%" _ltmodYlo ", %%" _tbYlo "\n" \
- "vpshufb %%" _bsYhi ", %%" _ltmodYhi ", %%" _tbYhi "\n" \
- "vinserti64x4 $1,%%" _tbYhi ",%%" _tb ",%%" _tb "\n" \
- \
- "vextracti64x4 $1,%%" _ltmul ",%%" _ltmulYhi"\n" \
- \
- "vpshufb %%" "ymm1" ", %%" _ltmulYlo ", %%" "ymm1" "\n" \
- "vpshufb %%" _asYhi ", %%" _ltmulYhi ", %%" _asYhi "\n" \
- "vinserti64x4 $1,%%" _asYhi ",%%" "zmm1" ",%%" VR0(r) "\n" \
- \
- "vpshufb %%" "ymm2" ", %%" _ltmulYlo ", %%" "ymm2" "\n" \
- "vpshufb %%" _bsYhi ", %%" _ltmulYhi ", %%" _bsYhi "\n" \
- "vinserti64x4 $1,%%" _bsYhi ",%%" "zmm2" ",%%" VR1(r) "\n" \
- \
- "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \
- "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \
- "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \
- "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \
- : : [c0f] "r" (0x0f0f0f0f0f0f0f0f), \
- [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
- break; \
- default: \
- ASM_BUG(); \
- } \
+ if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \
+ if (x & 0xfe) { MUL2(in); } \
+ if (x & 0x02) { XOR(in, acc); } \
+ if (x & 0xfc) { MUL2(in); } \
+ if (x & 0x04) { XOR(in, acc); } \
+ if (x & 0xf8) { MUL2(in); } \
+ if (x & 0x08) { XOR(in, acc); } \
+ if (x & 0xf0) { MUL2(in); } \
+ if (x & 0x10) { XOR(in, acc); } \
+ if (x & 0xe0) { MUL2(in); } \
+ if (x & 0x20) { XOR(in, acc); } \
+ if (x & 0xc0) { MUL2(in); } \
+ if (x & 0x40) { XOR(in, acc); } \
+ if (x & 0x80) { MUL2(in); XOR(in, acc); } \
}
-#define MUL(c, r...) \
+#define MUL_x2_DEFINE(x) \
+static void \
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+ mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+ mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+ mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+ mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+ mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+ mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+ mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+ mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+ mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+ mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+ mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+ mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+ mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+ mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+ mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+ mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+ mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+ mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+ mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+ mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+ mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+ mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+ mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+ mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+ mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+ mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+ mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+ mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+ mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+ mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+ mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+ mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+ mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+ mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+ mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+ mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+ mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+ mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+ mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+ mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+ mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+ mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+ mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define MUL(c, r...) \
{ \
switch (REG_CNT(r)) { \
case 4: \
- _MULx2(c, R_01(r)); \
- _MULx2(c, R_23(r)); \
- break; \
- case 2: \
- _MULx2(c, R_01(r)); \
- break; \
- default: \
- ASM_BUG(); \
+ COPY(R_01(r), _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, R_01(r)); \
+ COPY(R_23(r), _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, R_23(r)); \
} \
}
+
#define raidz_math_begin() kfpu_begin()
-#define raidz_math_end() \
-{ \
- FLUSH(); \
- kfpu_end(); \
-}
+#define raidz_math_end() kfpu_end()
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() MUL2_SETUP()
+#define MUL_D 0, 1, 2, 3
-/*
- * This use zmm16-zmm31 registers to free up zmm0-zmm15
- * to use with the AVX2 pshufb, see above
- */
-#define GEN_P_DEFINE() {}
#define GEN_P_STRIDE 4
-#define GEN_P_P 20, 21, 22, 23
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
-#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4
-#define GEN_PQ_D 20, 21, 22, 23
-#define GEN_PQ_P 24, 25, 26, 27
-#define GEN_PQ_Q 28, 29, 3, 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {}
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 20, 21
-#define GEN_PQR_P 22, 23
-#define GEN_PQR_Q 24, 25
-#define GEN_PQR_R 26, 27
-
-#define REC_P_DEFINE() {}
-#define REC_P_STRIDE 4
-#define REC_P_X 20, 21, 22, 23
-
-#define REC_Q_DEFINE() {}
-#define REC_Q_STRIDE 4
-#define REC_Q_X 20, 21, 22, 23
-
-#define REC_R_DEFINE() {}
-#define REC_R_STRIDE 4
-#define REC_R_X 20, 21, 22, 23
-
-#define REC_PQ_DEFINE() {}
-#define REC_PQ_STRIDE 2
-#define REC_PQ_X 20, 21
-#define REC_PQ_Y 22, 23
-#define REC_PQ_D 24, 25
-
-#define REC_PR_DEFINE() {}
-#define REC_PR_STRIDE 2
-#define REC_PR_X 20, 21
-#define REC_PR_Y 22, 23
-#define REC_PR_D 24, 25
-
-#define REC_QR_DEFINE() {}
-#define REC_QR_STRIDE 2
-#define REC_QR_X 20, 21
-#define REC_QR_Y 22, 23
-#define REC_QR_D 24, 25
-
-#define REC_PQR_DEFINE() {}
-#define REC_PQR_STRIDE 2
-#define REC_PQR_X 20, 21
-#define REC_PQR_Y 22, 23
-#define REC_PQR_Z 24, 25
-#define REC_PQR_D 26, 27
-#define REC_PQR_XS 26, 27
-#define REC_PQR_YS 28, 29
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 4
+#define REC_PQ_DEFINE() MUL2_SETUP()
+#define REC_PQ_X 0, 1, 2, 3
+#define REC_PQ_Y 4, 5, 6, 7
+#define REC_PQ_T 8, 9, 10, 11
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 4
+#define REC_PR_DEFINE() MUL2_SETUP()
+#define REC_PR_X 0, 1, 2, 3
+#define REC_PR_Y 4, 5, 6, 7
+#define REC_PR_T 8, 9, 10, 11
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 4
+#define REC_QR_DEFINE() MUL2_SETUP()
+#define REC_QR_X 0, 1, 2, 3
+#define REC_QR_Y 4, 5, 6, 7
+#define REC_QR_T 8, 9, 10, 11
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 4
+#define REC_PQR_DEFINE() MUL2_SETUP()
+#define REC_PQR_X 0, 1, 2, 3
+#define REC_PQR_Y 4, 5, 6, 7
+#define REC_PQR_Z 8, 9, 10, 11
+#define REC_PQR_XS 12, 13, 14, 15
+#define REC_PQR_YS 16, 17, 18, 19
#include <sys/vdev_raidz_impl.h>
@@ -508,6 +471,7 @@ static boolean_t
raidz_will_avx512f_work(void)
{
return (zfs_avx_available() &&
+ zfs_avx2_available() &&
zfs_avx512f_available());
}
diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h
index 70257ee49..171380524 100644
--- a/module/zfs/vdev_raidz_math_impl.h
+++ b/module/zfs/vdev_raidz_math_impl.h
@@ -32,250 +32,14 @@
#define noinline __attribute__((noinline))
#endif
-/* Calculate data offset in raidz column, offset is in bytes */
-#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_data) + (off)))
-
-/*
- * PARITY CALCULATION
- * An optimized function is called for a full length of data columns
- * If RAIDZ map contains remainder columns (shorter columns) the same function
- * is called for reminder of full columns.
- *
- * GEN_[P|PQ|PQR]_BLOCK() functions are designed to be efficiently in-lined by
- * the compiler. This removes a lot of conditionals from the inside loop which
- * makes the code faster, especially for vectorized code.
- * They are also highly parametrized, allowing for each implementation to define
- * most optimal stride, and register allocation.
- */
-
-static raidz_inline void
-GEN_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int ncols)
-{
- int c;
- size_t ioff;
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t *col;
-
- GEN_P_DEFINE();
-
- for (ioff = off; ioff < end; ioff += (GEN_P_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(&(rm->rm_col[1]), ioff), GEN_P_P);
-
- for (c = 2; c < ncols; c++) {
- col = &rm->rm_col[c];
- XOR_ACC(COL_OFF(col, ioff), GEN_P_P);
- }
-
- STORE(COL_OFF(pcol, ioff), GEN_P_P);
- }
-}
-
-/*
- * Generate P parity (RAIDZ1)
- *
- * @rm RAIDZ map
- */
-static raidz_inline void
-raidz_generate_p_impl(raidz_map_t * const rm)
-{
- const int ncols = raidz_ncols(rm);
- const size_t psize = raidz_big_size(rm);
- const size_t short_size = raidz_short_size(rm);
-
- raidz_math_begin();
-
- /* short_size */
- GEN_P_BLOCK(rm, 0, short_size, ncols);
-
- /* fullcols */
- GEN_P_BLOCK(rm, short_size, psize, raidz_nbigcols(rm));
-
- raidz_math_end();
-}
-
-static raidz_inline void
-GEN_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int ncols, const int nbigcols)
-{
- int c;
- size_t ioff;
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t *col;
-
- GEN_PQ_DEFINE();
-
- MUL2_SETUP();
-
- for (ioff = off; ioff < end; ioff += (GEN_PQ_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(&rm->rm_col[2], ioff), GEN_PQ_P);
- COPY(GEN_PQ_P, GEN_PQ_Q);
-
- for (c = 3; c < nbigcols; c++) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), GEN_PQ_D);
- MUL2(GEN_PQ_Q);
- XOR(GEN_PQ_D, GEN_PQ_P);
- XOR(GEN_PQ_D, GEN_PQ_Q);
- }
-
- STORE(COL_OFF(pcol, ioff), GEN_PQ_P);
-
- for (; c < ncols; c++)
- MUL2(GEN_PQ_Q);
-
- STORE(COL_OFF(qcol, ioff), GEN_PQ_Q);
- }
-}
-
-/*
- * Generate PQ parity (RAIDZ2)
- *
- * @rm RAIDZ map
- */
-static raidz_inline void
-raidz_generate_pq_impl(raidz_map_t * const rm)
-{
- const int ncols = raidz_ncols(rm);
- const size_t psize = raidz_big_size(rm);
- const size_t short_size = raidz_short_size(rm);
-
- raidz_math_begin();
-
- /* short_size */
- GEN_PQ_BLOCK(rm, 0, short_size, ncols, ncols);
-
- /* fullcols */
- GEN_PQ_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm));
-
- raidz_math_end();
-}
-
-
-static raidz_inline void
-GEN_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int ncols, const int nbigcols)
-{
- int c;
- size_t ioff;
- raidz_col_t *col;
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t * const rcol = raidz_col_p(rm, CODE_R);
-
- GEN_PQR_DEFINE();
-
- MUL2_SETUP();
-
- for (ioff = off; ioff < end; ioff += (GEN_PQR_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(&rm->rm_col[3], ioff), GEN_PQR_P);
- COPY(GEN_PQR_P, GEN_PQR_Q);
- COPY(GEN_PQR_P, GEN_PQR_R);
-
- for (c = 4; c < nbigcols; c++) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), GEN_PQR_D);
- MUL2(GEN_PQR_Q);
- MUL4(GEN_PQR_R);
- XOR(GEN_PQR_D, GEN_PQR_P);
- XOR(GEN_PQR_D, GEN_PQR_Q);
- XOR(GEN_PQR_D, GEN_PQR_R);
- }
-
- STORE(COL_OFF(pcol, ioff), GEN_PQR_P);
-
- for (; c < ncols; c++) {
- MUL2(GEN_PQR_Q);
- MUL4(GEN_PQR_R);
- }
-
- STORE(COL_OFF(qcol, ioff), GEN_PQR_Q);
- STORE(COL_OFF(rcol, ioff), GEN_PQR_R);
- }
-}
-
-
-/*
- * Generate PQR parity (RAIDZ3)
- *
- * @rm RAIDZ map
- */
-static raidz_inline void
-raidz_generate_pqr_impl(raidz_map_t * const rm)
-{
- const int ncols = raidz_ncols(rm);
- const size_t psize = raidz_big_size(rm);
- const size_t short_size = raidz_short_size(rm);
-
- raidz_math_begin();
-
- /* short_size */
- GEN_PQR_BLOCK(rm, 0, short_size, ncols, ncols);
-
- /* fullcols */
- GEN_PQR_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm));
-
- raidz_math_end();
-}
-
-/*
- * DATA RECONSTRUCTION
- *
- * Data reconstruction process consists of two phases:
- * - Syndrome calculation
- * - Data reconstruction
- *
- * Syndrome is calculated by generating parity using available data columns
- * and zeros in places of erasure. Existing parity is added to corresponding
- * syndrome value to obtain the [P|Q|R]syn values from equation:
- * P = Psyn + Dx + Dy + Dz
- * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz
- * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz
- *
- * For data reconstruction phase, the corresponding equations are solved
- * for missing data (Dx, Dy, Dz). This generally involves multiplying known
- * symbols by an coefficient and adding them together. The multiplication
- * constant coefficients are calculated ahead of the operation in
- * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions.
- *
- * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big"
- * and "short" columns.
- * For this reason, reconstruction is performed in minimum of
- * two steps. First, from offset 0 to short_size, then from short_size to
- * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work
- * over both ranges. The split also enables removal of conditional expressions
- * from loop bodies, improving throughput of SIMD implementations.
- * For the best performance, all functions marked with raidz_inline attribute
- * must be inlined by compiler.
- *
- * parity data
- * columns columns
- * <----------> <------------------>
- * x y <----+ missing columns (x, y)
- * | |
- * +---+---+---+---+-v-+---+-v-+---+ ^ 0
- * | | | | | | | | | |
- * | | | | | | | | | |
- * | P | Q | R | D | D | D | D | D | |
- * | | | | 0 | 1 | 2 | 3 | 4 | |
- * | | | | | | | | | v
- * | | | | | +---+---+---+ ^ short_size
- * | | | | | | |
- * +---+---+---+---+---+ v big_size
- * <------------------> <---------->
- * big columns short columns
- *
- */
-
/*
* Functions calculate multiplication constants for data reconstruction.
* Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
* used parity columns for reconstruction.
* @rm RAIDZ map
* @tgtidx array of missing data indexes
- * @coeff output array of coefficients. Array must be user
- * provided and must hold minimum MUL_CNT values
+ * @coeff output array of coefficients. Array must be provided by
+ * user and must hold minimum MUL_CNT values.
*/
static noinline void
raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
@@ -383,240 +147,602 @@ raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
coeff[MUL_PQR_YQ] = yd;
}
+/*
+ * Method for zeroing a buffer (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @dsize Destination buffer size
+ * @private Unused
+ */
+static int
+raidz_zero_abd_cb(void *dc, size_t dsize, void *private)
+{
+ v_t *dst = (v_t *) dc;
+ size_t i;
+
+ ZERO_DEFINE();
+
+ (void) private; /* unused */
+
+ ZERO(ZERO_D);
+
+ for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) {
+ STORE(dst + i, ZERO_D);
+ STORE(dst + i + ZERO_STRIDE, ZERO_D);
+ }
+
+ return (0);
+}
+
+#define raidz_zero(dabd, size) \
+{ \
+ abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \
+}
/*
- * Reconstruction using P parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @ncols number of column
+ * Method for copying two buffers (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
*/
-static raidz_inline void
-REC_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const int ncols)
+static int
+raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
{
- int c;
- size_t ioff;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t *col;
+ v_t *dst = (v_t *) dc;
+ const v_t *src = (v_t *) sc;
+ size_t i;
- REC_P_DEFINE();
+ COPY_DEFINE();
- for (ioff = off; ioff < end; ioff += (REC_P_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(pcol, ioff), REC_P_X);
+ (void) private; /* unused */
- for (c = firstdc; c < x; c++) {
- col = &rm->rm_col[c];
- XOR_ACC(COL_OFF(col, ioff), REC_P_X);
- }
+ for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) {
+ LOAD(src + i, COPY_D);
+ STORE(dst + i, COPY_D);
+
+ LOAD(src + i + COPY_STRIDE, COPY_D);
+ STORE(dst + i + COPY_STRIDE, COPY_D);
+ }
+
+ return (0);
+}
- for (c++; c < ncols; c++) {
- col = &rm->rm_col[c];
- XOR_ACC(COL_OFF(col, ioff), REC_P_X);
- }
- STORE(COL_OFF(xcol, ioff), REC_P_X);
+#define raidz_copy(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+}
+
+/*
+ * Method for adding (XORing) two buffers.
+ * Source and destination are XORed together and result is stored in
+ * destination buffer. This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
+ */
+static int
+raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+ v_t *dst = (v_t *) dc;
+ const v_t *src = (v_t *) sc;
+ size_t i;
+
+ ADD_DEFINE();
+
+ (void) private; /* unused */
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) {
+ LOAD(dst + i, ADD_D);
+ XOR_ACC(src + i, ADD_D);
+ STORE(dst + i, ADD_D);
+
+ LOAD(dst + i + ADD_STRIDE, ADD_D);
+ XOR_ACC(src + i + ADD_STRIDE, ADD_D);
+ STORE(dst + i + ADD_STRIDE, ADD_D);
}
+
+ return (0);
+}
+
+#define raidz_add(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
}
/*
- * Reconstruct single data column using P parity
- * @rec_method REC_P_BLOCK()
+ * Method for multiplying a buffer with a constant in GF(2^8).
+ * Symbols from buffer are multiplied by a constant and result is stored
+ * back in the same buffer.
*
- * @rm RAIDZ map
- * @tgtidx array of missing data indexes
+ * @dc In/Out data buffer.
+ * @size Size of the buffer
+ * @private pointer to the multiplication constant (unsigned)
*/
-static raidz_inline int
-raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
+static int
+raidz_mul_abd_cb(void *dc, size_t size, void *private)
+{
+ const unsigned mul = *((unsigned *) private);
+ v_t *d = (v_t *) dc;
+ size_t i;
+
+ MUL_DEFINE();
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) {
+ LOAD(d + i, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i, MUL_D);
+
+ LOAD(d + i + MUL_STRIDE, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i + MUL_STRIDE, MUL_D);
+ }
+
+ return (0);
+}
+
+
+/*
+ * Syndrome generation/update macros
+ *
+ * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros
+ */
+#define P_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ STORE((t), T); \
+}
+
+#define R_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define R_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ STORE((t), T); \
+}
+
+
+/*
+ * PARITY CALCULATION
+ *
+ * Macros *_SYNDROME are used for parity/syndrome calculation.
+ * *_D_SYNDROME() macros are used to calculate syndrome between 0 and
+ * length of data column, and *_SYNDROME() macros are only for updating
+ * the parity/syndrome if data column is shorter.
+ *
+ * P parity is calculated using raidz_add_abd().
+ */
+
+/*
+ * Generate P parity (RAIDZ1)
+ *
+ * @rm RAIDZ map
+ */
+static raidz_inline void
+raidz_generate_p_impl(raidz_map_t * const rm)
{
- const int x = tgtidx[TARGET_X];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t short_size = raidz_short_size(rm);
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t psize = rm->rm_col[CODE_P].rc_size;
+ abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
+ size_t size;
+ abd_t *dabd;
raidz_math_begin();
- /* 0 - short_size */
- REC_P_BLOCK(rm, 0, short_size, x, ncols);
+ /* start with first data column */
+ raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
- /* short_size - xsize */
- REC_P_BLOCK(rm, short_size, xsize, x, nbigcols);
+ for (c = 2; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ size = rm->rm_col[c].rc_size;
- raidz_math_end();
+ /* add data column */
+ raidz_add(pabd, dabd, size);
+ }
- return (1 << CODE_P);
+ raidz_math_end();
}
+
/*
- * Reconstruct using Q parity
+ * Generate PQ parity (RAIDZ2)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
*/
+static void
+raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *) c[0];
+ v_t *q = (v_t *) c[1];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQ_DEFINE();
-#define REC_Q_SYN_UPDATE() MUL2(REC_Q_X)
+ MUL2_SETUP();
-#define REC_Q_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[c]; \
- REC_Q_SYN_UPDATE(); \
- XOR_ACC(COL_OFF(col, ioff), REC_Q_X); \
+ for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
+ q += GEN_PQ_STRIDE) {
+ LOAD(d, GEN_PQ_D);
+ P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
+ Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
+ }
+ for (; q < qend; q += GEN_PQ_STRIDE) {
+ Q_SYNDROME(GEN_PQ_C, q);
+ }
}
+
/*
- * Reconstruction using Q parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
+ * Generate PQ parity (RAIDZ2)
+ *
+ * @rm RAIDZ map
*/
static raidz_inline void
-REC_Q_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const unsigned *coeff, const int ncols, const int nbigcols)
+raidz_generate_pq_impl(raidz_map_t * const rm)
{
- int c;
- size_t ioff = 0;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t *col;
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t csize = rm->rm_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd
+ };
- REC_Q_DEFINE();
+ raidz_math_begin();
- for (ioff = off; ioff < end; ioff += (REC_Q_STRIDE * sizeof (v_t))) {
- MUL2_SETUP();
+ raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
- ZERO(REC_Q_X);
+ for (c = 3; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_Q_INNER_LOOP(c);
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
+ raidz_gen_pq_add);
+ }
- REC_Q_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_Q_INNER_LOOP(c);
- } else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_Q_SYN_UPDATE();
- if (x != c) {
- col = &rm->rm_col[c];
- XOR_ACC(COL_OFF(col, ioff), REC_Q_X);
- }
- }
- for (; c < ncols; c++)
- REC_Q_SYN_UPDATE();
- }
+ raidz_math_end();
+}
- XOR_ACC(COL_OFF(qcol, ioff), REC_Q_X);
- MUL(coeff[MUL_Q_X], REC_Q_X);
- STORE(COL_OFF(xcol, ioff), REC_Q_X);
+
+/*
+ * Generate PQR parity (RAIDZ3)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
+ */
+static void
+raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *) c[0];
+ v_t *q = (v_t *) c[1];
+ v_t *r = (v_t *) c[CODE_R];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE,
+ q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ LOAD(d, GEN_PQR_D);
+ P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p);
+ Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q);
+ R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r);
+ }
+ for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ Q_SYNDROME(GEN_PQR_C, q);
+ R_SYNDROME(GEN_PQR_C, r);
}
}
+
/*
- * Reconstruct single data column using Q parity
- * @rec_method REC_Q_BLOCK()
+ * Generate PQR parity (RAIDZ2)
+ *
+ * @rm RAIDZ map
+ */
+static raidz_inline void
+raidz_generate_pqr_impl(raidz_map_t * const rm)
+{
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t csize = rm->rm_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
+
+ raidz_math_begin();
+
+ raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
+
+ for (c = 4; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
+ raidz_gen_pqr_add);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * DATA RECONSTRUCTION
+ *
+ * Data reconstruction process consists of two phases:
+ * - Syndrome calculation
+ * - Data reconstruction
+ *
+ * Syndrome is calculated by generating parity using available data columns
+ * and zeros in places of erasure. Existing parity is added to corresponding
+ * syndrome value to obtain the [P|Q|R]syn values from equation:
+ * P = Psyn + Dx + Dy + Dz
+ * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz
+ * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz
+ *
+ * For data reconstruction phase, the corresponding equations are solved
+ * for missing data (Dx, Dy, Dz). This generally involves multiplying known
+ * symbols by an coefficient and adding them together. The multiplication
+ * constant coefficients are calculated ahead of the operation in
+ * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions.
+ *
+ * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big"
+ * and "short" columns.
+ * For this reason, reconstruction is performed in minimum of
+ * two steps. First, from offset 0 to short_size, then from short_size to
+ * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work
+ * over both ranges. The split also enables removal of conditional expressions
+ * from loop bodies, improving throughput of SIMD implementations.
+ * For the best performance, all functions marked with raidz_inline attribute
+ * must be inlined by compiler.
+ *
+ * parity data
+ * columns columns
+ * <----------> <------------------>
+ * x y <----+ missing columns (x, y)
+ * | |
+ * +---+---+---+---+-v-+---+-v-+---+ ^ 0
+ * | | | | | | | | | |
+ * | | | | | | | | | |
+ * | P | Q | R | D | D | D | D | D | |
+ * | | | | 0 | 1 | 2 | 3 | 4 | |
+ * | | | | | | | | | v
+ * | | | | | +---+---+---+ ^ short_size
+ * | | | | | | |
+ * +---+---+---+---+---+ v big_size
+ * <------------------> <---------->
+ * big columns short columns
+ *
+ */
+
+
+
+
+/*
+ * Reconstruct single data column using P parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method not applicable
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t short_size = raidz_short_size(rm);
- unsigned coeff[MUL_CNT];
-
- raidz_rec_q_coeff(rm, tgtidx, coeff);
+ size_t c;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ size_t size;
+ abd_t *dabd;
raidz_math_begin();
- /* 0 - short_size */
- REC_Q_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols);
+ /* copy P into target */
+ raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
- /* short_size - xsize */
- REC_Q_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols);
+ /* generate p_syndrome */
+ for (c = firstdc; c < ncols; c++) {
+ if (c == x)
+ continue;
+
+ dabd = rm->rm_col[c].rc_abd;
+ size = MIN(rm->rm_col[c].rc_size, xsize);
+
+ raidz_add(xabd, dabd, size);
+ }
raidz_math_end();
- return (1 << CODE_Q);
+ return (1 << CODE_P);
}
+
/*
- * Reconstruct using R parity
+ * Generate Q syndrome (Qsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @xsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
*/
+static void
+raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) xc[TARGET_X];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (xsize / sizeof (v_t));
-#define REC_R_SYN_UPDATE() MUL4(REC_R_X)
-#define REC_R_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[c]; \
- REC_R_SYN_UPDATE(); \
- XOR_ACC(COL_OFF(col, ioff), REC_R_X); \
+ SYN_Q_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_Q_D);
+ Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ Q_SYNDROME(SYN_Q_X, x);
+ }
}
+
/*
- * Reconstruction using R parity
+ * Reconstruct single data column using Q parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
+ *
* @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
+ * @tgtidx array of missing data indexes
*/
-static raidz_inline void
-REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const unsigned *coeff, const int ncols, const int nbigcols)
+static raidz_inline int
+raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
{
- int c;
- size_t ioff = 0;
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const rcol = raidz_col_p(rm, CODE_R);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t *col;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *tabds[] = { xabd };
- REC_R_DEFINE();
-
- for (ioff = off; ioff < end; ioff += (REC_R_STRIDE * sizeof (v_t))) {
- MUL2_SETUP();
+ unsigned coeff[MUL_CNT];
+ raidz_rec_q_coeff(rm, tgtidx, coeff);
- ZERO(REC_R_X);
+ raidz_math_begin();
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_R_INNER_LOOP(c);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
- REC_R_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_R_INNER_LOOP(c);
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
} else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_R_SYN_UPDATE();
- if (c != x) {
- col = &rm->rm_col[c];
- XOR_ACC(COL_OFF(col, ioff), REC_R_X);
- }
- }
- for (; c < ncols; c++)
- REC_R_SYN_UPDATE();
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
}
- XOR_ACC(COL_OFF(rcol, ioff), REC_R_X);
- MUL(coeff[MUL_R_X], REC_R_X);
- STORE(COL_OFF(xcol, ioff), REC_R_X);
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_q_abd);
}
+
+ /* add Q to the syndrome */
+ raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
+
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
+
+ raidz_math_end();
+
+ return (1 << CODE_Q);
}
+
+/*
+ * Generate R syndrome (Rsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) xc[TARGET_X];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+
+ SYN_R_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_R_D);
+ R_D_SYNDROME(SYN_R_D, SYN_R_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ R_SYNDROME(SYN_R_X, x);
+ }
+}
+
+
/*
* Reconstruct single data column using R parity
- * @rec_method REC_R_BLOCK()
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
@@ -624,122 +750,136 @@ REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
static raidz_inline int
raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t short_size = raidz_short_size(rm);
- unsigned coeff[MUL_CNT];
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *tabds[] = { xabd };
+ unsigned coeff[MUL_CNT];
raidz_rec_r_coeff(rm, tgtidx, coeff);
raidz_math_begin();
- /* 0 - short_size */
- REC_R_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
+
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_r_abd);
+ }
+
+ /* add R to the syndrome */
+ raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
- /* short_size - xsize */
- REC_R_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols);
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
raidz_math_end();
return (1 << CODE_R);
}
+
/*
- * Reconstruct using PQ parity
+ * Generate P and Q syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
*/
+static void
+raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) tc[TARGET_X];
+ v_t *y = (v_t *) tc[TARGET_Y];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+
+ SYN_PQ_DEFINE();
-#define REC_PQ_SYN_UPDATE() MUL2(REC_PQ_Y)
-#define REC_PQ_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[c]; \
- LOAD(COL_OFF(col, ioff), REC_PQ_D); \
- REC_PQ_SYN_UPDATE(); \
- XOR(REC_PQ_D, REC_PQ_X); \
- XOR(REC_PQ_D, REC_PQ_Y); \
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x);
+ Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQ_X, y);
+ }
}
/*
- * Reconstruction using PQ parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @y missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
- * @calcy calculate second data column
+ * Reconstruct data using PQ parity and PQ syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
*/
-static raidz_inline void
-REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const int y, const unsigned *coeff, const int ncols,
- const int nbigcols, const boolean_t calcy)
+static void
+raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
+ const unsigned *mul)
{
- int c;
- size_t ioff = 0;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t * const ycol = raidz_col_p(rm, y);
- raidz_col_t *col;
+ v_t *x = (v_t *) tc[TARGET_X];
+ v_t *y = (v_t *) tc[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *) c[CODE_P];
+ const v_t *q = (v_t *) c[CODE_Q];
REC_PQ_DEFINE();
- for (ioff = off; ioff < end; ioff += (REC_PQ_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(pcol, ioff), REC_PQ_X);
- ZERO(REC_PQ_Y);
- MUL2_SETUP();
-
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_PQ_INNER_LOOP(c);
+ for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE,
+ p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) {
+ LOAD(x, REC_PQ_X);
+ LOAD(y, REC_PQ_Y);
- REC_PQ_SYN_UPDATE();
- for (c++; c < y; c++)
- REC_PQ_INNER_LOOP(c);
-
- REC_PQ_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_PQ_INNER_LOOP(c);
- } else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_PQ_SYN_UPDATE();
- if (c != x && c != y) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), REC_PQ_D);
- XOR(REC_PQ_D, REC_PQ_X);
- XOR(REC_PQ_D, REC_PQ_Y);
- }
- }
- for (; c < ncols; c++)
- REC_PQ_SYN_UPDATE();
- }
-
- XOR_ACC(COL_OFF(qcol, ioff), REC_PQ_Y);
+ XOR_ACC(p, REC_PQ_X);
+ XOR_ACC(q, REC_PQ_Y);
/* Save Pxy */
- COPY(REC_PQ_X, REC_PQ_D);
+ COPY(REC_PQ_X, REC_PQ_T);
/* Calc X */
- MUL(coeff[MUL_PQ_X], REC_PQ_X);
- MUL(coeff[MUL_PQ_Y], REC_PQ_Y);
+ MUL(mul[MUL_PQ_X], REC_PQ_X);
+ MUL(mul[MUL_PQ_Y], REC_PQ_Y);
XOR(REC_PQ_Y, REC_PQ_X);
- STORE(COL_OFF(xcol, ioff), REC_PQ_X);
+ STORE(x, REC_PQ_X);
- if (calcy) {
- /* Calc Y */
- XOR(REC_PQ_D, REC_PQ_X);
- STORE(COL_OFF(ycol, ioff), REC_PQ_X);
- }
+ /* Calc Y */
+ XOR(REC_PQ_T, REC_PQ_X);
+ STORE(y, REC_PQ_X);
}
}
+
/*
* Reconstruct two data columns using PQ parity
- * @rec_method REC_PQ_BLOCK()
+ *
+ * @syn_method raidz_syn_pq_abd()
+ * @rec_method raidz_rec_pq_abd()
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
@@ -747,126 +887,156 @@ REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
static raidz_inline int
raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int y = tgtidx[TARGET_Y];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t ysize = raidz_col_size(rm, y);
- const size_t short_size = raidz_short_size(rm);
- unsigned coeff[MUL_CNT];
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd
+ };
+ unsigned coeff[MUL_CNT];
raidz_rec_pq_coeff(rm, tgtidx, coeff);
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
raidz_math_begin();
- /* 0 - short_size */
- REC_PQ_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
- /* short_size - xsize */
- REC_PQ_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols,
- xsize == ysize);
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pq_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff);
+
+ /* Copy shorter targets back to the original abd buffer */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_math_end();
+ if (ysize < xsize)
+ abd_free(yabd);
+
return ((1 << CODE_P) | (1 << CODE_Q));
}
+
/*
- * Reconstruct using PR parity
+ * Generate P and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
*/
+static void
+raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) c[TARGET_X];
+ v_t *y = (v_t *) c[TARGET_Y];
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
-#define REC_PR_SYN_UPDATE() MUL4(REC_PR_Y)
-#define REC_PR_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[c]; \
- LOAD(COL_OFF(col, ioff), REC_PR_D); \
- REC_PR_SYN_UPDATE(); \
- XOR(REC_PR_D, REC_PR_X); \
- XOR(REC_PR_D, REC_PR_Y); \
+ SYN_PR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PR_D);
+ P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x);
+ R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ R_SYNDROME(SYN_PR_X, y);
+ }
}
/*
- * Reconstruction using PR parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @y missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
- * @calcy calculate second data column
+ * Reconstruct data using PR parity and PR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
*/
-static raidz_inline void
-REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const int y, const unsigned *coeff, const int ncols,
- const int nbigcols, const boolean_t calcy)
+static void
+raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
{
- int c;
- size_t ioff;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const rcol = raidz_col_p(rm, CODE_R);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t * const ycol = raidz_col_p(rm, y);
- raidz_col_t *col;
+ v_t *x = (v_t *) t[TARGET_X];
+ v_t *y = (v_t *) t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *) c[CODE_P];
+ const v_t *q = (v_t *) c[CODE_Q];
REC_PR_DEFINE();
- for (ioff = off; ioff < end; ioff += (REC_PR_STRIDE * sizeof (v_t))) {
- LOAD(COL_OFF(pcol, ioff), REC_PR_X);
- ZERO(REC_PR_Y);
- MUL2_SETUP();
-
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_PR_INNER_LOOP(c);
-
- REC_PR_SYN_UPDATE();
- for (c++; c < y; c++)
- REC_PR_INNER_LOOP(c);
-
- REC_PR_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_PR_INNER_LOOP(c);
- } else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_PR_SYN_UPDATE();
- if (c != x && c != y) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), REC_PR_D);
- XOR(REC_PR_D, REC_PR_X);
- XOR(REC_PR_D, REC_PR_Y);
- }
- }
- for (; c < ncols; c++)
- REC_PR_SYN_UPDATE();
- }
-
- XOR_ACC(COL_OFF(rcol, ioff), REC_PR_Y);
+ for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE,
+ p += REC_PR_STRIDE, q += REC_PR_STRIDE) {
+ LOAD(x, REC_PR_X);
+ LOAD(y, REC_PR_Y);
+ XOR_ACC(p, REC_PR_X);
+ XOR_ACC(q, REC_PR_Y);
/* Save Pxy */
- COPY(REC_PR_X, REC_PR_D);
+ COPY(REC_PR_X, REC_PR_T);
/* Calc X */
- MUL(coeff[MUL_PR_X], REC_PR_X);
- MUL(coeff[MUL_PR_Y], REC_PR_Y);
+ MUL(mul[MUL_PR_X], REC_PR_X);
+ MUL(mul[MUL_PR_Y], REC_PR_Y);
XOR(REC_PR_Y, REC_PR_X);
- STORE(COL_OFF(xcol, ioff), REC_PR_X);
+ STORE(x, REC_PR_X);
- if (calcy) {
- /* Calc Y */
- XOR(REC_PR_D, REC_PR_X);
- STORE(COL_OFF(ycol, ioff), REC_PR_X);
- }
+ /* Calc Y */
+ XOR(REC_PR_T, REC_PR_X);
+ STORE(y, REC_PR_X);
}
}
/*
* Reconstruct two data columns using PR parity
- * @rec_method REC_PR_BLOCK()
+ *
+ * @syn_method raidz_syn_pr_abd()
+ * @rec_method raidz_rec_pr_abd()
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
@@ -874,134 +1044,162 @@ REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
static raidz_inline int
raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int y = tgtidx[TARGET_Y];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t ysize = raidz_col_size(rm, y);
- const size_t short_size = raidz_short_size(rm);
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[0];
+ const size_t y = tgtidx[1];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
unsigned coeff[MUL_CNT];
-
raidz_rec_pr_coeff(rm, tgtidx, coeff);
+ /*
+ * Check if some of targets are shorter then others.
+ * They need to be replaced with a new buffer so that syndrome can
+ * be calculated on full length.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
raidz_math_begin();
- /* 0 - short_size */
- REC_PR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff);
- /* short_size - xsize */
- REC_PR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols,
- xsize == ysize);
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_math_end();
- return ((1 << CODE_P) | (1 << CODE_R));
+ if (ysize < xsize)
+ abd_free(yabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q));
}
/*
- * Reconstruct using QR parity
+ * Generate Q and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
*/
+static void
+raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) c[TARGET_X];
+ v_t *y = (v_t *) c[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
-#define REC_QR_SYN_UPDATE() \
-{ \
- MUL2(REC_QR_X); \
- MUL4(REC_QR_Y); \
-}
+ SYN_QR_DEFINE();
+
+ MUL2_SETUP();
-#define REC_QR_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[c]; \
- LOAD(COL_OFF(col, ioff), REC_QR_D); \
- REC_QR_SYN_UPDATE(); \
- XOR(REC_QR_D, REC_QR_X); \
- XOR(REC_QR_D, REC_QR_Y); \
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x);
+ R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y);
+ }
+ for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_QR_X, x);
+ R_SYNDROME(SYN_QR_X, y);
+ }
}
+
/*
- * Reconstruction using QR parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @y missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
- * @calcy calculate second data column
+ * Reconstruct data using QR parity and QR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
*/
-static raidz_inline void
-REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const int y, const unsigned *coeff, const int ncols,
- const int nbigcols, const boolean_t calcy)
+static void
+raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
{
- int c;
- size_t ioff;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t * const rcol = raidz_col_p(rm, CODE_R);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t * const ycol = raidz_col_p(rm, y);
- raidz_col_t *col;
+ v_t *x = (v_t *) t[TARGET_X];
+ v_t *y = (v_t *) t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *) c[CODE_P];
+ const v_t *q = (v_t *) c[CODE_Q];
REC_QR_DEFINE();
- for (ioff = off; ioff < end; ioff += (REC_QR_STRIDE * sizeof (v_t))) {
- MUL2_SETUP();
- ZERO(REC_QR_X);
- ZERO(REC_QR_Y);
+ for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE,
+ p += REC_QR_STRIDE, q += REC_QR_STRIDE) {
+ LOAD(x, REC_QR_X);
+ LOAD(y, REC_QR_Y);
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_QR_INNER_LOOP(c);
+ XOR_ACC(p, REC_QR_X);
+ XOR_ACC(q, REC_QR_Y);
- REC_QR_SYN_UPDATE();
- for (c++; c < y; c++)
- REC_QR_INNER_LOOP(c);
-
- REC_QR_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_QR_INNER_LOOP(c);
- } else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_QR_SYN_UPDATE();
- if (c != x && c != y) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), REC_QR_D);
- XOR(REC_QR_D, REC_QR_X);
- XOR(REC_QR_D, REC_QR_Y);
- }
- }
- for (; c < ncols; c++)
- REC_QR_SYN_UPDATE();
- }
-
- XOR_ACC(COL_OFF(qcol, ioff), REC_QR_X);
- XOR_ACC(COL_OFF(rcol, ioff), REC_QR_Y);
-
- /* Save Qxy */
- COPY(REC_QR_X, REC_QR_D);
+ /* Save Pxy */
+ COPY(REC_QR_X, REC_QR_T);
/* Calc X */
- MUL(coeff[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */
- XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */
- MUL(coeff[MUL_QR_X], REC_QR_X); /* X = X * xm */
- STORE(COL_OFF(xcol, ioff), REC_QR_X);
-
- if (calcy) {
- /* Calc Y */
- MUL(coeff[MUL_QR_YQ], REC_QR_D); /* X = Q * xqm */
- XOR(REC_QR_Y, REC_QR_D); /* X = R ^ X */
- MUL(coeff[MUL_QR_Y], REC_QR_D); /* X = X * xm */
- STORE(COL_OFF(ycol, ioff), REC_QR_D);
- }
+ MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */
+ MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */
+ STORE(x, REC_QR_X);
+
+ /* Calc Y */
+ MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */
+ MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */
+ STORE(y, REC_QR_T);
}
}
+
/*
* Reconstruct two data columns using QR parity
- * @rec_method REC_QR_BLOCK()
+ *
+ * @syn_method raidz_syn_qr_abd()
+ * @rec_method raidz_rec_qr_abd()
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
@@ -1009,158 +1207,182 @@ REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
static raidz_inline int
raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int y = tgtidx[TARGET_Y];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t ysize = raidz_col_size(rm, y);
- const size_t short_size = raidz_short_size(rm);
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
unsigned coeff[MUL_CNT];
-
raidz_rec_qr_coeff(rm, tgtidx, coeff);
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
raidz_math_begin();
- /* 0 - short_size */
- REC_QR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_qr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff);
- /* short_size - xsize */
- REC_QR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols,
- xsize == ysize);
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_math_end();
+ if (ysize < xsize)
+ abd_free(yabd);
+
+
return ((1 << CODE_Q) | (1 << CODE_R));
}
+
/*
- * Reconstruct using PQR parity
+ * Generate P, Q, and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
*/
+static void
+raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *) c[TARGET_X];
+ v_t *y = (v_t *) c[TARGET_Y];
+ v_t *z = (v_t *) c[TARGET_Z];
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+ const v_t *d = (v_t *) dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
-#define REC_PQR_SYN_UPDATE() \
-{ \
- MUL2(REC_PQR_Y); \
- MUL4(REC_PQR_Z); \
-}
+ SYN_PQR_DEFINE();
-#define REC_PQR_INNER_LOOP(c) \
-{ \
- col = &rm->rm_col[(c)]; \
- LOAD(COL_OFF(col, ioff), REC_PQR_D); \
- REC_PQR_SYN_UPDATE(); \
- XOR(REC_PQR_D, REC_PQR_X); \
- XOR(REC_PQR_D, REC_PQR_Y); \
- XOR(REC_PQR_D, REC_PQR_Z); \
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE,
+ z += SYN_STRIDE) {
+ LOAD(d, SYN_PQR_D);
+ P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x)
+ Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y);
+ R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z);
+ }
+ for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQR_X, y);
+ R_SYNDROME(SYN_PQR_X, z);
+ }
}
+
/*
- * Reconstruction using PQR parity
- * @rm RAIDZ map
- * @off starting offset
- * @end ending offset
- * @x missing data column
- * @y missing data column
- * @z missing data column
- * @coeff multiplication coefficients
- * @ncols number of column
- * @nbigcols number of big columns
- * @calcy calculate second data column
- * @calcz calculate third data column
+ * Reconstruct data using PRQ parity and PQR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
*/
-static raidz_inline void
-REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
- const int x, const int y, const int z, const unsigned *coeff,
- const int ncols, const int nbigcols, const boolean_t calcy,
- const boolean_t calcz)
+static void
+raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
+ const unsigned * const mul)
{
- int c;
- size_t ioff;
- const size_t firstdc = raidz_parity(rm);
- raidz_col_t * const pcol = raidz_col_p(rm, CODE_P);
- raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q);
- raidz_col_t * const rcol = raidz_col_p(rm, CODE_R);
- raidz_col_t * const xcol = raidz_col_p(rm, x);
- raidz_col_t * const ycol = raidz_col_p(rm, y);
- raidz_col_t * const zcol = raidz_col_p(rm, z);
- raidz_col_t *col;
+ v_t *x = (v_t *) t[TARGET_X];
+ v_t *y = (v_t *) t[TARGET_Y];
+ v_t *z = (v_t *) t[TARGET_Z];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *) c[CODE_P];
+ const v_t *q = (v_t *) c[CODE_Q];
+ const v_t *r = (v_t *) c[CODE_R];
REC_PQR_DEFINE();
- for (ioff = off; ioff < end; ioff += (REC_PQR_STRIDE * sizeof (v_t))) {
- MUL2_SETUP();
- LOAD(COL_OFF(pcol, ioff), REC_PQR_X);
- ZERO(REC_PQR_Y);
- ZERO(REC_PQR_Z);
+ for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE,
+ z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE,
+ r += REC_PQR_STRIDE) {
+ LOAD(x, REC_PQR_X);
+ LOAD(y, REC_PQR_Y);
+ LOAD(z, REC_PQR_Z);
- if (ncols == nbigcols) {
- for (c = firstdc; c < x; c++)
- REC_PQR_INNER_LOOP(c);
-
- REC_PQR_SYN_UPDATE();
- for (c++; c < y; c++)
- REC_PQR_INNER_LOOP(c);
-
- REC_PQR_SYN_UPDATE();
- for (c++; c < z; c++)
- REC_PQR_INNER_LOOP(c);
-
- REC_PQR_SYN_UPDATE();
- for (c++; c < nbigcols; c++)
- REC_PQR_INNER_LOOP(c);
- } else {
- for (c = firstdc; c < nbigcols; c++) {
- REC_PQR_SYN_UPDATE();
- if (c != x && c != y && c != z) {
- col = &rm->rm_col[c];
- LOAD(COL_OFF(col, ioff), REC_PQR_D);
- XOR(REC_PQR_D, REC_PQR_X);
- XOR(REC_PQR_D, REC_PQR_Y);
- XOR(REC_PQR_D, REC_PQR_Z);
- }
- }
- for (; c < ncols; c++)
- REC_PQR_SYN_UPDATE();
- }
-
- XOR_ACC(COL_OFF(qcol, ioff), REC_PQR_Y);
- XOR_ACC(COL_OFF(rcol, ioff), REC_PQR_Z);
+ XOR_ACC(p, REC_PQR_X);
+ XOR_ACC(q, REC_PQR_Y);
+ XOR_ACC(r, REC_PQR_Z);
/* Save Pxyz and Qxyz */
COPY(REC_PQR_X, REC_PQR_XS);
COPY(REC_PQR_Y, REC_PQR_YS);
/* Calc X */
- MUL(coeff[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */
- MUL(coeff[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */
+ MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */
+ MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */
XOR(REC_PQR_Y, REC_PQR_X);
- MUL(coeff[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */
+ MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */
XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */
- STORE(COL_OFF(xcol, ioff), REC_PQR_X);
-
- if (calcy) {
- /* Calc Y */
- XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */
- MUL(coeff[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */
- XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */
- COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */
- MUL(coeff[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */
- MUL(coeff[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */
- XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */
- STORE(COL_OFF(ycol, ioff), REC_PQR_YS);
- }
-
- if (calcz) {
- /* Calc Z */
- XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */
- STORE(COL_OFF(zcol, ioff), REC_PQR_YS);
- }
+ STORE(x, REC_PQR_X);
+
+ /* Calc Y */
+ XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */
+ MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */
+ COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */
+ MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */
+ MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */
+ STORE(y, REC_PQR_YS);
+
+ /* Calc Z */
+ XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */
+ STORE(z, REC_PQR_YS);
}
}
+
/*
* Reconstruct three data columns using PQR parity
- * @rec_method REC_PQR_BLOCK()
+ *
+ * @syn_method raidz_syn_pqr_abd()
+ * @rec_method raidz_rec_pqr_abd()
*
* @rm RAIDZ map
* @tgtidx array of missing data indexes
@@ -1168,31 +1390,87 @@ REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end,
static raidz_inline int
raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
{
- const int x = tgtidx[TARGET_X];
- const int y = tgtidx[TARGET_Y];
- const int z = tgtidx[TARGET_Z];
- const int ncols = raidz_ncols(rm);
- const int nbigcols = raidz_nbigcols(rm);
- const size_t xsize = raidz_col_size(rm, x);
- const size_t ysize = raidz_col_size(rm, y);
- const size_t zsize = raidz_col_size(rm, z);
- const size_t short_size = raidz_short_size(rm);
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t z = tgtidx[TARGET_Z];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ const size_t zsize = rm->rm_col[z].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *zabd = rm->rm_col[z].rc_abd;
+ abd_t *tabds[] = { xabd, yabd, zabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
unsigned coeff[MUL_CNT];
-
raidz_rec_pqr_coeff(rm, tgtidx, coeff);
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+ if (zsize < xsize) {
+ zabd = abd_alloc(xsize, B_FALSE);
+ tabds[2] = zabd;
+ }
+
raidz_math_begin();
- /* 0 - short_size */
- REC_PQR_BLOCK(rm, 0, short_size, x, y, z, coeff, ncols, ncols,
- B_TRUE, B_TRUE);
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ raidz_zero(zabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y || c == z) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
- /* short_size - xsize */
- REC_PQR_BLOCK(rm, short_size, xsize, x, y, z, coeff, ncols, nbigcols,
- xsize == ysize, xsize == zsize);
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+ raidz_syn_pqr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ if (zsize < xsize)
+ raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
raidz_math_end();
+ if (ysize < xsize)
+ abd_free(yabd);
+ if (zsize < xsize)
+ abd_free(zabd);
+
return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
}
diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c
index 993d406e6..a693bff63 100644
--- a/module/zfs/vdev_raidz_math_scalar.c
+++ b/module/zfs/vdev_raidz_math_scalar.c
@@ -154,71 +154,96 @@ static const struct {
#define raidz_math_begin() {}
#define raidz_math_end() {}
-#define GEN_P_DEFINE() v_t p0
-#define GEN_P_STRIDE 1
-#define GEN_P_P p0
-
-#define GEN_PQ_DEFINE() v_t d0, p0, q0
-#define GEN_PQ_STRIDE 1
-#define GEN_PQ_D d0
-#define GEN_PQ_P p0
-#define GEN_PQ_Q q0
-
-#define GEN_PQR_DEFINE() v_t d0, p0, q0, r0
-#define GEN_PQR_STRIDE 1
-#define GEN_PQR_D d0
-#define GEN_PQR_P p0
-#define GEN_PQR_Q q0
-#define GEN_PQR_R r0
-
-#define REC_P_DEFINE() v_t x0
-#define REC_P_STRIDE 1
-#define REC_P_X x0
-
-#define REC_Q_DEFINE() v_t x0
-#define REC_Q_STRIDE 1
-#define REC_Q_X x0
-
-#define REC_R_DEFINE() v_t x0
-#define REC_R_STRIDE 1
-#define REC_R_X x0
-
-#define REC_PQ_DEFINE() v_t x0, y0, d0
-#define REC_PQ_STRIDE 1
-#define REC_PQ_X x0
-#define REC_PQ_Y y0
-#define REC_PQ_D d0
-
-#define REC_PR_DEFINE() v_t x0, y0, d0
-#define REC_PR_STRIDE 1
-#define REC_PR_X x0
-#define REC_PR_Y y0
-#define REC_PR_D d0
-
-#define REC_QR_DEFINE() v_t x0, y0, d0
-#define REC_QR_STRIDE 1
-#define REC_QR_X x0
-#define REC_QR_Y y0
-#define REC_QR_D d0
-
-#define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0
-#define REC_PQR_STRIDE 1
-#define REC_PQR_X x0
-#define REC_PQR_Y y0
-#define REC_PQR_Z z0
-#define REC_PQR_D d0
-#define REC_PQR_XS d0
-#define REC_PQR_YS t0
+#define SYN_STRIDE 1
-#include "vdev_raidz_math_impl.h"
+#define ZERO_DEFINE() v_t d0
+#define ZERO_STRIDE 1
+#define ZERO_D d0
-/*
- * If compiled with -O0, gcc doesn't do any stack frame coalescing
- * and -Wframe-larger-than=1024 is triggered in debug mode.
- * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which
- * does not trigger this warning.
- */
-#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#define COPY_DEFINE() v_t d0
+#define COPY_STRIDE 1
+#define COPY_D d0
+
+#define ADD_DEFINE() v_t d0
+#define ADD_STRIDE 1
+#define ADD_D d0
+
+#define MUL_DEFINE() v_t d0
+#define MUL_STRIDE 1
+#define MUL_D d0
+
+#define GEN_P_STRIDE 1
+#define GEN_P_DEFINE() v_t p0
+#define GEN_P_P p0
+
+#define GEN_PQ_STRIDE 1
+#define GEN_PQ_DEFINE() v_t d0, c0
+#define GEN_PQ_D d0
+#define GEN_PQ_C c0
+
+#define GEN_PQR_STRIDE 1
+#define GEN_PQR_DEFINE() v_t d0, c0
+#define GEN_PQR_D d0
+#define GEN_PQR_C c0
+
+#define SYN_Q_DEFINE() v_t d0, x0
+#define SYN_Q_D d0
+#define SYN_Q_X x0
+
+
+#define SYN_R_DEFINE() v_t d0, x0
+#define SYN_R_D d0
+#define SYN_R_X x0
+
+
+#define SYN_PQ_DEFINE() v_t d0, x0
+#define SYN_PQ_D d0
+#define SYN_PQ_X x0
+
+
+#define REC_PQ_STRIDE 1
+#define REC_PQ_DEFINE() v_t x0, y0, t0
+#define REC_PQ_X x0
+#define REC_PQ_Y y0
+#define REC_PQ_T t0
+
+
+#define SYN_PR_DEFINE() v_t d0, x0
+#define SYN_PR_D d0
+#define SYN_PR_X x0
+
+#define REC_PR_STRIDE 1
+#define REC_PR_DEFINE() v_t x0, y0, t0
+#define REC_PR_X x0
+#define REC_PR_Y y0
+#define REC_PR_T t0
+
+
+#define SYN_QR_DEFINE() v_t d0, x0
+#define SYN_QR_D d0
+#define SYN_QR_X x0
+
+
+#define REC_QR_STRIDE 1
+#define REC_QR_DEFINE() v_t x0, y0, t0
+#define REC_QR_X x0
+#define REC_QR_Y y0
+#define REC_QR_T t0
+
+
+#define SYN_PQR_DEFINE() v_t d0, x0
+#define SYN_PQR_D d0
+#define SYN_PQR_X x0
+
+#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0
+#define REC_PQR_X x0
+#define REC_PQR_Y y0
+#define REC_PQR_Z z0
+#define REC_PQR_XS xs0
+#define REC_PQR_YS ys0
+
+#include "vdev_raidz_math_impl.h"
DEFINE_GEN_METHODS(scalar);
DEFINE_REC_METHODS(scalar);
diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
index 6fc81215a..9985da273 100644
--- a/module/zfs/vdev_raidz_math_sse2.c
+++ b/module/zfs/vdev_raidz_math_sse2.c
@@ -58,9 +58,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) {}
-#define PREFETCH(ptr, offset) {}
-
#define XOR_ACC(src, r...) \
{ \
switch (REG_CNT(r)) { \
@@ -106,27 +103,8 @@ typedef struct v {
break; \
} \
}
-#define ZERO(r...) \
-{ \
- switch (REG_CNT(r)) { \
- case 4: \
- __asm( \
- "pxor %" VR0(r) ", %" VR0(r) "\n" \
- "pxor %" VR1(r) ", %" VR1(r) "\n" \
- "pxor %" VR2(r) ", %" VR2(r) "\n" \
- "pxor %" VR3(r) ", %" VR3(r)); \
- break; \
- case 2: \
- __asm( \
- "pxor %" VR0(r) ", %" VR0(r) "\n" \
- "pxor %" VR1(r) ", %" VR1(r)); \
- break; \
- case 1: \
- __asm( \
- "pxor %" VR0(r) ", %" VR0(r)); \
- break; \
- } \
-}
+
+#define ZERO(r...) XOR(r, r)
#define COPY(r...) \
{ \
@@ -236,6 +214,10 @@ typedef struct v {
#define MUL2(r...) \
{ \
switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2_x2(VR0(r), VR1(r)); \
+ _MUL2_x2(VR2(r), VR3(r)); \
+ break; \
case 2: \
_MUL2_x2(VR0(r), VR1(r)); \
break; \
@@ -255,7 +237,7 @@ typedef struct v {
#define _MUL_PARAM(x, in, acc) \
{ \
- if (x & 0x01) { COPY(in, acc); } else { XOR(acc, acc); } \
+ if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \
if (x & 0xfe) { MUL2(in); } \
if (x & 0x02) { XOR(in, acc); } \
if (x & 0xfc) { MUL2(in); } \
@@ -271,8 +253,8 @@ typedef struct v {
if (x & 0x80) { MUL2(in); XOR(in, acc); } \
}
-#define _mul_x1_in 9
-#define _mul_x1_acc 11
+#define _mul_x1_in 11
+#define _mul_x1_acc 12
#define MUL_x1_DEFINE(x) \
static void \
@@ -533,61 +515,87 @@ gf_x2_mul_fns[256] = {
#define raidz_math_begin() kfpu_begin()
#define raidz_math_end() kfpu_end()
-#define GEN_P_DEFINE() {}
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 2
+#define MUL_DEFINE() MUL2_SETUP()
+#define MUL_D 0, 1
+
#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3
+#define GEN_PQ_STRIDE 4
#define GEN_PQ_DEFINE() {}
-#define GEN_PQ_STRIDE 2
-#define GEN_PQ_D 0, 1
-#define GEN_PQ_P 2, 3
-#define GEN_PQ_Q 4, 5
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {}
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 0, 1
-#define GEN_PQR_P 2, 3
-#define GEN_PQR_Q 4, 5
-#define GEN_PQR_R 6, 7
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
-#define REC_P_DEFINE() {}
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
-#define REC_Q_DEFINE() {}
-#define REC_Q_STRIDE 2
-#define REC_Q_X 0, 1
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
-#define REC_R_DEFINE() {}
-#define REC_R_STRIDE 2
-#define REC_R_X 0, 1
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
-#define REC_PQ_DEFINE() {}
#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() MUL2_SETUP()
#define REC_PQ_X 0, 1
#define REC_PQ_Y 2, 3
-#define REC_PQ_D 4, 5
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
-#define REC_PR_DEFINE() {}
#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() MUL2_SETUP()
#define REC_PR_X 0, 1
#define REC_PR_Y 2, 3
-#define REC_PR_D 4, 5
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
-#define REC_QR_DEFINE() {}
#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() MUL2_SETUP()
#define REC_QR_X 0, 1
#define REC_QR_Y 2, 3
-#define REC_QR_D 4, 5
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
-#define REC_PQR_DEFINE() {}
#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() MUL2_SETUP()
#define REC_PQR_X 0
#define REC_PQR_Y 1
#define REC_PQR_Z 2
-#define REC_PQR_D 3
-#define REC_PQR_XS 4
-#define REC_PQR_YS 5
+#define REC_PQR_XS 3
+#define REC_PQR_YS 4
#include <sys/vdev_raidz_impl.h>
diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
index d93441349..cebb0fe2b 100644
--- a/module/zfs/vdev_raidz_math_ssse3.c
+++ b/module/zfs/vdev_raidz_math_ssse3.c
@@ -66,19 +66,6 @@ typedef struct v {
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) \
-{ \
- __asm( \
- "prefetchnta " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
-
-#define PREFETCH(ptr, offset) \
-{ \
- __asm( \
- "prefetcht0 " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
#define XOR_ACC(src, r...) \
{ \
@@ -122,25 +109,7 @@ typedef struct v {
} \
}
-#define ZERO(r...) \
-{ \
- switch (REG_CNT(r)) { \
- case 4: \
- __asm( \
- "pxor %" VR0(r) ", %" VR0(r) "\n" \
- "pxor %" VR1(r) ", %" VR1(r) "\n" \
- "pxor %" VR2(r) ", %" VR2(r) "\n" \
- "pxor %" VR3(r) ", %" VR3(r)); \
- break; \
- case 2: \
- __asm( \
- "pxor %" VR0(r) ", %" VR0(r) "\n" \
- "pxor %" VR1(r) ", %" VR1(r)); \
- break; \
- default: \
- ASM_BUG(); \
- } \
-}
+#define ZERO(r...) XOR(r, r)
#define COPY(r...) \
{ \
@@ -337,59 +306,86 @@ typedef struct v {
#define raidz_math_begin() kfpu_begin()
#define raidz_math_end() kfpu_end()
-#define GEN_P_DEFINE() {}
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3
-#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
#define GEN_PQ_D 0, 1, 2, 3
-#define GEN_PQ_P 4, 5, 6, 7
-#define GEN_PQ_Q 8, 9, 10, 11
+#define GEN_PQ_C 4, 5, 6, 7
+#define GEN_PQR_STRIDE 4
#define GEN_PQR_DEFINE() {}
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 0, 1
-#define GEN_PQR_P 2, 3
-#define GEN_PQR_Q 4, 5
-#define GEN_PQR_R 6, 7
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
-#define REC_P_DEFINE() {}
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
-#define REC_Q_DEFINE() {}
-#define REC_Q_STRIDE 4
-#define REC_Q_X 0, 1, 2, 3
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
-#define REC_R_DEFINE() {}
-#define REC_R_STRIDE 4
-#define REC_R_X 0, 1, 2, 3
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
-#define REC_PQ_DEFINE() {}
#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
#define REC_PQ_X 0, 1
#define REC_PQ_Y 2, 3
-#define REC_PQ_D 4, 5
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
-#define REC_PR_DEFINE() {}
#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
#define REC_PR_X 0, 1
#define REC_PR_Y 2, 3
-#define REC_PR_D 4, 5
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
-#define REC_QR_DEFINE() {}
#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
#define REC_QR_X 0, 1
#define REC_QR_Y 2, 3
-#define REC_QR_D 4, 5
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
-#define REC_PQR_DEFINE() {}
#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
#define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5
-#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9
@@ -419,7 +415,8 @@ const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
#endif /* defined(__x86_64) && defined(HAVE_SSSE3) */
-#if defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2))
+#if defined(__x86_64)
+#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
const uint8_t
__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
@@ -2473,4 +2470,5 @@ __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 }
};
-#endif /* defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2)) */
+#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */
+#endif /* defined(__x86_64) */
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index a15e22c5b..131c33e29 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -584,7 +584,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
sz = zap->zap_dbuf->db_size;
- mzp = zio_buf_alloc(sz);
+ mzp = vmem_alloc(sz, KM_SLEEP);
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
@@ -592,7 +592,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
1ULL << fzap_default_block_shift, 0, tx);
if (err) {
- zio_buf_free(mzp, sz);
+ vmem_free(mzp, sz);
return (err);
}
}
@@ -619,7 +619,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
if (err)
break;
}
- zio_buf_free(mzp, sz);
+ vmem_free(mzp, sz);
*zapp = zap;
return (err);
}
diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c
index 33b767808..e32786902 100644
--- a/module/zfs/zfs_sa.c
+++ b/module/zfs/zfs_sa.c
@@ -203,13 +203,13 @@ zfs_sa_get_xattr(znode_t *zp)
return (error);
}
- obj = zio_buf_alloc(size);
+ obj = vmem_alloc(size, KM_SLEEP);
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size);
if (error == 0)
error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);
- zio_buf_free(obj, size);
+ vmem_free(obj, size);
return (error);
}
@@ -233,7 +233,7 @@ zfs_sa_set_xattr(znode_t *zp)
if (error)
goto out;
- obj = zio_buf_alloc(size);
+ obj = vmem_alloc(size, KM_SLEEP);
error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
NV_ENCODE_XDR, KM_SLEEP);
@@ -253,7 +253,7 @@ zfs_sa_set_xattr(znode_t *zp)
dmu_tx_commit(tx);
}
out_free:
- zio_buf_free(obj, size);
+ vmem_free(obj, size);
out:
return (error);
}
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 760f0a891..b2d07166e 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -40,6 +40,7 @@
#include <sys/dsl_pool.h>
#include <sys/metaslab.h>
#include <sys/trace_zil.h>
+#include <sys/abd.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@@ -878,6 +879,7 @@ zil_lwb_write_done(zio_t *zio)
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/
+ abd_put(zio->io_abd);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_zio = NULL;
@@ -914,12 +916,14 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
if (lwb->lwb_zio == NULL) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+ BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1;
}
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
+ 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_FASTWRITE, &zb);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 223c20abe..e9d08093e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -42,6 +42,7 @@
#include <sys/metaslab_impl.h>
#include <sys/time.h>
#include <sys/trace_zio.h>
+#include <sys/abd.h>
/*
* ==========================================================================
@@ -56,7 +57,7 @@ const char *zio_type_name[ZIO_TYPES] = {
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
};
-int zio_dva_throttle_enabled = B_FALSE;
+int zio_dva_throttle_enabled = B_TRUE;
/*
* ==========================================================================
@@ -67,6 +68,11 @@ kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
int zio_delay_max = ZIO_DELAY_MAX;
#define ZIO_PIPELINE_CONTINUE 0x100
@@ -212,6 +218,13 @@ zio_fini(void)
if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
break;
#endif
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
+ (void) printf("zio_fini: [%d] %llu != %llu\n",
+ (int)((c + 1) << SPA_MINBLOCKSHIFT),
+ (long long unsigned)zio_buf_cache_allocs[c],
+ (long long unsigned)zio_buf_cache_frees[c]);
+#endif
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
kmem_cache_destroy(zio_buf_cache[c]);
@@ -251,6 +264,9 @@ zio_buf_alloc(size_t size)
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_allocs[c], 1);
+#endif
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
}
@@ -271,26 +287,15 @@ zio_data_buf_alloc(size_t size)
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
}
-/*
- * Use zio_buf_alloc_flags when specific allocation flags are needed. e.g.
- * passing KM_NOSLEEP when it is acceptable for an allocation to fail.
- */
-void *
-zio_buf_alloc_flags(size_t size, int flags)
-{
- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
- VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
- return (kmem_cache_alloc(zio_buf_cache[c], flags));
-}
-
void
zio_buf_free(void *buf, size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_frees[c], 1);
+#endif
kmem_cache_free(zio_buf_cache[c], buf);
}
@@ -311,12 +316,18 @@ zio_data_buf_free(void *buf, size_t size)
* ==========================================================================
*/
void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
- zt->zt_orig_data = zio->io_data;
+ /*
+ * Ensure that anyone expecting this zio to contain a linear ABD isn't
+ * going to get a nasty surprise when they try to access the data.
+ */
+ IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
+
+ zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize;
zt->zt_transform = transform;
@@ -324,7 +335,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zt->zt_next = zio->io_transform_stack;
zio->io_transform_stack = zt;
- zio->io_data = data;
+ zio->io_abd = data;
zio->io_size = size;
}
@@ -336,12 +347,12 @@ zio_pop_transforms(zio_t *zio)
while ((zt = zio->io_transform_stack) != NULL) {
if (zt->zt_transform != NULL)
zt->zt_transform(zio,
- zt->zt_orig_data, zt->zt_orig_size);
+ zt->zt_orig_abd, zt->zt_orig_size);
if (zt->zt_bufsize != 0)
- zio_buf_free(zio->io_data, zt->zt_bufsize);
+ abd_free(zio->io_abd);
- zio->io_data = zt->zt_orig_data;
+ zio->io_abd = zt->zt_orig_abd;
zio->io_size = zt->zt_orig_size;
zio->io_transform_stack = zt->zt_next;
@@ -355,21 +366,26 @@ zio_pop_transforms(zio_t *zio)
* ==========================================================================
*/
static void
-zio_subblock(zio_t *zio, void *data, uint64_t size)
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
{
ASSERT(zio->io_size > size);
if (zio->io_type == ZIO_TYPE_READ)
- bcopy(zio->io_data, data, size);
+ abd_copy(data, zio->io_abd, size);
}
static void
-zio_decompress(zio_t *zio, void *data, uint64_t size)
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{
- if (zio->io_error == 0 &&
- zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
- zio->io_data, data, zio->io_size, size) != 0)
- zio->io_error = SET_ERROR(EIO);
+ if (zio->io_error == 0) {
+ void *tmp = abd_borrow_buf(data, size);
+ int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+ zio->io_abd, tmp, zio->io_size, size);
+ abd_return_buf_copy(data, tmp, size);
+
+ if (ret != 0)
+ zio->io_error = SET_ERROR(EIO);
+ }
}
/*
@@ -552,7 +568,7 @@ zio_timestamp_compare(const void *x1, const void *x2)
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage,
@@ -611,7 +627,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
- zio->io_orig_data = zio->io_data = data;
+ zio->io_orig_abd = zio->io_abd = data;
zio->io_orig_size = zio->io_size = psize;
zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
@@ -755,7 +771,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
+ abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -773,7 +789,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+ abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
@@ -814,7 +830,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{
@@ -967,7 +983,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -988,7 +1004,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -1011,8 +1027,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
- void *wbuf = zio_buf_alloc(size);
- bcopy(data, wbuf, size);
+ abd_t *wbuf = abd_alloc_sametype(data, size);
+ abd_copy(wbuf, data, size);
+
zio_push_transform(zio, wbuf, size, size, NULL);
}
@@ -1024,7 +1041,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, zio_priority_t priority,
+ abd_t *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
@@ -1090,7 +1107,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
}
zio_t *
-zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
@@ -1151,14 +1168,17 @@ zio_read_bp_init(zio_t *zio)
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(psize);
-
- zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decompress);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ int psize = BPE_GET_PSIZE(bp);
+ void *data = abd_borrow_buf(zio->io_abd, psize);
+
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- decode_embedded_bp_compressed(bp, zio->io_data);
+ decode_embedded_bp_compressed(bp, data);
+ abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
}
@@ -1299,7 +1319,7 @@ zio_write_compress(zio_t *zio)
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
- psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@@ -1337,9 +1357,11 @@ zio_write_compress(zio_t *zio)
zio_buf_free(cbuf, lsize);
psize = lsize;
} else {
- bzero((char *)cbuf + psize, rounded - psize);
+ abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+ abd_take_ownership_of_buf(cdata, B_TRUE);
+ abd_zero_off(cdata, psize, rounded - psize);
psize = rounded;
- zio_push_transform(zio, cbuf,
+ zio_push_transform(zio, cdata,
psize, lsize, NULL);
}
}
@@ -1942,26 +1964,38 @@ zio_resume_wait(spa_t *spa)
* ==========================================================================
*/
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+ abd_put(zio->io_abd);
+}
+
static zio_t *
-zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
if (gn != NULL)
return (pio);
- return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
- NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+ BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark));
}
-zio_t *
-zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
zio_t *zio;
if (gn != NULL) {
+ abd_t *gbh_abd =
+ abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
@@ -1972,8 +2006,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
* this is just good hygiene.)
*/
if (gn != pio->io_gang_leader->io_gang_tree) {
+ abd_t *buf = abd_get_offset(data, offset);
+
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
- data, BP_GET_PSIZE(bp));
+ buf, BP_GET_PSIZE(bp));
+
+ abd_put(buf);
}
/*
* If we are here to damage data for testing purposes,
@@ -1983,7 +2021,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
+ abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+ zio_gang_issue_func_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
}
@@ -1991,16 +2030,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
}
/* ARGSUSED */
-zio_t *
-zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
-zio_t *
-zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
@@ -2064,13 +2105,14 @@ static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
- SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
- gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_gang_tree_assemble_done, gn, gio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
static void
@@ -2087,13 +2129,16 @@ zio_gang_tree_assemble_done(zio_t *zio)
if (zio->io_error)
return;
+ /* this ABD was created from a linear buf in zio_gang_tree_assemble */
if (BP_SHOULD_BYTESWAP(bp))
- byteswap_uint64_array(zio->io_data, zio->io_size);
+ byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
- ASSERT(zio->io_data == gn->gn_gbh);
+ ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ abd_put(zio->io_abd);
+
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
@@ -2103,7 +2148,8 @@ zio_gang_tree_assemble_done(zio_t *zio)
}
static void
-zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+ uint64_t offset)
{
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@@ -2117,7 +2163,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
- zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
+ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
@@ -2126,13 +2172,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
- zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
- data = (char *)data + BP_GET_PSIZE(gbp);
+ zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+ offset);
+ offset += BP_GET_PSIZE(gbp);
}
}
if (gn == gio->io_gang_tree)
- ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
+ ASSERT3U(gio->io_size, ==, offset);
if (zio != pio)
zio_nowait(zio);
@@ -2165,7 +2212,8 @@ zio_gang_issue(zio_t *zio)
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
- zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
+ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+ 0);
else
zio_gang_tree_free(&zio->io_gang_tree);
@@ -2205,6 +2253,12 @@ zio_write_gang_member_ready(zio_t *zio)
mutex_exit(&pio->io_lock);
}
+static void
+zio_write_gang_done(zio_t *zio)
+{
+ abd_put(zio->io_abd);
+}
+
static int
zio_write_gang_block(zio_t *pio)
{
@@ -2215,6 +2269,7 @@ zio_write_gang_block(zio_t *pio)
zio_t *zio;
zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh;
+ abd_t *gbh_abd;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
@@ -2275,12 +2330,14 @@ zio_write_gang_block(zio_t *pio)
gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh;
bzero(gbh, SPA_GANGBLOCKSIZE);
+ gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/*
* Create the gang header.
*/
- zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_write_gang_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
/*
* Create and nowait the gang children.
@@ -2302,9 +2359,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE;
cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
- (char *)pio->io_data + (pio->io_size - resid), lsize,
- lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
- &gn->gn_child[g], pio->io_priority,
+ abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
+ lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
+ zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
@@ -2320,7 +2377,6 @@ zio_write_gang_block(zio_t *pio)
zp.zp_copies, cio, flags));
}
zio_nowait(cio);
-
}
/*
@@ -2423,10 +2479,11 @@ zio_ddt_child_read_done(zio_t *zio)
ddp = ddt_phys_select(dde, bp);
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
- if (zio->io_error == 0 && dde->dde_repair_data == NULL)
- dde->dde_repair_data = zio->io_data;
+
+ if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+ dde->dde_repair_abd = zio->io_abd;
else
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_exit(&pio->io_lock);
}
@@ -2459,16 +2516,16 @@ zio_ddt_read_start(zio_t *zio)
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
- zio_buf_alloc(zio->io_size), zio->io_size,
- zio_ddt_child_read_done, dde, zio->io_priority,
- ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
- &zio->io_bookmark));
+ abd_alloc_for_io(zio->io_size, B_TRUE),
+ zio->io_size, zio_ddt_child_read_done, dde,
+ zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+ ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
}
return (ZIO_PIPELINE_CONTINUE);
}
zio_nowait(zio_read(zio, zio->io_spa, bp,
- zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
return (ZIO_PIPELINE_CONTINUE);
@@ -2498,8 +2555,9 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
- if (dde->dde_repair_data != NULL) {
- bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ if (dde->dde_repair_abd != NULL) {
+ abd_copy(zio->io_abd, dde->dde_repair_abd,
+ zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0;
}
ddt_repair_done(ddt, dde);
@@ -2537,12 +2595,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (lio != NULL && do_raw) {
return (lio->io_size != zio->io_size ||
- bcmp(zio->io_data, lio->io_data,
- zio->io_size) != 0);
+ abd_cmp(zio->io_abd, lio->io_abd) != 0);
} else if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size ||
- bcmp(zio->io_orig_data, lio->io_orig_data,
- zio->io_orig_size) != 0);
+ abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
}
}
@@ -2552,7 +2608,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (ddp->ddp_phys_birth != 0 && do_raw) {
blkptr_t blk = *zio->io_bp;
uint64_t psize;
- void *tmpbuf;
+ abd_t *tmpabd;
int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
@@ -2563,19 +2619,19 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
ddt_exit(ddt);
- tmpbuf = zio_buf_alloc(psize);
+ tmpabd = abd_alloc_for_io(psize, B_TRUE);
- error = zio_wait(zio_read(NULL, spa, &blk, tmpbuf,
+ error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_RAW, &zio->io_bookmark));
if (error == 0) {
- if (bcmp(tmpbuf, zio->io_data, psize) != 0)
+ if (abd_cmp(tmpabd, zio->io_abd) != 0)
error = SET_ERROR(ENOENT);
}
- zio_buf_free(tmpbuf, psize);
+ abd_free(tmpabd);
ddt_enter(ddt);
return (error != 0);
} else if (ddp->ddp_phys_birth != 0) {
@@ -2597,7 +2653,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
&aflags, &zio->io_bookmark);
if (error == 0) {
- if (bcmp(abuf->b_data, zio->io_orig_data,
+ if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
zio->io_orig_size) != 0)
error = SET_ERROR(ENOENT);
arc_buf_destroy(abuf, &abuf);
@@ -2762,12 +2818,12 @@ zio_ddt_write(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
- dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
- zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
}
@@ -2784,13 +2840,13 @@ zio_ddt_write(zio_t *zio)
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
- cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
- zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio;
}
@@ -3130,11 +3186,11 @@ zio_vdev_io_start(zio_t *zio)
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio_buf_alloc(asize);
+ abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
- bcopy(zio->io_data, abuf, zio->io_size);
- bzero(abuf + zio->io_size, asize - zio->io_size);
+ abd_copy(abuf, zio->io_abd, zio->io_size);
+ abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
}
@@ -3264,7 +3320,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
{
void *buf = zio_buf_alloc(zio->io_size);
- bcopy(zio->io_data, buf, zio->io_size);
+ abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
zcr->zcr_cbinfo = zio->io_size;
zcr->zcr_cbdata = buf;
@@ -3398,7 +3454,7 @@ zio_checksum_generate(zio_t *zio)
}
}
- zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
+ zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
@@ -3537,7 +3593,7 @@ zio_ready(zio_t *zio)
if (BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA;
} else {
- ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+ ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
}
}
@@ -3616,6 +3672,7 @@ zio_done(zio_t *zio)
* Always attempt to keep stack usage minimal here since
* we can be called recurisvely up to 19 levels deep.
*/
+ uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
int c, w;
zio_link_t *zl = NULL;
@@ -3696,28 +3753,35 @@ zio_done(zio_t *zio)
while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align;
- uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio->io_data;
-
- if (asize != zio->io_size) {
- abuf = zio_buf_alloc(asize);
- bcopy(zio->io_data, abuf, zio->io_size);
- bzero(abuf+zio->io_size, asize-zio->io_size);
+ uint64_t asize = P2ROUNDUP(psize, align);
+ char *abuf = NULL;
+ abd_t *adata = zio->io_abd;
+
+ if (asize != psize) {
+ adata = abd_alloc_linear(asize, B_TRUE);
+ abd_copy(adata, zio->io_abd, psize);
+ abd_zero_off(adata, psize, asize - psize);
}
+ if (adata != NULL)
+ abuf = abd_borrow_buf_copy(adata, asize);
+
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr);
- if (asize != zio->io_size)
- zio_buf_free(abuf, asize);
+ if (adata != NULL)
+ abd_return_buf(adata, abuf, asize);
+
+ if (asize != psize)
+ abd_free(adata);
}
}
zio_pop_transforms(zio); /* note: may set zio->io_error */
- vdev_stat_update(zio, zio->io_size);
+ vdev_stat_update(zio, psize);
/*
* If this I/O is attached to a particular vdev is slow, exceeding
@@ -4098,7 +4162,6 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
EXPORT_SYMBOL(zio_type_name);
EXPORT_SYMBOL(zio_buf_alloc);
EXPORT_SYMBOL(zio_data_buf_alloc);
-EXPORT_SYMBOL(zio_buf_alloc_flags);
EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free);
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index d3d2f05a8..37116f049 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -20,8 +20,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -30,6 +30,7 @@
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
+#include <sys/abd.h>
#include <zfs_fletcher.h>
/*
@@ -92,45 +93,85 @@
/*ARGSUSED*/
static void
-zio_checksum_off(const void *buf, uint64_t size,
- const void *ctx_template, zio_cksum_t *zcp)
+abd_checksum_off(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
+/*ARGSUSED*/
+void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_byteswap, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_byteswap, zcp);
+}
+
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "inherit"},
{{NULL, NULL}, NULL, NULL, 0, "on"},
- {{zio_checksum_off, zio_checksum_off},
+ {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
- {{fletcher_4_native, fletcher_4_byteswap},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
- {{zio_checksum_off, zio_checksum_off},
+ {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"},
- {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
+ {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
- {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
- zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+ {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
+ abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
- {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
- zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+ {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
+ abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
};
@@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
*/
void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size)
+ abd_t *abd, uint64_t size)
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
@@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
+ void *data = abd_to_buf(abd);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
else
bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC;
- ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum);
eck->zec_cksum = cksum;
} else {
- ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&bp->blk_cksum);
}
}
int
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
- void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
+ abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
int byteswap;
@@ -308,25 +350,32 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
zio_cksum_t verifier;
+ size_t eck_offset;
+ uint64_t data_size = size;
+ void *data = abd_borrow_buf_copy(abd, data_size);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
uint64_t nused;
eck = &zilc->zc_eck;
- if (eck->zec_magic == ZEC_MAGIC)
+ if (eck->zec_magic == ZEC_MAGIC) {
nused = zilc->zc_nused;
- else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+ } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
nused = BSWAP_64(zilc->zc_nused);
- else
+ } else {
+ abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
+ }
- if (nused > size)
+ if (nused > data_size) {
+ abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
+ }
size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
} else {
- eck = (zio_eck_t *)((char *)data + size) - 1;
+ eck = (zio_eck_t *)((char *)data + data_size) - 1;
}
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
@@ -341,11 +390,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+ eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier;
- ci->ci_func[byteswap](data, size,
+ abd_return_buf_copy(abd, data, data_size);
+
+ ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
- eck->zec_cksum = expected_cksum;
+ abd_copy_from_buf_off(abd, &expected_cksum,
+ eck_offset, sizeof (zio_cksum_t));
if (byteswap) {
byteswap_uint64_array(&expected_cksum,
@@ -354,7 +407,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
} else {
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
- ci->ci_func[byteswap](data, size,
+ ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
@@ -383,7 +436,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
- void *data = zio->io_data;
+ abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 6b8d6c39b..7e44d16e4 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -28,7 +28,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,24 +41,23 @@
/*
* Compression vectors.
*/
-
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
- {NULL, NULL, 0, "inherit"},
- {NULL, NULL, 0, "on"},
- {NULL, NULL, 0, "uncompressed"},
- {lzjb_compress, lzjb_decompress, 0, "lzjb"},
- {NULL, NULL, 0, "empty"},
- {gzip_compress, gzip_decompress, 1, "gzip-1"},
- {gzip_compress, gzip_decompress, 2, "gzip-2"},
- {gzip_compress, gzip_decompress, 3, "gzip-3"},
- {gzip_compress, gzip_decompress, 4, "gzip-4"},
- {gzip_compress, gzip_decompress, 5, "gzip-5"},
- {gzip_compress, gzip_decompress, 6, "gzip-6"},
- {gzip_compress, gzip_decompress, 7, "gzip-7"},
- {gzip_compress, gzip_decompress, 8, "gzip-8"},
- {gzip_compress, gzip_decompress, 9, "gzip-9"},
- {zle_compress, zle_decompress, 64, "zle"},
- {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"},
+ {"inherit", 0, NULL, NULL},
+ {"on", 0, NULL, NULL},
+ {"uncompressed", 0, NULL, NULL},
+ {"lzjb", 0, lzjb_compress, lzjb_decompress},
+ {"empty", 0, NULL, NULL},
+ {"gzip-1", 1, gzip_compress, gzip_decompress},
+ {"gzip-2", 2, gzip_compress, gzip_decompress},
+ {"gzip-3", 3, gzip_compress, gzip_decompress},
+ {"gzip-4", 4, gzip_compress, gzip_decompress},
+ {"gzip-5", 5, gzip_compress, gzip_decompress},
+ {"gzip-6", 6, gzip_compress, gzip_decompress},
+ {"gzip-7", 7, gzip_compress, gzip_decompress},
+ {"gzip-8", 8, gzip_compress, gzip_decompress},
+ {"gzip-9", 9, gzip_compress, gzip_decompress},
+ {"zle", 64, zle_compress, zle_decompress},
+ {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs}
};
enum zio_compress
@@ -85,12 +84,26 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
return (result);
}
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+ uint64_t *end = (uint64_t *)((char *)data + len);
+ uint64_t *word;
+
+ for (word = data; word < end; word++)
+ if (*word != 0)
+ return (1);
+
+ return (0);
+}
+
size_t
-zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
{
- uint64_t *word, *word_end;
size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
+ void *tmp;
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
@@ -99,12 +112,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
* If the data is all zeroes, we don't even need to allocate
* a block for it. We indicate this by returning zero size.
*/
- word_end = (uint64_t *)((char *)src + s_len);
- for (word = src; word < word_end; word++)
- if (*word != 0)
- break;
-
- if (word == word_end)
+ if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
return (0);
if (c == ZIO_COMPRESS_EMPTY)
@@ -112,7 +120,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
/* Compress at least 12.5% */
d_len = s_len - (s_len >> 3);
- c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
+
+ /* No compression algorithms can read from ABDs directly */
+ tmp = abd_borrow_buf_copy(src, s_len);
+ c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
+ abd_return_buf(src, tmp, s_len);
if (c_len > d_len)
return (s_len);
@@ -122,13 +134,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
}
int
-zio_decompress_data(enum zio_compress c, void *src, void *dst,
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len)
{
zio_compress_info_t *ci = &zio_compress_table[c];
-
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
return (SET_ERROR(EINVAL));
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len)
+{
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
+ abd_return_buf(src, tmp, s_len);
+
+ return (ret);
+}
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 61b7d25e6..0e8e9d932 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -41,7 +41,7 @@
*/
#include <sys/arc.h>
-#include <sys/zio_impl.h>
+#include <sys/zio.h>
#include <sys/zfs_ioctl.h>
#include <sys/vdev_impl.h>
#include <sys/dmu_objset.h>
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index d3c090b89..c187960dc 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -167,9 +167,6 @@ tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
'zfs_send_007_pos']
-# DISABLED:
-# mountpoint_003_pos - needs investigation
-# ro_props_001_pos - https://github.com/zfsonlinux/zfs/issues/5201
[tests/functional/cli_root/zfs_set]
tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
@@ -178,7 +175,8 @@ tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg',
- 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos']
+ 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos',
+ 'ro_props_001_pos', 'mountpoint_003_pos']
# DISABLED: Tests need to be updated for Linux share behavior
# zfs_share_005_pos - needs investigation, probably unsupported NFS share format
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh
index ce427c105..b160c8104 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh
@@ -61,13 +61,13 @@ log_onexit cleanup
#
if is_linux; then
set -A args \
- "dev" "/dev/" "nodev" "/nodev/" \
- "exec" "/exec/" "noexec" "/noexec/" \
- "mand" "/mand/" "nomand" "/nomand/" \
- "ro" "read only" "rw" "read/write" \
- "suid" "/suid/" "nosuid" "/nosuid/" \
- "xattr" "/xattr/" "noxattr" "/noxattr/" \
- "atime" "/atime/" "noatime" "/noatime/"
+ "nodev" "dev" \
+ "noexec" "exec" \
+ "mand" "nomand" \
+ "ro" "rw" \
+ "nosuid" "suid" \
+ "xattr" "noxattr" \
+ "atime" "noatime"
else
set -A args \
"devices" "/devices/" "nodevices" "/nodevices/" \
@@ -90,26 +90,40 @@ typeset i=0
while ((i < ${#args[@]})); do
if is_linux; then
log_must $MOUNT -t zfs -o ${args[$i]} $testfs $tmpmnt
+
+ msg=$($MOUNT | $GREP "$tmpmnt ")
+
+ $ECHO $msg | $GREP "${args[((i))]}" > /dev/null 2>&1
+ if (($? != 0)) ; then
+ $ECHO $msg | $GREP "${args[((i-1))]}" > /dev/null 2>&1
+ if (($? == 0)) ; then
+ log_fail "Expected option: ${args[((i))]} \n" \
+ "Real option: $msg"
+ fi
+ fi
+
+ log_must $UMOUNT $tmpmnt
+ ((i += 1))
else
log_must $MOUNT -F zfs -o ${args[$i]} $testfs $tmpmnt
- fi
- msg=$($MOUNT | $GREP "^$tmpmnt ")
- if ! is_linux; then
+ msg=$($MOUNT | $GREP "^$tmpmnt ")
+
# In LZ, a user with all zone privileges can never "devices"
if ! is_global_zone && [[ ${args[$i]} == devices ]] ; then
args[((i+1))]="/nodevices/"
fi
- fi
- $ECHO $msg | $GREP "${args[((i+1))]}" > /dev/null 2>&1
- if (($? != 0)) ; then
- log_fail "Expected option: ${args[((i+1))]} \n" \
- "Real option: $msg"
- fi
+ $ECHO $msg | $GREP "${args[((i+1))]}" > /dev/null 2>&1
+ if (($? != 0)) ; then
+ log_fail "Expected option: ${args[((i+1))]} \n" \
+ "Real option: $msg"
+ fi
+
- log_must $UMOUNT $tmpmnt
- ((i += 2))
+ log_must $UMOUNT $tmpmnt
+ ((i += 2))
+ fi
done
log_pass "With legacy mount, FSType-specific option works well passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh
index c9a739109..1f3c0cf09 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh
@@ -72,6 +72,8 @@ log_onexit cleanup
# Create filesystem and volume's snapshot
create_snapshot $TESTPOOL/$TESTFS $TESTSNAP
create_snapshot $TESTPOOL/$TESTVOL $TESTSNAP
+sync_pool $TESTPOOL
+$SLEEP 5
typeset -i i=0
typeset -i j=0
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
index ca07769ec..16e27a0ca 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
@@ -48,6 +48,11 @@
verify_runnable "global"
+# See issue: https://github.com/zfsonlinux/zfs/issues/5444
+if is_32bit; then
+ log_unsupported "Test case fails on 32-bit systems"
+fi
+
log_assert "Resilver prevent scrub from starting until the resilver completes"
log_must $ZPOOL detach $TESTPOOL $DISK2