summaryrefslogtreecommitdiffstats
path: root/module/zfs/vdev_raidz.c
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2016-11-30 14:48:16 -0700
committerGitHub <[email protected]>2016-11-30 14:48:16 -0700
commit7657defc48b7c47a8bf0c8f21c78783d293dc5ed (patch)
treeec6ebdcc7289bc707076205314737cf04fc3bfc0 /module/zfs/vdev_raidz.c
parentce43e88dd65509a4cf62c4acc76619e571d8518a (diff)
parent982957483450d53683681f456d1c84cfeb56afad (diff)
Introduce ARC Buffer Data (ABD)
ZFS currently uses ARC buffers which are backed by virtual memory. While functional, there are some major problems with this approach which can be observed on all OpenZFS platforms. ABD was designed to address these issues and includes contributions from OpenZFS developers from multiple platforms. While all OpenZFS platforms will benefit from ABD this functionality is critical for Linux. Unlike the other OpenZFS platforms the Linux kernel discourages extensive use of virtual memory. The provided interfaces are not optimized for frequent allocations from the virtual address space. To maintain good performance a kmem cache is used which contains relatively long lived slabs backed by virtual memory. The downside to the approach is that those slabs can become highly fragmented resulting in an inefficient use of memory. Another issue is that on 32-bit systems the available virtual address space in the kernel is only a small fraction of total system memory. This means the ARC size is highly constrained which hurts performance and make allocating memory difficult and OOMs more likely. ABD is designed to address these issues by using scatter lists of pages for data buffers. This removes the need for slabs which resolves the fragmentation issue. It also allows high memory pages to be allocated which alleviates the virtual address space pressure on 32-bit systems. For metadata buffers, which are small, linear ABDs are allocated from the slab. This is preferable because there are many places in the code which expect to be able to read from a given offset in the buffer. Using linear ABDs means none of that code needs to be modified. The majority of these buffers are allocated with kmalloc so there's minimal impact of the virtual address space. Tested-by: Kash Pande <[email protected]> Tested-by: kernelOfTruth <[email protected]> Tested-by: RageLtMan <rageltman@sempervictus> Tested-by: DHE <[email protected]> Reviewed-by: Chunwei Chen <[email protected]> Reviewed-by: Dan Kimmel <[email protected]> Reviewed-by: David Quigley <[email protected]> Reviewed-by: Gvozden Neskovic <[email protected]> Reviewed-by: Tom Caputi <[email protected]> Reviewed-by: Isaac Huang <[email protected]> Reviewed-by: Jinshan Xiong <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #3441 Closes #5135
Diffstat (limited to 'module/zfs/vdev_raidz.c')
-rw-r--r--module/zfs/vdev_raidz.c603
1 files changed, 407 insertions, 196 deletions
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index d1b415367..a92d3cbaa 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -30,6 +30,7 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h>
@@ -136,7 +137,7 @@ vdev_raidz_map_free(raidz_map_t *rm)
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
- zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+ abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -144,11 +145,13 @@ vdev_raidz_map_free(raidz_map_t *rm)
}
size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size;
+ }
- if (rm->rm_datacopy != NULL)
- zio_buf_free(rm->rm_datacopy, size);
+ if (rm->rm_abd_copy != NULL)
+ abd_free(rm->rm_abd_copy);
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
@@ -185,7 +188,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
size_t x;
const char *good = NULL;
- const char *bad = rm->rm_col[c].rc_data;
+ char *bad;
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
@@ -199,8 +202,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* data never changes for a given logical ZIO)
*/
if (rm->rm_col[0].rc_gdata == NULL) {
- char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
+ int offset;
/*
* Set up the rm_col[]s to generate the parity for
@@ -208,15 +212,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* replacing them with buffers to hold the result.
*/
for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_data;
- rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+ bad_parity[x] = rm->rm_col[x].rc_abd;
+ rm->rm_col[x].rc_gdata =
zio_buf_alloc(rm->rm_col[x].rc_size);
+ rm->rm_col[x].rc_abd =
+ abd_get_from_buf(rm->rm_col[x].rc_gdata,
+ rm->rm_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
for (; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rm_col[x].rc_size);
buf += rm->rm_col[x].rc_size;
}
@@ -226,13 +235,18 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++)
- rm->rm_col[x].rc_data = bad_parity[x];
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = bad_parity[x];
+ }
- buf = rm->rm_datacopy;
+ offset = 0;
for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
- buf += rm->rm_col[x].rc_size;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_offset_size(
+ rm->rm_abd_copy, offset,
+ rm->rm_col[x].rc_size);
+ offset += rm->rm_col[x].rc_size;
}
}
@@ -246,8 +260,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
good += rm->rm_col[x].rc_size;
}
+ bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
}
/*
@@ -260,7 +276,7 @@ static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- caddr_t buf;
+ size_t offset;
raidz_map_t *rm = zio->io_vsd;
size_t size;
@@ -274,7 +290,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
- if (rm->rm_datacopy != NULL)
+ if (rm->rm_abd_copy != NULL)
return;
/*
@@ -290,17 +306,21 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
size += rm->rm_col[c].rc_size;
- buf = rm->rm_datacopy = zio_buf_alloc(size);
+ rm->rm_abd_copy =
+ abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
+ abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
+ col->rc_size);
- bcopy(col->rc_data, buf, col->rc_size);
- col->rc_data = buf;
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- buf += col->rc_size;
+ offset += col->rc_size;
}
- ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+ ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -329,6 +349,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t off = 0;
/*
* "Quotient": The number of data sectors for this stripe on all but
@@ -373,7 +394,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
- rm->rm_datacopy = NULL;
+ rm->rm_abd_copy = NULL;
rm->rm_reports = 0;
rm->rm_freed = 0;
rm->rm_ecksuminjected = 0;
@@ -389,7 +410,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_abd = NULL;
rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
@@ -412,13 +433,18 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+ rm->rm_col[c].rc_abd =
+ abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
+ rm->rm_col[c].rc_size);
+ off = rm->rm_col[c].rc_size;
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
+ for (c = c + 1; c < acols; c++) {
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
+ rm->rm_col[c].rc_size);
+ off += rm->rm_col[c].rc_size;
+ }
/*
* If all data stored spans all columns, there's a danger that parity
@@ -464,29 +490,84 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
return (rm);
}
+struct pqr_struct {
+ uint64_t *p;
+ uint64_t *q;
+ uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++)
+ *pqr->p ^= *src;
+
+ return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+ *pqr->r ^= *src;
+ }
+
+ return (0);
+}
+
static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
- uint64_t *p, *src, pcount, ccount, i;
+ uint64_t *p;
int c;
-
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ abd_t *src;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p = *src;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p ^= *src;
- }
+ struct pqr_struct pqr = { p, NULL, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_p_func, &pqr);
}
}
}
@@ -494,50 +575,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p = *src;
- *q = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++) {
- *p = 0;
- *q = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p ^= *src;
+ struct pqr_struct pqr = { p, q, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pq_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
}
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
}
@@ -546,59 +620,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{
- uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_R].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p = *src;
- *q = *src;
- *r = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++, r++) {
- *p = 0;
- *q = 0;
- *r = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ (void) memcpy(r, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p ^= *src;
-
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ struct pqr_struct pqr = { p, q, r };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pqr_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_4(*r, mask);
- *r ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ r[i] = 0;
}
-
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++, r++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
- VDEV_RAIDZ_64MUL_4(*r, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ VDEV_RAIDZ_64MUL_4(r[i], mask);
}
}
}
@@ -630,40 +693,159 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
}
}
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ int cnt = size / sizeof (src[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dst[i] ^= src[i];
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+ void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++, src++) {
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ *dst ^= *src;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+ uint64_t *dst = buf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++) {
+ /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ }
+
+ return (0);
+}
+
+struct reconst_q_struct {
+ uint64_t *q;
+ int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+ struct reconst_q_struct *rq = private;
+ uint64_t *dst = buf;
+ int cnt = size / sizeof (dst[0]);
+ int i;
+
+ for (i = 0; i < cnt; i++, dst++, rq->q++) {
+ int j;
+ uint8_t *b;
+
+ *dst ^= *rq->q;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, rq->exp);
+ }
+ }
+
+ return (0);
+}
+
+struct reconst_pq_struct {
+ uint8_t *p;
+ uint8_t *q;
+ uint8_t *pxy;
+ uint8_t *qxy;
+ int aexp;
+ int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ uint8_t *yd = ybuf;
+ int i;
+
+ for (i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ *yd = *rpq->p ^ *rpq->pxy ^ *xd;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ int i;
+
+ for (i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+ /* same operation as vdev_raidz_reconst_pq_func() on xd */
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ }
+
+ return (0);
+}
+
static int
vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, i;
int x = tgts[0];
int c;
+ abd_t *dst, *src;
ASSERT(ntgts == 1);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(x < rm->rm_cols);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
- ASSERT(xcount > 0);
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ ASSERT(rm->rm_col[x].rc_size > 0);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- dst = rm->rm_col[x].rc_data;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst = *src;
- }
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
+ uint64_t size = MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
+
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == x)
continue;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
-
- for (i = 0; i < count; i++, dst++, src++) {
- *dst ^= *src;
- }
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_p_func, NULL);
}
return (1 << VDEV_RAIDZ_P);
@@ -672,57 +854,46 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, mask, i;
- uint8_t *b;
int x = tgts[0];
- int c, j, exp;
+ int c, exp;
+ abd_t *dst, *src;
+ struct reconst_q_struct rq;
ASSERT(ntgts == 1);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
-
- if (c == x)
- ccount = 0;
- else
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
- count = MIN(ccount, xcount);
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == rm->rm_firstdatacol) {
- for (i = 0; i < count; i++, dst++, src++) {
- *dst = *src;
- }
- for (; i < xcount; i++, dst++) {
- *dst = 0;
- }
+ abd_copy(dst, src, size);
+ if (rm->rm_col[x].rc_size > size)
+ abd_zero_off(dst, size,
+ rm->rm_col[x].rc_size - size);
} else {
- for (i = 0; i < count; i++, dst++, src++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- *dst ^= *src;
- }
-
- for (; i < xcount; i++, dst++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- }
+ ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_q_pre_func, NULL);
+ (void) abd_iterate_func(dst,
+ size, rm->rm_col[x].rc_size - size,
+ vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- dst = rm->rm_col[x].rc_data;
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
exp = 255 - (rm->rm_cols - 1 - x);
+ rq.q = abd_to_buf(src);
+ rq.exp = exp;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst ^= *src;
- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
- *b = vdev_raidz_exp2(*b, exp);
- }
- }
+ (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
@@ -730,11 +901,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
- void *pdata, *qdata;
- uint64_t xsize, ysize, i;
+ uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+ abd_t *pdata, *qdata;
+ uint64_t xsize, ysize;
int x = tgts[0];
int y = tgts[1];
+ abd_t *xd, *yd;
+ struct reconst_pq_struct rpq;
ASSERT(ntgts == 2);
ASSERT(x < y);
@@ -750,15 +923,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
xsize = rm->rm_col[x].rc_size;
ysize = rm->rm_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
- rm->rm_col[VDEV_RAIDZ_Q].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
rm->rm_col[x].rc_size = 0;
rm->rm_col[y].rc_size = 0;
@@ -767,12 +940,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
rm->rm_col[x].rc_size = xsize;
rm->rm_col[y].rc_size = ysize;
- p = pdata;
- q = qdata;
- pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- xd = rm->rm_col[x].rc_data;
- yd = rm->rm_col[y].rc_data;
+ p = abd_to_buf(pdata);
+ q = abd_to_buf(qdata);
+ pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rm->rm_col[x].rc_abd;
+ yd = rm->rm_col[y].rc_abd;
/*
* We now have:
@@ -796,24 +969,27 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
- for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
- *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
- vdev_raidz_exp2(*q ^ *qxy, bexp);
+ ASSERT3U(xsize, >=, ysize);
+ rpq.p = p;
+ rpq.q = q;
+ rpq.pxy = pxy;
+ rpq.qxy = qxy;
+ rpq.aexp = aexp;
+ rpq.bexp = bexp;
- if (i < ysize)
- *yd = *p ^ *pxy ^ *xd;
- }
+ (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+ vdev_raidz_reconst_pq_func, &rpq);
+ (void) abd_iterate_func(xd, ysize, xsize - ysize,
+ vdev_raidz_reconst_pq_tail_func, &rpq);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
- rm->rm_col[VDEV_RAIDZ_P].rc_size);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1131,7 +1307,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
c = used[i];
ASSERT3U(c, <, rm->rm_cols);
- src = rm->rm_col[c].rc_data;
+ src = abd_to_buf(rm->rm_col[c].rc_abd);
ccount = rm->rm_col[c].rc_size;
for (j = 0; j < nmissing; j++) {
cc = missing[j] + rm->rm_firstdatacol;
@@ -1139,7 +1315,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
ASSERT3U(cc, <, rm->rm_cols);
ASSERT3U(cc, !=, c);
- dst[j] = rm->rm_col[cc].rc_data;
+ dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
dcount[j] = rm->rm_col[cc].rc_size;
}
@@ -1187,8 +1363,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used;
+ abd_t **bufs = NULL;
+
int code = 0;
+ /*
+ * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+ * temporary linear ABDs.
+ */
+ if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bufs[c] = col->rc_abd;
+ col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
+ }
n = rm->rm_cols - rm->rm_firstdatacol;
@@ -1275,6 +1468,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
kmem_free(p, psize);
+ /*
+ * copy back from temporary linear abds and free them
+ */
+ if (bufs) {
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ col->rc_abd = bufs[c];
+ }
+ kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ }
+
return (code);
}
@@ -1321,7 +1528,6 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
dt = &tgts[nbadparity];
-
/* Reconstruct using the new math implementation */
ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
if (ret != RAIDZ_ORIGINAL_IMPL)
@@ -1479,7 +1685,7 @@ vdev_raidz_io_start(zio_t *zio)
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1536,7 +1742,7 @@ vdev_raidz_io_start(zio_t *zio)
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1552,6 +1758,7 @@ vdev_raidz_io_start(zio_t *zio)
static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
+ void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -1565,9 +1772,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
+ buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
- rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+ rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc);
+ abd_return_buf(rc->rc_abd, buf, rc->rc_size);
}
}
@@ -1616,7 +1825,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig[c], rc->rc_size);
+ abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
vdev_raidz_generate_parity(rm);
@@ -1625,7 +1834,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
- if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) {
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -1728,7 +1937,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
ASSERT3S(c, >=, 0);
ASSERT3S(c, <, rm->rm_cols);
rc = &rm->rm_col[c];
- bcopy(rc->rc_data, orig[i], rc->rc_size);
+ abd_copy_to_buf(orig[i], rc->rc_abd,
+ rc->rc_size);
}
/*
@@ -1758,7 +1968,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
- bcopy(orig[i], rc->rc_data, rc->rc_size);
+ abd_copy_from_buf(rc->rc_abd, orig[i],
+ rc->rc_size);
}
do {
@@ -1997,7 +2208,7 @@ vdev_raidz_io_done(zio_t *zio)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
@@ -2077,7 +2288,7 @@ done:
continue;
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));