Merge commit 'refs/top-bases/fix-dbuf_dirty_record_t' into fix-dbuf_dirty_record_t

author: Brian Behlendorf <[email protected]> 2009-08-18 12:09:14 -0700
committer: Brian Behlendorf <[email protected]> 2009-08-18 12:09:14 -0700
commit: b330ab77dfa9ec1d3844bcb25cdf5eb02fbe318d (patch)
tree: 157afe7ded2d31f2bebbe5f5235b1eaf3432b1f3 /module/zfs/vdev_raidz.c
parent: 5b629493e04f13db9d0e29bf7d5a85769be91459 (diff)
parent: 45d1cae3b8c949ecc391dd7a5b81963b34c71c29 (diff)
1 files changed, 996 insertions, 283 deletions
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 92753d871..b3074173e 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -35,12 +35,27 @@
 /*
  * Virtual device vector for RAID-Z.
  *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
@@ -55,22 +70,32 @@
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
  *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
@@ -84,21 +109,49 @@ typedef struct raidz_col {
 } raidz_col_t;
 
 typedef struct raidz_map {
-	uint64_t rm_cols;		/* Column count */
+	uint64_t rm_cols;		/* Regular column count */
+	uint64_t rm_scols;		/* Count including skipped columns */
 	uint64_t rm_bigcols;		/* Number of oversized columns */
 	uint64_t rm_asize;		/* Actual total I/O size */
 	uint64_t rm_missingdata;	/* Count of missing data devices */
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
+	uint64_t rm_skipped;		/* Skipped sectors for padding */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
+#define	VDEV_RAIDZ_R		2
+#define	VDEV_RAIDZ_MAXPARITY	3
+
+#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define	VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+	(mask) = (x) & 0x8080808080808080ULL; \
+	(mask) = ((mask) << 1) - ((mask) >> 7); \
+	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
 
-#define	VDEV_RAIDZ_MAXPARITY	2
+#define	VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+}
 
-#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
@@ -201,7 +254,7 @@ vdev_raidz_map_free(zio_t *zio)
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
 static raidz_map_t *
@@ -213,24 +266,35 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	uint64_t s = zio->io_size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, coff, devidx;
+	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
 	q = s / (dcols - nparity);
 	r = s - q * (dcols - nparity);
 	bc = (r == 0 ? 0 : r + nparity);
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	if (q == 0) {
+		acols = bc;
+		scols = MIN(dcols, roundup(bc, nparity + 1));
+	} else {
+		acols = dcols;
+		scols = dcols;
+	}
 
-	acols = (q == 0 ? bc : dcols);
+	ASSERT3U(acols, <=, scols);
 
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
 	rm->rm_cols = acols;
+	rm->rm_scols = scols;
 	rm->rm_bigcols = bc;
-	rm->rm_asize = 0;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
 
-	for (c = 0; c < acols; c++) {
+	asize = 0;
+
+	for (c = 0; c < scols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
@@ -239,15 +303,26 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
 		rm->rm_col[c].rc_data = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
 		rm->rm_col[c].rc_skipped = 0;
-		rm->rm_asize += rm->rm_col[c].rc_size;
+
+		if (c >= acols)
+			rm->rm_col[c].rc_size = 0;
+		else if (c < bc)
+			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		else
+			rm->rm_col[c].rc_size = q << unit_shift;
+
+		asize += rm->rm_col[c].rc_size;
 	}
 
-	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+	ASSERT3U(asize, ==, tot << unit_shift);
+	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+	rm->rm_skipped = roundup(tot, nparity + 1) - tot;
+	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
+	ASSERT3U(rm->rm_skipped, <=, nparity);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -305,12 +380,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccount == pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p = *src;
 			}
 		} else {
 			ASSERT(ccount <= pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p ^= *src;
 			}
 		}
@@ -320,10 +395,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-	uint64_t *q, *p, *src, pcount, ccount, mask, i;
+	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 	int c;
 
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
@@ -331,55 +406,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount || ccount == 0);
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				*q = *src;
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p = *src;
+				*q = *src;
 			}
-			for (; i < pcount; i++, p++, q++, src++) {
-				*q = 0;
+			for (; i < pcnt; i++, src++, p++, q++) {
 				*p = 0;
+				*q = 0;
 			}
 		} else {
-			ASSERT(ccount <= pcount);
+			ASSERT(ccnt <= pcnt);
 
 			/*
-			 * Rather than multiplying each byte individually (as
-			 * described above), we are able to handle 8 at once
-			 * by generating a mask based on the high bit in each
-			 * byte and using that to conditionally XOR in 0x1d.
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
 			 */
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
+				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
+			}
+
+			/*
+			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
+			 */
+			for (; i < pcnt; i++, q++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+	int c;
+
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		src = rm->rm_col[c].rc_data;
+		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		if (c == rm->rm_firstdatacol) {
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+				*p = *src;
+				*q = *src;
+				*r = *src;
+			}
+			for (; i < pcnt; i++, src++, p++, q++, r++) {
+				*p = 0;
+				*q = 0;
+				*r = 0;
+			}
+		} else {
+			ASSERT(ccnt <= pcnt);
+
+			/*
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
+			 */
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				*q ^= *src;
+
+				VDEV_RAIDZ_64MUL_4(*r, mask);
+				*r ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
 			 */
-			for (; i < pcount; i++, q++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (; i < pcnt; i++, q++, r++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				VDEV_RAIDZ_64MUL_4(*r, mask);
 			}
 		}
 	}
 }
 
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
 static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+	switch (rm->rm_firstdatacol) {
+	case 1:
+		vdev_raidz_generate_parity_p(rm);
+		break;
+	case 2:
+		vdev_raidz_generate_parity_pq(rm);
+		break;
+	case 3:
+		vdev_raidz_generate_parity_pqr(rm);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+	}
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, i;
+	int x = tgts[0];
 	int c;
 
+	ASSERT(ntgts == 1);
+	ASSERT(x >= rm->rm_firstdatacol);
+	ASSERT(x < rm->rm_cols);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 	ASSERT(xcount > 0);
@@ -404,15 +562,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
 			*dst ^= *src;
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_P);
 }
 
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
 	uint8_t *b;
+	int x = tgts[0];
 	int c, j, exp;
 
+	ASSERT(ntgts == 1);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
@@ -436,23 +599,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
 			}
 
 		} else {
-			/*
-			 * For an explanation of this, see the comment in
-			 * vdev_raidz_generate_parity_pq() above.
-			 */
 			for (i = 0; i < count; i++, dst++, src++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 				*dst ^= *src;
 			}
 
 			for (; i < xcount; i++, dst++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 			}
 		}
 	}
@@ -467,15 +620,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
 			*b = vdev_raidz_exp2(*b, exp);
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_Q);
 }
 
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
 	uint64_t xsize, ysize, i;
+	int x = tgts[0];
+	int y = tgts[1];
 
+	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(y < rm->rm_cols);
@@ -553,15 +711,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
 	 */
 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+    uint8_t **rows)
+{
+	int i, j;
+	int pow;
+
+	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+	/*
+	 * Fill in the missing rows of interest.
+	 */
+	for (i = 0; i < nmap; i++) {
+		ASSERT3S(0, <=, map[i]);
+		ASSERT3S(map[i], <=, 2);
+
+		pow = map[i] * n;
+		if (pow > 255)
+			pow -= 255;
+		ASSERT(pow <= 255);
+
+		for (j = 0; j < n; j++) {
+			pow -= map[i];
+			if (pow < 0)
+				pow += 255;
+			rows[i][j] = vdev_raidz_pow2[pow];
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, ii, jj;
+	uint8_t log;
+
+	/*
+	 * Assert that the first nmissing entries from the array of used
+	 * columns correspond to parity columns and that subsequent entries
+	 * correspond to data columns.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		ASSERT3S(used[i], <, rm->rm_firstdatacol);
+	}
+	for (; i < n; i++) {
+		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+	}
+
+	/*
+	 * First initialize the storage where we'll compute the inverse rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			invrows[i][j] = (i == j) ? 1 : 0;
+		}
+	}
+
+	/*
+	 * Subtract all trivial rows from the rows of consequence.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = nmissing; j < n; j++) {
+			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+			jj = used[j] - rm->rm_firstdatacol;
+			ASSERT3S(jj, <, n);
+			invrows[i][j] = rows[i][jj];
+			rows[i][jj] = 0;
+		}
+	}
+
+	/*
+	 * For each of the rows of interest, we must normalize it and subtract
+	 * a multiple of it from the other rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < missing[i]; j++) {
+			ASSERT3U(rows[i][j], ==, 0);
+		}
+		ASSERT3U(rows[i][missing[i]], !=, 0);
+
+		/*
+		 * Compute the inverse of the first element and multiply each
+		 * element in the row by that value.
+		 */
+		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+		for (j = 0; j < n; j++) {
+			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+		}
+
+		for (ii = 0; ii < nmissing; ii++) {
+			if (i == ii)
+				continue;
+
+			ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+			log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+			for (j = 0; j < n; j++) {
+				rows[ii][j] ^=
+				    vdev_raidz_exp2(rows[i][j], log);
+				invrows[ii][j] ^=
+				    vdev_raidz_exp2(invrows[i][j], log);
+			}
+		}
+	}
+
+	/*
+	 * Verify that the data that is left in the rows are properly part of
+	 * an identity matrix.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			if (j == missing[i]) {
+				ASSERT3U(rows[i][j], ==, 1);
+			} else {
+				ASSERT3U(rows[i][j], ==, 0);
+			}
+		}
+	}
 }
 
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, x, cc, c;
+	uint8_t *src;
+	uint64_t ccount;
+	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+	uint8_t log, val;
+	int ll;
+	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *p, *pp;
+	size_t psize;
+
+	psize = sizeof (invlog[0][0]) * n * nmissing;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing; i++) {
+		invlog[i] = pp;
+		pp += n;
+	}
+
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			ASSERT3U(invrows[i][j], !=, 0);
+			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		c = used[i];
+		ASSERT3U(c, <, rm->rm_cols);
+
+		src = rm->rm_col[c].rc_data;
+		ccount = rm->rm_col[c].rc_size;
+		for (j = 0; j < nmissing; j++) {
+			cc = missing[j] + rm->rm_firstdatacol;
+			ASSERT3U(cc, >=, rm->rm_firstdatacol);
+			ASSERT3U(cc, <, rm->rm_cols);
+			ASSERT3U(cc, !=, c);
+
+			dst[j] = rm->rm_col[cc].rc_data;
+			dcount[j] = rm->rm_col[cc].rc_size;
+		}
+
+		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+		for (x = 0; x < ccount; x++, src++) {
+			if (*src != 0)
+				log = vdev_raidz_log2[*src];
+
+			for (cc = 0; cc < nmissing; cc++) {
+				if (x >= dcount[cc])
+					continue;
+
+				if (*src == 0) {
+					val = 0;
+				} else {
+					if ((ll = log + invlog[cc][i]) >= 255)
+						ll -= 255;
+					val = vdev_raidz_pow2[ll];
+				}
+
+				if (i == 0)
+					dst[cc][x] = val;
+				else
+					dst[cc][x] ^= val;
+			}
+		}
+	}
+
+	kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+	int n, i, c, t, tt;
+	int nmissing_rows;
+	int missing_rows[VDEV_RAIDZ_MAXPARITY];
+	int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+	uint8_t *p, *pp;
+	size_t psize;
+
+	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *used;
+
+	int code = 0;
+
+
+	n = rm->rm_cols - rm->rm_firstdatacol;
+
+	/*
+	 * Figure out which data columns are missing.
+	 */
+	nmissing_rows = 0;
+	for (t = 0; t < ntgts; t++) {
+		if (tgts[t] >= rm->rm_firstdatacol) {
+			missing_rows[nmissing_rows++] =
+			    tgts[t] - rm->rm_firstdatacol;
+		}
+	}
+
+	/*
+	 * Figure out which parity columns to use to help generate the missing
+	 * data columns.
+	 */
+	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+		ASSERT(tt < ntgts);
+		ASSERT(c < rm->rm_firstdatacol);
+
+		/*
+		 * Skip any targeted parity columns.
+		 */
+		if (c == tgts[tt]) {
+			tt++;
+			continue;
+		}
+
+		code |= 1 << c;
+
+		parity_map[i] = c;
+		i++;
+	}
+
+	ASSERT(code != 0);
+	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+	    nmissing_rows * n + sizeof (used[0]) * n;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing_rows; i++) {
+		rows[i] = pp;
+		pp += n;
+		invrows[i] = pp;
+		pp += n;
+	}
+	used = pp;
+
+	for (i = 0; i < nmissing_rows; i++) {
+		used[i] = parity_map[i];
+	}
+
+	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		if (tt < nmissing_rows &&
+		    c == missing_rows[tt] + rm->rm_firstdatacol) {
+			tt++;
+			continue;
+		}
+
+		ASSERT3S(i, <, n);
+		used[i] = c;
+		i++;
+	}
+
+	/*
+	 * Initialize the interesting rows of the matrix.
+	 */
+	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+	/*
+	 * Invert the matrix.
+	 */
+	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+	    invrows, used);
+
+	/*
+	 * Reconstruct the missing data using the generated matrix.
+	 */
+	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+	    invrows, used);
+
+	kmem_free(p, psize);
+
+	return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+	int ntgts;
+	int i, c;
+	int code;
+	int nbadparity, nbaddata;
+	int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+	/*
+	 * The tgts list must already be sorted.
+	 */
+	for (i = 1; i < nt; i++) {
+		ASSERT(t[i] > t[i - 1]);
+	}
+
+	nbadparity = rm->rm_firstdatacol;
+	nbaddata = rm->rm_cols - nbadparity;
+	ntgts = 0;
+	for (i = 0, c = 0; c < rm->rm_cols; c++) {
+		if (c < rm->rm_firstdatacol)
+			parity_valid[c] = B_FALSE;
+
+		if (i < nt && c == t[i]) {
+			tgts[ntgts++] = c;
+			i++;
+		} else if (rm->rm_col[c].rc_error != 0) {
+			tgts[ntgts++] = c;
+		} else if (c >= rm->rm_firstdatacol) {
+			nbaddata--;
+		} else {
+			parity_valid[c] = B_TRUE;
+			nbadparity--;
+		}
+	}
+
+	ASSERT(ntgts >= nt);
+	ASSERT(nbaddata >= 0);
+	ASSERT(nbaddata + nbadparity == ntgts);
+
+	dt = &tgts[nbadparity];
+
+	/*
+	 * See if we can use any of our optimized reconstruction routines.
+	 */
+	if (!vdev_raidz_default_to_general) {
+		switch (nbaddata) {
+		case 1:
+			if (parity_valid[VDEV_RAIDZ_P])
+				return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+			break;
+
+		case 2:
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_P] &&
+			    parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+
+			break;
+		}
+	}
+
+	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+	ASSERT(code > 0);
+	return (code);
+}
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
-	int c, error;
+	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -573,11 +1270,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 		return (EINVAL);
 	}
 
+	vdev_open_children(vd);
+
 	for (c = 0; c < vd->vdev_children; c++) {
 		cvd = vd->vdev_child[c];
 
-		if ((error = vdev_open(cvd)) != 0) {
-			lasterror = error;
+		if (cvd->vdev_open_error != 0) {
+			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
@@ -639,7 +1338,7 @@ vdev_raidz_io_start(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
-	int c;
+	int c, i;
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
@@ -647,13 +1346,7 @@ vdev_raidz_io_start(zio_t *zio)
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * Generate RAID parity in the first virtual columns.
-		 */
-		if (rm->rm_firstdatacol == 1)
-			vdev_raidz_generate_parity_p(rm);
-		else
-			vdev_raidz_generate_parity_pq(rm);
+		vdev_raidz_generate_parity(rm);
 
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
@@ -664,6 +1357,23 @@ vdev_raidz_io_start(zio_t *zio)
 			    vdev_raidz_child_done, rc));
 		}
 
+		/*
+		 * Generate optional I/Os for any skipped sectors to improve
+		 * aggregation contiguity.
+		 */
+		for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
+			ASSERT(c <= rm->rm_scols);
+			if (c == rm->rm_scols)
+				c = 0;
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset + rc->rc_size, NULL,
+			    1 << tvd->vdev_ashift,
+			    zio->io_type, zio->io_priority,
+			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+		}
+
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
@@ -671,8 +1381,7 @@ vdev_raidz_io_start(zio_t *zio)
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
-	 * last -- any errors along the way will force us to read the parity
-	 * data.
+	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
@@ -748,10 +1457,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 		bcopy(rc->rc_data, orig[c], rc->rc_size);
 	}
 
-	if (rm->rm_firstdatacol == 1)
-		vdev_raidz_generate_parity_p(rm);
-	else
-		vdev_raidz_generate_parity_pq(rm);
+	vdev_raidz_generate_parity(rm);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
@@ -768,9 +1474,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	return (ret);
 }
 
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
@@ -783,19 +1490,176 @@ vdev_raidz_worst_error(raidz_map_t *rm)
 	return (error);
 }
 
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+	void *orig[VDEV_RAIDZ_MAXPARITY];
+	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+	int *tgts = &tstore[1];
+	int current, next, i, c, n;
+	int code, ret = 0;
+
+	ASSERT(total_errors < rm->rm_firstdatacol);
+
+	/*
+	 * This simplifies one edge condition.
+	 */
+	tgts[-1] = -1;
+
+	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+		/*
+		 * Initialize the targets array by finding the first n columns
+		 * that contain no error.
+		 *
+		 * If there were no data errors, we need to ensure that we're
+		 * always explicitly attempting to reconstruct at least one
+		 * data column. To do this, we simply push the highest target
+		 * up into the data columns.
+		 */
+		for (c = 0, i = 0; i < n; i++) {
+			if (i == n - 1 && data_errors == 0 &&
+			    c < rm->rm_firstdatacol) {
+				c = rm->rm_firstdatacol;
+			}
+
+			while (rm->rm_col[c].rc_error != 0) {
+				c++;
+				ASSERT3S(c, <, rm->rm_cols);
+			}
+
+			tgts[i] = c++;
+		}
+
+		/*
+		 * Setting tgts[n] simplifies the other edge condition.
+		 */
+		tgts[n] = rm->rm_cols;
+
+		/*
+		 * These buffers were allocated in previous iterations.
+		 */
+		for (i = 0; i < n - 1; i++) {
+			ASSERT(orig[i] != NULL);
+		}
+
+		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+		current = 0;
+		next = tgts[current];
+
+		while (current != n) {
+			tgts[current] = next;
+			current = 0;
+
+			/*
+			 * Save off the original data that we're going to
+			 * attempt to reconstruct.
+			 */
+			for (i = 0; i < n; i++) {
+				ASSERT(orig[i] != NULL);
+				c = tgts[i];
+				ASSERT3S(c, >=, 0);
+				ASSERT3S(c, <, rm->rm_cols);
+				rc = &rm->rm_col[c];
+				bcopy(rc->rc_data, orig[i], rc->rc_size);
+			}
+
+			/*
+			 * Attempt a reconstruction and exit the outer loop on
+			 * success.
+			 */
+			code = vdev_raidz_reconstruct(rm, tgts, n);
+			if (zio_checksum_error(zio) == 0) {
+				atomic_inc_64(&raidz_corrected[code]);
+
+				for (i = 0; i < n; i++) {
+					c = tgts[i];
+					rc = &rm->rm_col[c];
+					ASSERT(rc->rc_error == 0);
+					if (rc->rc_tried)
+						raidz_checksum_error(zio, rc);
+					rc->rc_error = ECKSUM;
+				}
+
+				ret = code;
+				goto done;
+			}
+
+			/*
+			 * Restore the original data.
+			 */
+			for (i = 0; i < n; i++) {
+				c = tgts[i];
+				rc = &rm->rm_col[c];
+				bcopy(orig[i], rc->rc_data, rc->rc_size);
+			}
+
+			do {
+				/*
+				 * Find the next valid column after the current
+				 * position..
+				 */
+				for (next = tgts[current] + 1;
+				    next < rm->rm_cols &&
+				    rm->rm_col[next].rc_error != 0; next++)
+					continue;
+
+				ASSERT(next <= tgts[current + 1]);
+
+				/*
+				 * If that spot is available, we're done here.
+				 */
+				if (next != tgts[current + 1])
+					break;
+
+				/*
+				 * Otherwise, find the next valid column after
+				 * the previous position.
+				 */
+				for (c = tgts[current - 1] + 1;
+				    rm->rm_col[c].rc_error != 0; c++)
+					continue;
+
+				tgts[current] = c;
+				current++;
+
+			} while (current != n);
+		}
+	}
+	n--;
+done:
+	for (i = 0; i < n; i++) {
+		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+	}
+
+	return (ret);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc, *rc1;
+	raidz_col_t *rc;
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
-	int n, c, c1;
+	int n, c;
+	int tgts[VDEV_RAIDZ_MAXPARITY];
+	int code;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
@@ -859,8 +1723,7 @@ vdev_raidz_io_done(zio_t *zio)
 	 * any errors.
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-		switch (data_errors) {
-		case 0:
+		if (data_errors == 0) {
 			if (zio_checksum_error(zio) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
@@ -880,9 +1743,7 @@ vdev_raidz_io_done(zio_t *zio)
 				}
 				goto done;
 			}
-			break;
-
-		case 1:
+		} else {
 			/*
 			 * We either attempt to read all the parity columns or
 			 * none of them. If we didn't try to read parity, we
@@ -894,45 +1755,38 @@ vdev_raidz_io_done(zio_t *zio)
 			ASSERT(parity_errors < rm->rm_firstdatacol);
 
 			/*
-			 * Find the column that reported the error.
+			 * Identify the data columns that reported an error.
 			 */
+			n = 0;
 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
+				if (rc->rc_error != 0) {
+					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+					tgts[n++] = c;
+				}
 			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
 
-			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-				vdev_raidz_reconstruct_p(rm, c);
-			} else {
-				ASSERT(rm->rm_firstdatacol > 1);
-				vdev_raidz_reconstruct_q(rm, c);
-			}
+			ASSERT(rm->rm_firstdatacol >= n);
+
+			code = vdev_raidz_reconstruct(rm, tgts, n);
 
 			if (zio_checksum_error(zio) == 0) {
-				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-					atomic_inc_64(&raidz_corrected_p);
-				else
-					atomic_inc_64(&raidz_corrected_q);
+				atomic_inc_64(&raidz_corrected[code]);
 
 				/*
-				 * If there's more than one parity disk that
-				 * was successfully read, confirm that the
-				 * other parity disk produced the correct data.
-				 * This routine is suboptimal in that it
-				 * regenerates both the parity we wish to test
-				 * as well as the parity we just used to
-				 * perform the reconstruction, but this should
-				 * be a relatively uncommon case, and can be
-				 * optimized if it becomes a problem.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
+				 * If we read more parity disks than were used
+				 * for reconstruction, confirm that the other
+				 * parity disks produced correct data. This
+				 * routine is suboptimal in that it regenerates
+				 * the parity that we already used in addition
+				 * to the parity that we're attempting to
+				 * verify, but this should be a relatively
+				 * uncommon case, and can be optimized if it
+				 * becomes a problem. Note that we regenerate
+				 * parity when resilvering so we can write it
+				 * out to failed devices later.
 				 */
-				if (parity_errors < rm->rm_firstdatacol - 1 ||
+				if (parity_errors < rm->rm_firstdatacol - n ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
@@ -942,46 +1796,6 @@ vdev_raidz_io_done(zio_t *zio)
 
 				goto done;
 			}
-			break;
-
-		case 2:
-			/*
-			 * Two data column errors require double parity.
-			 */
-			ASSERT(rm->rm_firstdatacol == 2);
-
-			/*
-			 * Find the two columns that reported errors.
-			 */
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			for (c1 = c++; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			vdev_raidz_reconstruct_pq(rm, c1, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				atomic_inc_64(&raidz_corrected_pq);
-				goto done;
-			}
-			break;
-
-		default:
-			ASSERT(rm->rm_firstdatacol <= 2);
-			ASSERT(0);
 		}
 	}
 
@@ -1020,8 +1834,10 @@ vdev_raidz_io_done(zio_t *zio)
 	 * errors we detected, and we've attempted to read all columns. There
 	 * must, therefore, be one or more additional problems -- silent errors
 	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. Before we attempt combinatorial reconstruction make
-	 * sure we have a chance of coming up with the right answer.
+	 * in absent data. We check if there is enough additional data to
+	 * possibly reconstruct the data and then perform combinatorial
+	 * reconstruction over all possible combinations. If that fails,
+	 * we're cooked.
 	 */
 	if (total_errors >= rm->rm_firstdatacol) {
 		zio->io_error = vdev_raidz_worst_error(rm);
@@ -1032,133 +1848,30 @@ vdev_raidz_io_done(zio_t *zio)
 		 */
 		if (total_errors == rm->rm_firstdatacol)
 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
-		goto done;
-	}
 
-	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+	} else if ((code = vdev_raidz_combrec(zio, total_errors,
+	    data_errors)) != 0) {
 		/*
-		 * Attempt to reconstruct the data from parity P.
+		 * If we didn't use all the available parity for the
+		 * combinatorial reconstruction, verify that the remaining
+		 * parity is correct.
 		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_p(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_p);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+		if (code != (1 << rm->rm_firstdatacol) - 1)
+			(void) raidz_parity_verify(zio, rm);
+	} else {
 		/*
-		 * Attempt to reconstruct the data from parity Q.
+		 * All combinations failed to checksum. Generate checksum
+		 * ereports for all children.
 		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_q(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_q);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
+		zio->io_error = ECKSUM;
 
-	if (rm->rm_firstdatacol > 1 &&
-	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from both P and Q.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-			void *orig, *orig1;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-
-			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-				rc1 = &rm->rm_col[c1];
-
-				orig1 = zio_buf_alloc(rc1->rc_size);
-				bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-				vdev_raidz_reconstruct_pq(rm, c, c1);
-
-				if (zio_checksum_error(zio) == 0) {
-					zio_buf_free(orig, rc->rc_size);
-					zio_buf_free(orig1, rc1->rc_size);
-					atomic_inc_64(&raidz_corrected_pq);
-
-					/*
-					 * If these children didn't know they
-					 * returned bad data, inform them.
-					 */
-					if (rc->rc_tried && rc->rc_error == 0)
-						raidz_checksum_error(zio, rc);
-					if (rc1->rc_tried && rc1->rc_error == 0)
-						raidz_checksum_error(zio, rc1);
-
-					rc->rc_error = ECKSUM;
-					rc1->rc_error = ECKSUM;
-
-					goto done;
-				}
-
-				bcopy(orig1, rc1->rc_data, rc1->rc_size);
-				zio_buf_free(orig1, rc1->rc_size);
+		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			for (c = 0; c < rm->rm_cols; c++) {
+				rc = &rm->rm_col[c];
+				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
+				    zio, rc->rc_offset, rc->rc_size);
 			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	/*
-	 * All combinations failed to checksum. Generate checksum ereports for
-	 * all children.
-	 */
-	zio->io_error = ECKSUM;
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-			    rc->rc_offset, rc->rc_size);
 		}
 	}
author	Brian Behlendorf <[email protected]>	2009-08-18 12:09:14 -0700
committer	Brian Behlendorf <[email protected]>	2009-08-18 12:09:14 -0700
commit	b330ab77dfa9ec1d3844bcb25cdf5eb02fbe318d (patch)
tree	157afe7ded2d31f2bebbe5f5235b1eaf3432b1f3 /module/zfs/vdev_raidz.c
parent	5b629493e04f13db9d0e29bf7d5a85769be91459 (diff)
parent	45d1cae3b8c949ecc391dd7a5b81963b34c71c29 (diff)