summaryrefslogtreecommitdiffstats
path: root/module/zfs/vdev_raidz.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/vdev_raidz.c')
-rw-r--r--module/zfs/vdev_raidz.c65
1 files changed, 65 insertions, 0 deletions
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 130ec575e..d2dfd5b43 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -431,23 +431,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
vdev_raidz_cksum_report
};
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
{
raidz_map_t *rm;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> unit_shift;
+ /* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = zio->io_size >> unit_shift;
+ /* The first column for this stripe. */
uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
tot = s + nparity * (q + (r == 0 ? 0 : 1));
+ /* acols: The columns that will be accessed. */
+ /* scols: The columns that will be accessed or skipped. */
if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
acols = bc;
scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
@@ -1521,6 +1548,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
static int
vdev_raidz_io_start(zio_t *zio)
{
@@ -1864,6 +1908,27 @@ done:
return (ret);
}
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Check for errors on the child IOs.
+ * 2. Return, setting an error code if too few child VDevs were written
+ * to reconstruct the data later. Note that partial writes are
+ * considered successful if they can be reconstructed at all.
+ * - For read operations:
+ * 1. Check for errors on the child IOs.
+ * 2. If data errors occurred:
+ * a. Try to reassemble the data from the parity available.
+ * b. If we haven't yet read the parity drives, read them now.
+ * c. If all parity drives have been read but the data still doesn't
+ * reassemble with a correct checksum, then try combinatorial
+ * reconstruction.
+ * d. If that doesn't work, return an error.
+ * 3. If there were unexpected errors or this is a resilver operation,
+ * rewrite the vdevs that had errors.
+ */
static void
vdev_raidz_io_done(zio_t *zio)
{