aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Atkinson <[email protected]>2021-11-09 12:51:33 -0700
committerGitHub <[email protected]>2021-11-09 12:51:33 -0700
commit345196be182ad6efe07ea21372117243ba6d7910 (patch)
treec6d2ea022a463ef0f0bf6019e25bd65ffc8b0b97
parent453c63e9b74cea42d45e0bd30c0771f98a7ce60d (diff)
Single IO issue for raidz writes with skip sector
In order to reduce contention on the vq_lock, optional skip sectors for Raidz writes can be placed into a single IO request. This is done by padding out the linear ABD for a parity column to contain the skip sector and by creating gang ABD to contain the data and skip sector for data columns. The vdev_raidz_map_alloc() function now contains specific functions for both reads and write to allocate the ABD's that will be issued down to the VDEV chldren. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-By: Mark Maybee <[email protected]> Signed-off-by: Brian Atkinson <[email protected]> Closes #12333
-rw-r--r--module/zfs/vdev_raidz.c172
1 files changed, 135 insertions, 37 deletions
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 1feebf708..7e7202ec1 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -174,6 +174,114 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = {
.vsd_free = vdev_raidz_map_free_vsd,
};
+static void
+vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
+{
+ int c;
+ int nwrapped = 0;
+ uint64_t off = 0;
+ raidz_row_t *rr = rm->rm_row[0];
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(rm->rm_nrows, ==, 1);
+
+ /*
+ * Pad any parity columns with additional space to account for skip
+ * sectors.
+ */
+ if (rm->rm_skipstart < rr->rr_firstdatacol) {
+ ASSERT0(rm->rm_skipstart);
+ nwrapped = rm->rm_nskip;
+ } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
+ nwrapped =
+ (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
+ }
+
+ /*
+ * Optional single skip sectors (rc_size == 0) will be handled in
+ * vdev_raidz_io_start_write().
+ */
+ int skipped = rr->rr_scols - rr->rr_cols;
+
+ /* Allocate buffers for the parity columns */
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ /*
+ * Parity columns will pad out a linear ABD to account for
+ * the skip sector. A linear ABD is used here because
+ * parity calculations use the ABD buffer directly to calculate
+ * parity. This avoids doing a memcpy back to the ABD after the
+ * parity has been calculated. By issuing the parity column
+ * with the skip sector we can reduce contention on the child
+ * VDEV queue locks (vq_lock).
+ */
+ if (c < nwrapped) {
+ rc->rc_abd = abd_alloc_linear(
+ rc->rc_size + (1ULL << ashift), B_FALSE);
+ abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
+ skipped++;
+ } else {
+ rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
+ }
+ }
+
+ for (off = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, off, rc->rc_size);
+
+ /*
+ * Generate I/O for skip sectors to improve aggregation
+ * continuity. We will use gang ABD's to reduce contention
+ * on the child VDEV queue locks (vq_lock) by issuing
+ * a single I/O that contains the data and skip sector.
+ *
+ * It is important to make sure that rc_size is not updated
+ * even though we are adding a skip sector to the ABD. When
+ * calculating the parity in vdev_raidz_generate_parity_row()
+ * the rc_size is used to iterate through the ABD's. We can
+ * not have zero'd out skip sectors used for calculating
+ * parity for raidz, because those same sectors are not used
+ * during reconstruction.
+ */
+ if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
+ rc->rc_abd = abd_alloc_gang();
+ abd_gang_add(rc->rc_abd, abd, B_TRUE);
+ abd_gang_add(rc->rc_abd,
+ abd_get_zeros(1ULL << ashift), B_TRUE);
+ skipped++;
+ } else {
+ rc->rc_abd = abd;
+ }
+ off += rc->rc_size;
+ }
+
+ ASSERT3U(off, ==, zio->io_size);
+ ASSERT3S(skipped, ==, rm->rm_nskip);
+}
+
+static void
+vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
+{
+ int c;
+ raidz_row_t *rr = rm->rm_row[0];
+
+ ASSERT3U(rm->rm_nrows, ==, 1);
+
+ /* Allocate buffers for the parity columns */
+ for (c = 0; c < rr->rr_firstdatacol; c++)
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
+
+ for (uint64_t off = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+ zio->io_abd, off, rc->rc_size);
+ off += rc->rc_size;
+ }
+}
+
/*
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
@@ -287,17 +395,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
rm->rm_skipstart = bc;
- for (c = 0; c < rr->rr_firstdatacol; c++)
- rr->rr_col[c].rc_abd =
- abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
-
- for (uint64_t off = 0; c < acols; c++) {
- raidz_col_t *rc = &rr->rr_col[c];
- rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
- zio->io_abd, off, rc->rc_size);
- off += rc->rc_size;
- }
-
/*
* If all data stored spans all columns, there's a danger that parity
* will always be on the same device and, since parity isn't read
@@ -333,6 +430,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rm->rm_skipstart = 1;
}
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_raidz_map_alloc_write(zio, rm, ashift);
+ } else {
+ vdev_raidz_map_alloc_read(zio, rm);
+ }
+
/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();
@@ -1482,6 +1585,7 @@ vdev_raidz_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;
+ ASSERT3P(rc->rc_abd, !=, NULL);
rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;
@@ -1525,40 +1629,34 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
{
vdev_t *vd = zio->io_vd;
raidz_map_t *rm = zio->io_vsd;
- int c, i;
vdev_raidz_generate_parity_row(rm, rr);
- for (int c = 0; c < rr->rr_cols; c++) {
+ for (int c = 0; c < rr->rr_scols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- if (rc->rc_size == 0)
- continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
/* Verify physical to logical translation */
vdev_raidz_io_verify(vd, rr, c);
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[rc->rc_devidx], rc->rc_offset,
- rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
- 0, vdev_raidz_child_done, rc));
- }
-
- /*
- * Generate optional I/Os for skip sectors to improve aggregation
- * contiguity.
- */
- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
- ASSERT(c <= rr->rr_scols);
- if (c == rr->rr_scols)
- c = 0;
-
- raidz_col_t *rc = &rr->rr_col[c];
- vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
-
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ if (rc->rc_size > 0) {
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd,
+ abd_get_size(rc->rc_abd), zio->io_type,
+ zio->io_priority, 0, vdev_raidz_child_done, rc));
+ } else {
+ /*
+ * Generate optional write for skip sector to improve
+ * aggregation contiguity.
+ */
+ ASSERT3P(rc->rc_abd, ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, NULL, 1ULL << ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
+ NULL));
+ }
}
}