From 428870ff734fdaccc342b33fc53cf94724409a46 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 28 May 2010 13:45:14 -0700 Subject: Update core ZFS code from build 121 to build 141. --- module/zfs/vdev_raidz.c | 299 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 261 insertions(+), 38 deletions(-) (limited to 'module/zfs/vdev_raidz.c') diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b3074173e..4b0f5602c 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -103,6 +102,7 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ @@ -116,14 +116,18 @@ typedef struct raidz_map { uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_skipped; /* Skipped sectors for padding */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + void *rm_datacopy; /* rm_asize-buffer of copied data */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 -#define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) @@ -226,6 +230,8 @@ static const uint8_t vdev_raidz_log2[256] = { 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; +static void vdev_raidz_generate_parity(raidz_map_t *rm); + /* * Multiply a given number by 2 raised to the given power. */ @@ -246,17 +252,184 @@ vdev_raidz_exp2(uint_t a, int exp) } static void -vdev_raidz_map_free(zio_t *zio) +vdev_raidz_map_free(raidz_map_t *rm) { - raidz_map_t *rm = zio->io_vsd; int c; + size_t size; - for (c = 0; c < rm->rm_firstdatacol; c++) + for (c = 0; c < rm->rm_firstdatacol; c++) { zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + if (rm->rm_col[c].rc_gdata != NULL) + zio_buf_free(rm->rm_col[c].rc_gdata, + rm->rm_col[c].rc_size); + } + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + if (rm->rm_datacopy != NULL) + zio_buf_free(rm->rm_datacopy, size); + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } +static void +vdev_raidz_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT3U(rm->rm_freed, ==, 0); + rm->rm_freed = 1; + + if (rm->rm_reports == 0) + vdev_raidz_map_free(rm); +} + +/*ARGSUSED*/ +static void +vdev_raidz_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed != 0) + vdev_raidz_map_free(rm); +} + +static void +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + size_t c = zcr->zcr_cbinfo; + size_t x; + + const char *good = NULL; + const char *bad = rm->rm_col[c].rc_data; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + if (c < rm->rm_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical ZIO) + */ + if (rm->rm_col[0].rc_gdata == NULL) { + char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + char *buf; + + /* + * Set up the rm_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + bad_parity[x] = rm->rm_col[x].rc_data; + rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + zio_buf_alloc(rm->rm_col[x].rc_size); + } + + /* fill in the data columns from good_data */ + buf = (char *)good_data; + for (; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity(rm); + + /* restore everything back to its original state */ + for (x = 0; x < rm->rm_firstdatacol; x++) + rm->rm_col[x].rc_data = bad_parity[x]; + + buf = rm->rm_datacopy; + for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + } + + ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); + good = rm->rm_col[c].rc_gdata; + } else { + /* adjust good_data to point at the start of our column */ + good = good_data; + + for (x = rm->rm_firstdatacol; x < c; x++) + good += rm->rm_col[x].rc_size; + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_raidz_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + caddr_t buf; + + raidz_map_t *rm = zio->io_vsd; + size_t size; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_raidz_cksum_finish; + zcr->zcr_free = vdev_raidz_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_datacopy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. + */ + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + buf = rm->rm_datacopy = zio_buf_alloc(size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bcopy(col->rc_data, buf, col->rc_size); + col->rc_data = buf; + + buf += col->rc_size; + } + ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); +} + +static const zio_vsd_ops_t vdev_raidz_vsd_ops = { + vdev_raidz_map_free_vsd, + vdev_raidz_cksum_report +}; + static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) @@ -288,9 +461,14 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_cols = acols; rm->rm_scols = scols; rm->rm_bigcols = bc; + rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; + rm->rm_datacopy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; asize = 0; @@ -304,6 +482,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; @@ -320,9 +499,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(asize, ==, tot << unit_shift); rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); - rm->rm_skipped = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift); - ASSERT3U(rm->rm_skipped, <=, nparity); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); @@ -347,6 +526,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); @@ -358,10 +542,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; } zio->io_vsd = rm; - zio->io_vsd_free = vdev_raidz_map_free; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -1335,7 +1522,6 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; - blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; int c, i; @@ -1361,7 +1547,7 @@ vdev_raidz_io_start(zio_t *zio) * Generate optional I/Os for any skipped sectors to improve * aggregation contiguity. */ - for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) { + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { ASSERT(c <= rm->rm_scols); if (c == rm->rm_scols) c = 0; @@ -1396,7 +1582,7 @@ vdev_raidz_io_start(zio_t *zio) rc->rc_skipped = 1; continue; } - if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -1417,23 +1603,47 @@ vdev_raidz_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } + /* * Report a checksum error for a child of a RAID-Z device. */ static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); + + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + &zbc); } +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + int ret = zio_checksum_error(zio, &zbc); + if (ret != 0 && zbc.zbc_injected != 0) + rm->rm_ecksuminjected = 1; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); + return (ret); } /* @@ -1464,7 +1674,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { - raidz_checksum_error(zio, rc); + raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = ECKSUM; ret++; } @@ -1579,7 +1789,7 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) * success. */ code = vdev_raidz_reconstruct(rm, tgts, n); - if (zio_checksum_error(zio) == 0) { + if (raidz_checksum_verify(zio) == 0) { atomic_inc_64(&raidz_corrected[code]); for (i = 0; i < n; i++) { @@ -1587,7 +1797,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) rc = &rm->rm_col[c]; ASSERT(rc->rc_error == 0); if (rc->rc_tried) - raidz_checksum_error(zio, rc); + raidz_checksum_error(zio, rc, + orig[i]); rc->rc_error = ECKSUM; } @@ -1724,7 +1935,7 @@ vdev_raidz_io_done(zio_t *zio) */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { if (data_errors == 0) { - if (zio_checksum_error(zio) == 0) { + if (raidz_checksum_verify(zio) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -1770,7 +1981,7 @@ vdev_raidz_io_done(zio_t *zio) code = vdev_raidz_reconstruct(rm, tgts, n); - if (zio_checksum_error(zio) == 0) { + if (raidz_checksum_verify(zio) == 0) { atomic_inc_64(&raidz_corrected[code]); /* @@ -1839,18 +2050,11 @@ vdev_raidz_io_done(zio_t *zio) * reconstruction over all possible combinations. If that fails, * we're cooked. */ - if (total_errors >= rm->rm_firstdatacol) { + if (total_errors > rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); - /* - * If there were exactly as many device errors as parity - * columns, yet we couldn't reconstruct the data, then at - * least one device must have returned bad data silently. - */ - if (total_errors == rm->rm_firstdatacol) - zio->io_error = zio_worst_error(zio->io_error, ECKSUM); - } else if ((code = vdev_raidz_combrec(zio, total_errors, - data_errors)) != 0) { + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the * combinatorial reconstruction, verify that the remaining @@ -1860,17 +2064,34 @@ vdev_raidz_io_done(zio_t *zio) (void) raidz_parity_verify(zio, rm); } else { /* - * All combinations failed to checksum. Generate checksum - * ereports for all children. + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. */ zio->io_error = ECKSUM; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], - zio, rc->rc_offset, rc->rc_size); + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + zfs_ereport_start_checksum( + zio->io_spa, + vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + } } } } @@ -1918,6 +2139,8 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, + NULL, + NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; -- cgit v1.2.3