diff options
author | Tom Caputi <[email protected]> | 2019-03-15 17:14:31 -0400 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2019-03-15 14:14:31 -0700 |
commit | ab7615d92c9bf4bdbbc6a675724b763d16f05280 (patch) | |
tree | 4a61231e1be8887feb8cf3ccdae070a016cc45b7 | |
parent | 2bbec1c910a24bf61c6f41e0762e50face4b8907 (diff) |
Multiple DVA Scrubbing Fix
Currently, there is an issue in the sequential scrub code which
prevents self healing from working in some cases. The scrub code
will split up all DVA copies of a bp and issue each of them
separately. The problem is that, since each of the DVAs is no
longer associated with the others, the self healing code doesn't
have the opportunity to repair problems that show up in one of the
DVAs with the data from the others.
This patch fixes this issue by ensuring that all IOs issued by the
sequential scrub code include all DVAs. Initially, only the first
DVA of each is attempted. If an issue arises, the IO is retried
with all available copies, giving the self healing code a chance
to correct the issue.
To test this change, this patch also adds the ability for zinject
to specify individual DVAs to inject read errors into. We then
add a new test case that utilizes this functionality to ensure
scrubs and self-healing reads can handle and transparently fix
issues with individual copies of blocks.
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Matt Ahrens <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Closes #8453
-rw-r--r-- | cmd/zinject/zinject.c | 123 | ||||
-rw-r--r-- | include/sys/zfs_ioctl.h | 4 | ||||
-rw-r--r-- | man/man8/zinject.8 | 14 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 195 | ||||
-rw-r--r-- | module/zfs/vdev_mirror.c | 25 | ||||
-rw-r--r-- | module/zfs/zio_inject.c | 45 | ||||
-rw-r--r-- | tests/runfiles/linux.run | 2 | ||||
-rw-r--r-- | tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am | 3 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh | 77 | ||||
-rwxr-xr-x | tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh | 22 |
10 files changed, 415 insertions, 95 deletions
diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 54740104a..393e50a28 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -291,8 +291,8 @@ usage(void) "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexadecimal, and only one block can be specified.\n" "\n" - "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" - "\t [-a] [-m] [-u] [-f freq] <object>\n" + "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" + "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" @@ -300,7 +300,10 @@ usage(void) "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be one of 'io',\n" - "\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n" + "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" + "\t\t-C\tInject the given error only into specific DVAs. The\n" + "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" + "\t\t\tseparated by commas (ex. '0,2').\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" @@ -361,17 +364,20 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", - "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " + "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", + "LVL", "DVAs", "RANGE"); (void) printf("--- --------------- ------ " - "------ -------- --- ---------------\n"); + "------ -------- --- ---- ---------------\n"); } *count += 1; - (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, - (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, - type_to_name(record->zi_type), record->zi_level); + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", + id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas); + if (record->zi_start == 0 && record->zi_end == -1ULL) @@ -602,6 +608,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record, (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); + (void) printf(" dvas: 0x%x\n", record->zi_dvas); } } @@ -674,6 +681,59 @@ parse_frequency(const char *str, uint32_t *percent) return (0); } +/* + * This function converts a string specifier for DVAs into a bit mask. + * The dva's provided by the user should be 0 indexed and separated by + * a comma. For example: + * "1" -> 0b0010 (0x2) + * "0,1" -> 0b0011 (0x3) + * "0,1,2" -> 0b0111 (0x7) + */ +static int +parse_dvas(const char *str, uint32_t *dvas_out) +{ + const char *c = str; + uint32_t mask = 0; + boolean_t need_delim = B_FALSE; + + /* max string length is 5 ("0,1,2") */ + if (strlen(str) > 5 || strlen(str) == 0) + return (EINVAL); + + while (*c != '\0') { + switch (*c) { + case '0': + case '1': + case '2': + /* check for pipe between DVAs */ + if (need_delim) + return (EINVAL); + + /* check if this DVA has been set already */ + if (mask & (1 << ((*c) - '0'))) + return (EINVAL); + + mask |= (1 << ((*c) - '0')); + need_delim = B_TRUE; + break; + case ',': + need_delim = B_FALSE; + break; + default: + /* check for invalid character */ + return (EINVAL); + } + c++; + } + + /* check for dangling delimiter */ + if (!need_delim) + return (EINVAL); + + *dvas_out = mask; + return (0); +} + int main(int argc, char **argv) { @@ -700,6 +760,7 @@ main(int argc, char **argv) int dur_secs = 0; int ret; int flags = 0; + uint32_t dvas = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s", libzfs_error_init(errno)); @@ -730,7 +791,7 @@ main(int argc, char **argv) } while ((c = getopt(argc, argv, - ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -754,6 +815,17 @@ main(int argc, char **argv) case 'c': cancel = optarg; break; + case 'C': + ret = parse_dvas(optarg, &dvas); + if (ret != 0) { + (void) fprintf(stderr, "invalid DVA list '%s': " + "DVAs should be 0 indexed and separated by " + "commas.\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; case 'd': device = optarg; break; @@ -937,7 +1009,7 @@ main(int argc, char **argv) */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - record.zi_freq > 0) { + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); @@ -972,7 +1044,8 @@ main(int argc, char **argv) * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + dvas != 0) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); @@ -1020,7 +1093,7 @@ main(int argc, char **argv) } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || - record.zi_freq > 0) { + record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); @@ -1055,7 +1128,8 @@ main(int argc, char **argv) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || device != NULL || record.zi_freq > 0) { + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); @@ -1076,6 +1150,15 @@ main(int argc, char **argv) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "hardware failure (-I) " + "incompatible with other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); @@ -1136,6 +1219,18 @@ main(int argc, char **argv) return (1); } + if (dvas != 0) { + if (error == EACCES || error == EINVAL) { + (void) fprintf(stderr, "the '-c' option may " + "not be used with logical data errors " + "'decrypt' and 'decompress'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_dvas = dvas; + } + if (error == EACCES) { if (type != TYPE_DATA) { (void) fprintf(stderr, "decryption errors " diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index f8c65f581..bb5b48c91 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -367,7 +367,7 @@ typedef struct zinject_record { uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; - uint32_t zi_pad; + uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -386,6 +386,8 @@ typedef struct zinject_record { #define ZI_PERCENTAGE_MIN 4294UL #define ZI_PERCENTAGE_MAX UINT32_MAX +#define ZI_NO_DVA (-1) + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index e97db839b..f02e78ca2 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -85,13 +85,13 @@ Simulate a hardware failure that fails to honor a cache flush. .B "zinject \-p \fIfunction\fB \fIpool\fB Panic inside the specified function. .TP -.B "zinject \-t data [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" +.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" Force an error into the contents of a file. .TP -.B "zinject \-t dnode [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" +.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" Force an error into the metadnode for a file or directory. .TP -.B "zinject \-t \fImos_type\fB [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" +.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" Force an error into the MOS of a pool. .SH OPTIONS .TP @@ -102,6 +102,14 @@ Flush the ARC before injection. Force an error into the pool at this bookmark tuple. Each number is in hexadecimal, and only one block can be specified. .TP +.BI "\-C" " dvas" +Inject the given error only into specific DVAs. The mask should be +specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This +option is not applicable to logical data errors such as +.BR "decompress" +and +.BR "decrypt" . +.TP .BI "\-d" " vdev" A vdev specified by path or GUID. .TP diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 3180ce65a..eee122aa6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -236,24 +236,43 @@ typedef enum { */ typedef struct scan_io { /* fields from blkptr_t */ - uint64_t sio_offset; uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; - uint32_t sio_asize; + uint32_t sio_nr_dvas; /* fields from zio_t */ - int sio_flags; + uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { - avl_node_t sio_addr_node; /* link into issueing queue */ + avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; + + /* + * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, + * depending on how many were in the original bp. Only the + * first DVA is really used for sorting and issuing purposes. + * The other DVAs (if provided) simply exist so that the zio + * layer can find additional copies to repair from in the + * event of an error. This array must go at the end of the + * struct to allow this for the variable number of elements. + */ + dva_t sio_dva[0]; } scan_io_t; +#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) +#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) +#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) +#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) +#define SIO_GET_END_OFFSET(sio) \ + (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) +#define SIO_GET_MUSED(sio) \ + (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) + struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ @@ -262,6 +281,7 @@ struct dsl_scan_io_queue { range_tree_t *q_exts_by_addr; avl_tree_t q_exts_by_size; avl_tree_t q_sios_by_addr; + uint64_t q_sio_memused; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; @@ -300,7 +320,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); -static kmem_cache_t *sio_cache; +static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; + +/* sio->sio_nr_dvas must be set so we know which cache to free from */ +static void +sio_free(scan_io_t *sio) +{ + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); +} + +/* It is up to the caller to set sio->sio_nr_dvas for freeing */ +static scan_io_t * +sio_alloc(unsigned short nr_dvas) +{ + ASSERT3U(nr_dvas, >, 0); + ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); + + return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); +} void scan_init(void) @@ -315,14 +355,22 @@ scan_init(void) */ fill_weight = zfs_scan_fill_weight; - sio_cache = kmem_cache_create("sio_cache", - sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + char name[36]; + + (void) sprintf(name, "sio_cache_%d", i); + sio_cache[i] = kmem_cache_create(name, + (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), + 0, NULL, NULL, NULL, NULL, NULL, 0); + } } void scan_fini(void) { - kmem_cache_destroy(sio_cache); + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + kmem_cache_destroy(sio_cache[i]); + } } static inline boolean_t @@ -339,29 +387,39 @@ dsl_scan_resilvering(dsl_pool_t *dp) } static inline void -sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +sio2bp(const scan_io_t *sio, blkptr_t *bp) { bzero(bp, sizeof (*bp)); - DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); - DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; + + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { - /* we discard the vdev id, since we can deduce it from the queue */ - sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); - sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; + sio->sio_nr_dvas = BP_GET_NDVAS(bp); + + /* + * Copy the DVAs to the sio. We need all copies of the block so + * that the self healing code can use the alternate copies if the + * first is corrupted. We want the DVA at index dva_i to be first + * in the sio since this is the primary one that we want to issue. + */ + for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { + sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; + } } int @@ -1176,11 +1234,9 @@ dsl_scan_should_clear(dsl_scan_t *scn) mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { - /* #extents in exts_by_size = # in exts_by_addr */ + /* # extents in exts_by_size = # in exts_by_addr */ mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + - avl_numnodes(&queue->q_sios_by_addr) * - sizeof (scan_io_t); + sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -2693,13 +2749,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) break; } - sio2bp(sio, &bp, queue->q_vd->vdev_id); - bytes_issued += sio->sio_asize; + sio2bp(sio, &bp); + bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); - kmem_cache_free(sio_cache, sio); + sio_free(sio); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); @@ -2717,7 +2773,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { - scan_io_t srch_sio, *sio, *next_sio; + scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; @@ -2725,24 +2781,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - srch_sio.sio_offset = rs->rs_start; + srch_sio = sio_alloc(1); + srch_sio->sio_nr_dvas = 1; + SIO_SET_OFFSET(srch_sio, rs->rs_start); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ - sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { - ASSERT3U(sio->sio_offset, >=, rs->rs_start); - ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + while (sio != NULL && + SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); - bytes_issued += sio->sio_asize; + bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; @@ -2754,11 +2816,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && sio->sio_offset < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - sio->sio_offset, rs->rs_end - sio->sio_offset); + SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { @@ -2862,9 +2924,9 @@ scan_io_queues_run_one(void *arg) first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); - seg_end = last_sio->sio_offset + last_sio->sio_asize; + seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) - seg_start = first_sio->sio_offset; + seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the @@ -3567,10 +3629,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; - /* update the spa's stats on how many bytes we have issued */ - for (i = 0; i < BP_GET_NDVAS(bp); i++) { + /* + * Update the spa's stats on how many bytes we have issued. + * Sequential scrubs create a zio for each DVA of the bp. Each + * of these will include all DVAs for repair purposes, but the + * zio code will only try the first one unless there is an issue. + * Therefore, we should only count the first DVA for these IOs. + */ + if (scn->scn_is_sorted) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); + DVA_GET_ASIZE(&bp->blk_dva[0])); + } else { + spa_t *spa = scn->scn_dp->dp_spa; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } } /* @@ -3625,7 +3700,7 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3633,11 +3708,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); - kmem_cache_free(sio_cache, sio); + sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); - range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); + queue->q_sio_memused += SIO_GET_MUSED(sio); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); } /* @@ -3651,7 +3727,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP); + scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); @@ -3665,7 +3741,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ - atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); scan_io_queue_insert_impl(queue, sio); } @@ -3905,11 +3981,7 @@ sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - if (a->sio_offset < b->sio_offset) - return (-1); - if (a->sio_offset == b->sio_offset) - return (0); - return (1); + return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -3921,6 +3993,7 @@ scan_io_queue_create(vdev_t *vd) q->q_scn = scn; q->q_vd = vd; + q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); @@ -3948,11 +4021,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, - sio->sio_offset, sio->sio_asize)); - bytes_dequeued += sio->sio_asize; - kmem_cache_free(sio_cache, sio); + SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); + bytes_dequeued += SIO_GET_ASIZE(sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + sio_free(sio); } + ASSERT0(queue->q_sio_memused); atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); @@ -4007,7 +4082,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; - scan_io_t srch, *sio; + scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; @@ -4022,9 +4097,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) return; } - bp2sio(bp, &srch, dva_i); - start = srch.sio_offset; - size = srch.sio_asize; + srch_sio = sio_alloc(BP_GET_NDVAS(bp)); + bp2sio(bp, srch_sio, dva_i); + start = SIO_GET_OFFSET(srch_sio); + size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: @@ -4044,15 +4120,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) * be done with issuing the zio's it gathered and will * signal us. */ - sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + if (sio != NULL) { - int64_t asize = sio->sio_asize; + int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ - ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); @@ -4065,10 +4144,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ - sio2bp(sio, &tmpbp, dva_i); + sio2bp(sio, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp); - kmem_cache_free(sio_cache, sio); + sio_free(sio); } mutex_exit(q_lock); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index a92d956cd..a095e0977 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -254,11 +254,36 @@ vdev_mirror_map_init(zio_t *zio) if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dva_t dva_copy[SPA_DVAS_PER_BP]; c = BP_GET_NDVAS(zio->io_bp); /* + * The sequential scrub code sorts and issues all DVAs + * of a bp separately. Each of these IOs includes all + * original DVA copies so that repairs can be performed + * in the event of an error, but we only actually want + * to check the first DVA since the others will be + * checked by their respective sorted IOs. Only if we + * hit an error will we try all DVAs upon retrying. + * + * Note: This check is safe even if the user switches + * from a legacy scrub to a sequential one in the middle + * of processing, since scn_is_sorted isn't updated until + * all outstanding IOs from the previous scrub pass + * complete. + */ + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + !(zio->io_flags & ZIO_FLAG_IO_RETRY) && + dsl_scan_scrubbing(spa->spa_dsl_pool) && + scn->scn_is_sorted) { + c = 1; + } else { + c = BP_GET_NDVAS(zio->io_bp); + } + + /* * If we do not trust the pool config, some DVAs might be * invalid or point to vdevs that do not exist. We skip them. */ diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 7a7401ecd..78896d3dc 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -124,7 +124,7 @@ freq_triggered(uint32_t frequency) * Returns true if the given record matches the I/O in progress. */ static boolean_t -zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, +zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* @@ -148,8 +148,10 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && - error == record->zi_error) + (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + error == record->zi_error) { return (freq_triggered(record->zi_freq)); + } return (B_FALSE); } @@ -199,7 +201,8 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) continue; - if (zio_match_handler(zb, type, &handler->zi_record, error)) { + if (zio_match_handler(zb, type, ZI_NO_DVA, + &handler->zi_record, error)) { ret = error; break; } @@ -210,6 +213,37 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, } /* + * If this is a physical I/O for a vdev child determine which DVA it is + * for. We iterate backwards through the DVAs matching on the offset so + * that we end up with ZI_NO_DVA (-1) if we don't find a match. + */ +static int +zio_match_dva(zio_t *zio) +{ + int i = ZI_NO_DVA; + + if (zio->io_bp != NULL && zio->io_vd != NULL && + zio->io_child_type == ZIO_CHILD_VDEV) { + for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { + dva_t *dva = &zio->io_bp->blk_dva[i]; + uint64_t off = DVA_GET_OFFSET(dva); + vdev_t *vd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(dva)); + + /* Compensate for vdev label added to leaves */ + if (zio->io_vd->vdev_ops->vdev_op_leaf) + off += VDEV_LABEL_START_SIZE; + + if (zio->io_vd == vd && zio->io_offset == off) + break; + } + } + + return (i); +} + + +/* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ @@ -235,15 +269,14 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; - /* If this handler matches, return EIO */ + /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { + zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8a3b4d4ee..93f0c03aa 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -444,7 +444,7 @@ tags = ['functional', 'cli_root', 'zpool_resilver'] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', - 'zpool_scrub_offline_device'] + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am index 69f4b5d3b..e2dfd9d64 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am @@ -9,7 +9,8 @@ dist_pkgdata_SCRIPTS = \ zpool_scrub_005_pos.ksh \ zpool_scrub_encrypted_unloaded.ksh \ zpool_scrub_offline_device.ksh \ - zpool_scrub_print_repairing.ksh + zpool_scrub_print_repairing.ksh \ + zpool_scrub_multiple_copies.ksh dist_pkgdata_DATA = \ zpool_scrub.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh new file mode 100755 index 000000000..2dd33c99c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh @@ -0,0 +1,77 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Scrubs and self-healing should be able to repair data from additional +# copies that may be stored. +# +# +# STRATEGY: +# 1. Create a dataset with copies=3 +# 2. Write a file to the dataset +# 3. zinject errors into the first and second DVAs of that file +# 4. Scrub and verify the scrub repaired all errors +# 7. Read the file normally to check that self healing also works +# 8. Remove the zinject handler +# 9. Scrub again and confirm 0 bytes were scrubbed +# + +verify_runnable "global" + +function cleanup +{ + destroy_dataset $TESTPOOL/$TESTFS2 + log_must zinject -c all +} +log_onexit cleanup + +log_assert "Scrubs and self healing must work with additional copies" + +log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2 +typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2) +log_must mkfile 10m $mntpnt/file +log_must zpool sync $TESTPOOL + +log_must zinject -a -t data -C 0,1 -e io $mntpnt/file + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock +log_must check_pool_status $TESTPOOL "errors" "No known data errors" + +log_must zinject -c all + +log_must zpool scrub $TESTPOOL +log_must wait_scrubbed $TESTPOOL + +zpool status + +log_must check_pool_status $TESTPOOL "errors" "No known data errors" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" +log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + +log_pass "Scrubs and self healing work with additional copies" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh index fdf315dea..7a07e6433 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh @@ -49,7 +49,7 @@ verify_runnable "global" function cleanup { - poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4 } @@ -103,31 +103,31 @@ log_must truncate -s $DEVSIZE $DISK1 log_must truncate -s $DEVSIZE $DISK2 log_must truncate -s $DEVSIZE $DISK3 log_must truncate -s $DEVSIZE $DISK4 -poolexists $TESTPOOL && destroy_pool $TESTPOOL -log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \ +poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 +log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \ raidz2 $DISK1 $DISK2 $DISK3 $DISK4 # 2. Offline the first device -zpool_do_sync 'offline' $TESTPOOL $DISK1 +zpool_do_sync 'offline' $TESTPOOL2 $DISK1 # 3. Write to the pool log_must mkfile $FILESIZE "$TESTDIR/data.bin" # 4. Scrub the pool -zpool_scrub_sync $TESTPOOL +zpool_scrub_sync $TESTPOOL2 # 5. Online the first device and offline the second device -zpool_do_sync 'online' $TESTPOOL $DISK1 -zpool_do_sync 'offline' $TESTPOOL $DISK2 -log_must wait_for_resilver_end $TESTPOOL $RESILVER_TIMEOUT +zpool_do_sync 'online' $TESTPOOL2 $DISK1 +zpool_do_sync 'offline' $TESTPOOL2 $DISK2 +log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT # 6. Scrub the pool again -zpool_scrub_sync $TESTPOOL +zpool_scrub_sync $TESTPOOL2 # 7. Verify data integrity -cksum=$(zpool status $TESTPOOL | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') +cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}') if [[ $cksum != 0 ]]; then - log_fail "Unexpected CKSUM errors found on $TESTPOOL ($cksum)" + log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)" fi log_pass "Scrubbing a pool with offline devices correctly preserves DTLs" |