summaryrefslogtreecommitdiffstats
path: root/module/zfs/vdev.c
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2010-05-28 13:45:14 -0700
committerBrian Behlendorf <[email protected]>2010-05-28 13:45:14 -0700
commit428870ff734fdaccc342b33fc53cf94724409a46 (patch)
tree164e83c0ceda52a843795ed7cd9e95637d02c177 /module/zfs/vdev.c
parent6119cb885a976e175a6e827894accf657ff1984f (diff)
Update core ZFS code from build 121 to build 141.
Diffstat (limited to 'module/zfs/vdev.c')
-rw-r--r--module/zfs/vdev.c679
1 files changed, 510 insertions, 169 deletions
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index bb5024f98..a61f29b8e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -40,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/zil.h>
+#include <sys/dsl_scan.h>
/*
* Virtual device management.
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
&vdev_disk_ops,
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
@@ -281,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
@@ -293,21 +294,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
* which must be unique among all pools.
*/
- while (guid == 0 || spa_guid_exists(guid, 0))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(NULL);
} else {
/*
* Any other vdev's guid must be unique within the pool.
*/
- while (guid == 0 ||
- spa_guid_exists(spa_guid(spa), guid))
- guid = spa_get_random(-1ULL);
+ guid = spa_generate_guid(spa);
}
ASSERT(!spa_guid_exists(spa_guid(spa), guid));
}
@@ -318,6 +316,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +396,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
@@ -404,10 +406,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
- /*
- * Currently, we can only support 3 parity devices.
- */
- if (nparity == 0 || nparity > 3)
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (EINVAL);
/*
* Previous versions could only support 1 or 2 parity
@@ -472,15 +471,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ if (parent && !parent->vdev_parent &&
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
+ }
+
+ if (parent && !parent->vdev_parent) {
+ ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_ADD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL);
+ vd->vdev_mg = metaslab_group_create(islog ?
+ spa_log_class(spa) : spa_normal_class(spa), vd);
}
/*
@@ -510,15 +527,27 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
- * valid in the current context.
+ * valid in the current context. Local vdevs will
+ * remain in the faulted state.
*/
- if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ if (spa_load_state(spa) == SPA_LOAD_OPEN) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
&vd->vdev_faulted);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
&vd->vdev_degraded);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
&vd->vdev_removed);
+
+ if (vd->vdev_faulted || vd->vdev_degraded) {
+ char *aux;
+
+ vd->vdev_label_aux =
+ VDEV_AUX_ERR_EXCEEDED;
+ if (nvlist_lookup_string(nv,
+ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+ strcmp(aux, "external") == 0)
+ vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+ }
}
}
@@ -544,6 +573,7 @@ vdev_free(vdev_t *vd)
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
@@ -557,8 +587,10 @@ vdev_free(vdev_t *vd)
/*
* Discard allocation state.
*/
- if (vd == vd->vdev_top)
+ if (vd->vdev_mg != NULL) {
vdev_metaslab_fini(vd);
+ metaslab_group_destroy(vd->vdev_mg);
+ }
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -705,6 +737,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
@@ -746,6 +779,7 @@ vdev_remove_parent(vdev_t *cvd)
*/
if (mvd->vdev_top == mvd) {
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+ cvd->vdev_orig_guid = cvd->vdev_guid;
cvd->vdev_guid += guid_delta;
cvd->vdev_guid_sum += guid_delta;
}
@@ -765,16 +799,22 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
- metaslab_class_t *mc;
uint64_t m;
uint64_t oldc = vd->vdev_ms_count;
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -786,14 +826,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
ASSERT(oldc <= newc);
- if (vd->vdev_islog)
- mc = spa->spa_log_class;
- else
- mc = spa->spa_normal_class;
-
- if (vd->vdev_mg == NULL)
- vd->vdev_mg = metaslab_group_create(mc, vd);
-
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (oldc != 0) {
@@ -828,6 +860,20 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
}
+ if (txg == 0)
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (oldc == 0 && !vd->vdev_removing)
+ metaslab_group_activate(vd->vdev_mg);
+
+ if (txg == 0)
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
return (0);
}
@@ -838,6 +884,7 @@ vdev_metaslab_fini(vdev_t *vd)
uint64_t count = vd->vdev_ms_count;
if (vd->vdev_ms != NULL) {
+ metaslab_group_passivate(vd->vdev_mg);
for (m = 0; m < count; m++)
if (vd->vdev_ms[m] != NULL)
metaslab_fini(vd->vdev_ms[m]);
@@ -965,6 +1012,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+ /*
+ * We can't change the vdev state in this context, so we
+ * kick off an async task to do it on our behalf.
+ */
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -1007,12 +1058,35 @@ vdev_open_child(void *arg)
vd->vdev_open_thread = NULL;
}
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+ strlen(ZVOL_DIR)) == 0)
+ return (B_TRUE);
+ for (int c = 0; c < vd->vdev_children; c++)
+ if (vdev_uses_zvols(vd->vdev_child[c]))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
+ /*
+ * in order to handle pools on top of zvols, do the opens
+ * in a single thread so that the same thread holds the
+ * spa_namespace_lock
+ */
+ if (vdev_uses_zvols(vd)) {
+ for (int c = 0; c < children; c++)
+ vd->vdev_child[c]->vdev_open_error =
+ vdev_open(vd->vdev_child[c]);
+ return;
+ }
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
@@ -1046,10 +1120,16 @@ vdev_open(vdev_t *vd)
vd->vdev_cant_write = B_FALSE;
vd->vdev_min_asize = vdev_get_min_asize(vd);
+ /*
+ * If this vdev is not removed, check its fault status. If it's
+ * faulted, bail out of the open.
+ */
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
- VDEV_AUX_ERR_EXCEEDED);
+ vd->vdev_label_aux);
return (ENXIO);
} else if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
@@ -1059,6 +1139,11 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+ /*
+ * Reset the vdev_reopening flag so that we actually close
+ * the vdev on error.
+ */
+ vd->vdev_reopening = B_FALSE;
if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, NULL, ENXIO);
@@ -1074,14 +1159,33 @@ vdev_open(vdev_t *vd)
vd->vdev_removed = B_FALSE;
+ /*
+ * Recheck the faulted flag now that we have confirmed that
+ * the vdev is accessible. If we're faulted, bail.
+ */
+ if (vd->vdev_faulted) {
+ ASSERT(vd->vdev_children == 0);
+ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+ vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ vd->vdev_label_aux);
+ return (ENXIO);
+ }
+
if (vd->vdev_degraded) {
ASSERT(vd->vdev_children == 0);
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_ERR_EXCEEDED);
} else {
- vd->vdev_state = VDEV_STATE_HEALTHY;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
}
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1157,8 +1261,8 @@ vdev_open(vdev_t *vd)
*/
if (vd->vdev_ops->vdev_op_leaf &&
(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_IO_FAILURE);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
return (error);
}
@@ -1189,7 +1293,7 @@ vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
- uint64_t guid, top_guid;
+ uint64_t guid = 0, top_guid;
uint64_t state;
for (int c = 0; c < vd->vdev_children; c++)
@@ -1202,6 +1306,8 @@ vdev_validate(vdev_t *vd)
* overwrite the previous state.
*/
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ uint64_t aux_guid = 0;
+ nvlist_t *nvl;
if ((label = vdev_label_read_config(vd)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1209,6 +1315,18 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ return (0);
+ }
+
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&guid) != 0 || guid != spa_guid(spa)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1217,6 +1335,11 @@ vdev_validate(vdev_t *vd)
return (0);
}
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
/*
* If this vdev just became a top-level vdev because its
* sibling was detached, it will have adopted the parent's
@@ -1224,12 +1347,16 @@ vdev_validate(vdev_t *vd)
* Fortunately, either version of the label will have the
* same top guid, so if we're a top-level vdev, we can
* safely compare to that instead.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
*/
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
&guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
&top_guid) != 0 ||
- (vd->vdev_guid != guid &&
+ ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
(vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -1252,7 +1379,7 @@ vdev_validate(vdev_t *vd)
* state of the pool.
*/
if (!spa->spa_load_verbatim &&
- spa->spa_load_state == SPA_LOAD_OPEN &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
@@ -1275,9 +1402,17 @@ void
vdev_close(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ vdev_t *pvd = vd->vdev_parent;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /*
+ * If our parent is reopening, then we are as well, unless we are
+ * going offline.
+ */
+ if (pvd != NULL && pvd->vdev_reopening)
+ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
@@ -1297,12 +1432,49 @@ vdev_close(vdev_t *vd)
}
void
+vdev_hold(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_hold(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_is_root(spa));
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_rele(vd->vdev_child[c]);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves. We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock. Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ /* set the reopening flag unless we're taking the vdev offline */
+ vd->vdev_reopening = !vd->vdev_offline;
vdev_close(vd);
(void) vdev_open(vd);
@@ -1370,6 +1542,7 @@ void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
if (flags & VDD_METASLAB)
@@ -1385,7 +1558,7 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
* DTLs.
*
* A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication. There are three kinds of DTL:
+ * the vdev has less than perfect replication. There are four kinds of DTL:
*
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
*
@@ -1479,14 +1652,16 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
- (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
- /* XXX should check scrub_done? */
+ (spa->spa_scrub_started ||
+ (scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -1534,6 +1709,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
mutex_enter(&vd->vdev_dtl_lock);
for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
if (t == DTL_SCRUB)
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
@@ -1546,7 +1723,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
- space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+ space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
@@ -1569,6 +1746,8 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
@@ -1596,6 +1775,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
@@ -1732,7 +1913,7 @@ vdev_load(vdev_t *vd)
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1789,12 +1970,54 @@ vdev_validate_aux(vdev_t *vd)
}
void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
+void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+ ASSERT(!vd->vdev_ishole);
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
+
+ if (reassess)
+ metaslab_sync_reassess(vd->vdev_mg);
}
void
@@ -1805,6 +2028,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1815,6 +2040,12 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1837,11 +2068,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
* not be opened, and no I/O is attempted.
*/
int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1850,18 +2081,25 @@ vdev_fault(spa_t *spa, uint64_t guid)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
/*
+ * We don't directly use the aux state here, but if we do a
+ * vdev_reopen(), we need this value to be present to remember why we
+ * were faulted.
+ */
+ vd->vdev_label_aux = aux;
+
+ /*
* Faulted state takes precedence over degraded.
*/
+ vd->vdev_delayed_close = B_FALSE;
vd->vdev_faulted = 1ULL;
vd->vdev_degraded = 0ULL;
- vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
/*
- * If marking the vdev as faulted cause the top-level vdev to become
- * unavailable, then back off and simply mark the vdev as degraded
- * instead.
+ * If this device has the only valid copy of the data, then
+ * back off and simply mark the vdev as degraded instead.
*/
- if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+ if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL;
@@ -1871,10 +2109,8 @@ vdev_fault(spa_t *spa, uint64_t guid)
*/
vdev_reopen(vd);
- if (vdev_readable(vd)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
- }
+ if (vdev_readable(vd))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
}
return (spa_vdev_state_exit(spa, vd, 0));
@@ -1886,11 +2122,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
* as I/O is concerned.
*/
int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
{
vdev_t *vd;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1907,7 +2143,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
vd->vdev_degraded = 1ULL;
if (!vdev_is_dead(vd))
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
- VDEV_AUX_ERR_EXCEEDED);
+ aux);
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -1923,7 +2159,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
- spa_vdev_state_enter(spa);
+ spa_vdev_state_enter(spa, SCL_NONE);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1969,13 +2205,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
return (spa_vdev_state_exit(spa, vd, 0));
}
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
{
vdev_t *vd, *tvd;
- int error;
+ int error = 0;
+ uint64_t generation;
+ metaslab_group_t *mg;
- spa_vdev_state_enter(spa);
+top:
+ spa_vdev_state_enter(spa, SCL_ALLOC);
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1984,6 +2223,8 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
+ mg = tvd->vdev_mg;
+ generation = spa->spa_config_generation + 1;
/*
* If the device isn't already offline, try to offline it.
@@ -1999,6 +2240,37 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
+ */
+ if (tvd->vdev_islog && mg != NULL) {
+ /*
+ * Prevent any future allocations.
+ */
+ metaslab_group_passivate(mg);
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = spa_offline_log(spa);
+
+ spa_vdev_state_enter(spa, SCL_ALLOC);
+
+ /*
+ * Check to see if the config has changed.
+ */
+ if (error || generation != spa->spa_config_generation) {
+ metaslab_group_activate(mg);
+ if (error)
+ return (spa_vdev_state_exit(spa,
+ vd, error));
+ (void) spa_vdev_state_exit(spa, vd, 0);
+ goto top;
+ }
+ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+ }
+
+ /*
* Offline this device and reopen its top-level vdev.
* If the top-level vdev is a log device then just offline
* it. Otherwise, if this action results in the top-level
@@ -2013,28 +2285,30 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
+
+ /*
+ * Add the device back into the metaslab rotor so that
+ * once we online the device it's open for business.
+ */
+ if (tvd->vdev_islog && mg != NULL)
+ metaslab_group_activate(mg);
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
- if (!tvd->vdev_islog || !vdev_is_dead(tvd))
- return (spa_vdev_state_exit(spa, vd, 0));
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
- (void) spa_vdev_state_exit(spa, vd, 0);
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+ int error;
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
- NULL, DS_FIND_CHILDREN);
- if (error) {
- (void) vdev_online(spa, guid, 0, NULL);
- return (error);
- }
- /*
- * If we successfully offlined the log device then we need to
- * sync out the current txg so that the "stubby" block can be
- * removed by zil_sync().
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
- return (0);
+ mutex_enter(&spa->spa_vdev_top_lock);
+ error = vdev_offline_locked(spa, guid, flags);
+ mutex_exit(&spa->spa_vdev_top_lock);
+
+ return (error);
}
/*
@@ -2068,12 +2342,21 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd->vdev_faulted || vd->vdev_degraded ||
!vdev_readable(vd) || !vdev_writeable(vd)) {
+ /*
+ * When reopening in reponse to a clear event, it may be due to
+ * a fmadm repair request. In this case, if the device is
+ * still broken, we want to still post the ereport again.
+ */
+ vd->vdev_forcefault = B_TRUE;
+
vd->vdev_faulted = vd->vdev_degraded = 0;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vdev_reopen(vd);
+ vd->vdev_forcefault = B_FALSE;
+
if (vd != rvd)
vdev_state_dirty(vd->vdev_top);
@@ -2082,12 +2365,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
}
+
+ /*
+ * When clearing a FMA-diagnosed fault, we always want to
+ * unspare the device, as we assume that the original spare was
+ * done in response to the FMA fault.
+ */
+ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+ vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_parent->vdev_child[0] == vd)
+ vd->vdev_unspare = B_TRUE;
}
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
@@ -2116,7 +2417,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole);
}
boolean_t
@@ -2146,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
@@ -2168,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ cvs->vs_scan_removing = cvd->vdev_removing;
mutex_exit(&vd->vdev_stat_lock);
}
}
@@ -2185,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd)
}
void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
spa_t *spa = zio->io_spa;
@@ -2228,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += psize;
+ if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -2254,6 +2576,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
+ /*
+ * Intent logs writes won't propagate their error to the root
+ * I/O so don't mark these types of failures as pool-level
+ * errors.
+ */
+ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
@@ -2267,14 +2597,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD))) {
+ (flags & ZIO_FLAG_SCRUB_THREAD) ||
+ spa->spa_claiming)) {
/*
- * This is either a normal write (not a repair), or it's a
- * repair induced by the scrub thread. In the normal case,
- * we commit the DTL change in the same txg as the block
- * was born. In the scrub-induced repair case, we know that
- * scrubs run in first-pass syncing context, so we commit
- * the DTL change in spa->spa_syncing_txg.
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
@@ -2287,9 +2620,12 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
- commit_txg = spa->spa_syncing_txg;
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
}
- ASSERT(commit_txg >= spa->spa_syncing_txg);
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2301,45 +2637,19 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
- boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -2355,28 +2665,26 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (update_root) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
- return;
-
+ if (mc == spa_normal_class(spa)) {
mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
+
+ if (mc != NULL) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc,
+ alloc_delta, defer_delta, space_delta, dspace_delta);
+ }
}
/*
@@ -2428,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd)
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
- aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
@@ -2449,7 +2757,8 @@ vdev_config_dirty(vdev_t *vd)
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
@@ -2490,7 +2799,7 @@ vdev_state_dirty(vdev_t *vd)
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node))
+ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
@@ -2523,6 +2832,12 @@ vdev_propagate_state(vdev_t *vd)
for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
@@ -2586,15 +2901,31 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
/*
* If we are setting the vdev state to anything but an open state, then
- * always close the underlying device. Otherwise, we keep accessible
- * but invalid devices open forever. We don't call vdev_close() itself,
- * because that implies some extra checks (offline, etc) that we don't
- * want here. This is limited to leaf devices, because otherwise
- * closing the device will affect other children.
+ * always close the underlying device unless the device has requested
+ * a delayed close (i.e. we're about to remove or fault the device).
+ * Otherwise, we keep accessible but invalid devices open forever.
+ * We don't call vdev_close() itself, because that implies some extra
+ * checks (offline, etc) that we don't want here. This is limited to
+ * leaf devices, because otherwise closing the device will affect other
+ * children.
*/
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+ vd->vdev_ops->vdev_op_leaf)
vd->vdev_ops->vdev_op_close(vd);
+ /*
+ * If we have brought this vdev back into service, we need
+ * to notify fmd so that it can gracefully repair any outstanding
+ * cases due to a missing device. We do this in all cases, even those
+ * that probably don't correlate to a repaired fault. This is sure to
+ * catch all cases, and we let the zfs-retire agent sort it out. If
+ * this is a transient state it's OK, as the retire agent will
+ * double-check the state of the vdev before repairing it.
+ */
+ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_prevstate != state)
+ zfs_post_state_change(spa, vd);
+
if (vd->vdev_removed &&
state == VDEV_STATE_CANT_OPEN &&
(aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2610,11 +2941,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_state = VDEV_STATE_REMOVED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
} else if (state == VDEV_STATE_REMOVED) {
- /*
- * Indicate to the ZFS DE that this device has been removed, and
- * any recent errors should be ignored.
- */
- zfs_post_remove(spa, vd);
vd->vdev_removed = B_TRUE;
} else if (state == VDEV_STATE_CANT_OPEN) {
/*
@@ -2623,7 +2949,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* begin with. Failure to open such a device is not considered
* an error.
*/
- if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+ if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
@@ -2666,9 +2992,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
case VDEV_AUX_BAD_LABEL:
class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
break;
- case VDEV_AUX_IO_FAILURE:
- class = FM_EREPORT_ZFS_IO_FAILURE;
- break;
default:
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
@@ -2716,32 +3039,31 @@ vdev_is_bootable(vdev_t *vd)
return (B_TRUE);
}
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (int c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+ for (int c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
/*
* It would be nice to call vdev_offline()
* directly but the pool isn't fully loaded and
* the txg threads have not been started yet.
*/
- spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
- vd->vdev_offline = val;
- vdev_reopen(vd->vdev_top);
- spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ nvd->vdev_offline = ovd->vdev_offline;
+ vdev_reopen(nvd->vdev_top);
}
}
@@ -2759,3 +3081,22 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_config_dirty(vd);
}
}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+ vdev_t *cvd, *pvd = vd->vdev_parent;
+
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ cvd = pvd->vdev_child[0];
+ if (pvd->vdev_children == 1) {
+ vdev_remove_parent(cvd);
+ cvd->vdev_splitting = B_TRUE;
+ }
+ vdev_propagate_state(cvd);
+}