aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/vdev.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/vdev.c')
-rw-r--r--module/zfs/vdev.c353
1 files changed, 247 insertions, 106 deletions
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index e41e79ab8..38f36e52f 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -40,6 +40,7 @@
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
+ &vdev_draid_ops,
+ &vdev_draid_spare_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
@@ -221,10 +225,11 @@ vdev_getops(const char *type)
/* ARGSUSED */
void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
- res->rs_start = in->rs_start;
- res->rs_end = in->rs_end;
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
}
/*
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
return (asize);
}
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+ return (vd->vdev_min_asize);
+}
+
/*
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
- /*
- * The allocatable space for a raidz vdev is N * sizeof(smallest child),
- * so each child must provide at least 1/Nth of its asize.
- */
- if (pvd->vdev_ops == &vdev_raidz_ops)
- return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
- pvd->vdev_children);
-
- return (pvd->vdev_min_asize);
+ return (pvd->vdev_ops->vdev_op_min_asize(pvd));
}
void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
vdev_set_min_asize(vd->vdev_child[c]);
}
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+ uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+ if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+ min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+ return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+ uint64_t nparity = 0;
+
+ if (vd->vdev_ops->vdev_op_nparity != NULL)
+ nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+ return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+ uint64_t ndisks = 1;
+
+ if (vd->vdev_ops->vdev_op_ndisks != NULL)
+ ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+ return (ndisks);
+}
+
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_initialize_node);
list_link_init(&vd->vdev_leaf_node);
list_link_init(&vd->vdev_trim_node);
+
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
{
vdev_ops_t *ops;
char *type;
- uint64_t guid = 0, islog, nparity;
+ uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
- /*
- * Set the nparity property for RAID-Z vdevs.
- */
- nparity = -1ULL;
- if (ops == &vdev_raidz_ops) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
- &nparity) == 0) {
- if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
- return (SET_ERROR(EINVAL));
- /*
- * Previous versions could only support 1 or 2 parity
- * device.
- */
- if (nparity > 1 &&
- spa_version(spa) < SPA_VERSION_RAIDZ2)
- return (SET_ERROR(ENOTSUP));
- if (nparity > 2 &&
- spa_version(spa) < SPA_VERSION_RAIDZ3)
- return (SET_ERROR(ENOTSUP));
- } else {
- /*
- * We require the parity to be specified for SPAs that
- * support multiple parity levels.
- */
- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
- return (SET_ERROR(EINVAL));
- /*
- * Otherwise, we default to 1 parity device for RAID-Z.
- */
- nparity = 1;
- }
- } else {
- nparity = 0;
- }
- ASSERT(nparity != -1ULL);
-
- /*
- * If creating a top-level vdev, check for allocation classes input
- */
if (top_level && alloctype == VDEV_ALLOC_ADD) {
char *bias;
+ /*
+ * If creating a top-level vdev, check for allocation
+ * classes input.
+ */
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
}
}
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (ops == &vdev_draid_ops &&
+ spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+ return (SET_ERROR(ENOTSUP));
+ }
}
- vd = vdev_alloc_common(spa, id, guid, ops);
- vic = &vd->vdev_indirect_config;
+ /*
+ * Initialize the vdev specific data. This is done before calling
+ * vdev_alloc_common() since it may fail and this simplifies the
+ * error reporting and cleanup code paths.
+ */
+ void *tsd = NULL;
+ if (ops->vdev_op_init != NULL) {
+ rc = ops->vdev_op_init(spa, nv, &tsd);
+ if (rc != 0) {
+ return (rc);
+ }
+ }
+ vd = vdev_alloc_common(spa, id, guid, ops);
+ vd->vdev_tsd = tsd;
vd->vdev_islog = islog;
- vd->vdev_nparity = nparity;
+
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
+ vic = &vd->vdev_indirect_config;
+
ASSERT0(vic->vic_mapping_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
&vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ if (vd->vdev_ops->vdev_op_fini != NULL)
+ vd->vdev_ops->vdev_op_fini(vd);
+
/*
* Discard allocation state.
*/
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
- mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
- cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
}
/*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev. There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
+
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
}
}
}
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
return (B_FALSE);
}
-void
-vdev_open_children(vdev_t *vd)
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+ return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs. If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread. This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
{
- taskq_t *tq;
int children = vd->vdev_children;
- /*
- * in order to handle pools on top of zvols, do the opens
- * in a single thread so that the same thread holds the
- * spa_namespace_lock
- */
- if (vdev_uses_zvols(vd)) {
-retry_sync:
- for (int c = 0; c < children; c++)
- vd->vdev_child[c]->vdev_open_error =
- vdev_open(vd->vdev_child[c]);
- } else {
- tq = taskq_create("vdev_open", children, minclsyspri,
- children, children, TASKQ_PREPOPULATE);
- if (tq == NULL)
- goto retry_sync;
+ taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ vd->vdev_nonrot = B_TRUE;
- for (int c = 0; c < children; c++)
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (open_func(cvd) == B_FALSE)
+ continue;
+
+ if (tq == NULL || vdev_uses_zvols(vd)) {
+ cvd->vdev_open_error = vdev_open(cvd);
+ } else {
VERIFY(taskq_dispatch(tq, vdev_open_child,
- vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+ cvd, TQ_SLEEP) != TASKQID_INVALID);
+ }
+ vd->vdev_nonrot &= cvd->vdev_nonrot;
+ }
+
+ if (tq != NULL) {
+ taskq_wait(tq);
taskq_destroy(tq);
}
+}
- vd->vdev_nonrot = B_TRUE;
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+ vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
- for (int c = 0; c < children; c++)
- vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+ vdev_open_children_impl(vd, open_func);
}
/*
@@ -1953,6 +2017,16 @@ vdev_open(vdev_t *vd)
}
/*
+ * Track the the minimum allocation size.
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ }
+
+ /*
* If this is a leaf vdev, assess whether a resilver is needed.
* But don't do this if we are doing a reopen for a scrub, since
* this would just restart the scrub we are already doing.
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
vdev_t *pvd = vd->vdev_parent;
spa_t *spa __maybe_unused = vd->vdev_spa;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT(vd != NULL);
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
@@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
}
/*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered. DVAs outside this range can always be skipped.
+ */
+boolean_t
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ /* Set by sequential resilver. */
+ if (phys_birth == TXG_UNKNOWN)
+ return (B_TRUE);
+
+ return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
*/
boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
@@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
- return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+ return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+ phys_birth));
}
/*
@@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
- else if (vd->vdev_nparity != 0)
- minref = vd->vdev_nparity + 1; /* RAID-Z */
+ else if (vdev_get_nparity(vd) != 0)
+ minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
else
minref = vd->vdev_children; /* any kind of mirror */
space_reftree_create(&reftree);
@@ -3727,6 +3820,9 @@ top:
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
@@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
+ /*
+ * Exclude the dRAID spare when aggregating to avoid double counting
+ * the ops and bytes. These IOs are counted by the physical leaves.
+ */
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vdev_get_child_stat(cvd, vs, cvs);
if (vsx)
vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
}
} else {
/*
@@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
/*
* Repair is the result of a rebuild issued by the
- * rebuild thread (vdev_rebuild_thread).
+ * rebuild thread (vdev_rebuild_thread). To avoid
+ * double counting repaired bytes the virtual dRAID
+ * spare vdev is excluded from the processed bytes.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
atomic_add_64(rebuilt, psize);
+ }
vs->vs_rebuild_processed += psize;
}
@@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
vdev_resilver_needed(vd, NULL, NULL));
}
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+ return (rs->rs_start == rs->rs_end);
+}
+
/*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t. This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
- range_seg64_t *physical_rs)
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
- vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+ remain_rs);
} else {
/*
- * We've reached the top-level vdev, initialize the
- * physical range to the logical range and start to
- * unwind.
+ * We've reached the top-level vdev, initialize the physical
+ * range to the logical range and set an empty remaining
+ * range then start to unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
+
+ remain_rs->rs_start = logical_rs->rs_start;
+ remain_rs->rs_end = logical_rs->rs_start;
+
return;
}
@@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
/*
* As this recursive function unwinds, translate the logical
- * range into its physical components by calling the
- * vdev specific translate function.
+ * range into its physical and any remaining components by calling
+ * the vdev specific translate function.
*/
range_seg64_t intermediate = { 0 };
- pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+ vdev_xlate_func_t *func, void *arg)
+{
+ range_seg64_t iter_rs = *logical_rs;
+ range_seg64_t physical_rs;
+ range_seg64_t remain_rs;
+
+ while (!vdev_xlate_is_empty(&iter_rs)) {
+
+ vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+ /*
+ * With raidz and dRAID, it's possible that the logical range
+ * does not live on this leaf vdev. Only when there is a non-
+ * zero physical size call the provided function.
+ */
+ if (!vdev_xlate_is_empty(&physical_rs))
+ func(arg, &physical_rs);
+
+ iter_rs = remain_rs;
+ }
+}
+
/*
* Look at the vdev tree and determine whether any devices are currently being
* replaced.