diff options
Diffstat (limited to 'module/zfs/vdev.c')
-rw-r--r-- | module/zfs/vdev.c | 353 |
1 files changed, 247 insertions, 106 deletions
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e41e79ab8..38f36e52f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -40,6 +40,7 @@ #include <sys/dsl_dir.h> #include <sys/vdev_impl.h> #include <sys/vdev_rebuild.h> +#include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -51,6 +52,7 @@ #include <sys/arc.h> #include <sys/zil.h> #include <sys/dsl_scan.h> +#include <sys/vdev_raidz.h> #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> @@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -221,10 +225,11 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* @@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, @@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } } - vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } + } + vd = vdev_alloc_common(spa, id, guid, ops); + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -937,6 +967,9 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ @@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); - mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); - cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } -void -vdev_open_children(vdev_t *vd) +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { - taskq_t *tq; int children = vd->vdev_children; - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; - for (int c = 0; c < children; c++) + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + + if (tq != NULL) { + taskq_wait(tq); taskq_destroy(tq); } +} - vd->vdev_nonrot = B_TRUE; +/* + * Open all child vdevs. + */ +void +vdev_open_children(vdev_t *vd) +{ + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1953,6 +2017,16 @@ vdev_open(vdev_t *vd) } /* + * Track the the minimum allocation size. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + } + + /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. @@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ +boolean_t +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -3727,6 +3820,9 @@ top: if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * Repair is the result of a rebuild issued by the - * rebuild thread (vdev_rebuild_thread). + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; @@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); + } vs->vs_rebuild_processed += psize; } @@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) vdev_resilver_needed(vd, NULL, NULL)); } +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs) + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. |