aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cmd/zpool/zpool_main.c9
-rw-r--r--include/libzfs.h1
-rw-r--r--include/sys/fs/zfs.h2
-rw-r--r--include/sys/spa.h6
-rw-r--r--include/sys/spa_impl.h13
-rw-r--r--include/sys/vdev.h9
-rw-r--r--include/sys/vdev_impl.h1
-rw-r--r--include/sys/zfs_context.h1
-rw-r--r--lib/libzfs/libzfs_import.c19
-rw-r--r--lib/libzfs/libzfs_pool.c5
-rw-r--r--lib/libzpool/kernel.c10
-rw-r--r--man/man5/zfs-module-parameters.524
-rw-r--r--module/zfs/spa.c902
-rw-r--r--module/zfs/spa_config.c3
-rw-r--r--module/zfs/spa_misc.c30
-rw-r--r--module/zfs/vdev.c464
-rw-r--r--module/zfs/vdev_label.c7
-rw-r--r--module/zfs/vdev_mirror.c59
-rw-r--r--module/zfs/vdev_root.c41
-rw-r--r--module/zfs/zio.c49
-rw-r--r--tests/runfiles/linux.run15
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am11
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh76
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh145
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh166
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh72
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh70
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh113
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh122
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh98
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh239
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh186
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh24
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg26
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib376
35 files changed, 2841 insertions, 553 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 453fb2131..d44589303 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -1820,6 +1820,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
(void) printf(gettext("currently in use"));
break;
+ case VDEV_AUX_CHILDREN_OFFLINE:
+ (void) printf(gettext("all children offline"));
+ break;
+
default:
(void) printf(gettext("corrupted data"));
break;
@@ -1919,6 +1923,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv,
(void) printf(gettext("currently in use"));
break;
+ case VDEV_AUX_CHILDREN_OFFLINE:
+ (void) printf(gettext("all children offline"));
+ break;
+
default:
(void) printf(gettext("corrupted data"));
break;
@@ -2752,6 +2760,7 @@ zpool_do_import(int argc, char **argv)
idata.guid = searchguid;
idata.cachefile = cachefile;
idata.scan = do_scan;
+ idata.policy = policy;
pools = zpool_search_import(g_zfs, &idata);
diff --git a/include/libzfs.h b/include/libzfs.h
index cbaaa13a2..45eb5c904 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -413,6 +413,7 @@ typedef struct importargs {
int unique : 1; /* does 'poolname' already exist? */
int exists : 1; /* set on return if pool already exists */
int scan : 1; /* prefer scanning to libblkid cache */
+ nvlist_t *policy; /* rewind policy (rewind txg, etc.) */
} importargs_t;
extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index de3b729eb..fa4eb2721 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -704,6 +704,7 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
+#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
@@ -811,6 +812,7 @@ typedef enum vdev_aux {
VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */
VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */
VDEV_AUX_ACTIVE, /* vdev active on a different host */
+ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */
} vdev_aux_t;
/*
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 1172468ad..8a3938e86 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -410,6 +410,7 @@ typedef enum bp_embedded_type {
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */
/*
* A block is a hole when it has either 1) never been written to, or
@@ -1015,11 +1016,16 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
extern int spa_maxdnodesize(spa_t *spa);
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
+extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
+ const blkptr_t *bp);
typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
void *arg);
extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
spa_remap_cb_t callback, void *arg);
extern uint64_t spa_get_last_removal_txg(spa_t *spa);
+extern boolean_t spa_trust_config(spa_t *spa);
+extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
+extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 90d929268..cd214c29c 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -184,6 +184,15 @@ typedef enum spa_all_vdev_zap_action {
AVZ_ACTION_INITIALIZE
} spa_avz_action_t;
+typedef enum spa_config_source {
+ SPA_CONFIG_SRC_NONE = 0,
+ SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */
+ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */
+ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */
+ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */
+ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */
+} spa_config_source_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -202,6 +211,8 @@ struct spa {
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */
+ boolean_t spa_trust_config; /* do we trust vdev tree? */
+ spa_config_source_t spa_config_source; /* where config comes from? */
uint64_t spa_import_flags; /* import specific flags */
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
@@ -263,6 +274,8 @@ struct spa {
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
+ uint64_t spa_missing_tvds; /* unopenable tvds on load */
+ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal;
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index a9b99331b..161e30ae7 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -48,9 +48,12 @@ typedef enum vdev_dtl_type {
extern int zfs_nocacheflush;
extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
+extern void vdev_dbgmsg_print_tree(vdev_t *, int);
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
-extern int vdev_validate(vdev_t *, boolean_t);
+extern int vdev_validate(vdev_t *);
+extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
+extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_reopen(vdev_t *);
@@ -100,6 +103,7 @@ extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
+extern boolean_t vdev_children_are_offline(vdev_t *vd);
extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
@@ -145,7 +149,8 @@ typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
VDEV_CONFIG_L2CACHE = 1 << 1,
VDEV_CONFIG_REMOVING = 1 << 2,
- VDEV_CONFIG_MOS = 1 << 3
+ VDEV_CONFIG_MOS = 1 << 3,
+ VDEV_CONFIG_MISSING = 1 << 4
} vdev_config_flag_t;
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index b933f9ab8..e28994613 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -437,7 +437,6 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern int vdev_load(vdev_t *vd);
extern int vdev_dtl_load(vdev_t *vd);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 2e311cffd..37bdc533c 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -674,6 +674,7 @@ typedef struct callb_cpr {
#define zone_dataset_visible(x, y) (1)
#define INGLOBALZONE(z) (1)
+extern uint32_t zone_get_hostid(void *zonep);
extern char *kmem_vasprintf(const char *fmt, va_list adx);
extern char *kmem_asprintf(const char *fmt, ...);
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index cc9a52a3e..68b5988cd 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -897,7 +897,8 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
* return to the user.
*/
static nvlist_t *
-get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
+get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
+ nvlist_t *policy)
{
pool_entry_t *pe;
vdev_entry_t *ve;
@@ -1230,6 +1231,12 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
continue;
}
+ if (policy != NULL) {
+ if (nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+ policy) != 0)
+ goto nomem;
+ }
+
if ((nvl = refresh_config(hdl, config)) == NULL) {
nvlist_free(config);
config = NULL;
@@ -2080,7 +2087,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
free(cache);
pthread_mutex_destroy(&lock);
- ret = get_configs(hdl, &pools, iarg->can_be_active);
+ ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
for (pe = pools.pools; pe != NULL; pe = penext) {
penext = pe->pe_next;
@@ -2209,6 +2216,14 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
if (active)
continue;
+ if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
+ cachefile) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
if ((dst = refresh_config(hdl, src)) == NULL) {
nvlist_free(raw);
nvlist_free(pools);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 2d94cd320..d082a5f66 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1935,8 +1935,9 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_lookup_nvlist(nvinfo,
ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
(void) printf(dgettext(TEXT_DOMAIN,
- "The devices below are missing, use "
- "'-m' to import the pool anyway:\n"));
+ "The devices below are missing or "
+ "corrupted, use '-m' to import the pool "
+ "anyway:\n"));
print_vdev_tree(hdl, NULL, missing, 2);
(void) printf("\n");
}
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 09e69ef6d..f3e84975c 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -297,6 +297,16 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw)
return (0);
}
+/* ARGSUSED */
+uint32_t
+zone_get_hostid(void *zonep)
+{
+ /*
+ * We're emulating the system's hostid in userland.
+ */
+ return (strtoul(hw_serial, NULL, 10));
+}
+
int
rw_tryupgrade(krwlock_t *rwlp)
{
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 822146a7a..886dffce8 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -354,6 +354,18 @@ Default value: \fB24\fR.
.sp
.ne 2
.na
+\fBspa_load_print_vdev_tree\fR (int)
+.ad
+.RS 12n
+Whether to print the vdev tree in the debugging message buffer during pool import.
+Use 0 to disable and 1 to enable.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fBspa_load_verify_data\fR (int)
.ad
.RS 12n
@@ -704,6 +716,18 @@ Default value: \fB0\fR.
.sp
.ne 2
.na
+\fBzfs_max_missing_tvds\fR (int)
+.ad
+.RS 12n
+Number of missing top-level vdevs which will be allowed during
+pool import (only in read-only mode).
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
\fBzfs_multilist_num_sublists\fR (int)
.ad
.RS 12n
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 80f0c6f36..3177f9649 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -157,9 +157,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static void spa_sync_version(void *arg, dmu_tx_t *tx);
static void spa_sync_props(void *arg, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
- spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
- char **ereport);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
+ boolean_t reloading);
static void spa_vdev_resilver_done(spa_t *spa);
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
@@ -182,6 +181,54 @@ boolean_t spa_load_verify_dryrun = B_FALSE;
#define TRYIMPORT_NAME "$import"
/*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+int spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+unsigned long zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t zfs_max_missing_tvds_scan = 0;
+
+/*
* ==========================================================================
* SPA properties routines
* ==========================================================================
@@ -1757,13 +1804,34 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
}
/*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t tvds = 0;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+ if (vd->vdev_islog)
+ continue;
+ if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+ tvds++;
+ }
+
+ return (tvds);
+}
+
+/*
* Checks to see if the given vdev could not be opened, in which case we post a
* sysevent to notify the autoreplace code that the device has been removed.
*/
static void
spa_check_removed(vdev_t *vd)
{
- for (int c = 0; c < vd->vdev_children; c++)
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
spa_check_removed(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
@@ -1773,38 +1841,14 @@ spa_check_removed(vdev_t *vd)
}
}
-static void
-spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
-{
- ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
-
- vd->vdev_top_zap = mvd->vdev_top_zap;
- vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
-
- for (uint64_t i = 0; i < vd->vdev_children; i++) {
- spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
- }
-}
-
-/*
- * Validate the current config against the MOS config
- */
-static boolean_t
-spa_config_valid(spa_t *spa, nvlist_t *config)
+static int
+spa_check_for_missing_logs(spa_t *spa)
{
- vdev_t *mrvd, *rvd = spa->spa_root_vdev;
- nvlist_t *nv;
-
- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
-
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
-
- ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+ vdev_t *rvd = spa->spa_root_vdev;
/*
* If we're doing a normal import, then build up any additional
- * diagnostic information about missing devices in this config.
+ * diagnostic information about missing log devices.
* We'll pass this up to the user for further processing.
*/
if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
@@ -1815,109 +1859,52 @@ spa_config_valid(spa_t *spa, nvlist_t *config)
KM_SLEEP);
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- for (int c = 0; c < rvd->vdev_children; c++) {
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- vdev_t *mtvd = mrvd->vdev_child[c];
- if (tvd->vdev_ops == &vdev_missing_ops &&
- mtvd->vdev_ops != &vdev_missing_ops &&
- mtvd->vdev_islog)
- child[idx++] = vdev_config_generate(spa, mtvd,
- B_FALSE, 0);
+ /*
+ * We consider a device as missing only if it failed
+ * to open (i.e. offline or faulted is not considered
+ * as missing).
+ */
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+ child[idx++] = vdev_config_generate(spa, tvd,
+ B_FALSE, VDEV_CONFIG_MISSING);
+ }
}
- if (idx) {
- VERIFY(nvlist_add_nvlist_array(nv,
- ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
- VERIFY(nvlist_add_nvlist(spa->spa_load_info,
- ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+ if (idx > 0) {
+ fnvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx);
+ fnvlist_add_nvlist(spa->spa_load_info,
+ ZPOOL_CONFIG_MISSING_DEVICES, nv);
- for (int i = 0; i < idx; i++)
+ for (uint64_t i = 0; i < idx; i++)
nvlist_free(child[i]);
}
nvlist_free(nv);
kmem_free(child, rvd->vdev_children * sizeof (char **));
- }
- /*
- * Compare the root vdev tree with the information we have
- * from the MOS config (mrvd). Check each top-level vdev
- * with the corresponding MOS config top-level (mtvd).
- */
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
- vdev_t *mtvd = mrvd->vdev_child[c];
-
- /*
- * Resolve any "missing" vdevs in the current configuration.
- * Also trust the MOS config about any "indirect" vdevs.
- * If we find that the MOS config has more accurate information
- * about the top-level vdev then use that vdev instead.
- */
- if ((tvd->vdev_ops == &vdev_missing_ops &&
- mtvd->vdev_ops != &vdev_missing_ops) ||
- (mtvd->vdev_ops == &vdev_indirect_ops &&
- tvd->vdev_ops != &vdev_indirect_ops)) {
-
- /*
- * Device specific actions.
- */
- if (mtvd->vdev_islog) {
- if (!(spa->spa_import_flags &
- ZFS_IMPORT_MISSING_LOG)) {
- continue;
- }
+ if (idx > 0) {
+ spa_load_failed(spa, "some log devices are missing");
+ return (SET_ERROR(ENXIO));
+ }
+ } else {
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ if (tvd->vdev_islog &&
+ tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
spa_set_log_state(spa, SPA_LOG_CLEAR);
- } else if (mtvd->vdev_ops != &vdev_indirect_ops) {
- continue;
- }
-
- /*
- * Swap the missing vdev with the data we were
- * able to obtain from the MOS config.
- */
- vdev_remove_child(rvd, tvd);
- vdev_remove_child(mrvd, mtvd);
-
- vdev_add_child(rvd, mtvd);
- vdev_add_child(mrvd, tvd);
-
- vdev_reopen(rvd);
- } else {
- if (mtvd->vdev_islog) {
- /*
- * Load the slog device's state from the MOS
- * config since it's possible that the label
- * does not contain the most up-to-date
- * information.
- */
- vdev_load_log_state(tvd, mtvd);
- vdev_reopen(tvd);
+ spa_load_note(spa, "some log devices are "
+ "missing, ZIL is dropped.");
+ break;
}
-
- /*
- * Per-vdev ZAP info is stored exclusively in the MOS.
- */
- spa_config_valid_zaps(tvd, mtvd);
}
-
- /*
- * Never trust this info from userland; always use what's
- * in the MOS. This prevents it from getting out of sync
- * with the rest of the info in the MOS.
- */
- tvd->vdev_removing = mtvd->vdev_removing;
- tvd->vdev_indirect_config = mtvd->vdev_indirect_config;
}
- vdev_free(mrvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
-
- /*
- * Ensure we were able to validate the config.
- */
- return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
+ return (0);
}
/*
@@ -2311,53 +2298,15 @@ spa_try_repair(spa_t *spa, nvlist_t *config)
}
static int
-spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
- boolean_t trust_config)
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
{
- nvlist_t *config = spa->spa_config;
char *ereport = FM_EREPORT_ZFS_POOL;
- char *comment;
int error;
- uint64_t pool_guid;
- nvlist_t *nvl;
-
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
- return (SET_ERROR(EINVAL));
-
- ASSERT(spa->spa_comment == NULL);
- if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
- spa->spa_comment = spa_strdup(comment);
-
- /*
- * Versioning wasn't explicitly added to the label until later, so if
- * it's not present treat it as the initial version.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
- &spa->spa_ubsync.ub_version) != 0)
- spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-
- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- &spa->spa_config_txg);
-
- if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
- spa_guid_exists(pool_guid, 0)) {
- error = SET_ERROR(EEXIST);
- } else {
- spa->spa_config_guid = pool_guid;
- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
- &nvl) == 0) {
- VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
- KM_SLEEP) == 0);
- }
-
- nvlist_free(spa->spa_load_info);
- spa->spa_load_info = fnvlist_alloc();
+ spa->spa_load_state = state;
- gethrestime(&spa->spa_loaded_ts);
- error = spa_load_impl(spa, pool_guid, config, state, type,
- trust_config, &ereport);
- }
+ gethrestime(&spa->spa_loaded_ts);
+ error = spa_load_impl(spa, type, &ereport, B_FALSE);
/*
* Don't count references from objsets that are already closed
@@ -2611,13 +2560,80 @@ out:
}
static int
-spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
- spa_import_type_t type)
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+ uint64_t hostid;
+ char *hostname;
+ uint64_t myhostid = 0;
+
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+ hostname = fnvlist_lookup_string(mos_config,
+ ZPOOL_CONFIG_HOSTNAME);
+
+ myhostid = zone_get_hostid(NULL);
+
+ if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+ cmn_err(CE_WARN, "pool '%s' could not be "
+ "loaded as it was last accessed by "
+ "another system (host: %s hostid: 0x%llx). "
+ "See: http://illumos.org/msg/ZFS-8000-EY",
+ spa_name(spa), hostname, (u_longlong_t)hostid);
+ spa_load_failed(spa, "hostid verification failed: pool "
+ "last accessed by host: %s (hostid: 0x%llx)",
+ hostname, (u_longlong_t)hostid);
+ return (SET_ERROR(EBADF));
+ }
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
{
int error = 0;
- nvlist_t *nvtree = NULL;
+ nvlist_t *nvtree, *nvl, *config = spa->spa_config;
int parse;
vdev_t *rvd;
+ uint64_t pool_guid;
+ char *comment;
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &spa->spa_ubsync.ub_version) != 0)
+ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ spa_load_failed(spa, "invalid config provided: '%s' missing",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
+ SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
+ spa_load_failed(spa, "a pool with guid %llu is already open",
+ (u_longlong_t)pool_guid);
+ return (SET_ERROR(EEXIST));
+ }
+
+ spa->spa_config_guid = pool_guid;
+
+ nvlist_free(spa->spa_load_info);
+ spa->spa_load_info = fnvlist_alloc();
+
+ ASSERT(spa->spa_comment == NULL);
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ spa->spa_comment = spa_strdup(comment);
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+ spa->spa_config_splitting = fnvlist_dup(nvl);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
spa_load_failed(spa, "invalid config provided: '%s' missing",
@@ -2625,9 +2641,6 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (SET_ERROR(EINVAL));
}
- parse = (type == SPA_IMPORT_EXISTING ?
- VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
-
/*
* Create "The Godfather" zio to hold all async IOs
*/
@@ -2645,6 +2658,8 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
* configuration requires knowing the version number.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ parse = (type == SPA_IMPORT_EXISTING ?
+ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2665,71 +2680,105 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (0);
}
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
static int
spa_ld_open_vdevs(spa_t *spa)
{
int error = 0;
+ /*
+ * spa_missing_tvds_allowed defines how many top-level vdevs can be
+ * missing/unopenable for the root vdev to be still considered openable.
+ */
+ if (spa->spa_trust_config) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+ } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+ spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+ } else {
+ spa->spa_missing_tvds_allowed = 0;
+ }
+
+ spa->spa_missing_tvds_allowed =
+ MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
error = vdev_open(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
+
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "vdev tree has %lld missing top-level "
+ "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+ if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
+ /*
+ * Although theoretically we could allow users to open
+ * incomplete pools in RW mode, we'd need to add a lot
+ * of extra logic (e.g. adjust pool space to account
+ * for missing vdevs).
+ * This limitation also prevents users from accidentally
+ * opening the pool in RW mode during data recovery and
+ * damaging it further.
+ */
+ spa_load_note(spa, "pools with missing top-level "
+ "vdevs can only be opened in read-only mode.");
+ error = SET_ERROR(ENXIO);
+ } else {
+ spa_load_note(spa, "current settings allow for maximum "
+ "%lld missing top-level vdevs at this stage.",
+ (u_longlong_t)spa->spa_missing_tvds_allowed);
+ }
+ }
if (error != 0) {
spa_load_failed(spa, "unable to open vdev tree [error=%d]",
error);
}
+ if (spa->spa_missing_tvds != 0 || error != 0)
+ vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
return (error);
}
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
static int
-spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type,
- boolean_t trust_config)
+spa_ld_validate_vdevs(spa_t *spa)
{
int error = 0;
vdev_t *rvd = spa->spa_root_vdev;
- /*
- * We need to validate the vdev labels against the configuration that
- * we have in hand, which is dependent on the setting of trust_config.
- * If trust_config is true then we're validating the vdev labels based
- * on that config. Otherwise, we're validating against the cached
- * config (zpool.cache) that was read when we loaded the zfs module, and
- * then later we will recursively call spa_load() and validate against
- * the vdev config.
- *
- * If we're assembling a new pool that's been split off from an
- * existing pool, the labels haven't yet been updated so we skip
- * validation for now.
- */
- if (type != SPA_IMPORT_ASSEMBLE) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd, trust_config);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0) {
- spa_load_failed(spa, "vdev_validate failed [error=%d]",
- error);
- return (error);
- }
+ if (error != 0) {
+ spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+ return (error);
+ }
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- spa_load_failed(spa, "cannot open vdev tree after "
- "invalidating some vdevs");
- return (SET_ERROR(ENXIO));
- }
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ spa_load_failed(spa, "cannot open vdev tree after invalidating "
+ "some vdevs");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ return (SET_ERROR(ENXIO));
}
return (0);
}
static int
-spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
- boolean_t trust_config)
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
{
vdev_t *rvd = spa->spa_root_vdev;
nvlist_t *label;
uberblock_t *ub = &spa->spa_uberblock;
- uint64_t children;
boolean_t activity_check = B_FALSE;
/*
@@ -2755,7 +2804,8 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
* pool is truly inactive and can be safely imported. Prevent
* hosts which don't have a hostid set from importing the pool.
*/
- activity_check = spa_activity_check_required(spa, ub, label, config);
+ activity_check = spa_activity_check_required(spa, ub, label,
+ spa->spa_config);
if (activity_check) {
if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
spa_get_hostid() == 0) {
@@ -2765,7 +2815,7 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
- int error = spa_activity_check(spa, ub, config);
+ int error = spa_activity_check(spa, ub, spa->spa_config);
if (error) {
nvlist_free(label);
return (error);
@@ -2851,26 +2901,9 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type,
nvlist_free(unsup_feat);
}
- /*
- * If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration. We first check to see if the pool
- * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
- * If it is, defer the vdev_guid_sum check till later so we
- * can handle missing vdevs.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
- &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE &&
- rvd->vdev_guid_sum != ub->ub_guid_sum) {
- spa_load_failed(spa, "guid sum in config doesn't match guid "
- "sum in uberblock (%llu != %llu)",
- (u_longlong_t)rvd->vdev_guid_sum,
- (u_longlong_t)ub->ub_guid_sum);
- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
- }
-
if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- spa_try_repair(spa, config);
+ spa_try_repair(spa, spa->spa_config);
spa_config_exit(spa, SCL_ALL, FTAG);
nvlist_free(spa->spa_config_splitting);
spa->spa_config_splitting = NULL;
@@ -2909,49 +2942,167 @@ spa_ld_open_rootbp(spa_t *spa)
}
static int
-spa_ld_validate_config(spa_t *spa, spa_import_type_t type)
+spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
+ boolean_t reloading)
{
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+ nvlist_t *nv, *mos_config, *policy;
+ int error = 0, copy_error;
+ uint64_t healthy_tvds, healthy_tvds_mos;
+ uint64_t mos_config_txg;
if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
!= 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
- * Validate the config, using the MOS config to fill in any
- * information which might be missing. If we fail to validate
- * the config then declare the pool unfit for use. If we're
- * assembling a pool from a split, the log is not transferred
- * over.
+ * If we're assembling a pool from a split, the config provided is
+ * already trusted so there is nothing to do.
*/
- if (type != SPA_IMPORT_ASSEMBLE) {
- nvlist_t *mos_config;
- if (load_nvlist(spa, spa->spa_config_object, &mos_config)
- != 0) {
- spa_load_failed(spa, "unable to retrieve MOS config");
- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- }
+ if (type == SPA_IMPORT_ASSEMBLE)
+ return (0);
+
+ healthy_tvds = spa_healthy_core_tvds(spa);
- if (!spa_config_valid(spa, mos_config)) {
+ if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+ != 0) {
+ spa_load_failed(spa, "unable to retrieve MOS config");
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ /*
+ * If we are doing an open, pool owner wasn't verified yet, thus do
+ * the verification here.
+ */
+ if (spa->spa_load_state == SPA_LOAD_OPEN) {
+ error = spa_verify_host(spa, mos_config);
+ if (error != 0) {
nvlist_free(mos_config);
- spa_load_failed(spa, "mismatch between config provided "
- "and config stored in MOS");
- return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
- ENXIO));
+ return (error);
}
- nvlist_free(mos_config);
+ }
+
+ nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * Build a new vdev tree from the trusted config
+ */
+ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+ /*
+ * Vdev paths in the MOS may be obsolete. If the untrusted config was
+ * obtained by scanning /dev/dsk, then it will have the right vdev
+ * paths. We update the trusted MOS config with this information.
+ * We first try to copy the paths with vdev_copy_path_strict, which
+ * succeeds only when both configs have exactly the same vdev tree.
+ * If that fails, we fall back to a more flexible method that has a
+ * best effort policy.
+ */
+ copy_error = vdev_copy_path_strict(rvd, mrvd);
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "provided vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ spa_load_note(spa, "MOS vdev tree:");
+ vdev_dbgmsg_print_tree(mrvd, 2);
+ }
+ if (copy_error != 0) {
+ spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+ "back to vdev_copy_path_relaxed");
+ vdev_copy_path_relaxed(rvd, mrvd);
+ }
+
+ vdev_close(rvd);
+ vdev_free(rvd);
+ spa->spa_root_vdev = mrvd;
+ rvd = mrvd;
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * We will use spa_config if we decide to reload the spa or if spa_load
+ * fails and we rewind. We must thus regenerate the config using the
+ * MOS information with the updated paths. Rewind policy is an import
+ * setting and is not in the MOS. We copy it over to our new, trusted
+ * config.
+ */
+ mos_config_txg = fnvlist_lookup_uint64(mos_config,
+ ZPOOL_CONFIG_POOL_TXG);
+ nvlist_free(mos_config);
+ mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+ if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY,
+ &policy) == 0)
+ fnvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy);
+ spa_config_set(spa, mos_config);
+ spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+ /*
+ * Now that we got the config from the MOS, we should be more strict
+ * in checking blkptrs and can make assumptions about the consistency
+ * of the vdev tree. spa_trust_config must be set to true before opening
+ * vdevs in order for them to be writeable.
+ */
+ spa->spa_trust_config = B_TRUE;
+
+ /*
+ * Open and validate the new vdev tree
+ */
+ error = spa_ld_open_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+
+ if (copy_error != 0 || spa_load_print_vdev_tree) {
+ spa_load_note(spa, "final vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ }
+
+ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+ !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
/*
- * Now that we've validated the config, check the state of the
- * root vdev. If it can't be opened, it indicates one or
- * more toplevel vdevs are faulted.
+ * Sanity check to make sure that we are indeed loading the
+ * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+ * in the config provided and they happened to be the only ones
+ * to have the latest uberblock, we could involuntarily perform
+ * an extreme rewind.
*/
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- spa_load_failed(spa, "some top vdevs are unavailable");
- return (SET_ERROR(ENXIO));
+ healthy_tvds_mos = spa_healthy_core_tvds(spa);
+ if (healthy_tvds_mos - healthy_tvds >=
+ SPA_SYNC_MIN_VDEVS) {
+ spa_load_note(spa, "config provided misses too many "
+ "top-level vdevs compared to MOS (%lld vs %lld). ",
+ (u_longlong_t)healthy_tvds,
+ (u_longlong_t)healthy_tvds_mos);
+ spa_load_note(spa, "vdev tree:");
+ vdev_dbgmsg_print_tree(rvd, 2);
+ if (reloading) {
+ spa_load_failed(spa, "config was already "
+ "provided from MOS. Aborting.");
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ spa_load_note(spa, "spa must be reloaded using MOS "
+ "config");
+ return (SET_ERROR(EAGAIN));
}
}
+ error = spa_check_for_missing_logs(spa);
+ if (error != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+ if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+ spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+ "guid sum (%llu != %llu)",
+ (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+ (u_longlong_t)rvd->vdev_guid_sum);
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+ ENXIO));
+ }
+
return (0);
}
@@ -3118,47 +3269,6 @@ spa_ld_load_special_directories(spa_t *spa)
}
static int
-spa_ld_prepare_for_reload(spa_t *spa, int orig_mode)
-{
- vdev_t *rvd = spa->spa_root_vdev;
-
- uint64_t hostid;
- nvlist_t *policy = NULL;
- nvlist_t *mos_config;
-
- if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
- spa_load_failed(spa, "unable to retrieve MOS config");
- return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- }
-
- if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
- ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
- char *hostname;
- unsigned long myhostid = 0;
-
- VERIFY(nvlist_lookup_string(mos_config,
- ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
-
- myhostid = spa_get_hostid();
- if (hostid && myhostid && hostid != myhostid) {
- nvlist_free(mos_config);
- return (SET_ERROR(EBADF));
- }
- }
- if (nvlist_lookup_nvlist(spa->spa_config,
- ZPOOL_REWIND_POLICY, &policy) == 0)
- VERIFY(nvlist_add_nvlist(mos_config,
- ZPOOL_REWIND_POLICY, policy) == 0);
-
- spa_config_set(spa, mos_config);
- spa_unload(spa);
- spa_deactivate(spa);
- spa_activate(spa, orig_mode);
-
- return (0);
-}
-
-static int
spa_ld_get_props(spa_t *spa)
{
int error = 0;
@@ -3286,6 +3396,19 @@ spa_ld_get_props(spa_t *spa)
spa->spa_autoreplace = (autoreplace != 0);
}
+ /*
+ * If we are importing a pool with missing top-level vdevs,
+ * we enforce that the pool doesn't panic or get suspended on
+ * error since the likelihood of missing data is extremely high.
+ */
+ if (spa->spa_missing_tvds > 0 &&
+ spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+ spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+ spa_load_note(spa, "forcing failmode to 'continue' "
+ "as some top level vdevs are missing");
+ spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+ }
+
return (0);
}
@@ -3428,9 +3551,15 @@ spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
boolean_t missing = spa_check_logs(spa);
if (missing) {
- *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
- spa_load_failed(spa, "spa_check_logs failed");
- return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+ if (spa->spa_missing_tvds != 0) {
+ spa_load_note(spa, "spa_check_logs failed "
+ "so dropping the logs");
+ } else {
+ *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ spa_load_failed(spa, "spa_check_logs failed");
+ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+ ENXIO));
+ }
}
}
@@ -3486,7 +3615,8 @@ spa_ld_claim_log_blocks(spa_t *spa)
}
static void
-spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+ boolean_t reloading)
{
vdev_t *rvd = spa->spa_root_vdev;
int need_update = B_FALSE;
@@ -3498,7 +3628,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
* If this is a verbatim import, trust the current
* in-core spa_config and update the disk labels.
*/
- if (config_cache_txg != spa->spa_config_txg ||
+ if (reloading || config_cache_txg != spa->spa_config_txg ||
spa->spa_load_state == SPA_LOAD_IMPORT ||
spa->spa_load_state == SPA_LOAD_RECOVER ||
(spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
@@ -3516,6 +3646,24 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+ int mode = spa->spa_mode;
+ int async_suspended = spa->spa_async_suspended;
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa, mode);
+
+ /*
+ * We save the value of spa_async_suspended as it gets reset to 0 by
+ * spa_unload(). We want to restore it back to the original value before
+ * returning as we might be calling spa_async_resume() later.
+ */
+ spa->spa_async_suspended = async_suspended;
+}
+
/*
* Load an existing storage pool, using the config provided. This config
* describes which vdevs are part of the pool and is later validated against
@@ -3523,32 +3671,35 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg)
* config stored in the MOS.
*/
static int
-spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
- spa_load_state_t state, spa_import_type_t type, boolean_t trust_config,
- char **ereport)
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
+ boolean_t reloading)
{
int error = 0;
- uint64_t config_cache_txg = spa->spa_config_txg;
- int orig_mode = spa->spa_mode;
boolean_t missing_feat_write = B_FALSE;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- spa->spa_load_state = state;
- spa_load_note(spa, "LOADING");
+ ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
/*
- * If this is an untrusted config, first access the pool in read-only
- * mode. We will then retrieve a trusted copy of the config from the MOS
- * and use it to reopen the pool in read-write mode.
+ * Never trust the config that is provided unless we are assembling
+ * a pool following a split.
+ * This means don't trust blkptrs and the vdev tree in general. This
+ * also effectively puts the spa in read-only mode since
+ * spa_writeable() checks for spa_trust_config to be true.
+ * We will later load a trusted config from the MOS.
*/
- if (!trust_config)
- spa->spa_mode = FREAD;
+ if (type != SPA_IMPORT_ASSEMBLE)
+ spa->spa_trust_config = B_FALSE;
+
+ if (reloading)
+ spa_load_note(spa, "RELOADING");
+ else
+ spa_load_note(spa, "LOADING");
/*
* Parse the config provided to create a vdev tree.
*/
- error = spa_ld_parse_config(spa, pool_guid, config, type);
+ error = spa_ld_parse_config(spa, type);
if (error != 0)
return (error);
@@ -3566,10 +3717,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
/*
* Read the label of each vdev and make sure that the GUIDs stored
* there match the GUIDs in the config provided.
+ * If we're assembling a new pool that's been split off from an
+ * existing pool, the labels haven't yet been updated so we skip
+ * validation for now.
*/
- error = spa_ld_validate_vdevs(spa, type, trust_config);
- if (error != 0)
- return (error);
+ if (type != SPA_IMPORT_ASSEMBLE) {
+ error = spa_ld_validate_vdevs(spa);
+ if (error != 0)
+ return (error);
+ }
/*
* Read vdev labels to find the best uberblock (i.e. latest, unless
@@ -3578,7 +3734,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
* label with the best uberblock and verify that our version of zfs
* supports them all.
*/
- error = spa_ld_select_uberblock(spa, config, type, trust_config);
+ error = spa_ld_select_uberblock(spa, type);
if (error != 0)
return (error);
@@ -3592,13 +3748,21 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (error);
/*
- * Retrieve the config stored in the MOS and use it to validate the
- * config provided. Also extract some information from the MOS config
- * to update our vdev tree.
+ * Retrieve the trusted config stored in the MOS and use it to create
+ * a new, exact version of the vdev tree, then reopen all vdevs.
*/
- error = spa_ld_validate_config(spa, type);
- if (error != 0)
+ error = spa_ld_load_trusted_config(spa, type, reloading);
+ if (error == EAGAIN) {
+ VERIFY(!reloading);
+ /*
+ * Redo the loading process with the trusted config if it is
+ * too different from the untrusted config.
+ */
+ spa_ld_prepare_for_reload(spa);
+ return (spa_load_impl(spa, type, ereport, B_TRUE));
+ } else if (error != 0) {
return (error);
+ }
/*
* Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -3629,19 +3793,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (error);
/*
- * If the config provided is not trusted, discard it and use the config
- * from the MOS to reload the pool.
- */
- if (!trust_config) {
- error = spa_ld_prepare_for_reload(spa, orig_mode);
- if (error != 0)
- return (error);
-
- spa_load_note(spa, "RELOADING");
- return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
- }
-
- /*
* Retrieve pool properties from the MOS.
*/
error = spa_ld_get_props(spa);
@@ -3677,7 +3828,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (error);
if (missing_feat_write) {
- ASSERT(state == SPA_LOAD_TRYIMPORT);
+ ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
/*
* At this point, we know that we can open the pool in
@@ -3709,9 +3860,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
* pool. If we are importing the pool in read-write mode, a few
* additional steps must be performed to finish the import.
*/
- if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
- ASSERT(state != SPA_LOAD_TRYIMPORT);
+ uint64_t config_cache_txg = spa->spa_config_txg;
+
+ ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
/*
* Traverse the ZIL and claim all blocks.
@@ -3739,7 +3892,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
* next sync, we would update the config stored in vdev labels
* and the cachefile (by default /etc/zfs/zpool.cache).
*/
- spa_ld_check_for_config_update(spa, config_cache_txg);
+ spa_ld_check_for_config_update(spa, config_cache_txg,
+ reloading);
/*
* Check all DTLs to see if anything needs resilvering.
@@ -3776,7 +3930,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
}
static int
-spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
+spa_load_retry(spa_t *spa, spa_load_state_t state)
{
int mode = spa->spa_mode;
@@ -3791,7 +3945,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
(u_longlong_t)spa->spa_load_max_txg);
- return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config));
+ return (spa_load(spa, state, SPA_IMPORT_EXISTING));
}
/*
@@ -3802,8 +3956,8 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config)
* spa_load().
*/
static int
-spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
- uint64_t max_request, int rewind_flags)
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+ int rewind_flags)
{
nvlist_t *loadinfo = NULL;
nvlist_t *config = NULL;
@@ -3820,8 +3974,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
spa->spa_extreme_rewind = B_TRUE;
}
- load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
- trust_config);
+ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
if (load_error == 0)
return (0);
@@ -3862,7 +4015,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config,
spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
if (spa->spa_load_max_txg < safe_rewind_txg)
spa->spa_extreme_rewind = B_TRUE;
- rewind_error = spa_load_retry(spa, state, trust_config);
+ rewind_error = spa_load_retry(spa, state);
}
spa->spa_extreme_rewind = B_FALSE;
@@ -3944,9 +4097,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
if (state != SPA_LOAD_RECOVER)
spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
zfs_dbgmsg("spa_open_common: opening %s", pool);
- error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+ error = spa_load_best(spa, state, policy.zrp_txg,
policy.zrp_request);
if (error == EBADF) {
@@ -4863,18 +5017,16 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
if (policy.zrp_request & ZPOOL_DO_REWIND)
state = SPA_LOAD_RECOVER;
- /*
- * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config
- * because the user-supplied config is actually the one to trust when
- * doing an import.
- */
- if (state != SPA_LOAD_RECOVER)
- spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
- zfs_dbgmsg("spa_import: importing %s%s", pool,
- (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : "");
- error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
- policy.zrp_request);
+ if (state != SPA_LOAD_RECOVER) {
+ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+ zfs_dbgmsg("spa_import: importing %s", pool);
+ } else {
+ zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+ "(RECOVERY MODE)", pool, (longlong_t)policy.zrp_txg);
+ }
+ error = spa_load_best(spa, state, policy.zrp_txg, policy.zrp_request);
/*
* Propagate anything learned while loading the pool and pass it
@@ -4988,10 +5140,11 @@ nvlist_t *
spa_tryimport(nvlist_t *tryconfig)
{
nvlist_t *config = NULL;
- char *poolname;
+ char *poolname, *cachefile;
spa_t *spa;
uint64_t state;
int error;
+ zpool_rewind_policy_t policy;
if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
return (NULL);
@@ -5006,14 +5159,30 @@ spa_tryimport(nvlist_t *tryconfig)
spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa_activate(spa, FREAD);
- zfs_dbgmsg("spa_tryimport: importing %s", poolname);
-
/*
- * Pass off the heavy lifting to spa_load().
- * Pass TRUE for trust_config because the user-supplied config
- * is actually the one to trust when doing an import.
+ * Rewind pool if a max txg was provided. Note that even though we
+ * retrieve the complete rewind policy, only the rewind txg is relevant
+ * for tryimport.
*/
- error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
+ zpool_get_rewind_policy(spa->spa_config, &policy);
+ if (policy.zrp_txg != UINT64_MAX) {
+ spa->spa_load_max_txg = policy.zrp_txg;
+ spa->spa_extreme_rewind = B_TRUE;
+ zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+ poolname, (longlong_t)policy.zrp_txg);
+ } else {
+ zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+ }
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+ == 0) {
+ zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+ spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+ } else {
+ spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+ }
+
+ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
/*
* If 'tryconfig' was at least parsable, return the current config.
@@ -6033,8 +6202,10 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+
/* create the new pool from the disks of the original pool */
- error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
if (error)
goto out;
@@ -7337,7 +7508,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
if (list_is_empty(&spa->spa_config_dirty_list)) {
- vdev_t *svd[SPA_DVAS_PER_BP];
+ vdev_t *svd[SPA_SYNC_MIN_VDEVS];
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
@@ -7348,7 +7519,7 @@ spa_sync(spa_t *spa, uint64_t txg)
!vdev_is_concrete(vd))
continue;
svd[svdcount++] = vd;
- if (svdcount == SPA_DVAS_PER_BP)
+ if (svdcount == SPA_SYNC_MIN_VDEVS)
break;
}
error = vdev_config_sync(svd, svdcount, txg);
@@ -7692,9 +7863,20 @@ module_param(spa_load_verify_data, int, 0644);
MODULE_PARM_DESC(spa_load_verify_data,
"Set to traverse data on pool import");
+module_param(spa_load_print_vdev_tree, int, 0644);
+MODULE_PARM_DESC(spa_load_print_vdev_tree,
+ "Print vdev tree to zfs_dbgmsg during pool import");
+
/* CSTYLED */
module_param(zio_taskq_batch_pct, uint, 0444);
MODULE_PARM_DESC(zio_taskq_batch_pct,
"Percentage of CPUs to run an IO worker thread");
+/* BEGIN CSTYLED */
+module_param(zfs_max_missing_tvds, ulong, 0644);
+MODULE_PARM_DESC(zfs_max_missing_tvds,
+ "Allow importing pool with up to this number of missing top-level vdevs"
+ " (in read-only mode)");
+/* END CSTYLED */
+
#endif
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 4e9fd6c57..50bba2345 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -393,7 +393,8 @@ void
spa_config_set(spa_t *spa, nvlist_t *config)
{
mutex_enter(&spa->spa_props_lock);
- nvlist_free(spa->spa_config);
+ if (spa->spa_config != NULL && spa->spa_config != config)
+ nvlist_free(spa->spa_config);
spa->spa_config = config;
mutex_exit(&spa->spa_props_lock);
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 6f4db76c8..e0edba155 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -384,7 +384,8 @@ spa_load_failed(spa_t *spa, const char *fmt, ...)
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
- zfs_dbgmsg("spa_load(%s): FAILED: %s", spa->spa_name, buf);
+ zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
/*PRINTFLIKE2*/
@@ -398,7 +399,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
- zfs_dbgmsg("spa_load(%s): %s", spa->spa_name, buf);
+ zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+ spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
/*
@@ -637,6 +639,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_load_max_txg = UINT64_MAX;
spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE;
+ spa->spa_trust_config = B_TRUE;
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
@@ -2052,7 +2055,7 @@ spa_is_root(spa_t *spa)
boolean_t
spa_writeable(spa_t *spa)
{
- return (!!(spa->spa_mode & FWRITE));
+ return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
}
/*
@@ -2233,6 +2236,24 @@ spa_get_hostid(void)
return (myhostid);
}
+boolean_t
+spa_trust_config(spa_t *spa)
+{
+ return (spa->spa_trust_config);
+}
+
+uint64_t
+spa_missing_tvds_allowed(spa_t *spa)
+{
+ return (spa->spa_missing_tvds_allowed);
+}
+
+void
+spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+{
+ spa->spa_missing_tvds = missing;
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
#include <linux/mod_compat.h>
@@ -2338,6 +2359,9 @@ EXPORT_SYMBOL(spa_is_root);
EXPORT_SYMBOL(spa_writeable);
EXPORT_SYMBOL(spa_mode);
EXPORT_SYMBOL(spa_namespace_lock);
+EXPORT_SYMBOL(spa_trust_config);
+EXPORT_SYMBOL(spa_missing_tvds_allowed);
+EXPORT_SYMBOL(spa_set_missing_tvds);
/* BEGIN CSTYLED */
module_param(zfs_flags, uint, 0644);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 3654919fc..ad53c0c89 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -74,6 +74,8 @@ unsigned int zfs_checksums_per_second = 20;
*/
int zfs_scan_ignore_errors = 0;
+int vdev_validate_skip = B_FALSE;
+
/*PRINTFLIKE2*/
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@@ -96,6 +98,57 @@ vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
}
}
+void
+vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
+{
+ char state[20];
+
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
+ zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+ vd->vdev_ops->vdev_op_type);
+ return;
+ }
+
+ switch (vd->vdev_state) {
+ case VDEV_STATE_UNKNOWN:
+ (void) snprintf(state, sizeof (state), "unknown");
+ break;
+ case VDEV_STATE_CLOSED:
+ (void) snprintf(state, sizeof (state), "closed");
+ break;
+ case VDEV_STATE_OFFLINE:
+ (void) snprintf(state, sizeof (state), "offline");
+ break;
+ case VDEV_STATE_REMOVED:
+ (void) snprintf(state, sizeof (state), "removed");
+ break;
+ case VDEV_STATE_CANT_OPEN:
+ (void) snprintf(state, sizeof (state), "can't open");
+ break;
+ case VDEV_STATE_FAULTED:
+ (void) snprintf(state, sizeof (state), "faulted");
+ break;
+ case VDEV_STATE_DEGRADED:
+ (void) snprintf(state, sizeof (state), "degraded");
+ break;
+ case VDEV_STATE_HEALTHY:
+ (void) snprintf(state, sizeof (state), "healthy");
+ break;
+ default:
+ (void) snprintf(state, sizeof (state), "<state %u>",
+ (uint_t)vd->vdev_state);
+ }
+
+ zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
+ "", vd->vdev_id, vd->vdev_ops->vdev_op_type,
+ vd->vdev_islog ? " (log)" : "",
+ (u_longlong_t)vd->vdev_guid,
+ vd->vdev_path ? vd->vdev_path : "N/A", state);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
+}
+
/*
* Virtual device management.
*/
@@ -1424,8 +1477,13 @@ vdev_open(vdev_t *vd)
vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
vd->vdev_removed = B_FALSE;
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- vd->vdev_stat.vs_aux);
+ if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
+ vd->vdev_stat.vs_aux);
+ } else {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ }
return (error);
}
@@ -1596,29 +1654,29 @@ vdev_open(vdev_t *vd)
/*
* Called once the vdevs are all opened, this routine validates the label
- * contents. This needs to be done before vdev_load() so that we don't
+ * contents. This needs to be done before vdev_load() so that we don't
* inadvertently do repair I/Os to the wrong device.
*
- * If 'strict' is false ignore the spa guid check. This is necessary because
- * if the machine crashed during a re-guid the new guid might have been written
- * to all of the vdev labels, but not the cached config. The strict check
- * will be performed when the pool is opened again using the mos config.
- *
* This function will only return failure if one of the vdevs indicates that it
* has since been destroyed or exported. This is only possible if
* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
* will be updated but the function will return 0.
*/
int
-vdev_validate(vdev_t *vd, boolean_t strict)
+vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
- uint64_t guid = 0, top_guid;
+ uint64_t guid = 0, aux_guid = 0, top_guid;
uint64_t state;
+ nvlist_t *nvl;
+ uint64_t txg;
- for (int c = 0; c < vd->vdev_children; c++)
- if (vdev_validate(vd->vdev_child[c], strict) != 0)
+ if (vdev_validate_skip)
+ return (0);
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ if (vdev_validate(vd->vdev_child[c]) != 0)
return (SET_ERROR(EBADF));
/*
@@ -1626,115 +1684,276 @@ vdev_validate(vdev_t *vd, boolean_t strict)
* any further validation. Otherwise, label I/O will fail and we will
* overwrite the previous state.
*/
- if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
- uint64_t aux_guid = 0;
- nvlist_t *nvl;
- uint64_t txg = spa_last_synced_txg(spa) != 0 ?
- spa_last_synced_txg(spa) : -1ULL;
+ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
+ return (0);
- if ((label = vdev_label_read_config(vd, txg)) == NULL) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- vdev_dbgmsg(vd, "vdev_validate: failed reading config");
- return (0);
- }
+ /*
+ * If we are performing an extreme rewind, we allow for a label that
+ * was modified at a point after the current txg.
+ */
+ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0)
+ txg = UINT64_MAX;
+ else
+ txg = spa_last_synced_txg(spa);
- /*
- * Determine if this vdev has been split off into another
- * pool. If so, then refuse to open it.
- */
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
- &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_SPLIT_POOL);
- nvlist_free(label);
- vdev_dbgmsg(vd, "vdev_validate: vdev split into other "
- "pool");
- return (0);
- }
+ if ((label = vdev_label_read_config(vd, txg)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ vdev_dbgmsg(vd, "vdev_validate: failed reading config");
+ return (0);
+ }
- if (strict && (nvlist_lookup_uint64(label,
- ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
- guid != spa_guid(spa))) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid "
- "doesn't match config (%llu != %llu)",
- (u_longlong_t)guid,
- (u_longlong_t)spa_guid(spa));
- return (0);
- }
+ /*
+ * Determine if this vdev has been split off into another
+ * pool. If so, then refuse to open it.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+ &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_SPLIT_POOL);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
+ return (0);
+ }
- if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
- != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
- &aux_guid) != 0)
- aux_guid = 0;
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_GUID);
+ return (0);
+ }
- /*
- * If this vdev just became a top-level vdev because its
- * sibling was detached, it will have adopted the parent's
- * vdev guid -- but the label may or may not be on disk yet.
- * Fortunately, either version of the label will have the
- * same top guid, so if we're a top-level vdev, we can
- * safely compare to that instead.
- *
- * If we split this vdev off instead, then we also check the
- * original pool's guid. We don't want to consider the vdev
- * corrupt if it is partway through a split operation.
- */
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
- &guid) != 0 ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
- &top_guid) != 0 ||
- ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
- (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- vdev_dbgmsg(vd, "vdev_validate: config guid doesn't "
- "match label guid (%llu != %llu)",
- (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid);
- return (0);
+ /*
+ * If config is not trusted then ignore the spa guid check. This is
+ * necessary because if the machine crashed during a re-guid the new
+ * guid might have been written to all of the vdev labels, but not the
+ * cached config. The check will be performed again once we have the
+ * trusted config from the MOS.
+ */
+ if (spa->spa_trust_config && guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
+ "match config (%llu != %llu)", (u_longlong_t)guid,
+ (u_longlong_t)spa_guid(spa));
+ return (0);
+ }
+
+ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+ &aux_guid) != 0)
+ aux_guid = 0;
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_GUID);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
+ != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_TOP_GUID);
+ return (0);
+ }
+
+ /*
+ * If this vdev just became a top-level vdev because its sibling was
+ * detached, it will have adopted the parent's vdev guid -- but the
+ * label may or may not be on disk yet. Fortunately, either version
+ * of the label will have the same top guid, so if we're a top-level
+ * vdev, we can safely compare to that instead.
+ * However, if the config comes from a cachefile that failed to update
+ * after the detach, a top-level vdev will appear as a non top-level
+ * vdev in the config. Also relax the constraints if we perform an
+ * extreme rewind.
+ *
+ * If we split this vdev off instead, then we also check the
+ * original pool's guid. We don't want to consider the vdev
+ * corrupt if it is partway through a split operation.
+ */
+ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
+ boolean_t mismatch = B_FALSE;
+ if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
+ if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
+ mismatch = B_TRUE;
+ } else {
+ if (vd->vdev_guid != top_guid &&
+ vd->vdev_top->vdev_guid != guid)
+ mismatch = B_TRUE;
}
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- &state) != 0) {
+ if (mismatch) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
- vdev_dbgmsg(vd, "vdev_validate: '%s' missing",
- ZPOOL_CONFIG_POOL_STATE);
+ vdev_dbgmsg(vd, "vdev_validate: config guid "
+ "doesn't match label guid");
+ vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
+ (u_longlong_t)vd->vdev_guid,
+ (u_longlong_t)vd->vdev_top->vdev_guid);
+ vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
+ "aux_guid %llu", (u_longlong_t)guid,
+ (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
return (0);
}
+ }
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
+ vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+ ZPOOL_CONFIG_POOL_STATE);
+ return (0);
+ }
- /*
- * If this is a verbatim import, no need to check the
- * state of the pool.
- */
- if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
- spa_load_state(spa) == SPA_LOAD_OPEN &&
- state != POOL_STATE_ACTIVE) {
- vdev_dbgmsg(vd, "vdev_validate: invalid pool state "
- "(%llu) for spa %s", (u_longlong_t)state,
- spa->spa_name);
- return (SET_ERROR(EBADF));
+ nvlist_free(label);
+
+ /*
+ * If this is a verbatim import, no need to check the
+ * state of the pool.
+ */
+ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+ spa_load_state(spa) == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE) {
+ vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
+ "for spa %s", (u_longlong_t)state, spa->spa_name);
+ return (SET_ERROR(EBADF));
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was
+ * previously marked permanently unavailable, clear that state
+ * now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+ if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
+ if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
+ "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+ dvd->vdev_path, svd->vdev_path);
+ spa_strfree(dvd->vdev_path);
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
}
+ } else if (svd->vdev_path != NULL) {
+ dvd->vdev_path = spa_strdup(svd->vdev_path);
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
+ (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+ }
+}
- /*
- * If we were able to open and validate a vdev that was
- * previously marked permanently unavailable, clear that state
- * now.
- */
- if (vd->vdev_not_present)
- vd->vdev_not_present = 0;
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
+ * paths from userland config into MOS config.
+ */
+int
+vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
+{
+ if ((svd->vdev_ops == &vdev_missing_ops) ||
+ (svd->vdev_ishole && dvd->vdev_ishole) ||
+ (dvd->vdev_ops == &vdev_indirect_ops))
+ return (0);
+
+ if (svd->vdev_ops != dvd->vdev_ops) {
+ vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
+ svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (svd->vdev_guid != dvd->vdev_guid) {
+ vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
+ "%llu)", (u_longlong_t)svd->vdev_guid,
+ (u_longlong_t)dvd->vdev_guid);
+ return (SET_ERROR(EINVAL));
}
+ if (svd->vdev_children != dvd->vdev_children) {
+ vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
+ "%llu != %llu", (u_longlong_t)svd->vdev_children,
+ (u_longlong_t)dvd->vdev_children);
+ return (SET_ERROR(EINVAL));
+ }
+
+ for (uint64_t i = 0; i < svd->vdev_children; i++) {
+ int error = vdev_copy_path_strict(svd->vdev_child[i],
+ dvd->vdev_child[i]);
+ if (error != 0)
+ return (error);
+ }
+
+ if (svd->vdev_ops->vdev_op_leaf)
+ vdev_copy_path_impl(svd, dvd);
+
return (0);
}
+static void
+vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
+{
+ ASSERT(stvd->vdev_top == stvd);
+ ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
+
+ for (uint64_t i = 0; i < dvd->vdev_children; i++) {
+ vdev_copy_path_search(stvd, dvd->vdev_child[i]);
+ }
+
+ if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
+ return;
+
+ /*
+ * The idea here is that while a vdev can shift positions within
+ * a top vdev (when replacing, attaching mirror, etc.) it cannot
+ * step outside of it.
+ */
+ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
+
+ if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
+ return;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ vdev_copy_path_impl(vd, dvd);
+}
+
+/*
+ * Recursively copy vdev paths from one root vdev to another. Source and
+ * destination vdev trees may differ in geometry. For each destination leaf
+ * vdev, search a vdev with the same guid and top vdev id in the source.
+ * Intended to copy paths from userland config into MOS config.
+ */
+void
+vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
+{
+ uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
+ ASSERT(srvd->vdev_ops == &vdev_root_ops);
+ ASSERT(drvd->vdev_ops == &vdev_root_ops);
+
+ for (uint64_t i = 0; i < children; i++) {
+ vdev_copy_path_search(srvd->vdev_child[i],
+ drvd->vdev_child[i]);
+ }
+}
+
/*
* Close a virtual device.
*/
@@ -1828,7 +2047,7 @@ vdev_reopen(vdev_t *vd)
!l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd);
} else {
- (void) vdev_validate(vd, B_TRUE);
+ (void) vdev_validate(vd);
}
/*
@@ -3873,6 +4092,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vdev_propagate_state(vd->vdev_parent);
}
+boolean_t
+vdev_children_are_offline(vdev_t *vd)
+{
+ ASSERT(!vd->vdev_ops->vdev_op_leaf);
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
/*
* Check the vdev configuration to ensure that it's capable of supporting
* a root pool. We do not support partial configuration.
@@ -3909,34 +4141,6 @@ vdev_is_concrete(vdev_t *vd)
}
/*
- * Load the state from the original vdev tree (ovd) which
- * we've retrieved from the MOS config object. If the original
- * vdev was offline or faulted then we transfer that state to the
- * device in the current vdev tree (nvd).
- */
-void
-vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
-{
- ASSERT(nvd->vdev_top->vdev_islog);
- ASSERT(spa_config_held(nvd->vdev_spa,
- SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
- ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
-
- for (int c = 0; c < nvd->vdev_children; c++)
- vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
-
- if (nvd->vdev_ops->vdev_op_leaf) {
- /*
- * Restore the persistent vdev state
- */
- nvd->vdev_offline = ovd->vdev_offline;
- nvd->vdev_faulted = ovd->vdev_faulted;
- nvd->vdev_degraded = ovd->vdev_degraded;
- nvd->vdev_removed = ovd->vdev_removed;
- }
-}
-
-/*
* Determine if a log device has valid content. If the vdev was
* removed or faulted in the MOS config then we know that
* the content on the log device has already been written to the pool.
@@ -4051,5 +4255,9 @@ module_param(zfs_checksums_per_second, uint, 0644);
module_param(zfs_scan_ignore_errors, int, 0644);
MODULE_PARM_DESC(zfs_scan_ignore_errors,
"Ignore errors during resilver/scrub");
+
+module_param(vdev_validate_skip, int, 0644);
+MODULE_PARM_DESC(vdev_validate_skip,
+ "Bypass vdev_validate()");
/* END CSTYLED */
#endif
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index ad334fe8c..85d133a5a 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -412,7 +412,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
- if (vd->vdev_not_present)
+ if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
@@ -1209,6 +1209,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
"txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+ if (*config == NULL && spa->spa_extreme_rewind) {
+ vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
+ "Trying again without txg restrictions.");
+ *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
+ }
if (*config == NULL) {
vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
}
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 4b01f317b..1c591cd64 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -251,9 +251,33 @@ vdev_mirror_map_init(zio_t *zio)
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
+ dva_t dva_copy[SPA_DVAS_PER_BP];
- mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
- B_TRUE);
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ /*
+ * If we do not trust the pool config, some DVAs might be
+ * invalid or point to vdevs that do not exist. We skip them.
+ */
+ if (!spa_trust_config(spa)) {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ int j = 0;
+ for (int i = 0; i < c; i++) {
+ if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
+ dva_copy[j++] = dva[i];
+ }
+ if (j == 0) {
+ zio->io_vsd = NULL;
+ zio->io_error = ENXIO;
+ return (NULL);
+ }
+ if (j < c) {
+ dva = dva_copy;
+ c = j;
+ }
+ }
+
+ mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -305,7 +329,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
}
if (numerrors == vd->vdev_children) {
- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ if (vdev_children_are_offline(vd))
+ vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
+ else
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
}
@@ -485,6 +512,13 @@ vdev_mirror_io_start(zio_t *zio)
mm = vdev_mirror_map_init(zio);
+ if (mm == NULL) {
+ ASSERT(!spa_trust_config(zio->io_spa));
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ zio_execute(zio);
+ return;
+ }
+
if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_bp != NULL &&
(zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
@@ -558,6 +592,9 @@ vdev_mirror_io_done(zio_t *zio)
int good_copies = 0;
int unexpected_errors = 0;
+ if (mm == NULL)
+ return;
+
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -677,13 +714,19 @@ vdev_mirror_io_done(zio_t *zio)
static void
vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
{
- if (faulted == vd->vdev_children)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded + faulted != 0)
+ if (faulted == vd->vdev_children) {
+ if (vdev_children_are_offline(vd)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
+ VDEV_AUX_CHILDREN_OFFLINE);
+ } else {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ }
+ } else if (degraded + faulted != 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
+ } else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ }
}
vdev_ops_t vdev_mirror_ops = {
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index 8ac9ce187..9f86cbfa4 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -37,6 +37,23 @@
* Virtual device vector for the pool's root vdev.
*/
+static uint64_t
+vdev_root_core_tvds(vdev_t *vd)
+{
+ uint64_t tvds = 0;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+ cvd->vdev_ops != &vdev_indirect_ops) {
+ tvds++;
+ }
+ }
+
+ return (tvds);
+}
+
/*
* We should be able to tolerate one failure with absolutely no damage
* to our metadata. Two failures will take out space maps, a bunch of
@@ -46,17 +63,28 @@
* probably fine. Adding bean counters during alloc/free can make this
* future guesswork more accurate.
*/
-static int
-too_many_errors(vdev_t *vd, int numerrors)
+static boolean_t
+too_many_errors(vdev_t *vd, uint64_t numerrors)
{
- ASSERT3U(numerrors, <=, vd->vdev_children);
- return (numerrors > 0);
+ uint64_t tvds;
+
+ if (numerrors == 0)
+ return (B_FALSE);
+
+ tvds = vdev_root_core_tvds(vd);
+ ASSERT3U(numerrors, <=, tvds);
+
+ if (numerrors == tvds)
+ return (B_TRUE);
+
+ return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
}
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *ashift)
{
+ spa_t *spa = vd->vdev_spa;
int lasterror = 0;
int numerrors = 0;
@@ -76,6 +104,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
}
}
+ if (spa_load_state(spa) != SPA_LOAD_NONE)
+ spa_set_missing_tvds(spa, numerrors);
+
if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror);
@@ -101,7 +132,7 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
if (too_many_errors(vd, faulted)) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
- } else if (degraded) {
+ } else if (degraded || faulted) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
} else {
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6822505f1..81ae65c31 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -879,6 +879,13 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
}
/*
+ * Do not verify individual DVAs if the config is not trusted. This
+ * will be done once the zio is executed in vdev_mirror_map_alloc.
+ */
+ if (!spa->spa_trust_config)
+ return;
+
+ /*
* Pool-specific checks.
*
* Note: it would be nice to verify that the blk_birth and
@@ -928,6 +935,36 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
}
}
+boolean_t
+zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
+{
+ uint64_t vdevid = DVA_GET_VDEV(dva);
+
+ if (vdevid >= spa->spa_root_vdev->vdev_children)
+ return (B_FALSE);
+
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_hole_ops)
+ return (B_FALSE);
+
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ return (B_FALSE);
+ }
+
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t asize = DVA_GET_ASIZE(dva);
+
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
@@ -3473,14 +3510,18 @@ zio_vdev_io_start(zio_t *zio)
}
ASSERT3P(zio->io_logical, !=, zio);
- if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ ASSERT(spa->spa_trust_config);
+
/*
* Note: the code can handle other kinds of writes,
* but we don't expect them.
*/
- ASSERT(zio->io_flags &
- (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
- ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
+ if (zio->io_vd->vdev_removing) {
+ ASSERT(zio->io_flags &
+ (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
+ }
}
align = 1ULL << vd->vdev_top->vdev_ashift;
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index eecac8f6e..0260eb884 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -364,11 +364,22 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
'zpool_import_015_pos',
'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
- 'zpool_import_features_003_pos','zpool_import_missing_001_pos',
+ 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
'zpool_import_missing_002_pos',
'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
'zpool_import_encrypted', 'zpool_import_encrypted_load',
- 'zpool_import_errata3']
+ 'zpool_import_errata3',
+ 'import_cache_device_added',
+ 'import_cache_device_removed',
+ 'import_cache_device_replaced',
+ 'import_cache_mirror_attached',
+ 'import_cache_mirror_detached',
+ 'import_cache_shared_device',
+ 'import_devices_missing',
+ 'import_paths_changed',
+ 'import_rewind_config_changed',
+ 'import_rewind_device_replaced']
+
tags = ['functional', 'cli_root', 'zpool_import']
[tests/functional/cli_root/zpool_labelclear]
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am
index 8aa34f33c..97a15a20d 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am
@@ -2,6 +2,17 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_impo
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
+ zpool_import.kshlib \
+ import_cache_device_added.ksh \
+ import_cache_device_removed.ksh \
+ import_cache_device_replaced.ksh \
+ import_cache_mirror_attached.ksh \
+ import_cache_mirror_detached.ksh \
+ import_cache_shared_device.ksh \
+ import_devices_missing.ksh \
+ import_paths_changed.ksh \
+ import_rewind_config_changed.ksh \
+ import_rewind_device_replaced.ksh \
zpool_import_001_pos.ksh \
zpool_import_002_pos.ksh \
zpool_import_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh
new file mode 100755
index 000000000..bda6b891b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh
@@ -0,0 +1,76 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable using an outdated cachefile that is unaware
+# that one or two top-level vdevs were added.
+#
+# STRATEGY:
+# 1. Create a pool with some devices and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Add a device/mirror/raid to the pool.
+# 4. Export the pool.
+# 5. Verify that we can import the pool using the backed-up cachefile.
+#
+
+verify_runnable "global"
+
+log_onexit cleanup
+
+function test_add_vdevs
+{
+ typeset poolcreate="$1"
+ typeset addvdevs="$2"
+ typeset poolcheck="$3"
+
+ log_note "$0: pool '$poolcreate', add $addvdevs."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool add -f $TESTPOOL1 $addvdevs
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcheck"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+
+ log_note ""
+}
+
+test_add_vdevs "$VDEV0" "$VDEV1" "$VDEV0 $VDEV1"
+test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" "$VDEV0 $VDEV1 $VDEV2"
+test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2"
+test_add_vdevs "$VDEV0" "mirror $VDEV1 $VDEV2" \
+ "$VDEV0 mirror $VDEV1 $VDEV2"
+test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \
+ "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3"
+test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" \
+ "$VDEV0 raidz $VDEV1 $VDEV2 $VDEV3"
+test_add_vdevs "$VDEV0" "log $VDEV1" "$VDEV0 log $VDEV1"
+test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV0 $VDEV2 log $VDEV1"
+test_add_vdevs "$VDEV0" "$VDEV1 log $VDEV2" "$VDEV0 $VDEV1 log $VDEV2"
+
+log_pass "zpool import -c cachefile_unaware_of_add passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh
new file mode 100755
index 000000000..1d878b7a2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh
@@ -0,0 +1,145 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable using an outdated cachefile that is unaware
+# that one or more vdevs were removed.
+#
+# STRATEGY:
+# 1. Create a pool with some devices and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Remove device(s) from the pool and remove them.
+# 4. (Optionally) Add device(s) to pool.
+# 5. Export the pool.
+# 6. Verify that we can import the pool using the backed-up cachefile.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function test_remove_vdev
+{
+ typeset poolcreate="$1"
+ typeset removevdev="$2"
+ typeset poolcheck="$3"
+
+ log_note "$0: pool '$poolcreate', remove $2."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool remove $TESTPOOL1 $removevdev
+ log_must wait_for_pool_config $TESTPOOL1 "$poolcheck"
+ log_must rm $removevdev
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcheck"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+ log_must mkfile $FILE_SIZE $removevdev
+
+ log_note ""
+}
+
+#
+# We have to remove top-level non-log vdevs one by one, else there is a high
+# chance pool will report busy and command will fail for the second vdev.
+#
+function test_remove_two_vdevs
+{
+ log_note "$0."
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 \
+ $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool remove $TESTPOOL1 $VDEV4
+ log_must wait_for_pool_config $TESTPOOL1 \
+ "$VDEV0 $VDEV1 $VDEV2 $VDEV3"
+ log_must zpool remove $TESTPOOL1 $VDEV3
+ log_must wait_for_pool_config $TESTPOOL1 "$VDEV0 $VDEV1 $VDEV2"
+ log_must rm $VDEV3 $VDEV4
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV1 $VDEV2"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+ log_must mkfile $FILE_SIZE $VDEV3 $VDEV4
+
+ log_note ""
+}
+
+#
+# We want to test the case where a whole created by a log device is filled
+# by a regular device
+#
+function test_remove_log_then_add_vdev
+{
+ log_note "$0."
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 \
+ $VDEV0 $VDEV1 $VDEV2 log $VDEV3
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool remove $TESTPOOL1 $VDEV1
+ log_must wait_for_pool_config $TESTPOOL1 "$VDEV0 $VDEV2 log $VDEV3"
+ log_must zpool remove $TESTPOOL1 $VDEV3
+ log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV2"
+ log_must rm $VDEV1 $VDEV3
+ log_must zpool add $TESTPOOL1 $VDEV4
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV2 $VDEV4"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+ log_must mkfile $FILE_SIZE $VDEV1 $VDEV3
+
+ log_note ""
+}
+
+test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV0 $VDEV1"
+test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV1" "$VDEV0 $VDEV2"
+test_remove_vdev "$VDEV0 log $VDEV1" "$VDEV1" "$VDEV0"
+test_remove_vdev "$VDEV0 log $VDEV1 $VDEV2" "$VDEV1 $VDEV2" "$VDEV0"
+test_remove_vdev "$VDEV0 $VDEV1 $VDEV2 log $VDEV3" "$VDEV2" \
+ "$VDEV0 $VDEV1 log $VDEV3"
+test_remove_two_vdevs
+test_remove_log_then_add_vdev
+
+log_pass "zpool import -c cachefile_unaware_of_remove passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh
new file mode 100755
index 000000000..f2888a5bb
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh
@@ -0,0 +1,166 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable using an outdated cachefile that is unaware
+# of a zpool replace operation at different stages in time.
+#
+# STRATEGY:
+# 1. Create a pool with some devices and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Initiate device replacement, backup cachefile again and export pool.
+# Special care must be taken so that resilvering doesn't complete
+# before we exported the pool.
+# 4. Verify that we can import the pool using the first cachefile backup.
+# (Test 1. cachefile: pre-replace, pool: resilvering)
+# 5. Wait for the resilvering to finish and export the pool.
+# 6. Verify that we can import the pool using the first cachefile backup.
+# (Test 2. cachefile: pre-replace, pool: post-replace)
+# 7. Export the pool.
+# 8. Verify that we can import the pool using the second cachefile backup.
+# (Test 3. cachefile: resilvering, pool: post-replace)
+#
+# STRATEGY TO SLOW DOWN RESILVERING:
+# 1. Reduce zfs_txg_timeout, which controls how long can we resilver for
+# each sync.
+# 2. Add data to pool
+# 3. Re-import the pool so that data isn't cached
+# 4. Use zinject to slow down device I/O
+# 5. Trigger the resilvering
+# 6. Use spa freeze to stop writing to the pool.
+# 7. Clear zinject events (needed to export the pool)
+# 8. Export the pool
+#
+
+verify_runnable "global"
+
+ZFS_TXG_TIMEOUT=""
+
+function custom_cleanup
+{
+ # Revert zfs_txg_timeout to defaults
+ [[ -n ZFS_TXG_TIMEOUT ]] &&
+ log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT
+
+ zinject -c all
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function test_replacing_vdevs
+{
+ typeset poolcreate="$1"
+ typeset replacevdev="$2"
+ typeset replaceby="$3"
+ typeset poolfinalstate="$4"
+ typeset zinjectdevices="$5"
+ typeset earlyremove="$6"
+ typeset writedata="$7"
+
+ log_note "$0: pool '$poolcreate', replace $replacevdev by $replaceby."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate
+
+ # Cachefile: pool in pre-replace state
+ log_must cp $CPATH $CPATHBKP
+
+ # Steps to insure resilvering happens very slowly.
+ log_must write_some_data $TESTPOOL1 $writedata
+ log_must zpool export $TESTPOOL1
+ log_must cp $CPATHBKP $CPATH
+ log_must zpool import -c $CPATH -o cachefile=$CPATH $TESTPOOL1
+ typeset device
+ for device in $zinjectdevices ; do
+ log_must zinject -d $device -D 200:1 $TESTPOOL1 > /dev/null
+ done
+ log_must zpool replace $TESTPOOL1 $replacevdev $replaceby
+
+ # Cachefile: pool in resilvering state
+ log_must cp $CPATH $CPATHBKP2
+
+ # We must disable zinject in order to export the pool, so we freeze
+ # it first to prevent writing out subsequent resilvering progress.
+ log_must zpool freeze $TESTPOOL1
+ # Confirm pool is still replacing
+ log_must pool_is_replacing $TESTPOOL1
+ log_must zinject -c all > /dev/null
+ log_must zpool export $TESTPOOL1
+
+ ( $earlyremove ) && log_must rm $replacevdev
+
+ ############################################################
+ # Test 1. Cachefile: pre-replace, pool: resilvering
+ ############################################################
+ log_must cp $CPATHBKP $CPATH
+ log_must zpool import -c $CPATH $TESTPOOL1
+
+ # Wait for resilvering to finish
+ log_must wait_for_pool_config $TESTPOOL1 "$poolfinalstate"
+ log_must zpool export $TESTPOOL1
+
+ ( ! $earlyremove ) && log_must rm $replacevdev
+
+ ############################################################
+ # Test 2. Cachefile: pre-replace, pool: post-replace
+ ############################################################
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolfinalstate"
+ log_must zpool export $TESTPOOL1
+
+ ############################################################
+ # Test 3. Cachefile: resilvering, pool: post-replace
+ ############################################################
+ log_must zpool import -c $CPATHBKP2 $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolfinalstate"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP $CPATHBKP2
+ log_must mkfile $FILE_SIZE $replacevdev
+
+ log_note ""
+}
+
+# We set zfs_txg_timeout to 1 to reduce resilvering time at each sync.
+ZFS_TXG_TIMEOUT=$(get_zfs_txg_timeout)
+set_zfs_txg_timeout 1
+
+test_replacing_vdevs "$VDEV0 $VDEV1" \
+ "$VDEV1" "$VDEV2" \
+ "$VDEV0 $VDEV2" \
+ "$VDEV0 $VDEV1" \
+ false 20
+
+test_replacing_vdevs "mirror $VDEV0 $VDEV1" \
+ "$VDEV1" "$VDEV2" \
+ "mirror $VDEV0 $VDEV2" \
+ "$VDEV0 $VDEV1" \
+ true 10
+
+test_replacing_vdevs "raidz $VDEV0 $VDEV1 $VDEV2" \
+ "$VDEV1" "$VDEV3" \
+ "raidz $VDEV0 $VDEV3 $VDEV2" \
+ "$VDEV0 $VDEV1 $VDEV2" \
+ true 20
+
+set_zfs_txg_timeout $ZFS_TXG_TIMEOUT
+
+log_pass "zpool import -c cachefile_unaware_of_replace passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh
new file mode 100755
index 000000000..987b745b9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh
@@ -0,0 +1,72 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable using an outdated cachefile that misses a
+# mirror that was attached.
+#
+# STRATEGY:
+# 1. Create a pool with some devices and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Attach a mirror to one of the devices in the pool.
+# 4. Export the pool.
+# 5. Verify that we can import the pool using the backed-up cachefile.
+#
+
+verify_runnable "global"
+
+log_onexit cleanup
+
+function test_attach_vdev
+{
+ typeset poolcreate="$1"
+ typeset attachto="$2"
+ typeset attachvdev="$3"
+ typeset poolcheck="$4"
+
+ log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool attach $TESTPOOL1 $attachto $attachvdev
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcheck"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+
+ log_note ""
+}
+
+test_attach_vdev "$VDEV0" "$VDEV0" "$VDEV4" "mirror $VDEV0 $VDEV4"
+test_attach_vdev "$VDEV0 $VDEV1" "$VDEV1" "$VDEV4" \
+ "$VDEV0 mirror $VDEV1 $VDEV4"
+test_attach_vdev "mirror $VDEV0 $VDEV1" "$VDEV0" "$VDEV4" \
+ "mirror $VDEV0 $VDEV1 $VDEV4"
+test_attach_vdev "$VDEV0 log $VDEV1" "$VDEV1" "$VDEV4" \
+ "$VDEV0 log mirror $VDEV1 $VDEV4"
+
+log_pass "zpool import -c cachefile_unaware_of_attach passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh
new file mode 100755
index 000000000..85ec51673
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh
@@ -0,0 +1,70 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable using an outdated cachefile that is unaware
+# that a mirror was detached.
+#
+# STRATEGY:
+# 1. Create a pool with some devices mirrored and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Detach a mirror from the pool.
+# 4. Export the pool.
+# 5. Verify that we can import the pool using the backed-up cachefile.
+#
+
+verify_runnable "global"
+
+log_onexit cleanup
+
+function test_detach_vdev
+{
+ typeset poolcreate="$1"
+ typeset poolcheck="$2"
+
+ log_note "$0: pool '$poolcreate', detach $VDEV4."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool detach $TESTPOOL1 $VDEV4
+ log_must rm -f $VDEV4
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -c $CPATHBKP $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcheck"
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ log_must rm -f $CPATH $CPATHBKP
+ log_must mkfile $FILE_SIZE $VDEV4
+
+ log_note ""
+}
+
+test_detach_vdev "mirror $VDEV0 $VDEV4" "$VDEV0"
+test_detach_vdev "mirror $VDEV0 $VDEV4 mirror $VDEV1 $VDEV2" \
+ "$VDEV0 mirror $VDEV1 $VDEV2"
+test_detach_vdev "mirror $VDEV0 $VDEV1 $VDEV4" "mirror $VDEV0 $VDEV1"
+test_detach_vdev "$VDEV0 log mirror $VDEV1 $VDEV4" "$VDEV0 log $VDEV1"
+
+log_pass "zpool import -c cachefile_unaware_of_detach passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh
new file mode 100755
index 000000000..66225c11b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh
@@ -0,0 +1,113 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should not try to write to a device that doesn't belong to it
+# anymore, even if the device is in its cachefile.
+#
+# STRATEGY:
+# 1. Create pool1 with some devices and an alternate cachefile.
+# 2. Backup the cachefile.
+# 3. Export pool1.
+# 4. Create pool2 using a device that belongs to pool1.
+# 5. Export pool2.
+# 6. Compute checksum of the shared device.
+# 7. Import pool1 and write some data to it.
+# 8. Verify that the checksum of the shared device hasn't changed.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+ destroy_pool $TESTPOOL2
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function dev_checksum
+{
+ typeset dev="$1"
+ typeset checksum
+
+ log_note "Compute checksum of '$dev'"
+
+ checksum=$(md5sum $dev)
+ if [[ $? -ne 0 ]]; then
+ log_fail "Failed to compute checksum of '$dev'"
+ return 1
+ fi
+
+ echo "$checksum"
+ return 0
+}
+
+function test_shared_device
+{
+ typeset pool1="$1"
+ typeset pool2="$2"
+ typeset sharedvdev="$3"
+ typeset importflags="${4:-}"
+
+ log_note "$0: pool1 '$pool1', pool2 '$pool2' takes $sharedvdev."
+
+ log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $pool1
+
+ log_must cp $CPATH $CPATHBKP
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool create -f $TESTPOOL2 $pool2
+
+ log_must zpool export $TESTPOOL2
+
+ typeset checksum1=$(dev_checksum $sharedvdev)
+
+ log_must zpool import -c $CPATHBKP $importflags $TESTPOOL1
+
+ log_must write_some_data $TESTPOOL1 2
+
+ log_must zpool destroy $TESTPOOL1
+
+ typeset checksum2=$(dev_checksum $sharedvdev)
+
+ if [[ $checksum1 == $checksum2 ]]; then
+ log_pos "Device hasn't been modified by original pool"
+ else
+ log_fail "Device has been modified by original pool." \
+ "Checksum mismatch: $checksum1 != $checksum2."
+ fi
+
+ # Cleanup
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL2
+ log_must zpool destroy $TESTPOOL2
+ log_must rm -f $CPATH $CPATHBKP
+
+ log_note ""
+}
+
+test_shared_device "mirror $VDEV0 $VDEV1" "mirror $VDEV1 $VDEV2" "$VDEV1"
+test_shared_device "mirror $VDEV0 $VDEV1 $VDEV2" "mirror $VDEV2 $VDEV3" \
+ "$VDEV2"
+test_shared_device "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2"
+test_shared_device "$VDEV0 log $VDEV1" "$VDEV2 log $VDEV1" "$VDEV1" "-m"
+
+log_pass "Pool doesn't write to a device it doesn't own anymore."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh
new file mode 100755
index 000000000..74b736aef
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh
@@ -0,0 +1,122 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable when up to 2 top-level devices are missing.
+#
+# STRATEGY:
+# 1. Create a pool.
+# 2. Write some data to the pool and checksum it.
+# 3. Add one or more devices.
+# 4. Write more data to the pool and checksum it.
+# 5. Export the pool.
+# 6. Move added devices out of the devices directory.
+# 7. Import the pool with missing devices.
+# 8. Verify that the first batch of data is intact.
+# 9. Verify that accessing the second batch of data doesn't suspend pool.
+# 10. Export the pool, move back missing devices, Re-import the pool.
+# 11. Verify that all the data is intact.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+ log_must set_spa_load_verify_metadata 1
+ log_must set_spa_load_verify_data 1
+ log_must set_zfs_max_missing_tvds 0
+ log_must rm -rf $BACKUP_DEVICE_DIR
+ # Highly damaged pools may fail to be destroyed, so we export them.
+ poolexists $TESTPOOL1 && log_must zpool export $TESTPOOL1
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function test_devices_missing
+{
+ typeset poolcreate="$1"
+ typeset addvdevs="$2"
+ typeset missingvdevs="$3"
+ typeset -i missingtvds="$4"
+
+ log_note "$0: pool '$poolcreate', adding $addvdevs, then" \
+ "moving away $missingvdevs."
+
+ log_must zpool create $TESTPOOL1 $poolcreate
+
+ log_must generate_data $TESTPOOL1 $MD5FILE "first"
+
+ log_must zpool add $TESTPOOL1 $addvdevs
+
+ log_must generate_data $TESTPOOL1 $MD5FILE2 "second"
+
+ log_must zpool export $TESTPOOL1
+
+ log_must mv $missingvdevs $BACKUP_DEVICE_DIR
+
+ # Tell zfs that it is ok to import a pool with missing top-level vdevs
+ log_must set_zfs_max_missing_tvds $missingtvds
+ # Missing devices means that data or metadata may be corrupted.
+ (( missingtvds > 1 )) && log_must set_spa_load_verify_metadata 0
+ log_must set_spa_load_verify_data 0
+ log_must zpool import -o readonly=on -d $DEVICE_DIR $TESTPOOL1
+
+ log_must verify_data_md5sums $MD5FILE
+
+ log_note "Try reading second batch of data, make sure pool doesn't" \
+ "get suspended."
+ verify_data_md5sums $MD5FILE >/dev/null 2>&1
+
+ log_must zpool export $TESTPOOL1
+
+ typeset newpaths=$(echo "$missingvdevs" | \
+ sed "s:$DEVICE_DIR:$BACKUP_DEVICE_DIR:g")
+ log_must mv $newpaths $DEVICE_DIR
+ log_must set_spa_load_verify_metadata 1
+ log_must set_spa_load_verify_data 1
+ log_must set_zfs_max_missing_tvds 0
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+
+ log_must verify_data_md5sums $MD5FILE
+ log_must verify_data_md5sums $MD5FILE2
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+
+ log_note ""
+}
+
+log_must mkdir -p $BACKUP_DEVICE_DIR
+
+test_devices_missing "$VDEV0" "$VDEV1" "$VDEV1" 1
+test_devices_missing "$VDEV0" "$VDEV1 $VDEV2" "$VDEV1" 1
+test_devices_missing "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \
+ "$VDEV2 $VDEV3" 1
+test_devices_missing "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV2" 1
+
+#
+# Note that we are testing for 2 non-consecutive missing devices.
+# Missing consecutive devices results in missing metadata. Because of
+# Missing metadata can cause the root dataset to fail to mount.
+#
+test_devices_missing "$VDEV0" "$VDEV1 $VDEV2 $VDEV3" "$VDEV1 $VDEV3" 2
+
+log_pass "zpool import succeeded with missing devices."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh
new file mode 100755
index 000000000..457eb6a14
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh
@@ -0,0 +1,98 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool should be importable even if device paths have changed.
+#
+# STRATEGY:
+# 1. Create a pool.
+# 2. Export the pool.
+# 3. Change the paths of some of the devices.
+# 4. Verify that we can import the pool in a healthy state.
+#
+
+verify_runnable "global"
+
+log_onexit cleanup
+
+function test_new_paths
+{
+ typeset poolcreate="$1"
+ typeset pathstochange="$2"
+
+ log_note "$0: pool '$poolcreate', changing paths of $pathstochange."
+
+ log_must zpool create $TESTPOOL1 $poolcreate
+
+ log_must zpool export $TESTPOOL1
+
+ for dev in $pathstochange; do
+ log_must mv $dev "${dev}_new"
+ done
+
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+ log_must check_pool_healthy $TESTPOOL1
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ for dev in $pathstochange; do
+ log_must mv "${dev}_new" $dev
+ done
+
+ log_note ""
+}
+
+function test_swap_paths
+{
+ typeset poolcreate="$1"
+ typeset pathtoswap1="$2"
+ typeset pathtoswap2="$3"
+
+ log_note "$0: pool '$poolcreate', swapping paths of $pathtoswap1" \
+ "and $pathtoswap2."
+
+ log_must zpool create $TESTPOOL1 $poolcreate
+
+ log_must zpool export $TESTPOOL1
+
+ log_must mv $pathtoswap2 "$pathtoswap2.tmp"
+ log_must mv $pathtoswap1 "$pathtoswap2"
+ log_must mv "$pathtoswap2.tmp" $pathtoswap1
+
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+ log_must check_pool_healthy $TESTPOOL1
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+
+ log_note ""
+}
+
+test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1"
+test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1"
+test_new_paths "$VDEV0 log $VDEV1" "$VDEV1"
+test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV1"
+
+test_swap_paths "$VDEV0 $VDEV1" "$VDEV0" "$VDEV1"
+test_swap_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1"
+test_swap_paths "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" \
+ "$VDEV0" "$VDEV2"
+
+log_pass "zpool import succeeded after changing device paths."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh
new file mode 100755
index 000000000..92d814015
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh
@@ -0,0 +1,239 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# It should be possible to rewind a pool beyond a configuration change.
+#
+# STRATEGY:
+# 1. Create a pool.
+# 2. Generate files and remember their md5sum.
+# 3. Note last synced txg.
+# 4. Take a snapshot to make sure old blocks are not overwritten.
+# 5. Perform zpool add/attach/detach/remove operation.
+# 6. Change device paths if requested and re-import pool.
+# 7. Overwrite the files.
+# 8. Export the pool.
+# 9. Verify that we can rewind the pool to the noted txg.
+# 10. Verify that the files are readable and retain their old data.
+#
+# DISCLAIMER:
+# This test can fail since nothing guarantees that old MOS blocks aren't
+# overwritten. Snapshots protect datasets and data files but not the MOS.
+# sync_some_data_a_few_times interleaves file data and MOS data for a few
+# txgs, thus increasing the odds that some txgs will have their MOS data
+# left untouched.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+ set_vdev_validate_skip 0
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function test_common
+{
+ typeset poolcreate="$1"
+ typeset addvdevs="$2"
+ typeset attachargs="${3:-}"
+ typeset detachvdev="${4:-}"
+ typeset removevdev="${5:-}"
+ typeset finalpool="${6:-}"
+
+ typeset poolcheck="$poolcreate"
+
+ log_must zpool create $TESTPOOL1 $poolcreate
+
+ log_must generate_data $TESTPOOL1 $MD5FILE
+
+ # syncing a few times while writing new data increases the odds that MOS
+ # metadata for some of the txgs will survive
+ log_must sync_some_data_a_few_times $TESTPOOL1
+ typeset txg
+ txg=$(get_last_txg_synced $TESTPOOL1)
+ log_must zfs snapshot -r $TESTPOOL1@snap1
+
+ #
+ # Perform config change operations
+ #
+ if [[ -n $addvdev ]]; then
+ log_must zpool add -f $TESTPOOL1 $addvdev
+ fi
+ if [[ -n $attachargs ]]; then
+ log_must zpool attach $TESTPOOL1 $attachargs
+ fi
+ if [[ -n $detachvdev ]]; then
+ log_must zpool detach $TESTPOOL1 $detachvdev
+ fi
+ if [[ -n $removevdev ]]; then
+ [[ -z $finalpool ]] &&
+ log_fail "Must provide final pool status!"
+ log_must zpool remove $TESTPOOL1 $removevdev
+ log_must wait_for_pool_config $TESTPOOL1 "$finalpool"
+ fi
+ if [[ -n $pathstochange ]]; then
+ #
+ # Change device paths and re-import pool to update labels
+ #
+ zpool export $TESTPOOL1
+ for dev in $pathstochange; do
+ log_must mv $dev "${dev}_new"
+ poolcheck=$(echo "$poolcheck" | \
+ sed "s:$dev:${dev}_new:g")
+ done
+ zpool import -d $DEVICE_DIR $TESTPOOL1
+ fi
+
+ log_must overwrite_data $TESTPOOL1 ""
+
+ log_must zpool export $TESTPOOL1
+
+ log_must zpool import -d $DEVICE_DIR -T $txg $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcheck"
+
+ log_must verify_data_md5sums $MD5FILE
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ if [[ -n $pathstochange ]]; then
+ for dev in $pathstochange; do
+ log_must mv "${dev}_new" $dev
+ done
+ fi
+ # Fast way to clear vdev labels
+ log_must zpool create -f $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4
+ log_must zpool destroy $TESTPOOL2
+
+ log_note ""
+}
+
+function test_add_vdevs
+{
+ typeset poolcreate="$1"
+ typeset addvdevs="$2"
+
+ log_note "$0: pool '$poolcreate', add $addvdevs."
+
+ test_common "$poolcreate" "$addvdevs"
+}
+
+function test_attach_vdev
+{
+ typeset poolcreate="$1"
+ typeset attachto="$2"
+ typeset attachvdev="$3"
+
+ log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto."
+
+ test_common "$poolcreate" "" "$attachto $attachvdev"
+}
+
+function test_detach_vdev
+{
+ typeset poolcreate="$1"
+ typeset detachvdev="$2"
+
+ log_note "$0: pool '$poolcreate', detach $detachvdev."
+
+ test_common "$poolcreate" "" "" "$detachvdev"
+}
+
+function test_attach_detach_vdev
+{
+ typeset poolcreate="$1"
+ typeset attachto="$2"
+ typeset attachvdev="$3"
+ typeset detachvdev="$4"
+
+ log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto," \
+ "then detach $detachvdev."
+
+ test_common "$poolcreate" "" "$attachto $attachvdev" "$detachvdev"
+}
+
+function test_remove_vdev
+{
+ typeset poolcreate="$1"
+ typeset removevdev="$2"
+ typeset finalpool="$3"
+
+ log_note "$0: pool '$poolcreate', remove $removevdev."
+
+ test_common "$poolcreate" "" "" "" "$removevdev" "$finalpool"
+}
+
+# Record txg history
+is_linux && log_must set_tunable32 zfs_txg_history 100
+
+# Make the devices bigger to reduce chances of overwriting MOS metadata.
+increase_device_sizes $(( FILE_SIZE * 4 ))
+
+# Part of the rewind test is to see how it reacts to path changes
+typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3"
+
+log_note " == test rewind after device addition == "
+
+test_add_vdevs "$VDEV0" "$VDEV1"
+test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2"
+test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2"
+test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3"
+test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3"
+test_add_vdevs "$VDEV0" "log $VDEV1"
+test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2"
+
+log_note " == test rewind after device attach == "
+
+test_attach_vdev "$VDEV0" "$VDEV0" "$VDEV1"
+test_attach_vdev "mirror $VDEV0 $VDEV1" "$VDEV0" "$VDEV2"
+test_attach_vdev "$VDEV0 $VDEV1" "$VDEV0" "$VDEV2"
+
+log_note " == test rewind after device removal == "
+
+# Once we remove a device it will be overlooked in the device scan, so we must
+# preserve its original path
+pathstochange="$VDEV0 $VDEV2"
+test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV1" "$VDEV0 $VDEV2"
+
+#
+# Path change and detach are incompatible. Detach changes the guid of the vdev
+# so we have no direct way to link the new path to an existing vdev.
+#
+pathstochange=""
+
+log_note " == test rewind after device detach == "
+
+test_detach_vdev "mirror $VDEV0 $VDEV1" "$VDEV1"
+test_detach_vdev "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" "$VDEV1"
+test_detach_vdev "$VDEV0 log mirror $VDEV1 $VDEV2" "$VDEV2"
+
+log_note " == test rewind after device attach followed by device detach == "
+
+#
+# We need to disable vdev validation since once we detach VDEV1, VDEV0 will
+# inherit the mirror tvd's guid and lose its original guid.
+#
+set_vdev_validate_skip 1
+test_attach_detach_vdev "$VDEV0" "$VDEV0" "$VDEV1" "$VDEV1"
+set_vdev_validate_skip 0
+
+log_pass "zpool import rewind after configuration change passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh
new file mode 100755
index 000000000..5ff1c47f3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh
@@ -0,0 +1,186 @@
+#!/usr/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# It should be possible to rewind a pool beyond a device replacement.
+#
+# STRATEGY:
+# 1. Create a pool.
+# 2. Generate files and remember their md5sum.
+# 3. Sync a few times and note last synced txg.
+# 4. Take a snapshot to make sure old blocks are not overwritten.
+# 5. Initiate device replacement and export the pool. Special care must
+# be taken so that resilvering doesn't complete before the export.
+# 6. Test 1: Rewind pool to noted txg and then verify data checksums.
+# Import it read-only so that we do not overwrite blocks in later txgs.
+# 7. Re-import pool at latest txg and let the replacement finish.
+# 8. Export the pool an remove the new device - we shouldn't need it.
+# 9. Test 2: Rewind pool to noted txg and then verify data checksums.
+#
+# STRATEGY TO SLOW DOWN RESILVERING:
+# 1. Reduce zfs_txg_timeout, which controls how long can we resilver for
+# each sync.
+# 2. Add data to pool
+# 3. Re-import the pool so that data isn't cached
+# 4. Use zinject to slow down device I/O
+# 5. Trigger the resilvering
+# 6. Use spa freeze to stop writing to the pool.
+# 7. Clear zinject events (needed to export the pool)
+# 8. Export the pool
+#
+# DISCLAIMER:
+# This test can fail since nothing guarantees that old MOS blocks aren't
+# overwritten. Snapshots protect datasets and data files but not the MOS.
+# sync_some_data_a_few_times interleaves file data and MOS data for a few
+# txgs, thus increasing the odds that some txgs will have their MOS data
+# left untouched.
+#
+
+verify_runnable "global"
+
+ZFS_TXG_TIMEOUT=""
+
+function custom_cleanup
+{
+ # Revert zfs_txg_timeout to defaults
+ [[ -n ZFS_TXG_TIMEOUT ]] &&
+ log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT
+ log_must rm -rf $BACKUP_DEVICE_DIR
+ zinject -c all
+ cleanup
+}
+
+log_onexit custom_cleanup
+
+function test_replace_vdev
+{
+ typeset poolcreate="$1"
+ typeset replacevdev="$2"
+ typeset replaceby="$3"
+ typeset poolfinalstate="$4"
+ typeset zinjectdevices="$5"
+ typeset writedata="$6"
+
+ log_note "$0: pool '$poolcreate', replace $replacevdev by $replaceby."
+
+ log_must zpool create $TESTPOOL1 $poolcreate
+
+ # generate data and checksum it
+ log_must generate_data $TESTPOOL1 $MD5FILE
+
+ # add more data so that resilver takes longer
+ log_must write_some_data $TESTPOOL1 $writedata
+
+ # Syncing a few times while writing new data increases the odds that
+ # MOS metadata for some of the txgs will survive.
+ log_must sync_some_data_a_few_times $TESTPOOL1
+ typeset txg
+ txg=$(get_last_txg_synced $TESTPOOL1)
+ log_must zfs snapshot -r $TESTPOOL1@snap1
+
+ # This should not free original data.
+ log_must overwrite_data $TESTPOOL1 ""
+
+ # Steps to insure resilvering happens very slowly.
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+ typeset device
+ for device in $zinjectdevices ; do
+ log_must zinject -d $device -D 200:1 $TESTPOOL1 > /dev/null
+ done
+ log_must zpool replace $TESTPOOL1 $replacevdev $replaceby
+
+ # We must disable zinject in order to export the pool, so we freeze
+ # it first to prevent writing out subsequent resilvering progress.
+ log_must zpool freeze $TESTPOOL1
+ # Confirm pool is still replacing
+ log_must pool_is_replacing $TESTPOOL1
+ log_must zinject -c all > /dev/null
+ log_must zpool export $TESTPOOL1
+
+ ############################################################
+ # Test 1: rewind while device is resilvering.
+ # Import read only to avoid overwriting more recent blocks.
+ ############################################################
+ log_must zpool import -d $DEVICE_DIR -o readonly=on -T $txg $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcreate"
+
+ log_must verify_data_md5sums $MD5FILE
+
+ log_must zpool export $TESTPOOL1
+
+ # Import pool at latest txg to finish the resilvering
+ log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+ log_must overwrite_data $TESTPOOL1 ""
+ log_must wait_for_pool_config $TESTPOOL1 "$poolfinalstate"
+ log_must zpool export $TESTPOOL1
+
+ # Move out the new device
+ log_must mv $replaceby $BACKUP_DEVICE_DIR/
+
+ ############################################################
+ # Test 2: rewind after device has been replaced.
+ # Import read-write since we won't need the pool anymore.
+ ############################################################
+ log_must zpool import -d $DEVICE_DIR -T $txg $TESTPOOL1
+ log_must check_pool_config $TESTPOOL1 "$poolcreate"
+
+ log_must verify_data_md5sums $MD5FILE
+
+ # Cleanup
+ log_must zpool destroy $TESTPOOL1
+ # Restore the device we moved out
+ log_must mv "$BACKUP_DEVICE_DIR/$(basename $replaceby)" $DEVICE_DIR/
+ # Fast way to clear vdev labels
+ log_must zpool create -f $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4
+ log_must zpool destroy $TESTPOOL2
+
+ log_note ""
+}
+
+# Record txg history
+is_linux && log_must set_tunable32 zfs_txg_history 100
+
+log_must mkdir -p $BACKUP_DEVICE_DIR
+# Make the devices bigger to reduce chances of overwriting MOS metadata.
+increase_device_sizes $(( FILE_SIZE * 4 ))
+
+# We set zfs_txg_timeout to 1 to reduce resilvering time at each sync.
+ZFS_TXG_TIMEOUT=$(get_zfs_txg_timeout)
+set_zfs_txg_timeout 1
+
+test_replace_vdev "$VDEV0 $VDEV1" \
+ "$VDEV1" "$VDEV2" \
+ "$VDEV0 $VDEV2" \
+ "$VDEV0 $VDEV1" 15
+
+test_replace_vdev "mirror $VDEV0 $VDEV1" \
+ "$VDEV1" "$VDEV2" \
+ "mirror $VDEV0 $VDEV2" \
+ "$VDEV0 $VDEV1" 10
+
+test_replace_vdev "raidz $VDEV0 $VDEV1 $VDEV2" \
+ "$VDEV1" "$VDEV3" \
+ "raidz $VDEV0 $VDEV3 $VDEV2" \
+ "$VDEV0 $VDEV1 $VDEV2" 10
+
+set_zfs_txg_timeout $ZFS_TXG_TIMEOUT
+
+log_pass "zpool import rewind after device replacement passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh
index 142771de6..d81e66636 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh
@@ -69,26 +69,14 @@ log_must zfs create $TESTPOOL/$TESTFS
log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS
DISK2="$(echo $DISKS | nawk '{print $2}')"
-if is_mpath_device $DISK2; then
- echo "y" | newfs -v $DEV_DSKDIR/$DISK2 >/dev/null 2>&1
- (( $? != 0 )) &&
- log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system"
+echo "y" | newfs -v $DEV_DSKDIR/$DISK2 >/dev/null 2>&1
+(( $? != 0 )) &&
+ log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system"
- [[ ! -d $DEVICE_DIR ]] && \
- log_must mkdir -p $DEVICE_DIR
+[[ ! -d $DEVICE_DIR ]] && \
+ log_must mkdir -p $DEVICE_DIR
- log_must mount $DEV_DSKDIR/$DISK2 $DEVICE_DIR
-else
- log_must set_partition 0 "" $FS_SIZE $ZFS_DISK2
- echo "y" | newfs -v $DEV_DSKDIR/$ZFSSIDE_DISK2 >/dev/null 2>&1
- (( $? != 0 )) &&
- log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system"
-
- [[ ! -d $DEVICE_DIR ]] && \
- log_must mkdir -p $DEVICE_DIR
-
- log_must mount $DEV_DSKDIR/$ZFSSIDE_DISK2 $DEVICE_DIR
-fi
+log_must mount $DEV_DSKDIR/$DISK2 $DEVICE_DIR
i=0
while (( i < $MAX_NUM )); do
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg
index 648f82c2b..20f43cefa 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg
@@ -25,7 +25,7 @@
#
#
-# Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@@ -57,10 +57,8 @@ case "${#disk_array[*]}" in
if ( is_mpath_device $ZFS_DISK1 ) && [[ -z $(echo $ZFS_DISK1 | awk 'substr($1,18,1)\
~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK1 ); then
ZFSSIDE_DISK1=${ZFS_DISK1}1
- ZFSSIDE_DISK2=${ZFS_DISK2}2
elif ( is_mpath_device $ZFS_DISK1 || is_loop_device $ZFS_DISK1 ); then
ZFSSIDE_DISK1=${ZFS_DISK1}p1
- ZFSSIDE_DISK2=${ZFS_DISK2}p2
else
log_fail "$ZFS_DISK1 not supported for partitioning."
fi
@@ -71,7 +69,6 @@ case "${#disk_array[*]}" in
ZFS_DISK1=${disk_array[0]}
ZFSSIDE_DISK1=${ZFS_DISK1}s0
ZFS_DISK2=${disk_array[0]}
- ZFSSIDE_DISK2=${ZFS_DISK2}s1
fi
;;
*)
@@ -96,14 +93,6 @@ case "${#disk_array[*]}" in
log_fail "$ZFS_DISK1 not supported for partitioning."
fi
ZFS_DISK2=${disk_array[1]}
- if ( is_mpath_device $ZFS_DISK2 ) && [[ -z $(echo $ZFS_DISK2 | awk 'substr($1,18,1)\
- ~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK2 ); then
- ZFSSIDE_DISK2=${ZFS_DISK2}1
- elif ( is_mpath_device $ZFS_DISK2 || is_loop_device $ZFS_DISK2 ); then
- ZFSSIDE_DISK2=${ZFS_DISK2}p1
- else
- log_fail "$ZFS_DISK2 not supported for partitioning."
- fi
else
export DEV_DSKDIR="/dev"
PRIMARY_SLICE=2
@@ -111,15 +100,14 @@ case "${#disk_array[*]}" in
ZFS_DISK1=${disk_array[0]}
ZFSSIDE_DISK1=${ZFS_DISK1}s0
ZFS_DISK2=${disk_array[1]}
- ZFSSIDE_DISK2=${ZFS_DISK2}s0
fi
;;
esac
-export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2 ZFSSIDE_DISK2
+export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2
-export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 16))m"
-export FILE_SIZE="$(($MINVDEVSIZE / 2))"
+export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m"
+export FILE_SIZE="$((MINVDEVSIZE))"
export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m"
export MAX_NUM=5
export GROUP_NUM=3
@@ -129,6 +117,12 @@ export DEVICE_FILE=disk
export DEVICE_ARCHIVE=archive_import-test
export MYTESTFILE=$STF_SUITE/include/libtest.shlib
+export CPATH=/var/tmp/cachefile.$$
+export CPATHBKP=/var/tmp/cachefile.$$.bkp
+export CPATHBKP2=/var/tmp/cachefile.$$.bkp2
+export MD5FILE=/var/tmp/md5sums.$$
+export MD5FILE2=/var/tmp/md5sums.$$.2
+
typeset -i num=0
while (( num < $GROUP_NUM )); do
DEVICE_FILES="$DEVICE_FILES ${DEVICE_DIR}/${DEVICE_FILE}$num"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
new file mode 100644
index 000000000..bc89d8159
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
@@ -0,0 +1,376 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
+
+#
+# Prototype cleanup function for zpool_import tests.
+#
+function cleanup
+{
+ destroy_pool $TESTPOOL1
+
+ log_must rm -f $CPATH $CPATHBKP $CPATHBKP2 $MD5FILE $MD5FILE2
+
+ log_must rm -rf $DEVICE_DIR/*
+ typeset i=0
+ while (( i < $MAX_NUM )); do
+ log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i
+ ((i += 1))
+ done
+ is_linux && set_tunable32 "zfs_txg_history" 0
+}
+
+#
+# Write a bit of data and sync several times.
+# This function is intended to be used by zpool rewind tests.
+#
+function sync_some_data_a_few_times
+{
+ typeset pool=$1
+ typeset -i a_few_times=${2:-10}
+
+ typeset file="/$pool/tmpfile"
+ for i in {0..$a_few_times}; do
+ dd if=/dev/urandom of=${file}_$i bs=128k count=10
+ sync_pool "$pool"
+ done
+
+ return 0
+}
+
+#
+# Just write a moderate amount of data to the pool.
+#
+function write_some_data
+{
+ typeset pool=$1
+ typeset files10mb=${2:-10}
+
+ typeset ds="$pool/fillerds"
+ zfs create $ds
+ [[ $? -ne 0 ]] && return 1
+
+ # Create 100 MB of data
+ typeset file="/$ds/fillerfile"
+ for i in {1..$files10mb}; do
+ dd if=/dev/urandom of=$file.$i bs=128k count=80
+ [[ $? -ne 0 ]] && return 1
+ done
+
+ return 0
+}
+
+#
+# Create/overwrite a few datasets with files.
+# Apply md5sum on all the files and store checksums in a file.
+#
+# newdata: overwrite existing files if false.
+# md5file: file where to store md5sums
+# datasetname: base name for datasets
+#
+function _generate_data_common
+{
+ typeset pool=$1
+ typeset newdata=$2
+ typeset md5file=$3
+ typeset datasetname=$4
+
+ typeset -i datasets=3
+ typeset -i files=5
+ typeset -i blocks=10
+
+ [[ -n $md5file ]] && rm -f $md5file
+ for i in {1..$datasets}; do
+ ( $newdata ) && log_must zfs create "$pool/$datasetname$i"
+ for j in {1..$files}; do
+ typeset file="/$pool/$datasetname$i/file$j"
+ dd if=/dev/urandom of=$file bs=128k count=$blocks > /dev/null
+ [[ -n $md5file ]] && md5sum $file >> $md5file
+ done
+ ( $newdata ) && sync_pool "$pool"
+ done
+
+ return 0
+}
+
+function generate_data
+{
+ typeset pool=$1
+ typeset md5file="$2"
+ typeset datasetname=${3:-ds}
+
+ _generate_data_common $pool true "$md5file" $datasetname
+}
+
+function overwrite_data
+{
+ typeset pool=$1
+ typeset md5file="$2"
+ typeset datasetname=${3:-ds}
+
+ _generate_data_common $1 false "$md5file" $datasetname
+}
+
+#
+# Verify md5sums of every file in md5sum file $1.
+#
+function verify_data_md5sums
+{
+ typeset md5file=$1
+
+ if [[ ! -f $md5file ]]; then
+ log_note "md5 sums file '$md5file' doesn't exist"
+ return 1
+ fi
+
+ md5sum -c --quiet $md5file
+ return $?
+}
+
+#
+# Set devices size in DEVICE_DIR to $1.
+#
+function increase_device_sizes
+{
+ typeset newfilesize=$1
+
+ typeset -i i=0
+ while (( i < $MAX_NUM )); do
+ log_must mkfile $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i
+ ((i += 1))
+ done
+}
+
+#
+# Translate vdev names returned by zpool status into more generic names.
+#
+# eg: mirror-2 --> mirror
+#
+function _translate_vdev
+{
+ typeset vdev=$1
+
+ typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect"
+ for word in $keywords; do
+ echo $vdev | egrep "^${word}-[0-9]+\$" > /dev/null
+ if [[ $? -eq 0 ]]; then
+ vdev=$word
+ break
+ fi
+ done
+
+ [[ $vdev == "logs" ]] && echo "log" && return 0
+ [[ $vdev == "raidz1" ]] && echo "raidz" && return 0
+
+ echo $vdev
+ return 0
+}
+
+#
+# Check that pool configuration returned by zpool status matches expected
+# configuration. Format for the check string is same as the vdev arguments for
+# creating a pool
+# Add -q for quiet mode.
+#
+# eg: check_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0 log c1t1d0s0"
+#
+function check_pool_config
+{
+ typeset logfailure=true
+ if [[ $1 == '-q' ]]; then
+ logfailure=false
+ shift
+ fi
+
+ typeset poolname=$1
+ typeset expected=$2
+
+ typeset status
+ status=$(zpool status $poolname 2>&1)
+ if [[ $? -ne 0 ]]; then
+ if ( $logfailure ); then
+ log_note "zpool status $poolname failed: $status"
+ fi
+ return 1
+ fi
+
+ typeset actual=""
+ typeset began=false
+ printf "$status\n" | while read line; do
+ typeset vdev=$(echo "$line" | awk '{printf $1}')
+ if ( ! $began ) && [[ $vdev == NAME ]]; then
+ began=true
+ continue
+ fi
+ ( $began ) && [[ -z $vdev ]] && break;
+
+ if ( $began ); then
+ [[ -z $actual ]] && actual="$vdev" && continue
+ vdev=$(_translate_vdev $vdev)
+ actual="$actual $vdev"
+ fi
+ done
+
+ expected="$poolname $expected"
+
+ if [[ "$actual" != "$expected" ]]; then
+ if ( $logfailure ); then
+ log_note "expected pool vdevs:"
+ log_note "> '$expected'"
+ log_note "actual pool vdevs:"
+ log_note "> '$actual'"
+ fi
+ return 1
+ fi
+
+ return 0
+}
+
+#
+# Check that pool configuration returned by zpool status matches expected
+# configuration within a given timeout in seconds. See check_pool_config().
+#
+# eg: wait_for_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0" 60
+#
+function wait_for_pool_config
+{
+ typeset poolname=$1
+ typeset expectedconfig="$2"
+ typeset -i timeout=${3:-60}
+
+ timeout=$(( $timeout + $(date +%s) ))
+
+ while (( $(date +%s) < $timeout )); do
+ check_pool_config -q $poolname "$expectedconfig"
+ [[ $? -eq 0 ]] && return 0
+ sleep 3
+ done
+
+ check_pool_config $poolname "$expectedconfig"
+ return $?
+}
+
+#
+# Check that pool status is ONLINE
+#
+function check_pool_healthy
+{
+ typeset pool=$1
+
+ typeset status
+ status=$(zpool status $pool 2>&1)
+ if [[ $? -ne 0 ]]; then
+ log_note "zpool status $pool failed: $status"
+ return 1
+ fi
+
+ status=$(echo "$status" | grep "$pool" | grep -v "pool:" | \
+ awk '{print $2}')
+
+ if [[ $status != "ONLINE" ]]; then
+ log_note "Invalid zpool status for '$pool': '$status'" \
+ "!= 'ONLINE'"
+ return 1
+ fi
+
+ return 0
+}
+
+#
+# Return 0 if a device is currently being replaced in the pool.
+#
+function pool_is_replacing
+{
+ typeset pool=$1
+
+ zpool status $pool | grep "replacing" | grep "ONLINE" > /dev/null
+
+ return $?
+}
+
+function set_vdev_validate_skip
+{
+ set_tunable32 "vdev_validate_skip" "$1"
+}
+
+function get_zfs_txg_timeout
+{
+ get_tunable "zfs_txg_timeout"
+}
+
+function set_zfs_txg_timeout
+{
+ set_tunable32 "zfs_txg_timeout" "$1"
+}
+
+function set_spa_load_verify_metadata
+{
+ set_tunable32 "spa_load_verify_metadata" "$1"
+}
+
+function set_spa_load_verify_data
+{
+ set_tunable32 "spa_load_verify_data" "$1"
+}
+
+function set_zfs_max_missing_tvds
+{
+ set_tunable32 "zfs_max_missing_tvds" "$1"
+}
+
+#
+# Use mdb to find the last txg that was synced in an active pool.
+#
+function get_last_txg_synced
+{
+ typeset pool=$1
+
+ if is_linux; then
+ txg=$(tail "/proc/spl/kstat/zfs/$pool/txgs" |
+ awk '$3=="C" {print $1}' | tail -1)
+ [[ "$txg" ]] || txg=0
+ echo $txg
+ return 0
+ fi
+
+ typeset spas
+ spas=$(mdb -k -e "::spa")
+ [[ $? -ne 0 ]] && return 1
+
+ typeset spa=""
+ print "$spas\n" | while read line; do
+ typeset poolname=$(echo "$line" | awk '{print $3}')
+ typeset addr=$(echo "$line" | awk '{print $1}')
+ if [[ $poolname == $pool ]]; then
+ spa=$addr
+ break
+ fi
+ done
+ if [[ -z $spa ]]; then
+ log_fail "Couldn't find pool '$pool'"
+ return 1
+ fi
+ typeset mdbcmd="$spa::print spa_t spa_ubsync.ub_txg | ::eval '.=E'"
+ typeset -i txg
+ txg=$(mdb -k -e "$mdbcmd")
+ [[ $? -ne 0 ]] && return 1
+
+ echo $txg
+ return 0
+}