aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorPavel Zakharov <[email protected]>2016-07-22 10:39:36 -0400
committerBrian Behlendorf <[email protected]>2018-05-08 21:35:27 -0700
commit6cb8e5306d9696d155ae7a808f56c4e46d69b64c (patch)
treec5c1f331a6341281fd3e9b7310e1618eab2c005c /lib
parentafd2f7b7117ff8bf23afa70ecae86ec0c1a1461e (diff)
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool load (and import) process. This includes: 7638 Refactor spa_load_impl into several functions 8961 SPA load/import should tell us why it failed 7277 zdb should be able to print zfs_dbgmsg's To iterate on top of that, there's a few changes that were made to make the import process more resilient and crash free. One of the first tasks during the pool load process is to parse a config provided from userland that describes what devices the pool is composed of. A vdev tree is generated from that config, and then all the vdevs are opened. The Meta Object Set (MOS) of the pool is accessed, and several metadata objects that are necessary to load the pool are read. The exact configuration of the pool is also stored inside the MOS. Since the configuration provided from userland is external and might not accurately describe the vdev tree of the pool at the txg that is being loaded, it cannot be relied upon to safely operate the pool. For that reason, the configuration in the MOS is read early on. In the past, the two configurations were compared together and if there was a mismatch then the load process was aborted and an error was returned. The latter was a good way to ensure a pool does not get corrupted, however it made the pool load process needlessly fragile in cases where the vdev configuration changed or the userland configuration was outdated. Since the MOS is stored in 3 copies, the configuration provided by userland doesn't have to be perfect in order to read its contents. Hence, a new approach has been adopted: The pool is first opened with the untrusted userland configuration just so that the real configuration can be read from the MOS. The trusted MOS configuration is then used to generate a new vdev tree and the pool is re-opened. When the pool is opened with an untrusted configuration, writes are disabled to avoid accidentally damaging it. During reads, some sanity checks are performed on block pointers to see if each DVA points to a known vdev; when the configuration is untrusted, instead of panicking the system if those checks fail we simply avoid issuing reads to the invalid DVAs. This new two-step pool load process now allows rewinding pools accross vdev tree changes such as device replacement, addition, etc. Loading a pool from an external config file in a clustering environment also becomes much safer now since the pool will import even if the config is outdated and didn't, for instance, register a recent device addition. With this code in place, it became relatively easy to implement a long-sought-after feature: the ability to import a pool with missing top level (i.e. non-redundant) devices. Note that since this almost guarantees some loss of data, this feature is for now restricted to a read-only import. Porting notes (ZTS): * Fix 'make dist' target in zpool_import * The maximum path length allowed by tar is 99 characters. Several of the new test cases exceeded this limit resulting in them not being included in the tarball. Shorten the names slightly. * Set/get tunables using accessor functions. * Get last synced txg via the "zfs_txg_history" mechanism. * Clear zinject handlers in cleanup for import_cache_device_replaced and import_rewind_device_replaced in order that the zpool can be exported if there is an error. * Increase FILESIZE to 8G in zfs-test.sh to allow for a larger ext4 file system to be created on ZFS_DISK2. Also, there's no need to partition ZFS_DISK2 at all. The partitioning had already been disabled for multipath devices. Among other things, the partitioning steals some space from the ext4 file system, makes it difficult to accurately calculate the paramters to parted and can make some of the tests fail. * Increase FS_SIZE and FILE_SIZE in the zpool_import test configuration now that FILESIZE is larger. * Write more data in order that device evacuation take lonnger in a couple tests. * Use mkdir -p to avoid errors when the directory already exists. * Remove use of sudo in import_rewind_config_changed. Authored by: Pavel Zakharov <[email protected]> Reviewed by: George Wilson <[email protected]> Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: Andrew Stormont <[email protected]> Approved by: Hans Rosenfeld <[email protected]> Ported-by: Tim Chase <[email protected]> Signed-off-by: Tim Chase <[email protected]> OpenZFS-issue: https://illumos.org/issues/9075 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123 Closes #7459
Diffstat (limited to 'lib')
-rw-r--r--lib/libzfs/libzfs_import.c19
-rw-r--r--lib/libzfs/libzfs_pool.c5
-rw-r--r--lib/libzpool/kernel.c10
3 files changed, 30 insertions, 4 deletions
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index cc9a52a3e..68b5988cd 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -897,7 +897,8 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
* return to the user.
*/
static nvlist_t *
-get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
+get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok,
+ nvlist_t *policy)
{
pool_entry_t *pe;
vdev_entry_t *ve;
@@ -1230,6 +1231,12 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
continue;
}
+ if (policy != NULL) {
+ if (nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+ policy) != 0)
+ goto nomem;
+ }
+
if ((nvl = refresh_config(hdl, config)) == NULL) {
nvlist_free(config);
config = NULL;
@@ -2080,7 +2087,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
free(cache);
pthread_mutex_destroy(&lock);
- ret = get_configs(hdl, &pools, iarg->can_be_active);
+ ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy);
for (pe = pools.pools; pe != NULL; pe = penext) {
penext = pe->pe_next;
@@ -2209,6 +2216,14 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
if (active)
continue;
+ if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE,
+ cachefile) != 0) {
+ (void) no_memory(hdl);
+ nvlist_free(raw);
+ nvlist_free(pools);
+ return (NULL);
+ }
+
if ((dst = refresh_config(hdl, src)) == NULL) {
nvlist_free(raw);
nvlist_free(pools);
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 2d94cd320..d082a5f66 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1935,8 +1935,9 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_lookup_nvlist(nvinfo,
ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
(void) printf(dgettext(TEXT_DOMAIN,
- "The devices below are missing, use "
- "'-m' to import the pool anyway:\n"));
+ "The devices below are missing or "
+ "corrupted, use '-m' to import the pool "
+ "anyway:\n"));
print_vdev_tree(hdl, NULL, missing, 2);
(void) printf("\n");
}
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 09e69ef6d..f3e84975c 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -297,6 +297,16 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw)
return (0);
}
+/* ARGSUSED */
+uint32_t
+zone_get_hostid(void *zonep)
+{
+ /*
+ * We're emulating the system's hostid in userland.
+ */
+ return (strtoul(hw_serial, NULL, 10));
+}
+
int
rw_tryupgrade(krwlock_t *rwlp)
{