diff options
author | Brian Behlendorf <[email protected]> | 2009-08-18 11:43:27 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2009-08-18 11:43:27 -0700 |
commit | 45d1cae3b8c949ecc391dd7a5b81963b34c71c29 (patch) | |
tree | 69b1f860eb1f9b1ebdef392760814c5cc089f345 | |
parent | 9babb37438b58e77bad04e820d5702e15b79e6a6 (diff) |
Rebase master to b121
64 files changed, 3005 insertions, 811 deletions
diff --git a/ZFS.RELEASE b/ZFS.RELEASE index 8b7cfb53c..0960bf493 100644 --- a/ZFS.RELEASE +++ b/ZFS.RELEASE @@ -1 +1 @@ -http://dlc.sun.com/osol/on/downloads/b117/on-src.tar.bz2 +http://dlc.sun.com/osol/on/downloads/b121/on-src.tar.bz2 diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index a310fc38b..292bb519a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -818,6 +818,8 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); @@ -1049,6 +1051,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = { dump_zap, /* DSL scrub queue */ dump_zap, /* ZFS user/group used */ dump_zap, /* ZFS user/group quota */ + dump_zap, /* snapshot refcount tags */ }; static void diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c index ca5c2b232..04dd2bdeb 100644 --- a/cmd/zfs/zfs_iter.c +++ b/cmd/zfs/zfs_iter.c @@ -362,7 +362,7 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, cb.cb_types = types; cb.cb_depth_limit = limit; /* - * If cb_proplist is provided then in the zfs_handles created we + * If cb_proplist is provided then in the zfs_handles created we * retain only those properties listed in cb_proplist and sortcol. * The rest are pruned. So, the caller should make sure that no other * properties other than those listed in cb_proplist/sortcol are diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 0752a4772..1fbd8bc65 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -80,6 +80,8 @@ static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_python(int argc, char **argv); +static int zfs_do_hold(int argc, char **argv); +static int zfs_do_release(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -121,7 +123,10 @@ typedef enum { HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, - HELP_GROUPSPACE + HELP_GROUPSPACE, + HELP_HOLD, + HELP_HOLDS, + HELP_RELEASE } zfs_help_t; typedef struct zfs_command { @@ -169,6 +174,10 @@ static zfs_command_t command_table[] = { { "allow", zfs_do_python, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_python, HELP_UNALLOW }, + { NULL }, + { "hold", zfs_do_hold, HELP_HOLD }, + { "holds", zfs_do_python, HELP_HOLDS }, + { "release", zfs_do_release, HELP_RELEASE }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -189,7 +198,8 @@ get_usage(zfs_help_t idx) "-V <size> <volume>\n")); case HELP_DESTROY: return (gettext("\tdestroy [-rRf] " - "<filesystem|volume|snapshot>\n")); + "<filesystem|volume|snapshot>\n" + "\tdestroy -d [-r] <filesystem|volume|snapshot>\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o field[,...]] [-s source[,...]]\n" @@ -236,7 +246,7 @@ get_usage(zfs_help_t idx) return (gettext("\tunmount [-f] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: - return (gettext("\tunshare [-f] " + return (gettext("\tunshare " "<-a | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow <filesystem|volume>\n" @@ -266,6 +276,12 @@ get_usage(zfs_help_t idx) return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] " "[-sS field] ... [-t type[,...]]\n" "\t <filesystem|snapshot>\n")); + case HELP_HOLD: + return (gettext("\thold [-r] <tag> <snapshot> ...\n")); + case HELP_HOLDS: + return (gettext("\tholds [-r] <snapshot> ...\n")); + case HELP_RELEASE: + return (gettext("\trelease [-r] <tag> <snapshot> ...\n")); } abort(); @@ -769,11 +785,13 @@ badusage: } /* - * zfs destroy [-rf] <fs, snap, vol> + * zfs destroy [-rRf] <fs, snap, vol> + * zfs destroy -d [-r] <fs, snap, vol> * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents + * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can @@ -789,6 +807,7 @@ typedef struct destroy_cbdata { boolean_t cb_closezhp; zfs_handle_t *cb_target; char *cb_snapname; + boolean_t cb_defer_destroy; } destroy_cbdata_t; /* @@ -869,7 +888,7 @@ destroy_callback(zfs_handle_t *zhp, void *data) * Bail out on the first error. */ if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp) != 0) { + zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) { zfs_close(zhp); return (-1); } @@ -923,8 +942,11 @@ zfs_do_destroy(int argc, char **argv) char *cp; /* check options */ - while ((c = getopt(argc, argv, "frR")) != -1) { + while ((c = getopt(argc, argv, "dfrR")) != -1) { switch (c) { + case 'd': + cb.cb_defer_destroy = B_TRUE; + break; case 'f': cb.cb_force = 1; break; @@ -956,6 +978,9 @@ zfs_do_destroy(int argc, char **argv) usage(B_FALSE); } + if (cb.cb_defer_destroy && cb.cb_doclones) + usage(B_FALSE); + /* * If we are doing recursive destroy of a snapshot, then the * named snapshot may not exist. Go straight to libzfs. @@ -977,7 +1002,7 @@ zfs_do_destroy(int argc, char **argv) } } - ret = zfs_destroy_snaps(zhp, cp); + ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy); zfs_close(zhp); if (ret) { (void) fprintf(stderr, @@ -986,7 +1011,6 @@ zfs_do_destroy(int argc, char **argv) return (ret != 0); } - /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) return (1); @@ -1014,15 +1038,15 @@ zfs_do_destroy(int argc, char **argv) * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; - if (!cb.cb_doclones && + if (!cb.cb_doclones && !cb.cb_defer_destroy && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { zfs_close(zhp); return (1); } - if (cb.cb_error || - zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { + if (cb.cb_error || (!cb.cb_defer_destroy && + (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) { zfs_close(zhp); return (1); } @@ -1035,7 +1059,6 @@ zfs_do_destroy(int argc, char **argv) if (destroy_callback(zhp, &cb) != 0) return (1); - return (0); } @@ -1613,7 +1636,7 @@ zfs_do_upgrade(int argc, char **argv) (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and File system " - "unique identifer (FUID)\n")); + "unique identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext("\nFor more information on a particular " @@ -2651,6 +2674,108 @@ zfs_do_receive(int argc, char **argv) return (err != 0); } +static int +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) +{ + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + int c; + int (*func)(zfs_handle_t *, const char *, const char *, boolean_t); + + /* check options */ + while ((c = getopt(argc, argv, "r")) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 2) + usage(B_FALSE); + + tag = argv[0]; + --argc; + ++argv; + + if (holding) { + if (tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, + gettext("tag may not start with '.'\n")); + usage(B_FALSE); + } + func = zfs_hold; + } else { + func = zfs_release; + } + + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAXNAMELEN]; + const char *delim; + char *path = argv[i]; + + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (func(zhp, delim+1, tag, recursive) != 0) + ++errors; + zfs_close(zhp); + } + + return (errors != 0); +} + +/* + * zfs hold [-r] <tag> <snap> ... + * + * -r Recursively hold + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_hold(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); +} + +/* + * zfs release [-r] <tag> <snap> ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ +static int +zfs_do_release(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); +} + typedef struct get_all_cbdata { zfs_handle_t **cb_handles; size_t cb_alloc; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index c8a33df4d..c9b092e44 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1469,7 +1469,7 @@ show_import(nvlist_t *config) */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - int force, nvlist_t *props, boolean_t allowfaulted) + int force, nvlist_t *props, boolean_t do_verbatim) { zpool_handle_t *zhp; char *name; @@ -1522,14 +1522,14 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, } } - if (zpool_import_props(g_zfs, config, newname, props, - allowfaulted) != 0) + if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0) return (1); if (newname != NULL) name = (char *)newname; - verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL); + if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) + return (1); if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { @@ -1566,7 +1566,8 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, * -F Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad - * vdevs in the FAULTED state. + * vdevs in the FAULTED state. In other words, it does verbatim + * import. * * -a Import all pools found. * @@ -1595,7 +1596,7 @@ zpool_do_import(int argc, char **argv) nvlist_t *found_config; nvlist_t *props = NULL; boolean_t first; - boolean_t allow_faulted = B_FALSE; + boolean_t do_verbatim = B_FALSE; uint64_t pool_state; char *cachefile = NULL; @@ -1628,7 +1629,7 @@ zpool_do_import(int argc, char **argv) do_force = B_TRUE; break; case 'F': - allow_faulted = B_TRUE; + do_verbatim = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { @@ -1778,7 +1779,7 @@ zpool_do_import(int argc, char **argv) if (do_all) err |= do_import(config, NULL, mntopts, - do_force, props, allow_faulted); + do_force, props, do_verbatim); else show_import(config); } else if (searchname != NULL) { @@ -1826,7 +1827,7 @@ zpool_do_import(int argc, char **argv) err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, do_force, props, allow_faulted); + argv[1], mntopts, do_force, props, do_verbatim); } } @@ -3117,6 +3118,17 @@ status_callback(zpool_handle_t *zhp, void *data) "replace'.\n")); break; + case ZPOOL_STATUS_REMOVED_DEV: + (void) printf(gettext("status: One or more devices has " + "been removed by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + (void) printf(gettext("action: Online the device using " + "'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -3539,6 +3551,8 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); + (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); + (void) printf(gettext(" 18 snapshot user holds\n")); (void) printf(gettext("For more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" @@ -3624,6 +3638,8 @@ char *hist_event_table[LOG_END] = { "refquota set", "refreservation set", "pool scrub done", + "user hold", + "user release", }; /* diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 10007c149..621519197 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,7 @@ #include <libdiskmgt.h> #include <libintl.h> #include <libnvpair.h> +#include <limits.h> #include <stdio.h> #include <string.h> #include <unistd.h> @@ -1093,20 +1094,35 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, } static const char * -is_grouping(const char *type, int *mindev) +is_grouping(const char *type, int *mindev, int *maxdev) { - if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { - if (mindev != NULL) - *mindev = 2; - return (VDEV_TYPE_RAIDZ); - } + if (strncmp(type, "raidz", 5) == 0) { + const char *p = type + 5; + char *end; + long nparity; + + if (*p == '\0') { + nparity = 1; + } else if (*p == '0') { + return (NULL); /* no zero prefixes allowed */ + } else { + errno = 0; + nparity = strtol(p, &end, 10); + if (errno != 0 || nparity < 1 || nparity >= 255 || + *end != '\0') + return (NULL); + } - if (strcmp(type, "raidz2") == 0) { if (mindev != NULL) - *mindev = 3; + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; return (VDEV_TYPE_RAIDZ); } + if (maxdev != NULL) + *maxdev = INT_MAX; + if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; @@ -1144,7 +1160,7 @@ nvlist_t * construct_spec(int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; - int t, toplevels, mindev, nspares, nlogs, nl2cache; + int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type; uint64_t is_log; boolean_t seen_logs; @@ -1166,7 +1182,7 @@ construct_spec(int argc, char **argv) * If it's a mirror or raidz, the subsequent arguments are * its leaves -- until we encounter the next mirror or raidz. */ - if ((type = is_grouping(argv[0], &mindev)) != NULL) { + if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1223,7 +1239,7 @@ construct_spec(int argc, char **argv) } for (c = 1; c < argc; c++) { - if (is_grouping(argv[c], NULL) != NULL) + if (is_grouping(argv[c], NULL, NULL) != NULL) break; children++; child = realloc(child, @@ -1243,6 +1259,13 @@ construct_spec(int argc, char **argv) return (NULL); } + if (children > maxdev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s supports no more than " + "%d devices\n"), argv[0], maxdev); + return (NULL); + } + argc -= c; argv += c; diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 746db0c07..5f49fd5a0 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -479,7 +479,7 @@ process_options(int argc, char **argv) zopt_raidz = MAX(1, value); break; case 'R': - zopt_raidz_parity = MIN(MAX(value, 1), 2); + zopt_raidz_parity = MIN(MAX(value, 1), 3); break; case 'd': zopt_datasets = MAX(1, value); @@ -1387,7 +1387,7 @@ ztest_destroy_cb(char *name, void *arg) /* * Destroy the dataset. */ - error = dmu_objset_destroy(name); + error = dmu_objset_destroy(name, B_FALSE); if (error) { (void) dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER | DS_MODE_READONLY, &os); @@ -1560,7 +1560,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za) zil_close(zilog); dmu_objset_close(os); - error = dmu_objset_destroy(name); + error = dmu_objset_destroy(name, B_FALSE); if (error) fatal(0, "dmu_objset_destroy(%s) = %d", name, error); @@ -1583,7 +1583,7 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) (void) snprintf(snapname, 100, "%s@%llu", osname, (u_longlong_t)za->za_instance); - error = dmu_objset_destroy(snapname); + error = dmu_objset_destroy(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(0, "dmu_objset_destroy() = %d", error); error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, @@ -1614,19 +1614,19 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t curval) (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval); (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval); - error = dmu_objset_destroy(clone2name); + error = dmu_objset_destroy(clone2name, B_FALSE); if (error && error != ENOENT) fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); - error = dmu_objset_destroy(snap3name); + error = dmu_objset_destroy(snap3name, B_FALSE); if (error && error != ENOENT) fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); - error = dmu_objset_destroy(snap2name); + error = dmu_objset_destroy(snap2name, B_FALSE); if (error && error != ENOENT) fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); - error = dmu_objset_destroy(clone1name); + error = dmu_objset_destroy(clone1name, B_FALSE); if (error && error != ENOENT) fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); - error = dmu_objset_destroy(snap1name); + error = dmu_objset_destroy(snap1name, B_FALSE); if (error && error != ENOENT) fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); } diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index 8d00a3151..f19e398f6 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -117,6 +117,8 @@ enum { EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ + EZFS_REFTAG_RELE, /* snapshot release: tag not found */ + EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_UNKNOWN }; @@ -286,6 +288,7 @@ typedef enum { ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ + ZPOOL_STATUS_REMOVED_DEV, /* removed device */ /* * Finally, the following indicates a healthy pool. @@ -454,8 +457,8 @@ extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); extern int zfs_create_ancestors(libzfs_handle_t *, const char *); -extern int zfs_destroy(zfs_handle_t *); -extern int zfs_destroy_snaps(zfs_handle_t *, char *); +extern int zfs_destroy(zfs_handle_t *, boolean_t); +extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); @@ -463,6 +466,8 @@ extern int zfs_rename(zfs_handle_t *, const char *, boolean_t); extern int zfs_send(zfs_handle_t *, const char *, const char *, boolean_t, boolean_t, boolean_t, boolean_t, int); extern int zfs_promote(zfs_handle_t *); +extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t); +extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 6fa196710..ff438b3d7 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -508,6 +508,14 @@ change_one(zfs_handle_t *zhp, void *data) &idx); uu_list_insert(clp->cl_list, cn, idx); } else { + /* + * Add this child to beginning of the list. Children + * below this one in the hierarchy will get added above + * this one in the list. This produces a list in + * reverse dataset name order. + * This is necessary when the original mountpoint + * is legacy or none. + */ ASSERT(!clp->cl_alldependents); verify(uu_list_insert_before(clp->cl_list, uu_list_first(clp->cl_list), cn) == 0); @@ -574,6 +582,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, zfs_handle_t *temp; char property[ZFS_MAXPROPLEN]; uu_compare_fn_t *compare = NULL; + boolean_t legacy = B_FALSE; if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) return (NULL); @@ -586,8 +595,19 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { - compare = compare_mountpoints; - clp->cl_sorted = B_TRUE; + + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + property, sizeof (property), + NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + + legacy = B_TRUE; + } + if (!legacy) { + compare = compare_mountpoints; + clp->cl_sorted = B_TRUE; + } } clp->cl_pool = uu_list_pool_create("changelist_pool", @@ -695,6 +715,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, (void) uu_list_find(clp->cl_list, cn, NULL, &idx); uu_list_insert(clp->cl_list, cn, idx); } else { + /* + * Add the target dataset to the end of the list. + * The list is not really unsorted. The list will be + * in reverse dataset name order. This is necessary + * when the original mountpoint is legacy or none. + */ verify(uu_list_insert_after(clp->cl_list, uu_list_last(clp->cl_list), cn) == 0); } @@ -703,11 +729,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, * If the mountpoint property was previously 'legacy', or 'none', * record it as the behavior of changelist_postfix() will be different. */ - if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && - (zfs_prop_get(zhp, prop, property, sizeof (property), - NULL, NULL, 0, B_FALSE) == 0 && - (strcmp(property, "legacy") == 0 || - strcmp(property, "none") == 0))) { + if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { /* * do not automatically mount ex-legacy datasets if * we specifically set canmount to noauto diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index ac9122605..ab9ba6b80 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -47,6 +47,7 @@ #include <ucred.h> #include <idmap.h> #include <aclutils.h> +#include <directory.h> #include <sys/spa.h> #include <sys/zap.h> @@ -1674,21 +1675,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "unable to get %s property"), - zfs_prop_to_name(prop)); - return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION, - dgettext(TEXT_DOMAIN, "internal error"))); + return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "unable to get %s property"), - zfs_prop_to_name(prop)); - return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM, - dgettext(TEXT_DOMAIN, "internal error"))); + return (-1); } if (zplprops) nvlist_free(zplprops); @@ -2074,6 +2067,7 @@ userquota_propname_decode(const char *propname, boolean_t zoned, { zfs_userquota_prop_t type; char *cp, *end; + char *numericsid = NULL; boolean_t isuser; domain[0] = '\0'; @@ -2096,33 +2090,41 @@ userquota_propname_decode(const char *propname, boolean_t zoned, if (strchr(cp, '@')) { /* * It's a SID name (eg "user@domain") that needs to be - * turned into S-1-domainID-RID. There should be a - * better way to do this, but for now just translate it - * to the (possibly ephemeral) uid and then back to the - * SID. This is like getsidname(noresolve=TRUE). + * turned into S-1-domainID-RID. */ - uid_t id; - idmap_rid_t rid; - char *mapdomain; - + directory_error_t e; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); - if (sid_to_id(cp, isuser, &id) != 0) + if (isuser) { + e = directory_sid_from_user_name(NULL, + cp, &numericsid); + } else { + e = directory_sid_from_group_name(NULL, + cp, &numericsid); + } + if (e != NULL) { + directory_error_free(e); return (ENOENT); - if (idmap_id_to_numeric_domain_rid(id, isuser, - &mapdomain, &rid) != 0) + } + if (numericsid == NULL) return (ENOENT); - (void) strlcpy(domain, mapdomain, domainlen); - *ridp = rid; - } else if (strncmp(cp, "S-1-", 4) == 0) { + cp = numericsid; + /* will be further decoded below */ + } + + if (strncmp(cp, "S-1-", 4) == 0) { /* It's a numeric SID (eg "S-1-234-567-89") */ - (void) strcpy(domain, cp); + (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); + if (numericsid) { + free(numericsid); + numericsid = NULL; + } if (errno != 0 || *end != '\0') return (EINVAL); } else if (!isdigit(*cp)) { @@ -2158,13 +2160,14 @@ userquota_propname_decode(const char *propname, boolean_t zoned, if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); - (void) strcpy(domain, mapdomain); + (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; } else { *ridp = id; } } + ASSERT3P(numericsid, ==, NULL); return (0); } @@ -2763,7 +2766,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, * isn't mounted, and that there are no active dependents. */ int -zfs_destroy(zfs_handle_t *zhp) +zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { zfs_cmd_t zc = { 0 }; @@ -2787,6 +2790,7 @@ zfs_destroy(zfs_handle_t *zhp) zc.zc_objset_type = DMU_OST_ZFS; } + zc.zc_defer_destroy = defer; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), @@ -2843,7 +2847,7 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg) * Destroys all snapshots with the given name in zhp & descendants. */ int -zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname) +zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { zfs_cmd_t zc = { 0 }; int ret; @@ -2860,6 +2864,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname) (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + zc.zc_defer_destroy = defer; ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc); if (ret != 0) { @@ -3275,7 +3280,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data) logstr = zhp->zfs_hdl->libzfs_log_str; zhp->zfs_hdl->libzfs_log_str = NULL; - cbp->cb_error |= zfs_destroy(zhp); + cbp->cb_error |= zfs_destroy(zhp, B_FALSE); zhp->zfs_hdl->libzfs_log_str = logstr; } } else { @@ -3289,7 +3294,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data) zfs_close(zhp); return (0); } - if (zfs_destroy(zhp) != 0) + if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); @@ -4089,3 +4094,79 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, return (error); } + +int +zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)); + zc.zc_cookie = recursive; + + if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot hold '%s@%s'"), zc.zc_name, snapname); + switch (errno) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case EEXIST: + return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} + +int +zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)); + zc.zc_cookie = recursive; + + if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot release '%s@%s'"), zc.zc_name, snapname); + switch (errno) { + case ESRCH: + return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 612a09914..1ffb6294c 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -113,6 +113,9 @@ fsavl_destroy(avl_tree_t *avl) free(avl); } +/* + * Given an nvlist, produce an avl tree of snapshots, ordered by guid + */ static avl_tree_t * fsavl_create(nvlist_t *fss) { @@ -243,7 +246,9 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) continue; verify(nvpair_value_nvlist(elem, &propnv) == 0); - if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) { + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { /* these guys are modifyable, but have no source */ uint64_t value; verify(nvlist_lookup_uint64(propnv, @@ -274,6 +279,11 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) } } +/* + * recursively generate nvlists describing datasets. See comment + * for the data structure send_data_t above for description of contents + * of the nvlist. + */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { @@ -689,9 +699,20 @@ again: } /* - * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL. - * If 'doall', dump all intermediate snaps. - * If 'replicate', dump special header and do recursively. + * Generate a send stream for the dataset identified by the argument zhp. + * + * The content of the send stream is the snapshot identified by + * 'tosnap'. Incremental streams are requested in two ways: + * - from the snapshot identified by "fromsnap" (if non-null) or + * - from the origin of the dataset identified by zhp, which must + * be a clone. In this case, "fromsnap" is null and "fromorigin" + * is TRUE. + * + * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and + * uses a special header (with a version field of DMU_BACKUP_HEADER_VERSION) + * if "replicate" is set. If "doall" is set, dump all the intermediate + * snapshots. The DMU_BACKUP_HEADER_VERSION header is used in the "doall" + * case too. */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, @@ -900,11 +921,12 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, if (err) return (err); + zc.zc_objset_type = DMU_OST_ZFS; + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + if (tryname) { (void) strcpy(newname, tryname); - zc.zc_objset_type = DMU_OST_ZFS; - (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value)); if (flags.verbose) { @@ -959,12 +981,18 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; + boolean_t defer = B_FALSE; + int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags.force ? MS_FORCE : 0); + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS) + defer = B_TRUE; zfs_close(zhp); if (clp == NULL) return (-1); @@ -973,12 +1001,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, return (err); zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_defer_destroy = defer; (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); if (flags.verbose) (void) printf("attempting destroy %s\n", zc.zc_name); err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc); - if (err == 0) { if (flags.verbose) (void) printf("success\n"); @@ -988,7 +1016,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, (void) changelist_postfix(clp); changelist_free(clp); - if (err != 0) + /* + * Deferred destroy should always succeed. Since we can't tell + * if it destroyed the dataset or just marked it for deferred + * destroy, always do the rename just in case. + */ + if (err != 0 || defer) err = recv_rename(hdl, name, NULL, baselen, newname, flags); return (err); @@ -1775,11 +1808,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); if (clp == NULL) { + zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } if (changelist_prefix(clp) != 0) { changelist_free(clp); + zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } @@ -1936,7 +1971,8 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * (if created, or if we tore them down to do an incremental * restore), and the /dev links for the new snapshot (if * created). Also mount any children of the target filesystem - * if we did an incremental receive. + * if we did a replication receive (indicated by stream_avl + * being non-NULL). */ cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { @@ -1952,7 +1988,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, if (err == 0 && ioctl_err == 0) err = zvol_create_link(hdl, zc.zc_value); - } else if (newfs) { + } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, * for mounting and sharing later. diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index c7eb04e74..44faf02af 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,6 +104,13 @@ vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) return (state == VDEV_STATE_OFFLINE); } +/* ARGSUSED */ +static int +vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_REMOVED); +} + /* * Detect if any leaf devices that have seen errors or could not be opened. */ @@ -276,6 +283,12 @@ check_status(nvlist_t *config, boolean_t isimport) return (ZPOOL_STATUS_OFFLINE_DEV); /* + * Removed device + */ + if (find_vdev_problem(nvroot, vdev_removed)) + return (ZPOOL_STATUS_REMOVED_DEV); + + /* * Currently resilvering */ if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 30829d50d..4da0fb44b 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -213,6 +213,11 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_UNPLAYED_LOGS: return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " "logs")); + case EZFS_REFTAG_RELE: + return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); + case EZFS_REFTAG_HOLD: + return (dgettext(TEXT_DOMAIN, "tag already exists on this " + "dataset")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/module/zcommon/include/sys/fs/zfs.h b/module/zcommon/include/sys/fs/zfs.h index 6651b140c..86b36a8ae 100644 --- a/module/zcommon/include/sys/fs/zfs.h +++ b/module/zcommon/include/sys/fs/zfs.h @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -26,6 +27,8 @@ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H +#include <sys/time.h> + #ifdef __cplusplus extern "C" { #endif @@ -111,6 +114,8 @@ typedef enum { ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ + ZFS_PROP_DEFER_DESTROY, + ZFS_PROP_USERREFS, ZFS_NUM_PROPS } zfs_prop_t; @@ -280,14 +285,16 @@ typedef enum zfs_cache_type { #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL +#define SPA_VERSION_17 17ULL +#define SPA_VERSION_18 18ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_16 -#define SPA_VERSION_STRING "16" +#define SPA_VERSION SPA_VERSION_18 +#define SPA_VERSION_STRING "18" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -303,7 +310,7 @@ typedef enum zfs_cache_type { #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 -#define SPA_VERSION_RAID6 SPA_VERSION_3 +#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 @@ -325,6 +332,8 @@ typedef enum zfs_cache_type { #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 +#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 +#define SPA_VERSION_USERREFS SPA_VERSION_18 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -601,7 +610,10 @@ typedef enum zfs_ioc { ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, - ZFS_IOC_USERSPACE_UPGRADE + ZFS_IOC_USERSPACE_UPGRADE, + ZFS_IOC_HOLD, + ZFS_IOC_RELEASE, + ZFS_IOC_GET_HOLDS } zfs_ioc_t; /* @@ -715,6 +727,8 @@ typedef enum history_internal_events { LOG_DS_REFQUOTA, LOG_DS_REFRESERV, LOG_POOL_SCRUB_DONE, + LOG_DS_USER_HOLD, + LOG_DS_USER_RELEASE, LOG_END } history_internal_events_t; diff --git a/module/zcommon/include/zfs_deleg.h b/module/zcommon/include/zfs_deleg.h index cdbbd83de..e90cd0d5f 100644 --- a/module/zcommon/include/zfs_deleg.h +++ b/module/zcommon/include/zfs_deleg.h @@ -61,6 +61,8 @@ typedef enum { ZFS_DELEG_NOTE_GROUPQUOTA, ZFS_DELEG_NOTE_USERUSED, ZFS_DELEG_NOTE_GROUPUSED, + ZFS_DELEG_NOTE_HOLD, + ZFS_DELEG_NOTE_RELEASE, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c index 2964cae5d..35f81b584 100644 --- a/module/zcommon/zfs_deleg.c +++ b/module/zcommon/zfs_deleg.c @@ -67,6 +67,8 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, {NULL, ZFS_DELEG_NOTE_NONE } }; diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 45730c6fc..5cfafea47 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -59,7 +59,7 @@ valid_char(char c) * Snapshot names must be made up of alphanumeric characters plus the following * characters: * - * [-_.:] + * [-_.: ] */ int snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 05d830633..6a3284609 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -235,6 +235,9 @@ zfs_prop_init(void) /* readonly index (boolean) properties */ register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); + register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, + PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", + boolean_table); /* set once index properties */ register_index(ZFS_PROP_NORMALIZE, "normalization", 0, @@ -286,6 +289,8 @@ zfs_prop_init(void) register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV"); + register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS"); /* default number properties */ register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 785c7c621..d86468202 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -87,6 +87,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "scrub work queue" }, { zap_byteswap, TRUE, "ZFS user/group used" }, { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, }; int @@ -195,7 +196,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, ASSERT(length <= DMU_MAX_ACCESS); - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; @@ -212,6 +213,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); return (EIO); } nblks = 1; @@ -234,9 +236,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } /* initiate async i/o */ if (read) { - rw_exit(&dn->dn_struct_rwlock); (void) dbuf_read(db, zio, dbuf_flags); - rw_enter(&dn->dn_struct_rwlock, RW_READER); } dbp[i] = &db->db; } @@ -376,56 +376,51 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + */ static int -get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) { - uint64_t len = *offset - limit; - uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; - uint64_t subchunk = + uint64_t len = *start - limit; + uint64_t blkcnt = 0; + uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); - ASSERT(limit <= *offset); + ASSERT(limit <= *start); - if (len <= chunk_len) { - *offset = limit; + if (len <= iblkrange * maxblks) { + *start = limit; return (0); } + ASSERT(ISP2(iblkrange)); - ASSERT(ISP2(subchunk)); - - while (*offset > limit) { - uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); - uint64_t delta; + while (*start > limit && blkcnt < maxblks) { int err; - /* skip over allocated data */ + /* find next allocated L1 indirect */ err = dnode_next_offset(dn, - DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) - return (err); + DNODE_FIND_BACKWARDS, start, 2, 1, 0); - ASSERT3U(*offset, <=, initial_offset); - *offset = P2ALIGN(*offset, subchunk); - delta = initial_offset - *offset; - if (delta >= chunk_len) { - *offset += delta - chunk_len; + /* if there are no more, then we are done */ + if (err == ESRCH) { + *start = limit; return (0); - } - chunk_len -= delta; - - /* skip over unallocated data */ - err = dnode_next_offset(dn, - DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) + } else if (err) { return (err); + } + blkcnt += 1; - if (*offset < limit) - *offset = limit; - ASSERT3U(*offset, <, initial_offset); + /* reset offset to end of "next" block back */ + *start = P2ALIGN(*start, iblkrange); + if (*start <= limit) + *start = limit; + else + *start -= 1; } return (0); } @@ -548,7 +543,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, { dnode_t *dn; dmu_buf_t **dbp; - int numbufs, i, err; + int numbufs, err; err = dnode_hold(os->os, object, FTAG, &dn); if (err) @@ -559,7 +554,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ - if (dn->dn_datablkshift == 0) { + if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); @@ -568,6 +563,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; /* * NB: we could do this block-at-a-time, but it's nice @@ -803,9 +799,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); - if (err) - break; - offset += tocpy; size -= tocpy; } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index e962c4b88..5a9d25b77 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -679,7 +679,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, } int -dmu_objset_destroy(const char *name) +dmu_objset_destroy(const char *name, boolean_t defer) { objset_t *os; int error; @@ -696,7 +696,7 @@ dmu_objset_destroy(const char *name) dsl_dataset_t *ds = os->os->os_dsl_dataset; zil_destroy(dmu_objset_zil(os), B_FALSE); - error = dsl_dataset_destroy(ds, os); + error = dsl_dataset_destroy(ds, os, defer); /* * dsl_dataset_destroy() closes the ds. */ @@ -1130,7 +1130,7 @@ dmu_objset_userspace_upgrade(objset_t *os) */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_t *tx; dmu_buf_t *db; int objerr; @@ -1140,6 +1140,7 @@ dmu_objset_userspace_upgrade(objset_t *os) objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr) continue; + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 9ca3999dd..ce59aac50 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -393,6 +393,7 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; struct recvbeginsyncarg *rbsa = arg2; int err; + struct dsl_ds_destroyarg dsda = {0}; /* must be a head ds */ if (ds->ds_phys->ds_next_snap_obj != 0) @@ -402,7 +403,8 @@ recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dir_is_clone(ds->ds_dir)) return (EINVAL); - err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + dsda.ds = ds; + err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx); if (err) return (err); @@ -427,13 +429,16 @@ recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dir_t *dd = ds->ds_dir; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; + struct dsl_ds_destroyarg dsda = {0}; /* * NB: caller must provide an extra hold on the dsl_dir_t, so it * won't go away when dsl_dataset_destroy_sync() closes the * dataset. */ - dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + dsda.ds = ds; + dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx); + ASSERT3P(dsda.rm_origin, ==, NULL); dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); @@ -483,7 +488,7 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -513,27 +518,13 @@ recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_spa, tx, cr, "dataset = %lld", dsobj); } -/* ARGSUSED */ -static void -recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_object); -} - /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) + boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) { int err = 0; boolean_t byteswap; @@ -582,36 +573,8 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, /* * Process the begin in syncing context. */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { - /* offline incremental receive */ - err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); - if (err) - return (err); - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - rbsa.fromguid) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); - } - rbsa.force = B_FALSE; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_offline_incremental_sync, ds, &rbsa, 1); - if (err) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = drc->drc_real_ds = ds; - } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { - /* online incremental receive */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* incremental receive */ /* tmp clone name is: tofs/%tosnap" */ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), @@ -622,11 +585,18 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, if (err) return (err); + /* must not have an incremental recv already in progress */ + if (!mutex_tryenter(&ds->ds_recvlock)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EBUSY); + } + rbsa.force = force; err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_incremental_check, - recv_online_incremental_sync, ds, &rbsa, 5); + recv_incremental_sync, ds, &rbsa, 5); if (err) { + mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); return (err); } @@ -931,26 +901,6 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } -void -dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) -{ - if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { - /* - * online incremental or new fs: destroy the fs (which - * may be a clone) that we created - */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - if (drc->drc_real_ds != drc->drc_logical_ds) - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } else { - /* - * offline incremental: rollback to most recent snapshot. - */ - (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); - dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); - } -} - /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -1078,11 +1028,17 @@ out: if (ra.err != 0) { /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. + * destroy what we created, so we don't leave it in the + * inconsistent restoring state. */ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - dmu_recv_abort_cleanup(drc); + + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + if (drc->drc_real_ds != drc->drc_logical_ds) { + mutex_exit(&drc->drc_logical_ds->ds_recvlock); + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } } kmem_free(ra.buf, ra.bufsize); @@ -1149,7 +1105,9 @@ dmu_recv_end(dmu_recv_cookie_t *drc) dsl_dataset_rele(ds, dmu_recv_tag); } /* dsl_dataset_destroy() will disown the ds */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + mutex_exit(&drc->drc_logical_ds->ds_recvlock); if (err) return (err); } @@ -1163,7 +1121,8 @@ dmu_recv_end(dmu_recv_cookie_t *drc) if (err) { if (drc->drc_newfs) { ASSERT(ds == drc->drc_real_ds); - (void) dsl_dataset_destroy(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(ds, dmu_recv_tag, + B_FALSE); return (err); } else { (void) dsl_dataset_rollback(ds, DMU_OST_NONE); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index af2b049a7..c6fbeeef0 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -697,8 +697,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) } err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, - &txh->txh_space_towrite, &txh->txh_space_tooverwrite, - txh->txh_dnode->dn_datablkshift); + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index cf49b97f1..d82e72a14 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1260,6 +1260,22 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) dmu_tx_willuse_space(tx, space); } +/* + * This function scans a block at the indicated "level" looking for + * a hole or data (depending on 'flags'). If level > 0, then we are + * scanning an indirect block looking at its pointers. If level == 0, + * then we are looking at a block of dnodes. If we don't find what we + * are looking for in the block, we return ESRCH. Otherwise, return + * with *offset pointing to the beginning (if searching forwards) or + * end (if searching backwards) of the range covered by the block + * pointer we matched on (or dnode). + * + * The basic search algorithm used below by dnode_next_offset() is to + * use this function to search up the block tree (widen the search) until + * we find something (i.e., we don't return ESRCH) and then search back + * down the tree (narrow the search) until we reach our original search + * level. + */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) @@ -1330,6 +1346,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = ESRCH; } else { blkptr_t *bp = data; + uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); @@ -1339,18 +1356,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, else minfill++; - for (i = (*offset >> span) & ((1ULL << epbs) - 1); + *offset = *offset >> span; + for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && (hole || bp[i].blk_birth > txg)) break; - if (inc < 0 && *offset < (1ULL << span)) - *offset = 0; - else - *offset += (1ULL << span) * inc; + if (inc > 0 || *offset > 0) + *offset += inc; + } + *offset = *offset << span; + if (inc < 0) { + /* traversing backwards; position offset at the end */ + ASSERT3U(*offset, <=, start); + *offset = MIN(*offset + (1ULL << span) - 1, start); + } else if (*offset < start) { + *offset = start; } - if (i < 0 || i == epb) + if (i < 0 || i >= epb) error = ESRCH; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 0fe7eb583..edc36e72b 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -39,6 +39,7 @@ #include <sys/spa.h> #include <sys/zfs_znode.h> #include <sys/sunddi.h> +#include <sys/zvol.h> static char *dsl_reaper = "the grim reaper"; @@ -262,6 +263,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -359,6 +361,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -377,6 +380,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, * just opened it. */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -406,8 +410,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_rele(origin, FTAG); } } - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj, + &ds->ds_userrefs); + } } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { @@ -448,6 +459,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); @@ -845,6 +857,7 @@ struct destroyarg { dsl_sync_task_group_t *dstg; char *snapname; char *failed; + boolean_t defer; }; static int @@ -852,23 +865,30 @@ dsl_snapshot_destroy_one(char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; - char *cp; int err; - - (void) strcat(name, "@"); - (void) strcat(name, da->snapname); - err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, + char *dsname; + size_t buflen; + + /* alloc a buffer to hold name@snapname, plus the terminating NULL */ + buflen = strlen(name) + strlen(da->snapname) + 2; + dsname = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname); + err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT, da->dstg, &ds); - cp = strchr(name, '@'); - *cp = '\0'; + kmem_free(dsname, buflen); if (err == 0) { + struct dsl_ds_destroyarg *dsda; + dsl_dataset_make_exclusive(ds, da->dstg); if (ds->ds_user_ptr) { ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); + dsda->ds = ds; + dsda->defer = da->defer; dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); + dsl_dataset_destroy_sync, dsda, da->dstg, 0); } else if (err == ENOENT) { err = 0; } else { @@ -882,7 +902,7 @@ dsl_snapshot_destroy_one(char *name, void *arg) */ #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname) +dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) { int err; struct destroyarg da; @@ -895,6 +915,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname) da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); da.snapname = snapname; da.failed = fsname; + da.defer = defer; err = dmu_objset_find(fsname, dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); @@ -904,7 +925,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + struct dsl_ds_destroyarg *dsda = dst->dst_arg1; + dsl_dataset_t *ds = dsda->ds; + /* * Return the file system name that triggered the error */ @@ -912,7 +935,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) dsl_dataset_name(ds, fsname); *strchr(fsname, '@') = '\0'; } + ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, da.dstg); + kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(da.dstg); @@ -920,18 +945,100 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +static boolean_t +dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) +{ + boolean_t might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds)) + might_destroy = B_TRUE; + mutex_exit(&ds->ds_lock); + + return (might_destroy); +} + +#ifdef _KERNEL +static int +dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name) +{ + int error; + objset_t *os; + + error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + if (error) + return (error); + + if (dmu_objset_type(os) == DMU_OST_ZVOL) + error = zvol_remove_minor(name); + dmu_objset_close(os); + + return (error); +} +#endif + +/* + * If we're removing a clone, and these three conditions are true: + * 1) the clone's origin has no other children + * 2) the clone's origin has no user references + * 3) the clone's origin has been marked for deferred destruction + * Then, prepare to remove the origin as part of this sync task group. + */ +static int +dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *origin = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(origin)) { + char *name; + int namelen; + int error; + + namelen = dsl_dataset_namelen(origin) + 1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(origin, name); +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + kmem_free(name, namelen); + return (error); + } + error = dsl_dataset_zvol_cleanup(origin, name); + if (error) { + kmem_free(name, namelen); + return (error); + } +#endif + error = dsl_dataset_own(name, + DS_MODE_READONLY | DS_MODE_INCONSISTENT, + tag, &origin); + kmem_free(name, namelen); + if (error) + return (error); + dsda->rm_origin = origin; + dsl_dataset_make_exclusive(origin, tag); + } + + return (0); +} + /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; + struct dsl_ds_destroyarg dsda = {0}; + + dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ @@ -941,9 +1048,12 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } + /* NOTE: defer is always B_FALSE for non-snapshots */ + dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, tag, 0); + &dsda, tag, 0); + ASSERT3P(dsda.rm_origin, ==, NULL); goto out; } @@ -1024,13 +1134,45 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) ds->ds_user_evict_func(ds, ds->ds_user_ptr); ds->ds_user_ptr = NULL; } - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); + + /* + * If we're removing a clone, we might also need to remove its + * origin. + */ + do { + dsda.need_prep = B_FALSE; + if (dsl_dir_is_clone(dd)) { + err = dsl_dataset_origin_rm_prep(&dsda, tag); + if (err) { + dsl_dir_close(dd, FTAG); + goto out; + } + } + + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, &dsda, tag, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, dd, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + + /* + * We could be racing against 'zfs release' or 'zfs destroy -d' + * on the origin snap, in which case we can get EBUSY if we + * needed to destroy the origin snap but were not ready to + * do so. + */ + if (dsda.need_prep) { + ASSERT(err == EBUSY); + ASSERT(dsl_dir_is_clone(dd)); + ASSERT(dsda.rm_origin == NULL); + } + } while (dsda.need_prep); + + if (dsda.rm_origin != NULL) + dsl_dataset_disown(dsda.rm_origin, tag); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); @@ -1211,7 +1353,8 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * We can only roll back to emptyness if it is a ZPL objset. */ - if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) + if (*ost != DMU_OST_ZFS && + ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) return (EINVAL); /* @@ -1316,6 +1459,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } else { objset_impl_t *osi; + ASSERT(*ost != DMU_OST_ZVOL); ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); @@ -1385,18 +1529,63 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) cr, "dataset = %llu", ds->ds_object); } +static int +dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *ds_prev = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(ds_prev)) { + struct dsl_ds_destroyarg ndsda = {0}; + + /* + * If we're not prepared to remove the origin, don't remove + * the clone either. + */ + if (dsda->rm_origin == NULL) { + dsda->need_prep = B_TRUE; + return (EBUSY); + } + + ndsda.ds = ds_prev; + ndsda.is_origin_rm = B_TRUE; + return (dsl_dataset_destroy_check(&ndsda, tag, tx)); + } + + /* + * If we're not going to remove the origin after all, + * undo the open context setup. + */ + if (dsda->rm_origin != NULL) { + dsl_dataset_disown(dsda->rm_origin, tag); + dsda->rm_origin = NULL; + } + + return (0); +} + /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - /* Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) - return (EEXIST); + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (dsda->defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + ASSERT(dsl_dataset_is_snapshot(ds)); + return (0); + } /* * Can't delete a head dataset if there are snapshots of it. @@ -1414,6 +1603,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); + if (dsl_dataset_is_snapshot(ds)) { + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0 && !dsda->releasing) + return (EBUSY); + + mutex_enter(&ds->ds_lock); + /* + * Can't delete a branch point. However, if we're destroying + * a clone and removing its origin due to it having a user + * hold count of 0 and having been marked for deferred destroy, + * it's OK for the origin to have a single clone. + */ + if (ds->ds_phys->ds_num_children > + (dsda->is_origin_rm ? 2 : 1)) { + mutex_exit(&ds->ds_lock); + return (EEXIST); + } + mutex_exit(&ds->ds_lock); + } else if (dsl_dir_is_clone(ds->ds_dir)) { + return (dsl_dataset_origin_check(dsda, arg2, tx)); + } + /* XXX we should do some i/o error checking... */ return (0); } @@ -1461,7 +1675,8 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) void dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1471,11 +1686,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) uint64_t obj; ASSERT(ds->ds_owner); - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + if (dsda->defer) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; + } + } + /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; @@ -1521,6 +1745,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; + + /* + * If the clone's origin has no other clones, no + * user holds, and has been marked for deferred + * deletion, then we should have done the necessary + * destroy setup for it. + */ + if (ds_prev->ds_phys->ds_num_children == 1 && + ds_prev->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds_prev)) { + ASSERT3P(dsda->rm_origin, !=, NULL); + } else { + ASSERT3P(dsda->rm_origin, ==, NULL); + } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; @@ -1733,10 +1971,32 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); + + if (dsda->rm_origin) { + /* + * Remove the origin of the clone we just destroyed. + */ + dsl_dataset_t *origin = ds->ds_prev; + struct dsl_ds_destroyarg ndsda = {0}; + + ASSERT3P(origin, ==, dsda->rm_origin); + if (origin->ds_user_ptr) { + origin->ds_user_evict_func(origin, origin->ds_user_ptr); + origin->ds_user_ptr = NULL; + } + + dsl_dataset_rele(origin, tag); + ds->ds_prev = NULL; + + ndsda.ds = origin; + dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); + } } static int @@ -1951,6 +2211,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -3019,7 +3282,7 @@ dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_quota = new_quota; - dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", @@ -3114,7 +3377,7 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation", new_reservation, cr, tx); spa_history_internal_log(LOG_DS_REFRESERV, @@ -3138,3 +3401,421 @@ dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) dsl_dataset_rele(ds, FTAG); return (err); } + +static int +dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *htag = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error = 0; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (strlen(htag) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + /* tags must be unique */ + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj) { + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, + 8, 1, tx); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +static void +dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + char *htag = arg2; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + time_t now = gethrestime_sec(); + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + spa_history_internal_log(LOG_DS_USER_HOLD, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu", + htag, ds->ds_object); +} + +struct dsl_ds_holdarg { + dsl_sync_task_group_t *dstg; + char *htag; + char *snapname; + boolean_t recursive; + char failed[MAXPATHLEN]; +}; + +static int +dsl_dataset_user_hold_one(char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + dsl_dataset_t *ds; + int error; + char *name; + size_t buflen; + + /* alloc a buffer to hold dsname@snapname plus terminating NULL */ + buflen = strlen(dsname) + strlen(ha->snapname) + 2; + name = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, ha->dstg, &ds); + kmem_free(name, buflen); + if (error == 0) { + dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, ds, ha->htag, 0); + } else if (error == ENOENT && ha->recursive) { + error = 0; + } else { + (void) strcpy(ha->failed, dsname); + } + return (error); +} + +int +dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_hold_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + + if (dst->dst_err) { + dsl_dataset_name(ds, ha->failed); + *strchr(ha->failed, '@') = '\0'; + } + dsl_dataset_rele(ds, ha->dstg); + } + + if (error) + (void) strcpy(dsname, ha->failed); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + return (error); +} + +struct dsl_ds_releasearg { + dsl_dataset_t *ds; + const char *htag; + boolean_t own; /* do we own or just hold ds? */ +}; + +static int +dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, + boolean_t *might_destroy) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t tmp; + int error; + + *might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) { + /* The tag can't possibly exist */ + mutex_exit(&ds->ds_lock); + return (ESRCH); + } + + /* Make sure the tag exists */ + error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); + if (error) { + mutex_exit(&ds->ds_lock); + if (error == ENOENT) + error = ESRCH; + return (error); + } + + if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) + *might_destroy = B_TRUE; + + mutex_exit(&ds->ds_lock); + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + boolean_t might_destroy; + int error; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); + if (error) + return (error); + + if (might_destroy) { + struct dsl_ds_destroyarg dsda = {0}; + + if (dmu_tx_is_syncing(tx)) { + /* + * If we're not prepared to remove the snapshot, + * we can't allow the release to happen right now. + */ + if (!ra->own) + return (EBUSY); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + } + dsda.ds = ds; + dsda.releasing = B_TRUE; + return (dsl_dataset_destroy_check(&dsda, tag, tx)); + } + + return (0); +} + +static void +dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t dsobj = ds->ds_object; + uint64_t refs; + + mutex_enter(&ds->ds_lock); + ds->ds_userrefs--; + refs = ds->ds_userrefs; + mutex_exit(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); + if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) { + struct dsl_ds_destroyarg dsda = {0}; + + ASSERT(ra->own); + dsda.ds = ds; + dsda.releasing = B_TRUE; + /* We already did the destroy_check */ + dsl_dataset_destroy_sync(&dsda, tag, cr, tx); + } + + spa_history_internal_log(LOG_DS_USER_RELEASE, + spa, tx, cr, "<%s> %lld dataset = %llu", + ra->htag, (longlong_t)refs, dsobj); +} + +static int +dsl_dataset_user_release_one(char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + struct dsl_ds_releasearg *ra; + dsl_dataset_t *ds; + int error; + void *dtag = ha->dstg; + char *name; + size_t buflen; + boolean_t own = B_FALSE; + boolean_t might_destroy; + + if (strlen(ha->htag) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ + buflen = strlen(dsname) + strlen(ha->snapname) + 2; + name = kmem_alloc(buflen, KM_SLEEP); + (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, dtag, &ds); + kmem_free(name, buflen); + if (error == ENOENT && ha->recursive) + return (0); + (void) strcpy(ha->failed, dsname); + if (error) + return (error); + + ASSERT(dsl_dataset_is_snapshot(ds)); + + error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + + if (might_destroy) { +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + error = dsl_dataset_zvol_cleanup(ds, name); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } +#endif + if (!dsl_dataset_tryown(ds, + DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) { + dsl_dataset_rele(ds, dtag); + return (EBUSY); + } else { + own = B_TRUE; + dsl_dataset_make_exclusive(ds, dtag); + } + } + + ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); + ra->ds = ds; + ra->htag = ha->htag; + ra->own = own; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, ra, dtag, 0); + + return (0); +} + +int +dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_release_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_release_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + struct dsl_ds_releasearg *ra = dst->dst_arg1; + dsl_dataset_t *ds = ra->ds; + + if (dst->dst_err) + dsl_dataset_name(ds, ha->failed); + + if (ra->own) + dsl_dataset_disown(ds, ha->dstg); + else + dsl_dataset_rele(ds, ha->dstg); + + kmem_free(ra, sizeof (struct dsl_ds_releasearg)); + } + + if (error) + (void) strcpy(dsname, ha->failed); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + return (error); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, + za->za_first_integer)); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index da5d15787..5d76ff5f9 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -66,8 +66,6 @@ * The ZAP OBJ is referred to as the jump object. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -540,7 +538,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) dsl_pool_t *dp; void *cookie; int error; - char checkflag = ZFS_DELEG_LOCAL; + char checkflag; objset_t *mos; avl_tree_t permsets; perm_set_t *setnode; @@ -563,6 +561,16 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) return (EPERM); } + if (dsl_dataset_is_snapshot(ds)) { + /* + * Snapshots are treated as descendents only, + * local permissions do not apply. + */ + checkflag = ZFS_DELEG_DESCENDENT; + } else { + checkflag = ZFS_DELEG_LOCAL; + } + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index f19653d92..2f312ae34 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -96,7 +96,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, #endif if (dd == NULL) { dsl_dir_t *winner; - int err; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 664ccff45..bfc0fa87e 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -442,7 +442,7 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void -dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, +dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index 8a802b53a..03ebb90bb 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -1024,6 +1024,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, int dsl_pool_scrub_clean(dsl_pool_t *dp) { + spa_t *spa = dp->dp_spa; + /* * Purge all vdev caches. We do this here rather than in sync * context because this requires a writer lock on the spa_config @@ -1031,11 +1033,11 @@ dsl_pool_scrub_clean(dsl_pool_t *dp) * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ - spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); - dp->dp_spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(dp->dp_spa->spa_root_vdev); - dp->dp_spa->spa_scrub_reopen = B_FALSE; - spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + spa_vdev_state_enter(spa); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); } diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index a363d7032..3ff71b3b7 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -117,6 +117,7 @@ typedef enum dmu_object_type { DMU_OT_SCRUB_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -174,8 +175,8 @@ int dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_snapshots_destroy(char *fsname, char *snapname); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, boolean_t recursive); @@ -646,10 +647,9 @@ typedef struct dmu_recv_cookie { } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); + boolean_t force, objset_t *origin, dmu_recv_cookie_t *); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); int dmu_recv_end(dmu_recv_cookie_t *drc); -void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/module/zfs/include/sys/dmu_impl.h b/module/zfs/include/sys/dmu_impl.h index 96ce688e1..3868a5816 100644 --- a/module/zfs/include/sys/dmu_impl.h +++ b/module/zfs/include/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -211,10 +211,11 @@ extern "C" { * ds_lock * protects: * ds_user_ptr - * ds_user_evice_func + * ds_user_evict_func * ds_open_refcount * ds_snapname * ds_phys accounting + * ds_phys userrefs zapobj * ds_reserved * held from: * dsl_dataset_* diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 82cb6ad7d..052cb8dd9 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -117,7 +117,7 @@ void dmu_objset_close(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); +int dmu_objset_destroy(const char *name, boolean_t defer); int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, boolean_t recursive); diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h index a1c2896e3..b51036d38 100644 --- a/module/zfs/include/sys/dsl_dataset.h +++ b/module/zfs/include/sys/dsl_dataset.h @@ -63,6 +63,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); #define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) /* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -93,7 +101,8 @@ typedef struct dsl_dataset_phys { blkptr_t ds_bp; uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ - uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { @@ -111,6 +120,9 @@ typedef struct dsl_dataset { /* has internal locking: */ bplist_t ds_deadlist; + /* to protect against multiple concurrent incremental recv */ + kmutex_t ds_recvlock; + /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -122,6 +134,7 @@ typedef struct dsl_dataset { kmutex_t ds_lock; void *ds_user_ptr; dsl_dataset_evict_func_t *ds_user_evict_func; + uint64_t ds_userrefs; /* * ds_owner is protected by the ds_rwlock and the ds_lock @@ -143,6 +156,15 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; +struct dsl_ds_destroyarg { + dsl_dataset_t *ds; /* ds to destroy */ + dsl_dataset_t *rm_origin; /* also remove our origin? */ + boolean_t is_origin_rm; /* set if removing origin snap */ + boolean_t defer; /* destroy -d requested? */ + boolean_t releasing; /* destroying due to release? */ + boolean_t need_prep; /* do we need to retry due to EBUSY? */ +}; + #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) @@ -167,8 +189,8 @@ uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); -int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); -int dsl_snapshots_destroy(char *fsname, char *snapname); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); +int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; @@ -178,6 +200,11 @@ int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); int dsl_dataset_promote(const char *name); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, void *p, dsl_dataset_evict_func_t func); diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h index b064c9228..a26a3f705 100644 --- a/module/zfs/include/sys/dsl_deleg.h +++ b/module/zfs/include/sys/dsl_deleg.h @@ -53,6 +53,8 @@ extern "C" { #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" /* * Note: the names of properties that are marked delegatable are also diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h index 26018a46d..5afaa1f0d 100644 --- a/module/zfs/include/sys/dsl_prop.h +++ b/module/zfs/include/sys/dsl_prop.h @@ -69,7 +69,7 @@ dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, int intsz, int numints, const void *buf); int dsl_props_set(const char *dsname, nvlist_t *nvl); -void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, +void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index c7ae4022e..0a4d55097 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -500,8 +500,9 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -void spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_internal_log(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ struct zbookmark; diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index 12999ee9e..84da68488 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -105,6 +105,7 @@ struct spa { int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_load_verbatim; /* load the given config? */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h index 71b9b12d6..7e53f62d2 100644 --- a/module/zfs/include/sys/vdev.h +++ b/module/zfs/include/sys/vdev.h @@ -47,6 +47,7 @@ typedef enum vdev_dtl_type { extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *vd); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 8240b66ac..23780430d 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -127,6 +127,8 @@ struct vdev { space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ /* * Top-level vdev state. diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h index de2053812..967174be4 100644 --- a/module/zfs/include/sys/zap.h +++ b/module/zfs/include/sys/zap.h @@ -182,8 +182,7 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, boolean_t *normalization_conflictp); int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, - int add, uint64_t *towrite, uint64_t *tooverwrite, - uint64_t dn_datablkshift); + int add, uint64_t *towrite, uint64_t *tooverwrite); /* * Create an attribute with the given name and value. diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h index f5e5aa7f4..3488962e2 100644 --- a/module/zfs/include/sys/zfs_acl.h +++ b/module/zfs/include/sys/zfs_acl.h @@ -203,6 +203,7 @@ void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index 1e9f35155..3a3e6e711 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -165,6 +165,7 @@ typedef struct zfs_cmd { dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; } zfs_cmd_t; typedef struct zfs_useracct { diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index b8ed7b27f..28555232b 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -73,7 +73,6 @@ struct zfsvfs { boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ - kmutex_t z_online_recv_lock; /* held while recv in progress */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index 69f4b50f5..5db5b8d51 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -77,6 +77,7 @@ extern "C" { #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ /* * Is ID ephemeral? @@ -200,6 +201,7 @@ typedef struct znode { uint64_t z_gen; /* generation (same as zp_gen) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ + zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ /* * These are dmu managed fields. diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 5c51717c1..e47d8f468 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -143,6 +143,8 @@ enum zio_compress { #define ZIO_FLAG_GODFATHER 0x080000 #define ZIO_FLAG_TRYHARD 0x100000 +#define ZIO_FLAG_NODATA 0x200000 +#define ZIO_FLAG_OPTIONAL 0x400000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -161,7 +163,9 @@ enum zio_compress { ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_PROBE | \ - ZIO_FLAG_TRYHARD) + ZIO_FLAG_TRYHARD | \ + ZIO_FLAG_NODATA | \ + ZIO_FLAG_OPTIONAL) #define ZIO_FLAG_AGG_INHERIT \ (ZIO_FLAG_DONT_AGGREGATE | \ diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 710685dbc..4cef53f95 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/refcount.h> #include <sys/rrwlock.h> @@ -118,7 +116,7 @@ rrn_find_and_remove(rrwlock_t *rrl) rrw_node_t *prev = NULL; if (refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); + return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { if (rn->rn_rrl == rrl) { @@ -159,6 +157,14 @@ static void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + rrl->rr_anon_rcount.rc_count++; + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +#endif ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); @@ -208,19 +214,28 @@ void rrw_exit(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { + rrl->rr_anon_rcount.rc_count--; + if (rrl->rr_anon_rcount.rc_count == 0) + cv_broadcast(&rrl->rr_cv); + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__exitmiss); +#endif ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || !refcount_is_zero(&rrl->rr_linked_rcount) || rrl->rr_writer != NULL); if (rrl->rr_writer == NULL) { - if (rrn_find_and_remove(rrl)) { - if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - - } else { - if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - } + int64_t count; + if (rrn_find_and_remove(rrl)) + count = refcount_remove(&rrl->rr_linked_rcount, tag); + else + count = refcount_remove(&rrl->rr_anon_rcount, tag); + if (count == 0) + cv_broadcast(&rrl->rr_cv); } else { ASSERT(rrl->rr_writer == curthread); ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6a95b399b..d7ed23e6f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1574,9 +1574,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. + * + * If spa_load_verbatim is true, trust the current + * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT) + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -2271,6 +2274,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); + spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); @@ -2404,6 +2408,7 @@ spa_import_rootpool(char *devpath, char *devid) spa = spa_add(pname, NULL); spa->spa_is_root = B_TRUE; + spa->spa_load_verbatim = B_TRUE; /* * Build up a vdev tree based on the boot device's label config. @@ -2459,6 +2464,7 @@ spa_import_rootpool(char *devpath, char *devid) VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); error = 0; + spa_history_log_version(spa, LOG_POOL_IMPORT); out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); @@ -2491,6 +2497,8 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); + spa->spa_load_verbatim = B_TRUE; + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); if (props != NULL) @@ -2499,6 +2507,7 @@ spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } @@ -2624,7 +2633,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) /* * Update the config cache to include the newly-imported pool. */ - spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* @@ -2634,6 +2643,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } @@ -2991,7 +3001,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; - dmu_tx_t *tx; char *oldvdpath, *newvdpath; int newvd_isspare; int error; @@ -3147,17 +3156,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, - CRED(), "%s vdev=%s %s vdev=%s", - replacing && newvd_isspare ? "spare in" : - replacing ? "replace" : "attach", newvdpath, - replacing ? "for" : "to", oldvdpath); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, + CRED(), "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); @@ -3747,19 +3750,11 @@ spa_async_thread(spa_t *spa) * then log an internal history event. */ if (space_update) { - dmu_tx_t *tx; - - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ONLINE, - spa, tx, CRED(), - "pool '%s' size: %llu(+%llu)", - spa_name(spa), spa_get_space(spa), - space_update); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_internal_log(LOG_POOL_VDEV_ONLINE, + spa, NULL, CRED(), + "pool '%s' size: %llu(+%llu)", + spa_name(spa), spa_get_space(spa), + space_update); } } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 7103e179b..b2063bba1 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -209,7 +209,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (rootdir == NULL) + if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* @@ -394,23 +394,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) } /* - * For a pool that's not currently a booting rootpool, update all disk labels, - * generate a fresh config based on the current in-core state, and sync the - * global config cache. - */ -void -spa_config_update(spa_t *spa, int what) -{ - spa_config_update_common(spa, what, FALSE); -} - -/* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void -spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; @@ -447,9 +436,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) /* * Update the global config cache to reflect the new mosconfig. */ - if (!isroot) + if (!spa->spa_is_root) spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 97d97d847..b77ac4208 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -390,13 +390,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (err); } -void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +static void +log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) { history_arg_t *hap; char *str; - va_list adx; /* * If this is part of creating a pool, not everything is @@ -408,9 +407,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); - va_start(adx, fmt); (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); - va_end(adx); hap->ha_log_type = LOG_INTERNAL; hap->ha_history_str = str; @@ -425,3 +422,48 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, } /* spa_history_log_sync() will free hap and str */ } + +void +spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(event, spa, htx, cr, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_version(spa_t *spa, history_internal_events_t event) +{ +#ifdef _KERNEL + uint64_t current_vers = spa_version(spa); + + if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { + spa_history_internal_log(event, spa, NULL, CRED(), + "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", + (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); + } + cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", + event == LOG_POOL_IMPORT ? "imported" : + event == LOG_POOL_CREATE ? "created" : "accessed", + (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); +#endif +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index aea3f5625..8150ac937 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -310,8 +310,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { + int wlocks_held = 0; + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); @@ -331,6 +335,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } + ASSERT(wlocks_held <= locks); } void diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3fa677e05..bb5024f98 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -405,22 +405,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* - * Currently, we can only support 2 parity devices. + * Currently, we can only support 3 parity devices. */ - if (nparity == 0 || nparity > 2) + if (nparity == 0 || nparity > 3) return (EINVAL); /* - * Older versions can only support 1 parity device. + * Previous versions could only support 1 or 2 parity + * device. */ - if (nparity == 2 && - spa_version(spa) < SPA_VERSION_RAID6) + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (ENOTSUP); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) return (ENOTSUP); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= SPA_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (EINVAL); /* * Otherwise, we default to 1 parity device for RAID-Z. @@ -993,6 +997,32 @@ vdev_probe(vdev_t *vd, zio_t *zio) return (NULL); } +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != NULL); + + taskq_destroy(tq); +} + /* * Prepare a virtual device for access. */ @@ -1005,8 +1035,8 @@ vdev_open(vdev_t *vd) uint64_t asize, psize; uint64_t ashift = 0; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); @@ -1217,7 +1247,12 @@ vdev_validate(vdev_t *vd) nvlist_free(label); - if (spa->spa_load_state == SPA_LOAD_OPEN && + /* + * If spa->spa_load_verbatim is true, no need to check the + * state of the pool. + */ + if (!spa->spa_load_verbatim && + spa->spa_load_state == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 48d5fc232..06cb72012 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -246,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * into a crufty old storage pool. */ ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity == 2 && - spa_version(spa) >= SPA_VERSION_RAID6)); + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add the nparity tag even on storage pools @@ -642,8 +644,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; @@ -672,11 +674,9 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); - } + vdev_label_write(zio, vd, l, ub, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); @@ -688,7 +688,7 @@ retry: nvlist_free(label); zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); + zio_buf_free(ub, VDEV_UBERBLOCK_RING); zio_buf_free(vp, sizeof (vdev_phys_t)); /* diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index fff7e0842..836386d42 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -124,21 +124,21 @@ vdev_mirror_map_alloc(zio_t *zio) static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - uint64_t c; int numerrors = 0; - int ret, lasterror = 0; + int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_open_children(vd); - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -158,9 +158,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5e57a1513..9867d0970 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ #include <sys/zfs_context.h> -#include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/avl.h> @@ -48,11 +48,14 @@ int zfs_vdev_time_shift = 6; int zfs_vdev_ramp_rate = 2; /* - * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. - * For read i/os, we also aggregate across small adjacency gaps. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -172,12 +175,14 @@ vdev_queue_agg_io_done(zio_t *aio) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio, *nio; + zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; + int stretch; +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -192,21 +197,88 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* - * We can aggregate I/Os that are adjacent and of the - * same flavor, as expressed by the AGG_INHERIT flags. - * The latter is necessary so that certain attributes - * of the I/O, such as whether it's a normal I/O or a - * scrub/resilver, can be preserved in the aggregate. + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { @@ -225,10 +297,15 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); - if (dio->io_type == ZIO_TYPE_WRITE) + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); + } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -244,6 +321,20 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 92753d871..b3074173e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,12 +35,27 @@ /* * Virtual device vector for RAID-Z. * - * This vdev supports both single and double parity. For single parity, we - * use a simple XOR of all the data columns. For double parity, we use both - * the simple XOR as well as a technique described in "The mathematics of - * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), - * over the integers expressable in a single byte. Briefly, the operations on - * the field are defined as follows: + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B @@ -55,22 +70,32 @@ * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is A^254. + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. * - * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, - * can be expressed by field operations: + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * See the reconstruction code below for how P and Q can used individually or - * in concert to recover missing data columns. + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. */ typedef struct raidz_col { @@ -84,21 +109,49 @@ typedef struct raidz_col { } raidz_col_t; typedef struct raidz_map { - uint64_t rm_cols; /* Column count */ + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_skipped; /* Skipped sectors for padding */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1d); \ +} -#define VDEV_RAIDZ_MAXPARITY 2 +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} -#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) +/* + * Force reconstruction to use the general purpose method. + */ +int vdev_raidz_default_to_general; /* * These two tables represent powers and logs of 2 in the Galois field defined @@ -201,7 +254,7 @@ vdev_raidz_map_free(zio_t *zio) for (c = 0; c < rm->rm_firstdatacol; c++) zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } static raidz_map_t * @@ -213,24 +266,35 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, coff, devidx; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + if (q == 0) { + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } - acols = (q == 0 ? bc : dcols); + ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; + rm->rm_scols = scols; rm->rm_bigcols = bc; - rm->rm_asize = 0; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - for (c = 0; c < acols; c++) { + asize = 0; + + for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { @@ -239,15 +303,26 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; - rm->rm_asize += rm->rm_col[c].rc_size; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; } - rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_skipped = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift); + ASSERT3U(rm->rm_skipped, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); @@ -305,12 +380,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } @@ -320,10 +395,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *q, *p, *src, pcount, ccount, mask, i; + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); @@ -331,55 +406,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount || ccount == 0); - for (i = 0; i < ccount; i++, p++, q++, src++) { - *q = *src; + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; + *q = *src; } - for (; i < pcount; i++, p++, q++, src++) { - *q = 0; + for (; i < pcnt; i++, src++, p++, q++) { *p = 0; + *q = 0; } } else { - ASSERT(ccount <= pcount); + ASSERT(ccnt <= pcnt); /* - * Rather than multiplying each byte individually (as - * described above), we are able to handle 8 at once - * by generating a mask based on the high bit in each - * byte and using that to conditionally XOR in 0x1d. + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. */ - for (i = 0; i < ccount; i++, p++, q++, src++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; } /* * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. */ - for (; i < pcount; i++, q++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); } } } } +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ static void -vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, i; + int x = tgts[0]; int c; + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(xcount > 0); @@ -404,15 +562,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) *dst ^= *src; } } + + return (1 << VDEV_RAIDZ_P); } -static void -vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, mask, i; uint8_t *b; + int x = tgts[0]; int c, j, exp; + ASSERT(ntgts == 1); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); @@ -436,23 +599,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) } } else { - /* - * For an explanation of this, see the comment in - * vdev_raidz_generate_parity_pq() above. - */ for (i = 0; i < count; i++, dst++, src++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } for (; i < xcount; i++, dst++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); } } } @@ -467,15 +620,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) *b = vdev_raidz_exp2(*b, exp); } } + + return (1 << VDEV_RAIDZ_Q); } -static void -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; void *pdata, *qdata; uint64_t xsize, ysize, i; + int x = tgts[0]; + int y = tgts[1]; + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); @@ -553,15 +711,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) */ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT3U(rows[i][j], ==, 0); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT3U(rows[i][j], ==, 0); + } + } + } } +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log, val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = rm->rm_col[c].rc_data; + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = rm->rm_col[cc].rc_data; + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + int code = 0; + + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + return (code); +} + +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* + * See if we can use any of our optimized reconstruction routines. + */ + if (!vdev_raidz_default_to_general) { + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; - int c, error; + int c; int lasterror = 0; int numerrors = 0; @@ -573,11 +1270,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } + vdev_open_children(vd); + for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -639,7 +1338,7 @@ vdev_raidz_io_start(zio_t *zio) blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; - int c; + int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -647,13 +1346,7 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * Generate RAID parity in the first virtual columns. - */ - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; @@ -664,6 +1357,23 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + return (ZIO_PIPELINE_CONTINUE); } @@ -671,8 +1381,7 @@ vdev_raidz_io_start(zio_t *zio) /* * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity - * data. + * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; @@ -748,10 +1457,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) bcopy(rc->rc_data, orig[c], rc->rc_size); } - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; @@ -768,9 +1474,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -static uint64_t raidz_corrected_p; -static uint64_t raidz_corrected_q; -static uint64_t raidz_corrected_pq; +/* + * Keep statistics on all the ways that we used parity to correct data. + */ +static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int vdev_raidz_worst_error(raidz_map_t *rm) @@ -783,19 +1490,176 @@ vdev_raidz_worst_error(raidz_map_t *rm) return (error); } +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + bcopy(rc->rc_data, orig[i], rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (zio_checksum_error(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc); + rc->rc_error = ECKSUM; + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + bcopy(orig[i], rc->rc_data, rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = 0; i < n; i++) { + zio_buf_free(orig[i], rm->rm_col[0].rc_size); + } + + return (ret); +} + static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc, *rc1; + raidz_col_t *rc; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c, c1; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ @@ -859,8 +1723,7 @@ vdev_raidz_io_done(zio_t *zio) * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { - switch (data_errors) { - case 0: + if (data_errors == 0) { if (zio_checksum_error(zio) == 0) { /* * If we read parity information (unnecessarily @@ -880,9 +1743,7 @@ vdev_raidz_io_done(zio_t *zio) } goto done; } - break; - - case 1: + } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we @@ -894,45 +1755,38 @@ vdev_raidz_io_done(zio_t *zio) ASSERT(parity_errors < rm->rm_firstdatacol); /* - * Find the column that reported the error. + * Identify the data columns that reported an error. */ + n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - vdev_raidz_reconstruct_p(rm, c); - } else { - ASSERT(rm->rm_firstdatacol > 1); - vdev_raidz_reconstruct_q(rm, c); - } + ASSERT(rm->rm_firstdatacol >= n); + + code = vdev_raidz_reconstruct(rm, tgts, n); if (zio_checksum_error(zio) == 0) { - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) - atomic_inc_64(&raidz_corrected_p); - else - atomic_inc_64(&raidz_corrected_q); + atomic_inc_64(&raidz_corrected[code]); /* - * If there's more than one parity disk that - * was successfully read, confirm that the - * other parity disk produced the correct data. - * This routine is suboptimal in that it - * regenerates both the parity we wish to test - * as well as the parity we just used to - * perform the reconstruction, but this should - * be a relatively uncommon case, and can be - * optimized if it becomes a problem. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. */ - if (parity_errors < rm->rm_firstdatacol - 1 || + if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; @@ -942,46 +1796,6 @@ vdev_raidz_io_done(zio_t *zio) goto done; } - break; - - case 2: - /* - * Two data column errors require double parity. - */ - ASSERT(rm->rm_firstdatacol == 2); - - /* - * Find the two columns that reported errors. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - for (c1 = c++; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - vdev_raidz_reconstruct_pq(rm, c1, c); - - if (zio_checksum_error(zio) == 0) { - atomic_inc_64(&raidz_corrected_pq); - goto done; - } - break; - - default: - ASSERT(rm->rm_firstdatacol <= 2); - ASSERT(0); } } @@ -1020,8 +1834,10 @@ vdev_raidz_io_done(zio_t *zio) * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. Before we attempt combinatorial reconstruction make - * sure we have a chance of coming up with the right answer. + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. */ if (total_errors >= rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); @@ -1032,133 +1848,30 @@ vdev_raidz_io_done(zio_t *zio) */ if (total_errors == rm->rm_firstdatacol) zio->io_error = zio_worst_error(zio->io_error, ECKSUM); - goto done; - } - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { + } else if ((code = vdev_raidz_combrec(zio, total_errors, + data_errors)) != 0) { /* - * Attempt to reconstruct the data from parity P. + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_p(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_p); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { /* - * Attempt to reconstruct the data from parity Q. + * All combinations failed to checksum. Generate checksum + * ereports for all children. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_q(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_q); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } + zio->io_error = ECKSUM; - if (rm->rm_firstdatacol > 1 && - rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && - rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { - /* - * Attempt to reconstruct the data from both P and Q. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { - void *orig, *orig1; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - - for (c1 = c + 1; c1 < rm->rm_cols; c1++) { - rc1 = &rm->rm_col[c1]; - - orig1 = zio_buf_alloc(rc1->rc_size); - bcopy(rc1->rc_data, orig1, rc1->rc_size); - - vdev_raidz_reconstruct_pq(rm, c, c1); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio_buf_free(orig1, rc1->rc_size); - atomic_inc_64(&raidz_corrected_pq); - - /* - * If these children didn't know they - * returned bad data, inform them. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - if (rc1->rc_tried && rc1->rc_error == 0) - raidz_checksum_error(zio, rc1); - - rc->rc_error = ECKSUM; - rc1->rc_error = ECKSUM; - - goto done; - } - - bcopy(orig1, rc1->rc_data, rc1->rc_size); - zio_buf_free(orig1, rc1->rc_size); + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size); } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - /* - * All combinations failed to checksum. Generate checksum ereports for - * all children. - */ - zio->io_error = ECKSUM; - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, - rc->rc_offset, rc->rc_size); } } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 88383f002..524c8e606 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - int c; int lasterror = 0; int numerrors = 0; @@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - int error; - if ((error = vdev_open(cvd)) != 0 && - !cvd->vdev_islog) { - lasterror = error; + if (cvd->vdev_open_error && !cvd->vdev_islog) { + lasterror = cvd->vdev_open_error; numerrors++; - continue; } } @@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_root_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index fbc93b423..528d31d5e 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1068,7 +1068,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, - uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift) + uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap; int err = 0; @@ -1113,28 +1113,28 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; } } else { - if (!add) { - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += SPA_MAXBLOCKSIZE; - else - *towrite += SPA_MAXBLOCKSIZE; - } else { - /* - * We are here if we are adding and (name != NULL). - * It is hard to find out if this add will promote this - * microzap to fatzap. Hence, we assume the worst case - * and account for the blocks assuming this microzap - * would be promoted to a fatzap. - * - * 1 block overwritten : header block - * 4 new blocks written : 2 new split leaf, 2 grown - * ptrtbl blocks - */ - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += 1 << dn_datablkshift; - else - *towrite += 1 << dn_datablkshift; - *towrite += 4 << dn_datablkshift; + /* + * We are here if (name != NULL) and this is a micro-zap. + * We account for the header block depending on whether it + * is freeable. + * + * Incase of an add-operation it is hard to find out + * if this add will promote this microzap to fatzap. + * Hence, we consider the worst case and account for the + * blocks assuming this microzap would be promoted to a + * fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + + if (add) { + *towrite += 4 * SPA_MAXBLOCKSIZE; } } diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 734bd8395..12ffe9f30 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -93,6 +93,8 @@ #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + static uint16_t zfs_ace_v0_get_type(void *acep) { @@ -781,6 +783,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) uint64_t who; uint16_t iflags, type; uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); @@ -905,8 +908,32 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) } } } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; } } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + else + zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + return (mode); } @@ -946,7 +973,8 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) } /* - * Read an external acl object. + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. */ static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) @@ -960,8 +988,15 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { *aclpp = zfs_acl_node_read_internal(zp, will_modify); + if (!will_modify) + zp->z_acl_cached = *aclpp; return (0); } @@ -995,6 +1030,8 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) } *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; return (0); } @@ -1019,11 +1056,16 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(zp->z_dbuf, tx); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + zphys->zp_mode = zfs_mode_compute(zp, aclp); /* - * Decide which opbject type to use. If we are forced to - * use old ACL format than transform ACL into zfs_oldace_t + * Decide which object type to use. If we are forced to + * use old ACL format then transform ACL into zfs_oldace_t * layout. */ if (!zfsvfs->z_use_fuids) { @@ -1869,7 +1911,6 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, mutex_exit(&dzp->z_acl_lock); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); - zfs_acl_free(paclp); } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); @@ -1998,8 +2039,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); - return (0); } @@ -2095,11 +2134,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); } top: - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { - zfs_acl_free(aclp); - return (error); - } - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); @@ -2145,6 +2179,7 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); @@ -2154,7 +2189,6 @@ top: if (fuidp) zfs_fuid_info_free(fuidp); - zfs_acl_free(aclp); dmu_tx_commit(tx); done: mutex_exit(&zp->z_acl_lock); @@ -2301,7 +2335,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, checkit = B_TRUE; break; } else { - zfs_acl_free(aclp); mutex_exit(&zp->z_acl_lock); return (EIO); } @@ -2321,7 +2354,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); return (0); } } @@ -2334,7 +2366,6 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, } mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); /* Put the found 'denies' back on the working mode */ if (deny_mask) { @@ -2366,8 +2397,7 @@ zfs_has_access(znode_t *zp, cred_t *cr) secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || - secpolicy_vnode_chown(cr, B_TRUE) == 0 || - secpolicy_vnode_chown(cr, B_FALSE) == 0 || + secpolicy_vnode_chown(cr, owner) == 0 || secpolicy_vnode_setdac(cr, owner) == 0 || secpolicy_vnode_remove(cr) == 0); } @@ -2421,6 +2451,78 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, check_privs, B_FALSE, cr)); } +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t fowner; + uid_t gowner; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + + is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + goto slow; + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || + FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + fowner = (uid_t)zdp->z_phys->zp_uid; + gowner = (uid_t)zdp->z_phys->zp_gid; + + if (uid == fowner) { + owner = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(gowner, cr)) { + groupmbr = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_phys->zp_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + /* * Determine whether Access should be granted/denied, invoking least * priv subsytem when a deny is determined. @@ -2515,7 +2617,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) owner, checkmode); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, B_TRUE); + error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(cr, owner); @@ -2524,7 +2626,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, B_FALSE); + error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 27c2c51a3..c6c719871 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -700,7 +700,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, if (err) avl_add(&sdp->sd_snaps, sep); else - err = dmu_objset_destroy(snapname); + err = dmu_objset_destroy(snapname, B_FALSE); } else { err = ENOENT; } diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 8e481dffb..e704b1ca9 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -353,6 +353,7 @@ retry: rw_exit(&zfsvfs->z_fuid_lock); return (retidx); } else { + rw_exit(&zfsvfs->z_fuid_lock); return (-1); } } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 07dd03c35..9cb40816d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -761,6 +761,20 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr)); } +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_HOLD, cr)); +} + +static int +zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RELEASE, cr)); +} + /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ @@ -2466,7 +2480,7 @@ zfs_ioc_create(zfs_cmd_t *zc) */ if (error == 0) { if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) - (void) dmu_objset_destroy(zc->zc_name); + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); } nvlist_free(nvprops); return (error); @@ -2553,8 +2567,9 @@ zfs_unmount_snap(char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2569,13 +2584,15 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc) zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); if (err) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, + zc->zc_defer_destroy)); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2588,7 +2605,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc) return (err); } - return (dmu_objset_destroy(zc->zc_name)); + return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy)); } /* @@ -2708,7 +2725,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) file_t *fp; objset_t *os; dmu_recv_cookie_t drc; - zfsvfs_t *zfsvfs = NULL; boolean_t force = (boolean_t)zc->zc_guid; int error, fd; offset_t off; @@ -2740,25 +2756,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } - if (getzfsvfs(tofs, &zfsvfs) == 0) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - VFS_RELE(zfsvfs->z_vfs); - zfsvfs = NULL; - error = EBUSY; - goto out; - } + if (props && dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { /* * If new properties are supplied, they are to completely * replace the existing ones, so stash away the existing ones. */ - if (props) - (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE); - } else if (props && dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Get the props even if there was no zfsvfs (zvol or - * unmounted zpl). - */ (void) dsl_prop_get_all(os, &origprops, TRUE); dmu_objset_close(os); @@ -2772,7 +2775,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) } error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, - force, origin, zfsvfs != NULL, &drc); + force, origin, &drc); if (origin) dmu_objset_close(origin); if (error) @@ -2793,25 +2796,33 @@ zfs_ioc_recv(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_recv_stream(&drc, fp->f_vnode, &off); - if (error == 0 && zfsvfs) { - char *osname; - int mode; + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; - /* online recv */ - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + int end_err; + char *osname; + int mode; - error = dmu_recv_end(&drc); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); - error = error ? error : resume_err; + osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + error = zfs_suspend_fs(zfsvfs, osname, &mode); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc); + if (error == 0) { + int resume_err = + zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } + error = error ? error : end_err; + VFS_RELE(zfsvfs->z_vfs); + kmem_free(osname, MAXNAMELEN); } else { - dmu_recv_abort_cleanup(&drc); + error = dmu_recv_end(&drc); } - kmem_free(osname, MAXNAMELEN); - } else if (error == 0) { - error = dmu_recv_end(&drc); } zc->zc_cookie = off - fp->f_offset; @@ -2826,10 +2837,6 @@ zfs_ioc_recv(zfs_cmd_t *zc) (void) zfs_set_prop_nvlist(tofs, origprops); } out: - if (zfsvfs) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } nvlist_free(props); nvlist_free(origprops); releasef(fd); @@ -3432,6 +3439,69 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) } /* + * inputs: + * zc_name name of filesystem + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_hold(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of dataset from which we're releasing a user reference + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_release(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of snapshot holds + */ +static int +zfs_ioc_get_holds(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. @@ -3511,8 +3581,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_TRUE }, { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE, + B_TRUE }, { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, @@ -3534,6 +3604,11 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { DATASET_NAME, B_FALSE, B_FALSE }, { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, DATASET_NAME, B_FALSE, B_TRUE }, + { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE } }; int diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 8a859b575..d03f92ba0 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -935,7 +935,6 @@ zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) goto out; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); @@ -1051,7 +1050,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 88d4e52c1..8eb4665ae 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -208,6 +208,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -215,12 +221,6 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && @@ -855,6 +855,10 @@ zfs_get_done(dmu_buf_t *db, void *vzgd) kmem_free(zgd, sizeof (zgd_t)); } +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + /* * Get data to generate a TX_WRITE intent log record. */ @@ -936,7 +940,21 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd->zgd_rl = rl; zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_bp = &lr->lr_blkptr; - VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); +#ifdef DEBUG + if (zil_fault_io) { + error = EIO; + zil_fault_io = 0; + } else { + error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); + } +#else + error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db); +#endif + if (error != 0) { + kmem_free(zgd, sizeof (zgd_t)); + goto out; + } + ASSERT(boff == db->db_offset); lr->lr_blkoff = off - boff; error = dmu_sync(zio, db, &lr->lr_blkptr, @@ -988,6 +1006,27 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, } /* + * If vnode is for a device return a specfs vnode instead. + */ +static int +specvp_check(vnode_t **vpp, cred_t *cr) +{ + int error = 0; + + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + *vpp = svp; + } + return (error); +} + + +/* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * @@ -1017,7 +1056,46 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; + int error = 0; + + /* fast path */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (dvp->v_type != VDIR) { + return (ENOTDIR); + } else if (zdp->z_dbuf == NULL) { + return (EIO); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *vpp = dvp; + VN_HOLD(*vpp); + return (0); + } + return (error); + } else { + vnode_t *tvp = dnlc_lookup(dvp, nm); + + if (tvp) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (error) { + VN_RELE(tvp); + return (error); + } + if (tvp == DNLC_NO_VNODE) { + VN_RELE(tvp); + return (ENOENT); + } else { + *vpp = tvp; + return (specvp_check(vpp, cr)); + } + } + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); @@ -1082,21 +1160,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) { - /* - * Convert device special files - */ - if (IS_DEVVP(*vpp)) { - vnode_t *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = ENOSYS; - else - *vpp = svp; - } - } + if (error == 0) + error = specvp_check(vpp, cr); ZFS_EXIT(zfsvfs); return (error); @@ -1235,6 +1300,7 @@ top: &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); error = EDQUOT; goto out; } @@ -1332,19 +1398,7 @@ out: VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); - /* - * If vnode is for a device return a specfs vnode instead. - */ - if (IS_DEVVP(*vpp)) { - struct vnode *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) { - error = ENOSYS; - } - *vpp = svp; - } + error = specvp_check(vpp, cr); } ZFS_EXIT(zfsvfs); @@ -1653,6 +1707,7 @@ top: return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (EDQUOT); @@ -2456,6 +2511,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, top: attrzp = NULL; + /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); @@ -2765,6 +2821,8 @@ top: zp->z_phys->zp_mode = new_mode; err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); + zp->z_acl_cached = aclp; + aclp = NULL; mutex_exit(&zp->z_acl_lock); } @@ -2856,10 +2914,8 @@ out: if (attrzp) VN_RELE(ZTOV(attrzp)); - if (aclp) { + if (aclp) zfs_acl_free(aclp); - aclp = NULL; - } if (fuidp) { zfs_fuid_info_free(fuidp); @@ -3724,8 +3780,8 @@ top: if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); - dmu_tx_commit(tx); } + dmu_tx_commit(tx); out: pvn_write_done(pp, (err ? B_ERROR : 0) | flags); diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 8ced95174..f99e72f1d 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -133,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dbuf = NULL; zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; return (0); } @@ -155,6 +156,7 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT(zp->z_dbuf == NULL); ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); } #ifdef ZNODE_STATS @@ -199,6 +201,18 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_phys = ozp->z_phys; nzp->z_dbuf = ozp->z_dbuf; + /* + * Release any cached ACL, since it *may* have + * zfs_acl_node_t's that directly references an + * embedded ACL in the zp_acl of the old znode_phys_t + * + * It will be recached the next time the ACL is needed. + */ + if (ozp->z_acl_cached) { + zfs_acl_free(ozp->z_acl_cached); + ozp->z_acl_cached = NULL; + } + /* Update back pointers. */ (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, znode_evict_error); @@ -1081,6 +1095,11 @@ zfs_znode_free(znode_t *zp) list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + kmem_cache_free(znode_cache, zp); VFS_RELE(zfsvfs->z_vfs); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 53d9d9bf7..db3822f5a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -714,14 +714,15 @@ zil_lwb_write_done(zio_t *zio) lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; - mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * which we allocated the next block sync. We still have the + * zl_lock to ensure zil_sync doesn't kmem free the lwb. */ txg_rele_to_sync(&lwb->lwb_txgh); + mutex_exit(&zilog->zl_lock); } /* @@ -925,6 +926,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } error = zilog->zl_get_data( itx->itx_private, lr, dbuf, lwb->lwb_zio); + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } if (error) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); |