diff options
103 files changed, 7625 insertions, 4089 deletions
diff --git a/ZFS.RELEASE b/ZFS.RELEASE index 282b58543..8b7cfb53c 100644 --- a/ZFS.RELEASE +++ b/ZFS.RELEASE @@ -1 +1 @@ -http://dlc.sun.com/osol/on/downloads/b108/on-src.tar.bz2 +http://dlc.sun.com/osol/on/downloads/b117/on-src.tar.bz2 diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 0ced25865..a310fc38b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -102,7 +102,9 @@ usage(void) (void) fprintf(stderr, " -C cached pool configuration\n"); (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -b block statistics\n"); - (void) fprintf(stderr, " -c checksum all data blocks\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -S <user|all>:<cksum_alg|all> -- " "dump blkptr signatures\n"); @@ -125,6 +127,11 @@ usage(void) exit(1); } +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + static void fatal(const char *fmt, ...) { @@ -136,7 +143,7 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); - abort(); + exit(1); } static void @@ -209,7 +216,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, object, 0, nvsize, packed)); + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); @@ -435,7 +442,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) alloc = 0; for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { VERIFY(0 == dmu_read(os, smo->smo_object, offset, - sizeof (entry), &entry)); + sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), @@ -467,6 +474,21 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) } static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[5]; + space_map_t *sm = &msp->ms_map; + avl_tree_t *t = sm->sm_pp_root; + int free_pct = sm->sm_space * 100 / sm->sm_size; + + nicenum(space_map_maxsize(sm), maxbuf); + + (void) printf("\t %20s %10lu %7s %6s %4s %4d%%\n", + "segments", avl_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); +} + +static void dump_metaslab(metaslab_t *msp) { char freebuf[5]; @@ -476,22 +498,28 @@ dump_metaslab(metaslab_t *msp) nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf); - if (dump_opt['d'] <= 5) { - (void) printf("\t%10llx %10llu %5s\n", - (u_longlong_t)msp->ms_map.sm_start, - (u_longlong_t)smo->smo_object, - freebuf); - return; - } - (void) printf( - "\tvdev %llu offset %08llx spacemap %4llu free %5s\n", + "\tvdev %5llu offset %12llx spacemap %6llu free %5s\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start, (u_longlong_t)smo->smo_object, freebuf); - ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); + if (dump_opt['m'] > 1) { + mutex_enter(&msp->ms_lock); + VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops, + SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0); + dump_metaslab_stats(msp); + space_map_unload(&msp->ms_map); + mutex_exit(&msp->ms_lock); + } + + if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { + ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); + + mutex_enter(&msp->ms_lock); + dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map); + mutex_exit(&msp->ms_lock); + } - dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map); } static void @@ -506,14 +534,12 @@ dump_metaslabs(spa_t *spa) for (c = 0; c < rvd->vdev_children; c++) { vd = rvd->vdev_child[c]; - (void) printf("\n vdev %llu\n\n", (u_longlong_t)vd->vdev_id); + (void) printf("\t%-10s %-19s %-15s %-10s\n", + "vdev", "offset", "spacemap", "free"); + (void) printf("\t%10s %19s %15s %10s\n", + "----------", "-------------------", + "---------------", "-------------"); - if (dump_opt['d'] <= 5) { - (void) printf("\t%10s %10s %5s\n", - "offset", "spacemap", "free"); - (void) printf("\t%10s %10s %5s\n", - "------", "--------", "----"); - } for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); @@ -917,6 +943,7 @@ dump_uidgid(objset_t *os, znode_phys_t *zp) /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; @@ -1020,6 +1047,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = { dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group used */ + dump_zap, /* ZFS user/group quota */ }; static void @@ -1083,6 +1112,14 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } if (verbosity >= 4) { + (void) printf("\tdnode flags: %s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + object_viewer[doi.doi_bonus_type](os, object, bonus, bsize); object_viewer[doi.doi_type](os, object, NULL, 0); *print_header = 1; @@ -1137,7 +1174,7 @@ dump_dir(objset_t *os) uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[8]; - char blkbuf[BP_SPRINTF_LEN]; + char blkbuf[BP_SPRINTF_LEN + 20]; char osname[MAXNAMELEN]; char *type = "UNKNOWN"; int verbosity = dump_opt['d']; @@ -1163,8 +1200,8 @@ dump_dir(objset_t *os) nicenum(refdbytes, numbuf); if (verbosity >= 4) { - (void) strcpy(blkbuf, ", rootbp "); - sprintf_blkptr(blkbuf + strlen(blkbuf), + (void) sprintf(blkbuf + strlen(blkbuf), ", rootbp "); + (void) sprintf_blkptr(blkbuf + strlen(blkbuf), BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp); } else { blkbuf[0] = '\0'; @@ -1199,7 +1236,12 @@ dump_dir(objset_t *os) } dump_object(os, 0, verbosity, &print_header); - object_count = 1; + object_count = 0; + if (os->os->os_userused_dnode && + os->os->os_userused_dnode->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { @@ -1211,8 +1253,10 @@ dump_dir(objset_t *os) (void) printf("\n"); - if (error != ESRCH) - fatal("dmu_object_next() = %d", error); + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } } static void @@ -1395,7 +1439,8 @@ static space_map_ops_t zdb_space_map_ops = { zdb_space_map_unload, NULL, /* alloc */ zdb_space_map_claim, - NULL /* free */ + NULL, /* free */ + NULL /* maxsize */ }; static void @@ -1505,13 +1550,25 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, { zdb_cb_t *zcb = arg; char blkbuf[BP_SPRINTF_LEN]; + dmu_object_type_t type; + boolean_t is_l0_metadata; if (bp == NULL) return (0); - zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp)); + type = BP_GET_TYPE(bp); + + zdb_count_block(spa, zcb, bp, type); - if (dump_opt['c'] || dump_opt['S']) { + /* + * if we do metadata-only checksumming there's no need to checksum + * indirect blocks here because it is done during traverse + */ + is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES && + dmu_ot[type].ot_metadata); + + if (dump_opt['c'] > 1 || dump_opt['S'] || + (dump_opt['c'] && is_l0_metadata)) { int ioerr, size; void *data; @@ -1523,7 +1580,7 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, free(data); /* We expect io errors on intent log */ - if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) { + if (ioerr && type != DMU_OT_INTENT_LOG) { zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; @@ -1571,8 +1628,9 @@ dump_block_stats(spa_t *spa) int c, e; if (!dump_opt['S']) { - (void) printf("\nTraversing all blocks %s%s%s%s...\n", + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", !dump_opt['L'] ? "nothing leaked " : ""); @@ -1772,14 +1830,17 @@ dump_zpool(spa_t *spa) if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock); - if (dump_opt['d'] || dump_opt['i']) { + if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_bplist(dp->dp_meta_objset, spa->spa_sync_bplist_obj, "Deferred frees"); dump_dtl(spa->spa_root_vdev, 0); - dump_metaslabs(spa); } + + if (dump_opt['d'] >= 3 || dump_opt['m']) + dump_metaslabs(spa); + (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } @@ -2255,13 +2316,14 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) { + while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) { switch (c) { case 'u': case 'd': case 'i': case 'b': case 'c': + case 'm': case 's': case 'C': case 'l': @@ -2397,7 +2459,7 @@ main(int argc, char **argv) } if (error == 0) - error = spa_import_faulted(argv[0], + error = spa_import_verbatim(argv[0], exported_conf, nvl); nvlist_free(nvl); diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 02d35a050..cc08ef514 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Print intent log header and statistics. */ @@ -345,8 +343,10 @@ dump_intent_log(zilog_t *zilog) if (zh->zh_log.blk_birth == 0 || verbose < 2) return; - (void) printf("\n ZIL header: claim_txg %llu, seq %llu\n", - (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq); + (void) printf("\n ZIL header: claim_txg %llu, claim_seq %llu", + (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); if (verbose >= 4) print_log_bp(&zh->zh_log, "\n\tfirst block: "); diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c index a22370a02..ca5c2b232 100644 --- a/cmd/zfs/zfs_iter.c +++ b/cmd/zfs/zfs_iter.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,11 +53,14 @@ typedef struct zfs_node { } zfs_node_t; typedef struct callback_data { - uu_avl_t *cb_avl; - int cb_flags; - zfs_type_t cb_types; - zfs_sort_column_t *cb_sortcol; - zprop_list_t **cb_proplist; + uu_avl_t *cb_avl; + int cb_flags; + zfs_type_t cb_types; + zfs_sort_column_t *cb_sortcol; + zprop_list_t **cb_proplist; + int cb_depth_limit; + int cb_depth; + uint8_t cb_props_table[ZFS_NUM_PROPS]; } callback_data_t; uu_avl_pool_t *avl_pool; @@ -98,10 +101,17 @@ zfs_callback(zfs_handle_t *zhp, void *data) uu_avl_node_init(node, &node->zn_avlnode, avl_pool); if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, &idx) == NULL) { - if (cb->cb_proplist && - zfs_expand_proplist(zhp, cb->cb_proplist) != 0) { - free(node); - return (-1); + if (cb->cb_proplist) { + if ((*cb->cb_proplist) && + !(*cb->cb_proplist)->pl_all) + zfs_prune_proplist(zhp, + cb->cb_props_table); + + if (zfs_expand_proplist(zhp, cb->cb_proplist) + != 0) { + free(node); + return (-1); + } } uu_avl_insert(cb->cb_avl, node, idx); dontclose = 1; @@ -113,11 +123,15 @@ zfs_callback(zfs_handle_t *zhp, void *data) /* * Recurse if necessary. */ - if (cb->cb_flags & ZFS_ITER_RECURSE) { + if (cb->cb_flags & ZFS_ITER_RECURSE && + ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + cb->cb_depth < cb->cb_depth_limit)) { + cb->cb_depth++; if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) (void) zfs_iter_filesystems(zhp, zfs_callback, data); if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps) (void) zfs_iter_snapshots(zhp, zfs_callback, data); + cb->cb_depth--; } if (!dontclose) @@ -325,10 +339,10 @@ zfs_sort(const void *larg, const void *rarg, void *data) int zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, - zfs_sort_column_t *sortcol, zprop_list_t **proplist, + zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, zfs_iter_f callback, void *data) { - callback_data_t cb; + callback_data_t cb = {0}; int ret = 0; zfs_node_t *node; uu_avl_walk_t *walk; @@ -346,6 +360,45 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, cb.cb_flags = flags; cb.cb_proplist = proplist; cb.cb_types = types; + cb.cb_depth_limit = limit; + /* + * If cb_proplist is provided then in the zfs_handles created we + * retain only those properties listed in cb_proplist and sortcol. + * The rest are pruned. So, the caller should make sure that no other + * properties other than those listed in cb_proplist/sortcol are + * accessed. + * + * If cb_proplist is NULL then we retain all the properties. We + * always retain the zoned property, which some other properties + * need (userquota & friends), and the createtxg property, which + * we need to sort snapshots. + */ + if (cb.cb_proplist && *cb.cb_proplist) { + zprop_list_t *p = *cb.cb_proplist; + + while (p) { + if (p->pl_prop >= ZFS_PROP_TYPE && + p->pl_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[p->pl_prop] = B_TRUE; + } + p = p->pl_next; + } + + while (sortcol) { + if (sortcol->sc_prop >= ZFS_PROP_TYPE && + sortcol->sc_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[sortcol->sc_prop] = B_TRUE; + } + sortcol = sortcol->sc_next; + } + + cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; + cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; + } else { + (void) memset(cb.cb_props_table, B_TRUE, + sizeof (cb.cb_props_table)); + } + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h index 76a11085a..a0290775b 100644 --- a/cmd/zfs/zfs_iter.h +++ b/cmd/zfs/zfs_iter.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,9 +41,10 @@ typedef struct zfs_sort_column { #define ZFS_ITER_RECURSE (1 << 0) #define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) #define ZFS_ITER_PROP_LISTSNAPS (1 << 2) +#define ZFS_ITER_DEPTH_LIMIT (1 << 3) int zfs_for_each(int, char **, int options, zfs_type_t, - zfs_sort_column_t *, zprop_list_t **, zfs_iter_f, void *); + zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); void zfs_free_sort_columns(zfs_sort_column_t *); diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 1f7f47d08..0752a4772 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -39,12 +39,14 @@ #include <unistd.h> #include <fcntl.h> #include <zone.h> +#include <grp.h> +#include <pwd.h> #include <sys/mkdev.h> #include <sys/mntent.h> #include <sys/mnttab.h> #include <sys/mount.h> #include <sys/stat.h> -#include <sys/avl.h> +#include <sys/fs/zfs.h> #include <libzfs.h> #include <libuutil.h> @@ -56,6 +58,7 @@ libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; +const char *pypath = "/usr/lib/zfs/pyzfs.py"; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); @@ -75,8 +78,8 @@ static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); -static int zfs_do_allow(int argc, char **argv); -static int zfs_do_unallow(int argc, char **argv); +static int zfs_do_userspace(int argc, char **argv); +static int zfs_do_python(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -116,7 +119,9 @@ typedef enum { HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, - HELP_UNALLOW + HELP_UNALLOW, + HELP_USERSPACE, + HELP_GROUPSPACE } zfs_help_t; typedef struct zfs_command { @@ -150,6 +155,8 @@ static zfs_command_t command_table[] = { { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, + { "userspace", zfs_do_userspace, HELP_USERSPACE }, + { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, @@ -159,9 +166,9 @@ static zfs_command_t command_table[] = { { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, - { "allow", zfs_do_allow, HELP_ALLOW }, + { "allow", zfs_do_python, HELP_ALLOW }, { NULL }, - { "unallow", zfs_do_unallow, HELP_UNALLOW }, + { "unallow", zfs_do_python, HELP_UNALLOW }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -184,8 +191,8 @@ get_usage(zfs_help_t idx) return (gettext("\tdestroy [-rRf] " "<filesystem|volume|snapshot>\n")); case HELP_GET: - return (gettext("\tget [-rHp] [-o field[,...]] " - "[-s source[,...]]\n" + return (gettext("\tget [-rHp] [-d max] " + "[-o field[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot] ...\n")); case HELP_INHERIT: @@ -195,8 +202,8 @@ get_usage(zfs_help_t idx) return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: - return (gettext("\tlist [-rH] [-o property[,...]] " - "[-t type[,...]] [-s property] ...\n" + return (gettext("\tlist [-rH][-d max] " + "[-o property[,...]] [-t type[,...]] [-s property] ...\n" "\t [-S property] ... " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: @@ -232,7 +239,8 @@ get_usage(zfs_help_t idx) return (gettext("\tunshare [-f] " "<-a | filesystem|mountpoint>\n")); case HELP_ALLOW: - return (gettext("\tallow [-ldug] " + return (gettext("\tallow <filesystem|volume>\n" + "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n" "\t <filesystem|volume>\n" "\tallow [-ld] -e <perm|@setname>[,...] " @@ -250,6 +258,14 @@ get_usage(zfs_help_t idx) "<filesystem|volume>\n" "\tunallow [-r] -s @setname [<perm|@setname>[,...]] " "<filesystem|volume>\n")); + case HELP_USERSPACE: + return (gettext("\tuserspace [-hniHp] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t <filesystem|snapshot>\n")); + case HELP_GROUPSPACE: + return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t <filesystem|snapshot>\n")); } abort(); @@ -311,7 +327,6 @@ usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; - boolean_t show_permissions = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { @@ -342,13 +357,7 @@ usage(boolean_t requested) strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; - if (current_command != NULL && - (strcmp(current_command->name, "allow") == 0 || - strcmp(current_command->name, "unallow") == 0)) - show_permissions = B_TRUE; - if (show_properties) { - (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); @@ -359,16 +368,26 @@ usage(boolean_t requested) (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); + (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); - - } else if (show_permissions) { - (void) fprintf(fp, - gettext("\nThe following permissions are supported:\n")); - - zfs_deleg_permissions(); + (void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ " + "properties must be appended with\n" + "a user or group specifier of one of these forms:\n" + " POSIX name (eg: \"matt\")\n" + " POSIX id (eg: \"126829\")\n" + " SMB name@domain (eg: \"matt@sun\")\n" + " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), @@ -415,6 +434,27 @@ parseprop(nvlist_t *props) return (0); } +static int +parse_depth(char *opt, int *flags) +{ + char *tmp; + int depth; + + depth = (int)strtol(opt, &tmp, 0); + if (*tmp) { + (void) fprintf(stderr, + gettext("%s is not an integer\n"), optarg); + usage(B_FALSE); + } + if (depth < 0) { + (void) fprintf(stderr, + gettext("Depth can not be negative.\n")); + usage(B_FALSE); + } + *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); + return (depth); +} + /* * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol> * @@ -1063,6 +1103,17 @@ get_callback(zfs_handle_t *zhp, void *data) zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source); } else { if (nvlist_lookup_nvlist(userprop, pl->pl_user_prop, &propval) != 0) { @@ -1102,6 +1153,7 @@ zfs_do_get(int argc, char **argv) int i, c, flags = 0; char *value, *fields; int ret; + int limit = 0; zprop_list_t fake_name = { 0 }; /* @@ -1115,11 +1167,14 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) { + while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; + case 'd': + limit = parse_depth(optarg, &flags); + break; case 'r': flags |= ZFS_ITER_RECURSE; break; @@ -1250,7 +1305,7 @@ zfs_do_get(int argc, char **argv) /* run for each object */ ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, - &cb.cb_proplist, get_callback, &cb); + &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); @@ -1363,10 +1418,10 @@ zfs_do_inherit(int argc, char **argv) if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, inherit_recurse_cb, propname); + NULL, NULL, 0, inherit_recurse_cb, propname); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, inherit_cb, propname); + NULL, NULL, 0, inherit_cb, propname); } return (ret); @@ -1435,21 +1490,30 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - - if (cb->cb_version >= ZPL_VERSION_FUID) { - int spa_version; - - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); - - if (spa_version < SPA_VERSION_FUID) { - /* can't upgrade */ - (void) printf(gettext("%s: can not be upgraded; " - "the pool version needs to first be upgraded\nto " - "version %d\n\n"), - zfs_get_name(zhp), SPA_VERSION_FUID); - cb->cb_numfailed++; - return (0); + int i; + static struct { int zplver; int spaver; } table[] = { + {ZPL_VERSION_FUID, SPA_VERSION_FUID}, + {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, + {0, 0} + }; + + + for (i = 0; table[i].zplver; i++) { + if (cb->cb_version >= table[i].zplver) { + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + if (spa_version < table[i].spaver) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), table[i].spaver); + cb->cb_numfailed++; + return (0); + } } } @@ -1550,6 +1614,8 @@ zfs_do_upgrade(int argc, char **argv) (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and File system " "unique identifer (FUID)\n")); + (void) printf(gettext(" 4 userquota, groupquota " + "properties\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" @@ -1561,7 +1627,7 @@ zfs_do_upgrade(int argc, char **argv) if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_set_callback, &cb); + NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), cb.cb_numupgraded); if (cb.cb_numsamegraded) { @@ -1579,14 +1645,14 @@ zfs_do_upgrade(int argc, char **argv) flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_list_callback, &cb); + NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_list_callback, &cb); + NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " @@ -1598,11 +1664,90 @@ zfs_do_upgrade(int argc, char **argv) } /* - * list [-rH] [-o property[,property]...] [-t type[,type]...] + * zfs userspace + */ +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) +{ + zfs_userquota_prop_t *typep = arg; + zfs_userquota_prop_t p = *typep; + char *name = NULL; + char *ug, *propname; + char namebuf[32]; + char sizebuf[32]; + + if (domain == NULL || domain[0] == '\0') { + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) { + struct group *g = getgrgid(rid); + if (g) + name = g->gr_name; + } else { + struct passwd *p = getpwuid(rid); + if (p) + name = p->pw_name; + } + } + + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) + ug = "group"; + else + ug = "user"; + + if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED) + propname = "used"; + else + propname = "quota"; + + if (name == NULL) { + (void) snprintf(namebuf, sizeof (namebuf), + "%llu", (longlong_t)rid); + name = namebuf; + } + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + + (void) printf("%s %s %s%c%s %s\n", propname, ug, domain, + domain[0] ? '-' : ' ', name, sizebuf); + + return (0); +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + int error; + + /* + * Try the python version. If the execv fails, we'll continue + * and do a simplistic implementation. + */ + (void) execv(pypath, argv-1); + + (void) printf("internal error: %s not found\n" + "falling back on built-in implementation, " + "some features will not work\n", pypath); + + if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) + return (1); + + (void) printf("PROP TYPE NAME VALUE\n"); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + error = zfs_userspace(zhp, p, userspace_cb, &p); + if (error) + break; + } + return (error); +} + +/* + * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] * [-s property [-s property]...] [-S property [-S property]...] * <dataset> ... * * -r Recurse over all children + * -d Limit recursion by depth. * -H Scripted mode; elide headers and separate columns by tabs * -o Control which fields to display. * -t Control which object types to display. @@ -1685,7 +1830,6 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) first = B_FALSE; } - right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, B_FALSE) != 0) @@ -1694,6 +1838,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + property, sizeof (property), B_FALSE) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) @@ -1701,6 +1852,7 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) else verify(nvlist_lookup_string(propval, ZPROP_VALUE, &propstr) == 0); + right_justify = B_FALSE; } width = pl->pl_width; @@ -1752,16 +1904,20 @@ zfs_do_list(int argc, char **argv) char *fields = NULL; list_cbdata_t cb = { 0 }; char *value; + int limit = 0; int ret; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ - while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) { + while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { switch (c) { case 'o': fields = optarg; break; + case 'd': + limit = parse_depth(optarg, &flags); + break; case 'r': flags |= ZFS_ITER_RECURSE; break; @@ -1852,7 +2008,7 @@ zfs_do_list(int argc, char **argv) cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, - list_callback, &cb); + limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); @@ -2235,7 +2391,7 @@ zfs_do_set(int argc, char **argv) } ret = zfs_for_each(argc - 2, argv + 2, NULL, - ZFS_TYPE_DATASET, NULL, NULL, set_callback, &cb); + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); return (ret); } @@ -2495,390 +2651,6 @@ zfs_do_receive(int argc, char **argv) return (err != 0); } -typedef struct allow_cb { - int a_permcnt; - size_t a_treeoffset; -} allow_cb_t; - -static void -zfs_print_perms(avl_tree_t *tree) -{ - zfs_perm_node_t *permnode; - - permnode = avl_first(tree); - while (permnode != NULL) { - (void) printf("%s", permnode->z_pname); - permnode = AVL_NEXT(tree, permnode); - if (permnode) - (void) printf(","); - else - (void) printf("\n"); - } -} - -/* - * Iterate over user/groups/everyone/... and the call perm_iter - * function to print actual permission when tree has >0 nodes. - */ -static void -zfs_iter_perms(avl_tree_t *tree, const char *banner, allow_cb_t *cb) -{ - zfs_allow_node_t *item; - avl_tree_t *ptree; - - item = avl_first(tree); - while (item) { - ptree = (void *)((char *)item + cb->a_treeoffset); - if (avl_numnodes(ptree)) { - if (cb->a_permcnt++ == 0) - (void) printf("%s\n", banner); - (void) printf("\t%s", item->z_key); - /* - * Avoid an extra space being printed - * for "everyone" which is keyed with a null - * string - */ - if (item->z_key[0] != '\0') - (void) printf(" "); - zfs_print_perms(ptree); - } - item = AVL_NEXT(tree, item); - } -} - -#define LINES "-------------------------------------------------------------\n" -static int -zfs_print_allows(char *ds) -{ - zfs_allow_t *curperms, *perms; - zfs_handle_t *zhp; - allow_cb_t allowcb = { 0 }; - char banner[MAXPATHLEN]; - - if (ds[0] == '-') - usage(B_FALSE); - - if (strrchr(ds, '@')) { - (void) fprintf(stderr, gettext("Snapshots don't have 'allow'" - " permissions\n")); - return (1); - } - if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL) - return (1); - - if (zfs_perm_get(zhp, &perms)) { - (void) fprintf(stderr, - gettext("Failed to retrieve 'allows' on %s\n"), ds); - zfs_close(zhp); - return (1); - } - - zfs_close(zhp); - - if (perms != NULL) - (void) printf("%s", LINES); - for (curperms = perms; curperms; curperms = curperms->z_next) { - - (void) snprintf(banner, sizeof (banner), - gettext("Permission sets on (%s)"), curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_sets, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - gettext("Create time permissions on (%s)"), - curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_crperms, banner, &allowcb); - - - (void) snprintf(banner, sizeof (banner), - gettext("Local permissions on (%s)"), curperms->z_setpoint); - allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - gettext("Descendent permissions on (%s)"), - curperms->z_setpoint); - allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - gettext("Local+Descendent permissions on (%s)"), - curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) printf("%s", LINES); - } - zfs_free_allows(perms); - return (0); -} - -#define ALLOWOPTIONS "ldcsu:g:e" -#define UNALLOWOPTIONS "ldcsu:g:er" - -/* - * Validate options, and build necessary datastructure to display/remove/add - * permissions. - * Returns 0 - If permissions should be added/removed - * Returns 1 - If permissions should be displayed. - * Returns -1 - on failure - */ -int -parse_allow_args(int *argc, char **argv[], boolean_t unallow, - char **ds, int *recurse, nvlist_t **zperms) -{ - int c; - char *options = unallow ? UNALLOWOPTIONS : ALLOWOPTIONS; - zfs_deleg_inherit_t deleg_type = ZFS_DELEG_NONE; - zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; - char *who = NULL; - char *perms = NULL; - zfs_handle_t *zhp; - - while ((c = getopt(*argc, *argv, options)) != -1) { - switch (c) { - case 'l': - if (who_type == ZFS_DELEG_CREATE || - who_type == ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - - deleg_type |= ZFS_DELEG_PERM_LOCAL; - break; - case 'd': - if (who_type == ZFS_DELEG_CREATE || - who_type == ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - - deleg_type |= ZFS_DELEG_PERM_DESCENDENT; - break; - case 'r': - *recurse = B_TRUE; - break; - case 'c': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - if (deleg_type) - usage(B_FALSE); - who_type = ZFS_DELEG_CREATE; - break; - case 's': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - if (deleg_type) - usage(B_FALSE); - who_type = ZFS_DELEG_NAMED_SET; - break; - case 'u': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_USER; - who = optarg; - break; - case 'g': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_GROUP; - who = optarg; - break; - case 'e': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_EVERYONE; - break; - default: - usage(B_FALSE); - break; - } - } - - if (deleg_type == 0) - deleg_type = ZFS_DELEG_PERM_LOCALDESCENDENT; - - *argc -= optind; - *argv += optind; - - if (unallow == B_FALSE && *argc == 1) { - /* - * Only print permissions if no options were processed - */ - if (optind == 1) - return (1); - else - usage(B_FALSE); - } - - /* - * initialize variables for zfs_build_perms based on number - * of arguments. - * 3 arguments ==> zfs [un]allow joe perm,perm,perm <dataset> or - * zfs [un]allow -s @set1 perm,perm <dataset> - * 2 arguments ==> zfs [un]allow -c perm,perm <dataset> or - * zfs [un]allow -u|-g <name> perm <dataset> or - * zfs [un]allow -e perm,perm <dataset> - * zfs unallow joe <dataset> - * zfs unallow -s @set1 <dataset> - * 1 argument ==> zfs [un]allow -e <dataset> or - * zfs [un]allow -c <dataset> - */ - - switch (*argc) { - case 3: - perms = (*argv)[1]; - who = (*argv)[0]; - *ds = (*argv)[2]; - - /* - * advance argc/argv for do_allow cases. - * for do_allow case make sure who have a know who type - * and its not a permission set. - */ - if (unallow == B_TRUE) { - *argc -= 2; - *argv += 2; - } else if (who_type != ZFS_DELEG_WHO_UNKNOWN && - who_type != ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - break; - - case 2: - if (unallow == B_TRUE && (who_type == ZFS_DELEG_EVERYONE || - who_type == ZFS_DELEG_CREATE || who != NULL)) { - perms = (*argv)[0]; - *ds = (*argv)[1]; - } else { - if (unallow == B_FALSE && - (who_type == ZFS_DELEG_WHO_UNKNOWN || - who_type == ZFS_DELEG_NAMED_SET)) - usage(B_FALSE); - else if (who_type == ZFS_DELEG_WHO_UNKNOWN || - who_type == ZFS_DELEG_NAMED_SET) - who = (*argv)[0]; - else if (who_type != ZFS_DELEG_NAMED_SET) - perms = (*argv)[0]; - *ds = (*argv)[1]; - } - if (unallow == B_TRUE) { - (*argc)--; - (*argv)++; - } - break; - - case 1: - if (unallow == B_FALSE) - usage(B_FALSE); - if (who == NULL && who_type != ZFS_DELEG_CREATE && - who_type != ZFS_DELEG_EVERYONE) - usage(B_FALSE); - *ds = (*argv)[0]; - break; - - default: - usage(B_FALSE); - } - - if (strrchr(*ds, '@')) { - (void) fprintf(stderr, - gettext("Can't set or remove 'allow' permissions " - "on snapshots.\n")); - return (-1); - } - - if ((zhp = zfs_open(g_zfs, *ds, ZFS_TYPE_DATASET)) == NULL) - return (-1); - - if ((zfs_build_perms(zhp, who, perms, - who_type, deleg_type, zperms)) != 0) { - zfs_close(zhp); - return (-1); - } - zfs_close(zhp); - return (0); -} - -static int -zfs_do_allow(int argc, char **argv) -{ - char *ds; - nvlist_t *zperms = NULL; - zfs_handle_t *zhp; - int unused; - int ret; - - if ((ret = parse_allow_args(&argc, &argv, B_FALSE, &ds, - &unused, &zperms)) == -1) - return (1); - - if (ret == 1) - return (zfs_print_allows(argv[0])); - - if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL) - return (1); - - if (zfs_perm_set(zhp, zperms)) { - zfs_close(zhp); - nvlist_free(zperms); - return (1); - } - nvlist_free(zperms); - zfs_close(zhp); - - return (0); -} - -static int -unallow_callback(zfs_handle_t *zhp, void *data) -{ - nvlist_t *nvp = (nvlist_t *)data; - int error; - - error = zfs_perm_remove(zhp, nvp); - if (error) { - (void) fprintf(stderr, gettext("Failed to remove permissions " - "on %s\n"), zfs_get_name(zhp)); - } - return (error); -} - -static int -zfs_do_unallow(int argc, char **argv) -{ - int recurse = B_FALSE; - char *ds; - int error; - nvlist_t *zperms = NULL; - int flags = 0; - - if (parse_allow_args(&argc, &argv, B_TRUE, - &ds, &recurse, &zperms) == -1) - return (1); - - if (recurse) - flags |= ZFS_ITER_RECURSE; - error = zfs_for_each(argc, argv, flags, - ZFS_TYPE_FILESYSTEM|ZFS_TYPE_VOLUME, NULL, - NULL, unallow_callback, (void *)zperms); - - if (zperms) - nvlist_free(zperms); - - return (error); -} - typedef struct get_all_cbdata { zfs_handle_t **cb_handles; size_t cb_alloc; @@ -3944,6 +3716,15 @@ zfs_do_unshare(int argc, char **argv) return (unshare_unmount(OP_SHARE, argc, argv)); } +/* ARGSUSED */ +static int +zfs_do_python(int argc, char **argv) +{ + (void) execv(pypath, argv-1); + (void) printf("internal error: %s not found\n", pypath); + return (-1); +} + /* * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. @@ -4197,6 +3978,7 @@ main(int argc, char **argv) /* * Run the appropriate command. */ + libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, argv + 1); @@ -4209,6 +3991,7 @@ main(int argc, char **argv) "command '%s'\n"), cmdname); usage(B_FALSE); } + libzfs_mnttab_cache(g_zfs, B_FALSE); } (void) fclose(mnttab_file); diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 02dc18b9c..09c377ef8 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS Fault Injector * @@ -224,7 +222,7 @@ usage(void) "\t\tClear the particular record (if given a numeric ID), or\n" "\t\tall records if 'all' is specificed.\n" "\n" - "\tzinject -d device [-e errno] [-L <nvlist|uber>] pool\n" + "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n" "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n" @@ -516,7 +514,7 @@ main(int argc, char **argv) return (0); } - while ((c = getopt(argc, argv, ":ab:d:f:qhc:t:l:mr:e:uL:")) != -1) { + while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; @@ -553,6 +551,9 @@ main(int argc, char **argv) return (1); } break; + case 'F': + record.zi_failfast = B_TRUE; + break; case 'h': usage(); return (0); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 42c6aabb6..c8a33df4d 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -376,12 +376,11 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, } normnm = zpool_prop_to_name(prop); } else { - if ((fprop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { - (void) fprintf(stderr, gettext("property '%s' is " - "not a valid file system property\n"), propname); - return (2); + if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + normnm = zfs_prop_to_name(fprop); + } else { + normnm = propname; } - normnm = zfs_prop_to_name(fprop); } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && @@ -979,14 +978,189 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) return (max); } +typedef struct spare_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} spare_cbdata_t; + +static boolean_t +find_vdev(nvlist_t *nv, uint64_t search) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + search == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev(child[c], search)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +find_spare(zpool_handle_t *zhp, void *data) +{ + spare_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (find_vdev(nvroot, cbp->cb_guid)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Print out configuration state as requested by status_callback. + */ +void +print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + int namewidth, int depth, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + char rbuf[6], wbuf[6], cbuf[6], repaired[7]; + char *vname; + uint64_t notpresent; + spare_cbdata_t cb; + char *state; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &c) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { + /* + * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for + * online drives. + */ + if (vs->vs_aux == VDEV_AUX_SPARED) + state = "INUSE"; + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = "AVAIL"; + } + + (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, + name, state); + + if (!isspare) { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); + (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + ¬present) == 0) { + char *path; + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + (void) printf(" was %s", path); + } else if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_SPARED: + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &cb) == 1) { + if (strcmp(zpool_get_name(cb.cb_zhp), + zpool_get_name(zhp)) == 0) + (void) printf(gettext("currently in " + "use")); + else + (void) printf(gettext("in use by " + "pool '%s'"), + zpool_get_name(cb.cb_zhp)); + zpool_close(cb.cb_zhp); + } else { + (void) printf(gettext("currently in use")); + } + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_IO_FAILURE: + (void) printf(gettext("experienced I/O failures")); + break; + + case VDEV_AUX_BAD_LOG: + (void) printf(gettext("bad intent log")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } else if (vs->vs_scrub_repaired != 0 && children == 0) { + /* + * Report bytes resilvered/repaired on leaf devices. + */ + zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); + (void) printf(gettext(" %s %s"), repaired, + (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? + "resilvered" : "repaired"); + } + + (void) printf("\n"); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + /* Don't print logs here */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + continue; + vname = zpool_vdev_name(g_zfs, zhp, child[c]); + print_status_config(zhp, vname, child[c], + namewidth, depth + 2, isspare); + free(vname); + } +} + /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ void -print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, - boolean_t print_logs) +print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) { nvlist_t **child; uint_t c, children; @@ -1043,12 +1217,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) + if (is_log) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c]); - print_import_config(vname, child[c], - namewidth, depth + 2, B_FALSE); + print_import_config(vname, child[c], namewidth, depth + 2); free(vname); } @@ -1074,6 +1247,43 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, } /* + * Print log vdevs. + * Logs are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1. We use either print_status_config() or + * print_import_config() to print the top level logs then any log + * children (eg mirrored slogs) are printed recursively - which + * works because only the top level vdev is marked "is_log" + */ +static void +print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose) +{ + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return; + + (void) printf(gettext("\tlogs\n")); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *name; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (!is_log) + continue; + name = zpool_vdev_name(g_zfs, zhp, child[c]); + if (verbose) + print_status_config(zhp, name, child[c], namewidth, + 2, B_FALSE); + else + print_import_config(name, child[c], namewidth, 2); + free(name); + } +} +/* * Display the status for the given pool. */ static void @@ -1241,11 +1451,9 @@ show_import(nvlist_t *config) if (namewidth < 10) namewidth = 10; - print_import_config(name, nvroot, namewidth, 0, B_FALSE); - if (num_logs(nvroot) > 0) { - (void) printf(gettext("\tlogs\n")); - print_import_config(name, nvroot, namewidth, 0, B_TRUE); - } + print_import_config(name, nvroot, namewidth, 0); + if (num_logs(nvroot) > 0) + print_logs(NULL, nvroot, namewidth, B_FALSE); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " @@ -2427,10 +2635,14 @@ zpool_do_online(int argc, char **argv) zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; + int flags = 0; /* check options */ - while ((c = getopt(argc, argv, "t")) != -1) { + while ((c = getopt(argc, argv, "et")) != -1) { switch (c) { + case 'e': + flags |= ZFS_ONLINE_EXPAND; + break; case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -2458,7 +2670,7 @@ zpool_do_online(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { - if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) { + if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), @@ -2715,181 +2927,6 @@ print_scrub_status(nvlist_t *nvroot) (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60)); } -typedef struct spare_cbdata { - uint64_t cb_guid; - zpool_handle_t *cb_zhp; -} spare_cbdata_t; - -static boolean_t -find_vdev(nvlist_t *nv, uint64_t search) -{ - uint64_t guid; - nvlist_t **child; - uint_t c, children; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && - search == guid) - return (B_TRUE); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev(child[c], search)) - return (B_TRUE); - } - - return (B_FALSE); -} - -static int -find_spare(zpool_handle_t *zhp, void *data) -{ - spare_cbdata_t *cbp = data; - nvlist_t *config, *nvroot; - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - if (find_vdev(nvroot, cbp->cb_guid)) { - cbp->cb_zhp = zhp; - return (1); - } - - zpool_close(zhp); - return (0); -} - -/* - * Print out configuration state as requested by status_callback. - */ -void -print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, - int namewidth, int depth, boolean_t isspare, boolean_t print_logs) -{ - nvlist_t **child; - uint_t c, children; - vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6], repaired[7]; - char *vname; - uint64_t notpresent; - spare_cbdata_t cb; - char *state; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &c) == 0); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - children = 0; - - state = zpool_state_to_name(vs->vs_state, vs->vs_aux); - if (isspare) { - /* - * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for - * online drives. - */ - if (vs->vs_aux == VDEV_AUX_SPARED) - state = "INUSE"; - else if (vs->vs_state == VDEV_STATE_HEALTHY) - state = "AVAIL"; - } - - (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, - name, state); - - if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); - } - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - ¬present) == 0) { - char *path; - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - (void) printf(" was %s", path); - } else if (vs->vs_aux != 0) { - (void) printf(" "); - - switch (vs->vs_aux) { - case VDEV_AUX_OPEN_FAILED: - (void) printf(gettext("cannot open")); - break; - - case VDEV_AUX_BAD_GUID_SUM: - (void) printf(gettext("missing device")); - break; - - case VDEV_AUX_NO_REPLICAS: - (void) printf(gettext("insufficient replicas")); - break; - - case VDEV_AUX_VERSION_NEWER: - (void) printf(gettext("newer version")); - break; - - case VDEV_AUX_SPARED: - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &cb.cb_guid) == 0); - if (zpool_iter(g_zfs, find_spare, &cb) == 1) { - if (strcmp(zpool_get_name(cb.cb_zhp), - zpool_get_name(zhp)) == 0) - (void) printf(gettext("currently in " - "use")); - else - (void) printf(gettext("in use by " - "pool '%s'"), - zpool_get_name(cb.cb_zhp)); - zpool_close(cb.cb_zhp); - } else { - (void) printf(gettext("currently in use")); - } - break; - - case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); - break; - - case VDEV_AUX_IO_FAILURE: - (void) printf(gettext("experienced I/O failures")); - break; - - case VDEV_AUX_BAD_LOG: - (void) printf(gettext("bad intent log")); - break; - - default: - (void) printf(gettext("corrupted data")); - break; - } - } else if (vs->vs_scrub_repaired != 0 && children == 0) { - /* - * Report bytes resilvered/repaired on leaf devices. - */ - zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); - (void) printf(gettext(" %s %s"), repaired, - (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilvered" : "repaired"); - } - - (void) printf("\n"); - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) - continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c]); - print_status_config(zhp, vname, child[c], - namewidth, depth + 2, isspare, B_FALSE); - free(vname); - } -} - static void print_error_log(zpool_handle_t *zhp) { @@ -2940,7 +2977,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i]); print_status_config(zhp, name, spares[i], - namewidth, 2, B_TRUE, B_FALSE); + namewidth, 2, B_TRUE); free(name); } } @@ -2960,7 +2997,7 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i]); print_status_config(zhp, name, l2cache[i], - namewidth, 2, B_FALSE, B_FALSE); + namewidth, 2, B_FALSE); free(name); } } @@ -3190,11 +3227,10 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); print_status_config(zhp, zpool_get_name(zhp), nvroot, - namewidth, 0, B_FALSE, B_FALSE); - if (num_logs(nvroot) > 0) - print_status_config(zhp, "logs", nvroot, namewidth, 0, - B_FALSE, B_TRUE); + namewidth, 0, B_FALSE); + if (num_logs(nvroot) > 0) + print_logs(zhp, nvroot, namewidth, B_TRUE); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, l2cache, nl2cache, namewidth); @@ -3418,7 +3454,7 @@ zpool_do_upgrade(int argc, char **argv) /* check options */ - while ((c = getopt(argc, argv, "avV:")) != -1) { + while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': cb.cb_all = B_TRUE; @@ -3435,6 +3471,11 @@ zpool_do_upgrade(int argc, char **argv) usage(B_FALSE); } break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3495,8 +3536,9 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); - (void) printf(gettext(" 14 passthrough-x aclinherit " - "support\n")); + (void) printf(gettext(" 14 passthrough-x aclinherit\n")); + (void) printf(gettext(" 15 user/group space accounting\n")); + (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext("For more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 4503a3d02..746db0c07 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ #include <sys/spa.h> #include <sys/dmu.h> #include <sys/txg.h> +#include <sys/dbuf.h> #include <sys/zap.h> #include <sys/dmu_objset.h> #include <sys/poll.h> @@ -92,6 +93,7 @@ #include <sys/vdev_file.h> #include <sys/spa_impl.h> #include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> #include <sys/refcount.h> #include <stdio.h> #include <stdio_ext.h> @@ -162,6 +164,7 @@ typedef void ztest_func_t(ztest_args_t *); * Note: these aren't static because we want dladdr() to work. */ ztest_func_t ztest_dmu_read_write; +ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_zap; @@ -170,6 +173,7 @@ ztest_func_t ztest_traverse; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_snapshot_create_destroy; +ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_spa_rename; @@ -196,6 +200,7 @@ uint64_t zopt_rarely = 60; /* every 60 seconds */ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, + { ztest_dmu_read_write_zcopy, 1, &zopt_always }, { ztest_dmu_write_parallel, 30, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, @@ -208,6 +213,7 @@ ztest_info_t ztest_info[] = { { ztest_spa_rename, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, + { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &zopt_vdevtime }, { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, { ztest_scrub, 1, &zopt_vdevtime }, @@ -242,9 +248,11 @@ static ztest_shared_t *ztest_shared; static int ztest_random_fd; static int ztest_dump_core = 1; +static uint64_t metaslab_sz; static boolean_t ztest_exiting; extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_df_alloc_threshold; #define ZTEST_DIROBJ 1 #define ZTEST_MICROZAP_OBJ 2 @@ -946,7 +954,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za) * of devices that have pending state changes. */ if (ztest_random(2) == 0) - (void) vdev_online(spa, guid, B_FALSE, NULL); + (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); if (error != 0 && error != EBUSY) @@ -1024,7 +1032,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) } oldguid = oldvd->vdev_guid; - oldsize = vdev_get_rsize(oldvd); + oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; @@ -1060,7 +1068,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) } if (newvd) { - newsize = vdev_get_rsize(newvd); + newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. @@ -1136,49 +1144,202 @@ ztest_vdev_attach_detach(ztest_args_t *za) } /* + * Callback function which expands the physical size of the vdev. + */ +vdev_t * +grow_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + size_t *newsize = arg; + size_t fsize; + int fd; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if ((fd = open(vd->vdev_path, O_RDWR)) == -1) + return (vd); + + fsize = lseek(fd, 0, SEEK_END); + (void) ftruncate(fd, *newsize); + + if (zopt_verbose >= 6) { + (void) printf("%s grew from %lu to %lu bytes\n", + vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); + } + (void) close(fd); + return (NULL); +} + +/* + * Callback function which expands a given vdev by calling vdev_online(). + */ +/* ARGSUSED */ +vdev_t * +online_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + vdev_t *pvd = vd->vdev_parent; + uint64_t guid = vd->vdev_guid; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* Calling vdev_online will initialize the new metaslabs */ + spa_config_exit(spa, SCL_STATE, spa); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * Since we dropped the lock we need to ensure that we're + * still talking to the original vdev. It's possible this + * vdev may have been detached/replaced while we were + * trying to online it. + */ + if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) { + if (zopt_verbose >= 6) { + (void) printf("vdev %p has disappeared, was " + "guid %llu\n", (void *)vd, (u_longlong_t)guid); + } + return (vd); + } + return (NULL); +} + +/* + * Traverse the vdev tree calling the supplied function. + * We continue to walk the tree until we either have walked all + * children or we receive a non-NULL return from the callback. + * If a NULL callback is passed, then we just return back the first + * leaf vdev we encounter. + */ +vdev_t * +vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) +{ + if (vd->vdev_ops->vdev_op_leaf) { + if (func == NULL) + return (vd); + else + return (func(vd, arg)); + } + + for (uint_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) + return (cvd); + } + return (NULL); +} + +/* * Verify that dynamic LUN growth works as expected. */ void ztest_vdev_LUN_growth(ztest_args_t *za) { spa_t *spa = za->za_spa; - char dev_name[MAXPATHLEN]; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; - uint64_t vdev; - size_t fsize; - int fd; + vdev_t *vd, *tvd = NULL; + size_t psize, newsize; + uint64_t spa_newsize, spa_cursize, ms_count; (void) mutex_lock(&ztest_shared->zs_vdev_lock); + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + while (tvd == NULL || tvd->vdev_islog) { + uint64_t vdev; + + vdev = ztest_random(spa->spa_root_vdev->vdev_children); + tvd = spa->spa_root_vdev->vdev_child[vdev]; + } /* - * Pick a random leaf vdev. + * Determine the size of the first leaf vdev associated with + * our top-level device. */ - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves); - spa_config_exit(spa, SCL_VDEV, FTAG); + vd = vdev_walk_tree(tvd, NULL, NULL); + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_ops->vdev_op_leaf); - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); + psize = vd->vdev_psize; - if ((fd = open(dev_name, O_RDWR)) != -1) { - /* - * Determine the size. - */ - fsize = lseek(fd, 0, SEEK_END); + /* + * We only try to expand the vdev if it's less than 4x its + * original size and it has a valid psize. + */ + if (psize == 0 || psize >= 4 * zopt_vdev_size) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&spa_namespace_lock); + (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + return; + } + ASSERT(psize > 0); + newsize = psize + psize / 8; + ASSERT3U(newsize, >, psize); - /* - * If it's less than 2x the original size, grow by around 3%. - */ - if (fsize < 2 * zopt_vdev_size) { - size_t newsize = fsize + ztest_random(fsize / 32); - (void) ftruncate(fd, newsize); - if (zopt_verbose >= 6) { - (void) printf("%s grew from %lu to %lu bytes\n", - dev_name, (ulong_t)fsize, (ulong_t)newsize); - } + if (zopt_verbose >= 6) { + (void) printf("Expanding vdev %s from %lu to %lu\n", + vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); + } + + spa_cursize = spa_get_space(spa); + ms_count = tvd->vdev_ms_count; + + /* + * Growing the vdev is a two step process: + * 1). expand the physical size (i.e. relabel) + * 2). online the vdev to create the new metaslabs + */ + if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || + vdev_walk_tree(tvd, online_vdev, NULL) != NULL || + tvd->vdev_state != VDEV_STATE_HEALTHY) { + if (zopt_verbose >= 5) { + (void) printf("Could not expand LUN because " + "some vdevs were not healthy\n"); } - (void) close(fd); + (void) spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&spa_namespace_lock); + (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + return; } + (void) spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&spa_namespace_lock); + + /* + * Expanding the LUN will update the config asynchronously, + * thus we must wait for the async thread to complete any + * pending tasks before proceeding. + */ + mutex_enter(&spa->spa_async_lock); + while (spa->spa_async_thread != NULL || spa->spa_async_tasks) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + spa_newsize = spa_get_space(spa); + + /* + * Make sure we were able to grow the pool. + */ + if (ms_count >= tvd->vdev_ms_count || + spa_cursize >= spa_newsize) { + (void) printf("Top-level vdev metaslab count: " + "before %llu, after %llu\n", + (u_longlong_t)ms_count, + (u_longlong_t)tvd->vdev_ms_count); + fatal(0, "LUN expansion failed: before %llu, " + "after %llu\n", spa_cursize, spa_newsize); + } else if (zopt_verbose >= 5) { + char oldnumbuf[6], newnumbuf[6]; + + nicenum(spa_cursize, oldnumbuf); + nicenum(spa_newsize, newnumbuf); + (void) printf("%s grew from %s to %s\n", + spa->spa_name, oldnumbuf, newnumbuf); + } + spa_config_exit(spa, SCL_STATE, spa); (void) mutex_unlock(&ztest_shared->zs_vdev_lock); } @@ -1425,7 +1586,8 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) error = dmu_objset_destroy(snapname); if (error != 0 && error != ENOENT) fatal(0, "dmu_objset_destroy() = %d", error); - error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE); + error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, + NULL, FALSE); if (error == ENOSPC) ztest_record_enospc("dmu_take_snapshot"); else if (error != 0 && error != EEXIST) @@ -1434,6 +1596,148 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) } /* + * Cleanup non-standard snapshots and clones. + */ +void +ztest_dsl_dataset_cleanup(char *osname, uint64_t curval) +{ + char snap1name[100]; + char clone1name[100]; + char snap2name[100]; + char clone2name[100]; + char snap3name[100]; + int error; + + (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval); + (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval); + (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval); + (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval); + (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval); + + error = dmu_objset_destroy(clone2name); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); + error = dmu_objset_destroy(snap3name); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); + error = dmu_objset_destroy(snap2name); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); + error = dmu_objset_destroy(clone1name); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); + error = dmu_objset_destroy(snap1name); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); +} + +/* + * Verify dsl_dataset_promote handles EBUSY + */ +void +ztest_dsl_dataset_promote_busy(ztest_args_t *za) +{ + int error; + objset_t *os = za->za_os; + objset_t *clone; + dsl_dataset_t *ds; + char snap1name[100]; + char clone1name[100]; + char snap2name[100]; + char clone2name[100]; + char snap3name[100]; + char osname[MAXNAMELEN]; + uint64_t curval = za->za_instance; + + (void) rw_rdlock(&ztest_shared->zs_name_lock); + + dmu_objset_name(os, osname); + ztest_dsl_dataset_cleanup(osname, curval); + + (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval); + (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval); + (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval); + (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval); + (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval); + + error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, + NULL, FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_take_snapshot"); + goto out; + } + fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + } + + error = dmu_objset_open(snap1name, DMU_OST_OTHER, + DS_MODE_USER | DS_MODE_READONLY, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); + + error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0, + NULL, NULL); + dmu_objset_close(clone); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_create"); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + } + + error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, + NULL, FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_take_snapshot"); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); + } + + error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, + NULL, FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_take_snapshot"); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + } + + error = dmu_objset_open(snap3name, DMU_OST_OTHER, + DS_MODE_USER | DS_MODE_READONLY, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + + error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0, + NULL, NULL); + dmu_objset_close(clone); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_create"); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + } + + error = dsl_dataset_own(snap1name, DS_MODE_READONLY, FTAG, &ds); + if (error) + fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error); + error = dsl_dataset_promote(clone2name); + if (error != EBUSY) + fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, + error); + dsl_dataset_disown(ds, FTAG); + +out: + ztest_dsl_dataset_cleanup(osname, curval); + + (void) rw_unlock(&ztest_shared->zs_name_lock); +} + +/* * Verify that dmu_object_{alloc,free} work as expected. */ void @@ -1456,7 +1760,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za) * Create a batch object if necessary, and record it in the directory. */ VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &batchobj)); + sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH)); if (batchobj == 0) { tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, @@ -1481,7 +1785,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za) */ for (b = 0; b < batchsize; b++) { VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object)); + sizeof (uint64_t), &object, DMU_READ_PREFETCH)); if (object == 0) continue; /* @@ -1516,7 +1820,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za) * We expect the word at endoff to be our object number. */ VERIFY(0 == dmu_read(os, object, endoff, - sizeof (uint64_t), &temp)); + sizeof (uint64_t), &temp, DMU_READ_PREFETCH)); if (temp != object) { fatal(0, "bad data in %s, got %llu, expected %llu", @@ -1701,7 +2005,7 @@ ztest_dmu_read_write(ztest_args_t *za) * Read the directory info. If it's the first time, set things up. */ VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (dd), &dd)); + sizeof (dd), &dd, DMU_READ_PREFETCH)); if (dd.dd_chunk == 0) { ASSERT(dd.dd_packobj == 0); ASSERT(dd.dd_bigobj == 0); @@ -1763,9 +2067,11 @@ ztest_dmu_read_write(ztest_args_t *za) /* * Read the current contents of our objects. */ - error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf); + error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf, + DMU_READ_PREFETCH); ASSERT3U(error, ==, 0); - error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf); + error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, + DMU_READ_PREFETCH); ASSERT3U(error, ==, 0); /* @@ -1871,9 +2177,9 @@ ztest_dmu_read_write(ztest_args_t *za) void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff, - packsize, packcheck)); + packsize, packcheck, DMU_READ_PREFETCH)); VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff, - bigsize, bigcheck)); + bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT(bcmp(packbuf, packcheck, packsize) == 0); ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); @@ -1887,6 +2193,314 @@ ztest_dmu_read_write(ztest_args_t *za) } void +compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, + uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg) +{ + uint64_t i; + bufwad_t *pack; + bufwad_t *bigH; + bufwad_t *bigT; + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + + *bigH = *pack; + *bigT = *pack; + } +} + +void +ztest_dmu_read_write_zcopy(ztest_args_t *za) +{ + objset_t *os = za->za_os; + dmu_read_write_dir_t dd; + dmu_tx_t *tx; + uint64_t i; + int error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf; + uint64_t packoff, packsize, bigoff, bigsize; + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 9; + dmu_buf_t *bonus_db; + arc_buf_t **bigbuf_arcbufs; + dmu_object_info_t *doi = &za->za_doi; + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is set equal to bigobj block size so that + * dmu_assign_arcbuf() can be tested for object updates. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, + sizeof (dd), &dd, DMU_READ_PREFETCH)); + if (dd.dd_chunk == 0) { + ASSERT(dd.dd_packobj == 0); + ASSERT(dd.dd_bigobj == 0); + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd)); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + ztest_record_enospc("create r/w directory"); + dmu_tx_abort(tx); + return; + } + + dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, + DMU_OT_NONE, 0, tx); + dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, + DMU_OT_NONE, 0, tx); + ztest_set_random_blocksize(os, dd.dd_packobj, tx); + ztest_set_random_blocksize(os, dd.dd_bigobj, tx); + + VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0); + ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t)); + ASSERT(ISP2(doi->doi_data_block_size)); + dd.dd_chunk = doi->doi_data_block_size; + + dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd, + tx); + dmu_tx_commit(tx); + } else { + VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0); + VERIFY(ISP2(doi->doi_data_block_size)); + VERIFY(dd.dd_chunk == doi->doi_data_block_size); + VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t)); + } + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * dd.dd_chunk; + bigsize = s * dd.dd_chunk; + + packbuf = umem_zalloc(packsize, UMEM_NOFAIL); + bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); + + VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0); + + bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); + + /* + * Iteration 0 test zcopy for DB_UNCACHED dbufs. + * Iteration 1 test zcopy to already referenced dbufs. + * Iteration 2 test zcopy to dirty dbuf in the same txg. + * Iteration 3 test zcopy to dbuf dirty in previous txg. + * Iteration 4 test zcopy when dbuf is no longer dirty. + * Iteration 5 test zcopy when it can't be done. + * Iteration 6 one more zcopy write. + */ + for (i = 0; i < 7; i++) { + uint64_t j; + uint64_t off; + + /* + * In iteration 5 (i == 5) use arcbufs + * that don't match bigobj blksz to test + * dmu_assign_arcbuf() when it can't directly + * assign an arcbuf to a dbuf. + */ + for (j = 0; j < s; j++) { + if (i != 5) { + bigbuf_arcbufs[j] = + dmu_request_arcbuf(bonus_db, + dd.dd_chunk); + } else { + bigbuf_arcbufs[2 * j] = + dmu_request_arcbuf(bonus_db, + dd.dd_chunk / 2); + bigbuf_arcbufs[2 * j + 1] = + dmu_request_arcbuf(bonus_db, + dd.dd_chunk / 2); + } + } + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize); + dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize); + + if (ztest_random(100) == 0) { + error = -1; + } else { + error = dmu_tx_assign(tx, TXG_WAIT); + } + + if (error) { + if (error != -1) { + ztest_record_enospc("dmu r/w range"); + } + dmu_tx_abort(tx); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + for (j = 0; j < s; j++) { + if (i != 5) { + dmu_return_arcbuf(bigbuf_arcbufs[j]); + } else { + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j]); + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j + 1]); + } + } + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + dmu_buf_rele(bonus_db, FTAG); + return; + } + + txg = dmu_tx_get_txg(tx); + + /* + * 50% of the time don't read objects in the 1st iteration to + * test dmu_assign_arcbuf() for the case when there're no + * existing dbufs for the specified offsets. + */ + if (i != 0 || ztest_random(2) != 0) { + error = dmu_read(os, dd.dd_packobj, packoff, + packsize, packbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, + bigbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + } + compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, + n, dd, txg); + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx); + if (zopt_verbose >= 6) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) { + dmu_buf_t *dbt; + if (i != 5) { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[j]->b_data, dd.dd_chunk); + } else { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[2 * j]->b_data, + dd.dd_chunk / 2); + bcopy((caddr_t)bigbuf + (off - bigoff) + + dd.dd_chunk / 2, + bigbuf_arcbufs[2 * j + 1]->b_data, + dd.dd_chunk / 2); + } + + if (i == 1) { + VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off, + FTAG, &dbt) == 0); + } + if (i != 5) { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[j], tx); + } else { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[2 * j], tx); + dmu_assign_arcbuf(bonus_db, + off + dd.dd_chunk / 2, + bigbuf_arcbufs[2 * j + 1], tx); + } + if (i == 1) { + dmu_buf_rele(dbt, FTAG); + } + } + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + if (i == 2) { + txg_wait_open(dmu_objset_pool(os), 0); + } else if (i == 3) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + } + + dmu_buf_rele(bonus_db, FTAG); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); +} + +void ztest_dmu_check_future_leak(ztest_args_t *za) { objset_t *os = za->za_os; @@ -1935,6 +2549,8 @@ ztest_dmu_write_parallel(ztest_args_t *za) uint64_t blkoff; zbookmark_t zb; dmu_tx_t *tx = dmu_tx_create(os); + dmu_buf_t *bonus_db; + arc_buf_t *abuf = NULL; dmu_objset_name(os, osname); @@ -1963,6 +2579,12 @@ ztest_dmu_write_parallel(ztest_args_t *za) } } + if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free && + ztest_random(8) == 0) { + VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0); + abuf = dmu_request_arcbuf(bonus_db, bs); + } + txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT; error = dmu_tx_assign(tx, txg_how); if (error) { @@ -1973,6 +2595,10 @@ ztest_dmu_write_parallel(ztest_args_t *za) ztest_record_enospc("dmu write parallel"); } dmu_tx_abort(tx); + if (abuf != NULL) { + dmu_return_arcbuf(abuf); + dmu_buf_rele(bonus_db, FTAG); + } return; } txg = dmu_tx_get_txg(tx); @@ -2027,8 +2653,12 @@ ztest_dmu_write_parallel(ztest_args_t *za) za->za_dbuf = NULL; } else if (do_free) { VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0); - } else { + } else if (abuf == NULL) { dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); + } else { + bcopy(wbt, abuf->b_data, btsize); + dmu_assign_arcbuf(bonus_db, off, abuf, tx); + dmu_buf_rele(bonus_db, FTAG); } (void) mutex_unlock(lp); @@ -2064,16 +2694,20 @@ ztest_dmu_write_parallel(ztest_args_t *za) dmu_buf_rele(db, FTAG); za->za_dbuf = NULL; - (void) mutex_unlock(lp); - - if (error) + if (error) { + (void) mutex_unlock(lp); return; + } - if (blk.blk_birth == 0) /* concurrent free */ + if (blk.blk_birth == 0) { /* concurrent free */ + (void) mutex_unlock(lp); return; + } txg_suspend(dmu_objset_pool(os)); + (void) mutex_unlock(lp); + ASSERT(blk.blk_fill == 1); ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER); ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); @@ -2146,7 +2780,7 @@ ztest_zap(ztest_args_t *za) * Create a new object if necessary, and record it in the directory. */ VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &object)); + sizeof (uint64_t), &object, DMU_READ_PREFETCH)); if (object == 0) { tx = dmu_tx_create(os); @@ -2799,7 +3433,7 @@ ztest_verify_blocks(char *pool) isa = strdup(isa); /* LINTED */ (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s", + "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s", isalen, isa, zopt_verbose >= 3 ? "s" : "", @@ -2944,7 +3578,7 @@ ztest_resume(spa_t *spa) spa_vdev_state_enter(spa); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); - zio_resume(spa); + (void) zio_resume(spa); } } @@ -3216,6 +3850,10 @@ ztest_run(char *pool) (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); if (zopt_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); + + /* Cleanup any non-standard clones and snapshots */ + ztest_dsl_dataset_cleanup(name, za[d].za_instance); + (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); (void) rw_unlock(&ztest_shared->zs_name_lock); @@ -3296,6 +3934,8 @@ ztest_init(char *pool) if (error) fatal(0, "spa_open() = %d", error); + metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + if (zopt_verbose >= 3) show_pool_stats(spa); @@ -3387,6 +4027,9 @@ main(int argc, char **argv) zi->zi_call_time = 0; } + /* Set the allocation switch size */ + metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; + pid = fork(); if (pid == -1) diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index a259fb469..8d00a3151 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -116,6 +116,7 @@ enum { EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ + EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_UNKNOWN }; @@ -178,6 +179,7 @@ extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *); +extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, @@ -229,6 +231,8 @@ extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); +extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, + boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); /* @@ -335,7 +339,8 @@ extern int zpool_stage_history(libzfs_handle_t *, const char *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); -extern int zpool_get_physpath(zpool_handle_t *, char *); +extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); + /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. @@ -368,6 +373,10 @@ extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); +extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *); extern const char *zfs_prop_values(zfs_prop_t); @@ -384,6 +393,7 @@ typedef struct zprop_list { } zprop_list_t; extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **); +extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" @@ -454,6 +464,12 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *, boolean_t, boolean_t, boolean_t, boolean_t, int); extern int zfs_promote(zfs_handle_t *); +typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, + uid_t rid, uint64_t space); + +extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg); + typedef struct recvflags { /* print informational messages (ie, -v was specified) */ int verbose : 1; @@ -492,17 +508,6 @@ extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, extern int zfs_spa_version(zfs_handle_t *, int *); /* - * dataset permission functions. - */ -extern int zfs_perm_set(zfs_handle_t *, nvlist_t *); -extern int zfs_perm_remove(zfs_handle_t *, nvlist_t *); -extern int zfs_build_perms(zfs_handle_t *, char *, char *, - zfs_deleg_who_type_t, zfs_deleg_inherit_t, nvlist_t **nvlist_t); -extern int zfs_perm_get(zfs_handle_t *, zfs_allow_t **); -extern void zfs_free_allows(zfs_allow_t *); -extern void zfs_deleg_permissions(void); - -/* * Mount support functions. */ extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); @@ -536,7 +541,7 @@ extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *); extern int zfs_share_iscsi(zfs_handle_t *); extern int zfs_unshare_iscsi(zfs_handle_t *); extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *); -extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, +extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); /* @@ -575,6 +580,15 @@ extern int zpool_remove_zvol_links(zpool_handle_t *); extern int zvol_check_dump_config(char *); /* + * Management interfaces for SMB ACL files + */ + +int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); +int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); + +/* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ diff --git a/lib/libzfs/include/libzfs_impl.h b/lib/libzfs/include/libzfs_impl.h index 073499b26..70a1d1cbf 100644 --- a/lib/libzfs/include/libzfs_impl.h +++ b/lib/libzfs/include/libzfs_impl.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -63,6 +63,7 @@ struct libzfs_handle { int libzfs_printerr; void *libzfs_sharehdl; /* libshare handle */ uint_t libzfs_shareflags; + boolean_t libzfs_mnttab_enable; avl_tree_t libzfs_mnttab_cache; }; #define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ @@ -78,6 +79,7 @@ struct zfs_handle { nvlist_t *zfs_user_props; boolean_t zfs_mntcheck; char *zfs_mntopts; + uint8_t *zfs_props_table; }; /* @@ -185,7 +187,7 @@ extern int zfs_init_libshare(libzfs_handle_t *, int); extern void zfs_uninit_libshare(libzfs_handle_t *); extern int zfs_parse_options(char *, zfs_share_proto_t); -extern int zfs_unshare_proto(zfs_handle_t *zhp, +extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *); #ifdef __cplusplus } diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index b905bc6cb..6fa196710 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Portions Copyright 2007 Ramprakash Jelari @@ -218,6 +218,7 @@ changelist_postfix(prop_changelist_t *clp) boolean_t sharenfs; boolean_t sharesmb; + boolean_t mounted; /* * If we are in the global zone, but this dataset is exported @@ -272,20 +273,29 @@ changelist_postfix(prop_changelist_t *clp) shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); - if ((cn->cn_mounted || clp->cl_waslegacy || sharenfs || - sharesmb) && !zfs_is_mounted(cn->cn_handle, NULL) && - zfs_mount(cn->cn_handle, NULL, 0) != 0) - errors++; + mounted = zfs_is_mounted(cn->cn_handle, NULL); + + if (!mounted && (cn->cn_mounted || + ((sharenfs || sharesmb || clp->cl_waslegacy) && + (zfs_prop_get_int(cn->cn_handle, + ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { + + if (zfs_mount(cn->cn_handle, NULL, 0) != 0) + errors++; + else + mounted = TRUE; + } /* - * We always re-share even if the filesystem is currently - * shared, so that we can adopt any new options. + * If the file system is mounted we always re-share even + * if the filesystem is currently shared, so that we can + * adopt any new options. */ - if (sharenfs) + if (sharenfs && mounted) errors += zfs_share_nfs(cn->cn_handle); else if (cn->cn_shared || clp->cl_waslegacy) errors += zfs_unshare_nfs(cn->cn_handle, NULL); - if (sharesmb) + if (sharesmb && mounted) errors += zfs_share_smb(cn->cn_handle); else if (cn->cn_shared || clp->cl_waslegacy) errors += zfs_unshare_smb(cn->cn_handle, NULL); @@ -621,8 +631,6 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else if (prop == ZFS_PROP_VOLSIZE) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; - } else if (prop == ZFS_PROP_VERSION) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else { clp->cl_prop = prop; } diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index a381a0e63..ac9122605 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +45,8 @@ #include <grp.h> #include <stddef.h> #include <ucred.h> +#include <idmap.h> +#include <aclutils.h> #include <sys/spa.h> #include <sys/zap.h> @@ -56,6 +58,8 @@ #include "zfs_deleg.h" static int zvol_create_link_common(libzfs_handle_t *, const char *, int); +static int userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human @@ -121,8 +125,8 @@ path_to_str(const char *path, int types) /* * Validate a ZFS path. This is used even before trying to open the dataset, to - * provide a more meaningful error message. We place a more useful message in - * 'buf' detailing exactly why the name was not valid. + * provide a more meaningful error message. We call zfs_error_aux() to + * explain exactly why the name was not valid. */ static int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, @@ -346,6 +350,10 @@ put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) return (-1); } + /* + * XXX Why do we store the user props separately, in addition to + * storing them in zfs_props? + */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); @@ -605,11 +613,15 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) void libzfs_mnttab_init(libzfs_handle_t *hdl) { - struct mnttab entry; - assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); +} + +void +libzfs_mnttab_update(libzfs_handle_t *hdl) +{ + struct mnttab entry; rewind(hdl->libzfs_mnttab); while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { @@ -642,6 +654,12 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl) avl_destroy(&hdl->libzfs_mnttab_cache); } +void +libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) +{ + hdl->libzfs_mnttab_enable = enable; +} + int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) @@ -649,8 +667,22 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, mnttab_node_t find; mnttab_node_t *mtn; + if (!hdl->libzfs_mnttab_enable) { + struct mnttab srch = { 0 }; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache)) + libzfs_mnttab_fini(hdl); + rewind(hdl->libzfs_mnttab); + srch.mnt_special = (char *)fsname; + srch.mnt_fstype = MNTTYPE_ZFS; + if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) + return (0); + else + return (ENOENT); + } + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) - libzfs_mnttab_init(hdl); + libzfs_mnttab_update(hdl); find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); @@ -748,23 +780,18 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, return (NULL); } + /* + * Make sure this property is valid and applies to this type. + */ + elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); - /* - * Make sure this property is valid and applies to this type. - */ - if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { - if (!zfs_prop_user(propname)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - + prop = zfs_name_to_prop(propname); + if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { /* - * If this is a user property, make sure it's a + * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { @@ -790,6 +817,10 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, continue; } + /* + * Currently, only user properties can be modified on + * snapshots. + */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); @@ -797,6 +828,80 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, goto error; } + if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { + zfs_userquota_prop_t uqtype; + char newpropname[128]; + char domain[128]; + uint64_t rid; + uint64_t valary[3]; + + if (userquota_propname_decode(propname, zoned, + &uqtype, domain, sizeof (domain), &rid) != 0) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' has an invalid user/group name"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (uqtype != ZFS_PROP_USERQUOTA && + uqtype != ZFS_PROP_GROUPQUOTA) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, + errbuf); + goto error; + } + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, "none") == 0) { + intval = 0; + } else if (zfs_nicestrtonum(hdl, + strval, &intval) != 0) { + (void) zfs_error(hdl, + EZFS_BADPROP, errbuf); + goto error; + } + } else if (nvpair_type(elem) == + DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, &intval); + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable " + "userquota/groupquota")); + goto error; + } + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) snprintf(newpropname, sizeof (newpropname), + "%s%s", zfs_userquota_prop_prefixes[uqtype], + domain); + valary[0] = uqtype; + valary[1] = rid; + valary[2] = intval; + if (nvlist_add_uint64_array(ret, newpropname, + valary, 3) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + if (prop == ZPROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + if (!zfs_prop_valid_for_type(prop, type)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " @@ -936,7 +1041,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in - * a globle zone. If not, something is wrong. + * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " @@ -1120,808 +1225,6 @@ error: return (NULL); } -static int -zfs_get_perm_who(const char *who, zfs_deleg_who_type_t *who_type, - uint64_t *ret_who) -{ - struct passwd *pwd; - struct group *grp; - uid_t id; - - if (*who_type == ZFS_DELEG_EVERYONE || *who_type == ZFS_DELEG_CREATE || - *who_type == ZFS_DELEG_NAMED_SET) { - *ret_who = -1; - return (0); - } - if (who == NULL && !(*who_type == ZFS_DELEG_EVERYONE)) - return (EZFS_BADWHO); - - if (*who_type == ZFS_DELEG_WHO_UNKNOWN && - strcmp(who, "everyone") == 0) { - *ret_who = -1; - *who_type = ZFS_DELEG_EVERYONE; - return (0); - } - - pwd = getpwnam(who); - grp = getgrnam(who); - - if ((*who_type == ZFS_DELEG_USER) && pwd) { - *ret_who = pwd->pw_uid; - } else if ((*who_type == ZFS_DELEG_GROUP) && grp) { - *ret_who = grp->gr_gid; - } else if (pwd) { - *ret_who = pwd->pw_uid; - *who_type = ZFS_DELEG_USER; - } else if (grp) { - *ret_who = grp->gr_gid; - *who_type = ZFS_DELEG_GROUP; - } else { - char *end; - - id = strtol(who, &end, 10); - if (errno != 0 || *end != '\0') { - return (EZFS_BADWHO); - } else { - *ret_who = id; - if (*who_type == ZFS_DELEG_WHO_UNKNOWN) - *who_type = ZFS_DELEG_USER; - } - } - - return (0); -} - -static void -zfs_perms_add_to_nvlist(nvlist_t *who_nvp, char *name, nvlist_t *perms_nvp) -{ - if (perms_nvp != NULL) { - verify(nvlist_add_nvlist(who_nvp, - name, perms_nvp) == 0); - } else { - verify(nvlist_add_boolean(who_nvp, name) == 0); - } -} - -static void -helper(zfs_deleg_who_type_t who_type, uint64_t whoid, char *whostr, - zfs_deleg_inherit_t inherit, nvlist_t *who_nvp, nvlist_t *perms_nvp, - nvlist_t *sets_nvp) -{ - boolean_t do_perms, do_sets; - char name[ZFS_MAX_DELEG_NAME]; - - do_perms = (nvlist_next_nvpair(perms_nvp, NULL) != NULL); - do_sets = (nvlist_next_nvpair(sets_nvp, NULL) != NULL); - - if (!do_perms && !do_sets) - do_perms = do_sets = B_TRUE; - - if (do_perms) { - zfs_deleg_whokey(name, who_type, inherit, - (who_type == ZFS_DELEG_NAMED_SET) ? - whostr : (void *)&whoid); - zfs_perms_add_to_nvlist(who_nvp, name, perms_nvp); - } - if (do_sets) { - zfs_deleg_whokey(name, toupper(who_type), inherit, - (who_type == ZFS_DELEG_NAMED_SET) ? - whostr : (void *)&whoid); - zfs_perms_add_to_nvlist(who_nvp, name, sets_nvp); - } -} - -static void -zfs_perms_add_who_nvlist(nvlist_t *who_nvp, uint64_t whoid, void *whostr, - nvlist_t *perms_nvp, nvlist_t *sets_nvp, - zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit) -{ - if (who_type == ZFS_DELEG_NAMED_SET || who_type == ZFS_DELEG_CREATE) { - helper(who_type, whoid, whostr, 0, - who_nvp, perms_nvp, sets_nvp); - } else { - if (inherit & ZFS_DELEG_PERM_LOCAL) { - helper(who_type, whoid, whostr, ZFS_DELEG_LOCAL, - who_nvp, perms_nvp, sets_nvp); - } - if (inherit & ZFS_DELEG_PERM_DESCENDENT) { - helper(who_type, whoid, whostr, ZFS_DELEG_DESCENDENT, - who_nvp, perms_nvp, sets_nvp); - } - } -} - -/* - * Construct nvlist to pass down to kernel for setting/removing permissions. - * - * The nvlist is constructed as a series of nvpairs with an optional embedded - * nvlist of permissions to remove or set. The topmost nvpairs are the actual - * base attribute named stored in the dsl. - * Arguments: - * - * whostr: is a comma separated list of users, groups, or a single set name. - * whostr may be null for everyone or create perms. - * who_type: is the type of entry in whostr. Typically this will be - * ZFS_DELEG_WHO_UNKNOWN. - * perms: common separated list of permissions. May be null if user - * is requested to remove permissions by who. - * inherit: Specifies the inheritance of the permissions. Will be either - * ZFS_DELEG_PERM_LOCAL and/or ZFS_DELEG_PERM_DESCENDENT. - * nvp The constructed nvlist to pass to zfs_perm_set(). - * The output nvp will look something like this. - * ul$1234 -> {create ; destroy } - * Ul$1234 -> { @myset } - * s-$@myset - { snapshot; checksum; compression } - */ -int -zfs_build_perms(zfs_handle_t *zhp, char *whostr, char *perms, - zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit, nvlist_t **nvp) -{ - nvlist_t *who_nvp; - nvlist_t *perms_nvp = NULL; - nvlist_t *sets_nvp = NULL; - char errbuf[1024]; - char *who_tok, *perm; - int error; - - *nvp = NULL; - - if (perms) { - if ((error = nvlist_alloc(&perms_nvp, - NV_UNIQUE_NAME, 0)) != 0) { - return (1); - } - if ((error = nvlist_alloc(&sets_nvp, - NV_UNIQUE_NAME, 0)) != 0) { - nvlist_free(perms_nvp); - return (1); - } - } - - if ((error = nvlist_alloc(&who_nvp, NV_UNIQUE_NAME, 0)) != 0) { - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - return (1); - } - - if (who_type == ZFS_DELEG_NAMED_SET) { - namecheck_err_t why; - char what; - - if ((error = permset_namecheck(whostr, &why, &what)) != 0) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - - switch (why) { - case NAME_ERR_NO_AT: - zfs_error_aux(zhp->zfs_hdl, - dgettext(TEXT_DOMAIN, - "set definition must begin with an '@' " - "character")); - } - return (zfs_error(zhp->zfs_hdl, - EZFS_BADPERMSET, whostr)); - } - } - - /* - * Build up nvlist(s) of permissions. Two nvlists are maintained. - * The first nvlist perms_nvp will have normal permissions and the - * other sets_nvp will have only permssion set names in it. - */ - for (perm = strtok(perms, ","); perm; perm = strtok(NULL, ",")) { - const char *perm_canonical = zfs_deleg_canonicalize_perm(perm); - - if (perm_canonical) { - verify(nvlist_add_boolean(perms_nvp, - perm_canonical) == 0); - } else if (perm[0] == '@') { - verify(nvlist_add_boolean(sets_nvp, perm) == 0); - } else { - nvlist_free(who_nvp); - nvlist_free(perms_nvp); - nvlist_free(sets_nvp); - return (zfs_error(zhp->zfs_hdl, EZFS_BADPERM, perm)); - } - } - - if (whostr && who_type != ZFS_DELEG_CREATE) { - who_tok = strtok(whostr, ","); - if (who_tok == NULL) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "Who string is NULL"), - whostr); - return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf)); - } - } - - /* - * Now create the nvlist(s) - */ - do { - uint64_t who_id; - - error = zfs_get_perm_who(who_tok, &who_type, - &who_id); - if (error) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "Unable to determine uid/gid for " - "%s "), who_tok); - return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf)); - } - - /* - * add entries for both local and descendent when required - */ - zfs_perms_add_who_nvlist(who_nvp, who_id, who_tok, - perms_nvp, sets_nvp, who_type, inherit); - - } while (who_tok = strtok(NULL, ",")); - *nvp = who_nvp; - return (0); -} - -static int -zfs_perm_set_common(zfs_handle_t *zhp, nvlist_t *nvp, boolean_t unset) -{ - zfs_cmd_t zc = { 0 }; - int error; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "Cannot update 'allows' for '%s'"), - zhp->zfs_name); - - if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, nvp)) - return (-1); - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - zc.zc_perm_action = unset; - - error = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SET_FSACL, &zc); - if (error && errno == ENOTSUP) { - (void) snprintf(errbuf, sizeof (errbuf), - gettext("Pool must be upgraded to use 'allow/unallow'")); - zcmd_free_nvlists(&zc); - return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION, errbuf)); - } else if (error) { - return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); - } - zcmd_free_nvlists(&zc); - - return (error); -} - -int -zfs_perm_set(zfs_handle_t *zhp, nvlist_t *nvp) -{ - return (zfs_perm_set_common(zhp, nvp, B_FALSE)); -} - -int -zfs_perm_remove(zfs_handle_t *zhp, nvlist_t *perms) -{ - return (zfs_perm_set_common(zhp, perms, B_TRUE)); -} - -static int -perm_compare(const void *arg1, const void *arg2) -{ - const zfs_perm_node_t *node1 = arg1; - const zfs_perm_node_t *node2 = arg2; - int ret; - - ret = strcmp(node1->z_pname, node2->z_pname); - - if (ret > 0) - return (1); - if (ret < 0) - return (-1); - else - return (0); -} - -static void -zfs_destroy_perm_tree(avl_tree_t *tree) -{ - zfs_perm_node_t *permnode; - void *cookie = NULL; - - while ((permnode = avl_destroy_nodes(tree, &cookie)) != NULL) - free(permnode); - avl_destroy(tree); -} - -static void -zfs_destroy_tree(avl_tree_t *tree) -{ - zfs_allow_node_t *allownode; - void *cookie = NULL; - - while ((allownode = avl_destroy_nodes(tree, &cookie)) != NULL) { - zfs_destroy_perm_tree(&allownode->z_localdescend); - zfs_destroy_perm_tree(&allownode->z_local); - zfs_destroy_perm_tree(&allownode->z_descend); - free(allownode); - } - avl_destroy(tree); -} - -void -zfs_free_allows(zfs_allow_t *allow) -{ - zfs_allow_t *allownext; - zfs_allow_t *freeallow; - - allownext = allow; - while (allownext) { - zfs_destroy_tree(&allownext->z_sets); - zfs_destroy_tree(&allownext->z_crperms); - zfs_destroy_tree(&allownext->z_user); - zfs_destroy_tree(&allownext->z_group); - zfs_destroy_tree(&allownext->z_everyone); - freeallow = allownext; - allownext = allownext->z_next; - free(freeallow); - } -} - -static zfs_allow_t * -zfs_alloc_perm_tree(zfs_handle_t *zhp, zfs_allow_t *prev, char *setpoint) -{ - zfs_allow_t *ptree; - - if ((ptree = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_allow_t))) == NULL) { - return (NULL); - } - - (void) strlcpy(ptree->z_setpoint, setpoint, sizeof (ptree->z_setpoint)); - avl_create(&ptree->z_sets, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_crperms, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_user, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_group, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_everyone, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - - if (prev) - prev->z_next = ptree; - ptree->z_next = NULL; - return (ptree); -} - -/* - * Add permissions to the appropriate AVL permission tree. - * The appropriate tree may not be the requested tree. - * For example if ld indicates a local permission, but - * same permission also exists as a descendent permission - * then the permission will be removed from the descendent - * tree and add the the local+descendent tree. - */ -static int -zfs_coalesce_perm(zfs_handle_t *zhp, zfs_allow_node_t *allownode, - char *perm, char ld) -{ - zfs_perm_node_t pnode, *permnode, *permnode2; - zfs_perm_node_t *newnode; - avl_index_t where, where2; - avl_tree_t *tree, *altree; - - (void) strlcpy(pnode.z_pname, perm, sizeof (pnode.z_pname)); - - if (ld == ZFS_DELEG_NA) { - tree = &allownode->z_localdescend; - altree = &allownode->z_descend; - } else if (ld == ZFS_DELEG_LOCAL) { - tree = &allownode->z_local; - altree = &allownode->z_descend; - } else { - tree = &allownode->z_descend; - altree = &allownode->z_local; - } - permnode = avl_find(tree, &pnode, &where); - permnode2 = avl_find(altree, &pnode, &where2); - - if (permnode2) { - avl_remove(altree, permnode2); - free(permnode2); - if (permnode == NULL) { - tree = &allownode->z_localdescend; - } - } - - /* - * Now insert new permission in either requested location - * local/descendent or into ld when perm will exist in both. - */ - if (permnode == NULL) { - if ((newnode = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_perm_node_t))) == NULL) { - return (-1); - } - *newnode = pnode; - avl_add(tree, newnode); - } - return (0); -} - -/* - * Uggh, this is going to be a bit complicated. - * we have an nvlist coming out of the kernel that - * will indicate where the permission is set and then - * it will contain allow of the various "who's", and what - * their permissions are. To further complicate this - * we will then have to coalesce the local,descendent - * and local+descendent permissions where appropriate. - * The kernel only knows about a permission as being local - * or descendent, but not both. - * - * In order to make this easier for zfs_main to deal with - * a series of AVL trees will be used to maintain - * all of this, primarily for sorting purposes as well - * as the ability to quickly locate a specific entry. - * - * What we end up with are tree's for sets, create perms, - * user, groups and everyone. With each of those trees - * we have subtrees for local, descendent and local+descendent - * permissions. - */ -int -zfs_perm_get(zfs_handle_t *zhp, zfs_allow_t **zfs_perms) -{ - zfs_cmd_t zc = { 0 }; - int error; - nvlist_t *nvlist; - nvlist_t *permnv, *sourcenv; - nvpair_t *who_pair, *source_pair; - nvpair_t *perm_pair; - char errbuf[1024]; - zfs_allow_t *zallowp, *newallowp; - char ld; - char *nvpname; - uid_t uid; - gid_t gid; - avl_tree_t *tree; - avl_index_t where; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - - while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } else if (errno == ENOTSUP) { - zcmd_free_nvlists(&zc); - (void) snprintf(errbuf, sizeof (errbuf), - gettext("Pool must be upgraded to use 'allow'")); - return (zfs_error(zhp->zfs_hdl, - EZFS_BADVERSION, errbuf)); - } else { - zcmd_free_nvlists(&zc); - return (-1); - } - } - - if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &nvlist) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zcmd_free_nvlists(&zc); - - source_pair = nvlist_next_nvpair(nvlist, NULL); - - if (source_pair == NULL) { - *zfs_perms = NULL; - return (0); - } - - *zfs_perms = zfs_alloc_perm_tree(zhp, NULL, nvpair_name(source_pair)); - if (*zfs_perms == NULL) { - return (0); - } - - zallowp = *zfs_perms; - - for (;;) { - struct passwd *pwd; - struct group *grp; - zfs_allow_node_t *allownode; - zfs_allow_node_t findallownode; - zfs_allow_node_t *newallownode; - - (void) strlcpy(zallowp->z_setpoint, - nvpair_name(source_pair), - sizeof (zallowp->z_setpoint)); - - if ((error = nvpair_value_nvlist(source_pair, &sourcenv)) != 0) - goto abort; - - /* - * Make sure nvlist is composed correctly - */ - if (zfs_deleg_verify_nvlist(sourcenv)) { - goto abort; - } - - who_pair = nvlist_next_nvpair(sourcenv, NULL); - if (who_pair == NULL) { - goto abort; - } - - do { - error = nvpair_value_nvlist(who_pair, &permnv); - if (error) { - goto abort; - } - - /* - * First build up the key to use - * for looking up in the various - * who trees. - */ - ld = nvpair_name(who_pair)[1]; - nvpname = nvpair_name(who_pair); - switch (nvpair_name(who_pair)[0]) { - case ZFS_DELEG_USER: - case ZFS_DELEG_USER_SETS: - tree = &zallowp->z_user; - uid = atol(&nvpname[3]); - pwd = getpwuid(uid); - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "user %s", - (pwd) ? pwd->pw_name : - &nvpair_name(who_pair)[3]); - break; - case ZFS_DELEG_GROUP: - case ZFS_DELEG_GROUP_SETS: - tree = &zallowp->z_group; - gid = atol(&nvpname[3]); - grp = getgrgid(gid); - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "group %s", - (grp) ? grp->gr_name : - &nvpair_name(who_pair)[3]); - break; - case ZFS_DELEG_CREATE: - case ZFS_DELEG_CREATE_SETS: - tree = &zallowp->z_crperms; - (void) strlcpy(findallownode.z_key, "", - sizeof (findallownode.z_key)); - break; - case ZFS_DELEG_EVERYONE: - case ZFS_DELEG_EVERYONE_SETS: - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "everyone"); - tree = &zallowp->z_everyone; - break; - case ZFS_DELEG_NAMED_SET: - case ZFS_DELEG_NAMED_SET_SETS: - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "%s", - &nvpair_name(who_pair)[3]); - tree = &zallowp->z_sets; - break; - } - - /* - * Place who in tree - */ - allownode = avl_find(tree, &findallownode, &where); - if (allownode == NULL) { - if ((newallownode = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_allow_node_t))) == NULL) { - goto abort; - } - avl_create(&newallownode->z_localdescend, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - avl_create(&newallownode->z_local, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - avl_create(&newallownode->z_descend, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - (void) strlcpy(newallownode->z_key, - findallownode.z_key, - sizeof (findallownode.z_key)); - avl_insert(tree, newallownode, where); - allownode = newallownode; - } - - /* - * Now iterate over the permissions and - * place them in the appropriate local, - * descendent or local+descendent tree. - * - * The permissions are added to the tree - * via zfs_coalesce_perm(). - */ - perm_pair = nvlist_next_nvpair(permnv, NULL); - if (perm_pair == NULL) - goto abort; - do { - if (zfs_coalesce_perm(zhp, allownode, - nvpair_name(perm_pair), ld) != 0) - goto abort; - } while (perm_pair = nvlist_next_nvpair(permnv, - perm_pair)); - } while (who_pair = nvlist_next_nvpair(sourcenv, who_pair)); - - source_pair = nvlist_next_nvpair(nvlist, source_pair); - if (source_pair == NULL) - break; - - /* - * allocate another node from the link list of - * zfs_allow_t structures - */ - newallowp = zfs_alloc_perm_tree(zhp, zallowp, - nvpair_name(source_pair)); - if (newallowp == NULL) { - goto abort; - } - zallowp = newallowp; - } - nvlist_free(nvlist); - return (0); -abort: - zfs_free_allows(*zfs_perms); - nvlist_free(nvlist); - return (-1); -} - -static char * -zfs_deleg_perm_note(zfs_deleg_note_t note) -{ - /* - * Don't put newlines on end of lines - */ - switch (note) { - case ZFS_DELEG_NOTE_CREATE: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_DESTROY: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_SNAPSHOT: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_ROLLBACK: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_CLONE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'create' " - "ability and 'mount'\n" - "\t\t\t\tability in the origin file system")); - case ZFS_DELEG_NOTE_PROMOTE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'\n" - "\t\t\t\tand 'promote' ability in the origin file system")); - case ZFS_DELEG_NOTE_RENAME: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount' " - "and 'create' \n\t\t\t\tability in the new parent")); - case ZFS_DELEG_NOTE_RECEIVE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'" - " and 'create' ability")); - case ZFS_DELEG_NOTE_USERPROP: - return (dgettext(TEXT_DOMAIN, - "Allows changing any user property")); - case ZFS_DELEG_NOTE_ALLOW: - return (dgettext(TEXT_DOMAIN, - "Must also have the permission that is being\n" - "\t\t\t\tallowed")); - case ZFS_DELEG_NOTE_MOUNT: - return (dgettext(TEXT_DOMAIN, - "Allows mount/umount of ZFS datasets")); - case ZFS_DELEG_NOTE_SHARE: - return (dgettext(TEXT_DOMAIN, - "Allows sharing file systems over NFS or SMB\n" - "\t\t\t\tprotocols")); - case ZFS_DELEG_NOTE_NONE: - default: - return (dgettext(TEXT_DOMAIN, "")); - } -} - -typedef enum { - ZFS_DELEG_SUBCOMMAND, - ZFS_DELEG_PROP, - ZFS_DELEG_OTHER -} zfs_deleg_perm_type_t; - -/* - * is the permission a subcommand or other? - */ -zfs_deleg_perm_type_t -zfs_deleg_perm_type(const char *perm) -{ - if (strcmp(perm, "userprop") == 0) - return (ZFS_DELEG_OTHER); - else - return (ZFS_DELEG_SUBCOMMAND); -} - -static char * -zfs_deleg_perm_type_str(zfs_deleg_perm_type_t type) -{ - switch (type) { - case ZFS_DELEG_SUBCOMMAND: - return (dgettext(TEXT_DOMAIN, "subcommand")); - case ZFS_DELEG_PROP: - return (dgettext(TEXT_DOMAIN, "property")); - case ZFS_DELEG_OTHER: - return (dgettext(TEXT_DOMAIN, "other")); - } - return (""); -} - -/*ARGSUSED*/ -static int -zfs_deleg_prop_cb(int prop, void *cb) -{ - if (zfs_prop_delegatable(prop)) - (void) fprintf(stderr, "%-15s %-15s\n", zfs_prop_to_name(prop), - zfs_deleg_perm_type_str(ZFS_DELEG_PROP)); - - return (ZPROP_CONT); -} - -void -zfs_deleg_permissions(void) -{ - int i; - - (void) fprintf(stderr, "\n%-15s %-15s\t%s\n\n", "NAME", - "TYPE", "NOTES"); - - /* - * First print out the subcommands - */ - for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { - (void) fprintf(stderr, "%-15s %-15s\t%s\n", - zfs_deleg_perm_tab[i].z_perm, - zfs_deleg_perm_type_str( - zfs_deleg_perm_type(zfs_deleg_perm_tab[i].z_perm)), - zfs_deleg_perm_note(zfs_deleg_perm_tab[i].z_note)); - } - - (void) zprop_iter(zfs_deleg_prop_cb, NULL, B_FALSE, B_TRUE, - ZFS_TYPE_DATASET|ZFS_TYPE_VOLUME); -} - /* * Given a property name and value, set the property for the given dataset. */ @@ -1988,6 +1291,7 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) goto error; ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + if (ret != 0) { switch (errno) { @@ -2198,6 +1502,8 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = ""; } @@ -2217,6 +1523,8 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); if ((value = (char *)zfs_prop_default_string(prop)) == NULL) value = ""; *source = ""; @@ -2393,7 +1701,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* - * If we tried to use a defalut value for a + * If we tried to use a default value for a * readonly property, it means that it was not * present; return an error. */ @@ -2687,7 +1995,7 @@ zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; - zfs_nicenum(val, buf, sizeof (buf)); + (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } @@ -2720,6 +2028,205 @@ zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, return (0); } +static int +idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, + char **domainp, idmap_rid_t *ridp) +{ + idmap_handle_t *idmap_hdl = NULL; + idmap_get_handle_t *get_hdl = NULL; + idmap_stat status; + int err = EINVAL; + + if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS) + goto out; + if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS) + goto out; + + if (isuser) { + err = idmap_get_sidbyuid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } else { + err = idmap_get_sidbygid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } + if (err == IDMAP_SUCCESS && + idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && + status == IDMAP_SUCCESS) + err = 0; + else + err = EINVAL; +out: + if (get_hdl) + idmap_get_destroy(get_hdl); + if (idmap_hdl) + (void) idmap_fini(idmap_hdl); + return (err); +} + +/* + * convert the propname into parameters needed by kernel + * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 + * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 + */ +static int +userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) +{ + zfs_userquota_prop_t type; + char *cp, *end; + boolean_t isuser; + + domain[0] = '\0'; + + /* Figure out the property type ({user|group}{quota|space}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(propname, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + if (type == ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + *typep = type; + + isuser = (type == ZFS_PROP_USERQUOTA || + type == ZFS_PROP_USERUSED); + + cp = strchr(propname, '@') + 1; + + if (strchr(cp, '@')) { + /* + * It's a SID name (eg "user@domain") that needs to be + * turned into S-1-domainID-RID. There should be a + * better way to do this, but for now just translate it + * to the (possibly ephemeral) uid and then back to the + * SID. This is like getsidname(noresolve=TRUE). + */ + uid_t id; + idmap_rid_t rid; + char *mapdomain; + + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (sid_to_id(cp, isuser, &id) != 0) + return (ENOENT); + if (idmap_id_to_numeric_domain_rid(id, isuser, + &mapdomain, &rid) != 0) + return (ENOENT); + (void) strlcpy(domain, mapdomain, domainlen); + *ridp = rid; + } else if (strncmp(cp, "S-1-", 4) == 0) { + /* It's a numeric SID (eg "S-1-234-567-89") */ + (void) strcpy(domain, cp); + cp = strrchr(domain, '-'); + *cp = '\0'; + cp++; + + errno = 0; + *ridp = strtoull(cp, &end, 10); + if (errno != 0 || *end != '\0') + return (EINVAL); + } else if (!isdigit(*cp)) { + /* + * It's a user/group name (eg "user") that needs to be + * turned into a uid/gid + */ + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + struct passwd *pw; + pw = getpwnam(cp); + if (pw == NULL) + return (ENOENT); + *ridp = pw->pw_uid; + } else { + struct group *gr; + gr = getgrnam(cp); + if (gr == NULL) + return (ENOENT); + *ridp = gr->gr_gid; + } + } else { + /* It's a user/group ID (eg "12345"). */ + uid_t id = strtoul(cp, &end, 10); + idmap_rid_t rid; + char *mapdomain; + + if (*end != '\0') + return (EINVAL); + if (id > MAXUID) { + /* It's an ephemeral ID. */ + if (idmap_id_to_numeric_domain_rid(id, isuser, + &mapdomain, &rid) != 0) + return (ENOENT); + (void) strcpy(domain, mapdomain); + *ridp = rid; + } else { + *ridp = id; + } + } + + return (0); +} + +static int +zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue, zfs_userquota_prop_t *typep) +{ + int err; + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + err = userquota_propname_decode(propname, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), + typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); + zc.zc_objset_type = *typep; + if (err) + return (err); + + err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); +} + +int +zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) +{ + zfs_userquota_prop_t type; + + return (zfs_prop_get_userquota_common(zhp, propname, propvalue, + &type)); +} + +int +zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) +{ + int err; + uint64_t propvalue; + zfs_userquota_prop_t type; + + err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, + &type); + + if (err) + return (err); + + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", propvalue); + } else if (propvalue == 0 && + (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(propvalue, propbuf, proplen); + } + return (0); +} + /* * Returns the name of the given zfs handle. */ @@ -2797,12 +2304,6 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, &zc)) == 0) { /* - * Ignore private dataset names. - */ - if (dataset_name_hidden(zc.zc_name)) - continue; - - /* * Silently ignore errors, as the only plausible explanation is * that the pool has since been removed. */ @@ -4424,18 +3925,167 @@ zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred) int zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, - void *export, void *sharetab, int sharemax, zfs_share_op_t operation) + char *resource, void *export, void *sharetab, + int sharemax, zfs_share_op_t operation) { zfs_cmd_t zc = { 0 }; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + if (resource) + (void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string)); zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab; zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export; zc.zc_share.z_sharetype = operation; zc.zc_share.z_sharemax = sharemax; - error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); return (error); } + +void +zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) +{ + nvpair_t *curr; + + /* + * Keep a reference to the props-table against which we prune the + * properties. + */ + zhp->zfs_props_table = props; + + curr = nvlist_next_nvpair(zhp->zfs_props, NULL); + + while (curr) { + zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); + nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr); + + /* + * We leave user:props in the nvlist, so there will be + * some ZPROP_INVAL. To be extra safe, don't prune + * those. + */ + if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) + (void) nvlist_remove(zhp->zfs_props, + nvpair_name(curr), nvpair_type(curr)); + curr = next; + } +} + +static int +zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, + zfs_smb_acl_op_t cmd, char *resource1, char *resource2) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *nvlist = NULL; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + zc.zc_cookie = (uint64_t)cmd; + + if (cmd == ZFS_SMB_ACL_RENAME) { + if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + } + + switch (cmd) { + case ZFS_SMB_ACL_ADD: + case ZFS_SMB_ACL_REMOVE: + (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); + break; + case ZFS_SMB_ACL_RENAME: + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, + resource1) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, + resource2) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { + nvlist_free(nvlist); + return (-1); + } + break; + case ZFS_SMB_ACL_PURGE: + break; + default: + return (-1); + } + error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); + if (nvlist) + nvlist_free(nvlist); + return (error); +} + +int +zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, + resource, NULL)); +} + +int +zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, + resource, NULL)); +} + +int +zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, + NULL, NULL)); +} + +int +zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, + char *oldname, char *newname) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, + oldname, newname)); +} + +int +zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg) +{ + zfs_cmd_t zc = { 0 }; + int error; + zfs_useracct_t buf[100]; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + zc.zc_objset_type = type; + zc.zc_nvlist_dst = (uintptr_t)buf; + + /* CONSTCOND */ + while (1) { + zfs_useracct_t *zua = buf; + + zc.zc_nvlist_dst_size = sizeof (buf); + error = ioctl(zhp->zfs_hdl->libzfs_fd, + ZFS_IOC_USERSPACE_MANY, &zc); + if (error || zc.zc_nvlist_dst_size == 0) + break; + + while (zc.zc_nvlist_dst_size > 0) { + error = func(arg, zua->zu_domain, zua->zu_rid, + zua->zu_space); + if (error != 0) + return (error); + zua++; + zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); + } + } + + return (error); +} diff --git a/lib/libzfs/libzfs_graph.c b/lib/libzfs/libzfs_graph.c index e7cbf2386..bc21c51ae 100644 --- a/lib/libzfs/libzfs_graph.c +++ b/lib/libzfs/libzfs_graph.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Iterate over all children of the current object. This includes the normal * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to @@ -399,13 +397,6 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - - /* - * Ignore private dataset names. - */ - if (dataset_name_hidden(zc.zc_name)) - continue; - /* * Get statistics for this dataset, to determine the type of the * dataset and clone statistics. If this fails, the dataset has diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 75ecc54e2..fd734d8b4 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -42,6 +42,7 @@ #include <sys/zfs_ioctl.h> #include <sys/zio.h> #include <strings.h> +#include <dlfcn.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -55,6 +56,10 @@ static int read_efi_label(nvlist_t *config, diskaddr_t *sb); #define BOOTCMD "installboot(1M)" #endif +#define DISK_ROOT "/dev/dsk" +#define RDISK_ROOT "/dev/rdsk" +#define BACKUP_SLICE "s2" + /* * ==================================================================== * zpool property functions @@ -628,6 +633,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) /* + * Don't start the slice at the default block of 34; many storage + * devices will use a stripe width of 128k, so start there instead. + */ +#define NEW_START_BLOCK 256 + +/* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ @@ -1369,46 +1380,90 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type) } /* + * Find a vdev that matches the search criteria specified. We use the + * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * -vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, - boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; - uint64_t theguid, present; - char *path; - uint64_t wholedisk = 0; nvlist_t *ret; uint64_t is_log; + char *srchkey; + nvpair_t *pair = nvlist_next_nvpair(search, NULL); - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0); + /* Nothing to look for */ + if (search == NULL || pair == NULL) + return (NULL); + + /* Obtain the key we will use to search */ + srchkey = nvpair_name(pair); + + switch (nvpair_type(pair)) { + case DATA_TYPE_UINT64: { + uint64_t srchval, theguid, present; + + verify(nvpair_value_uint64(pair, &srchval) == 0); + if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &present) == 0) { + /* + * If the device has never been present since + * import, the only reliable way to match the + * vdev is by GUID. + */ + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == srchval) + return (nv); + } + } + break; + } + + case DATA_TYPE_STRING: { + char *srchval, *val; + + verify(nvpair_value_string(pair, &srchval) == 0); + if (nvlist_lookup_string(nv, srchkey, &val) != 0) + break; - if (search == NULL && - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) { /* - * If the device has never been present since import, the only - * reliable way to match the vdev is by GUID. + * Search for the requested value. We special case the search + * for ZPOOL_CONFIG_PATH when it's a wholedisk. Otherwise, + * all other searches are simple string compares. */ - if (theguid == guid) - return (nv); - } else if (search != NULL && - nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk); - if (wholedisk) { - /* - * For whole disks, the internal path has 's0', but the - * path passed in by the user doesn't. - */ - if (strlen(search) == strlen(path) - 2 && - strncmp(search, path, strlen(search)) == 0) - return (nv); - } else if (strcmp(search, path) == 0) { - return (nv); + if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) { + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (wholedisk) { + /* + * For whole disks, the internal path has 's0', + * but the path passed in by the user doesn't. + */ + if (strlen(srchval) == strlen(val) - 2 && + strncmp(srchval, val, strlen(srchval)) == 0) + return (nv); + break; + } } + + /* + * Common case + */ + if (strcmp(srchval, val) == 0) + return (nv); + break; + } + + default: + break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, @@ -1416,7 +1471,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, return (NULL); for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel @@ -1437,7 +1492,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); @@ -1448,7 +1503,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); @@ -1459,24 +1514,48 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, return (NULL); } +/* + * Given a physical path (minus the "/devices" prefix), find the + * associated vdev. + */ +nvlist_t * +zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, + boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +{ + nvlist_t *search, *nvroot, *ret; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char buf[MAXPATHLEN]; - const char *search; char *end; - nvlist_t *nvroot; + nvlist_t *nvroot, *search, *ret; uint64_t guid; + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + guid = strtoull(path, &end, 10); if (guid != 0 && *end == '\0') { - search = NULL; + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); } else if (path[0] != '/') { (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); - search = buf; + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { - search = path; + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, @@ -1486,8 +1565,10 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; - return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare, - l2cache, log)); + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); } static int @@ -1504,81 +1585,142 @@ vdev_online(nvlist_t *nv) } /* - * Get phys_path for a root pool - * Return 0 on success; non-zeron on failure. + * Helper function for zpool_get_physpaths(). */ -int -zpool_get_physpath(zpool_handle_t *zhp, char *physpath) +static int +vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, + size_t *bytes_written) { + size_t bytes_left, pos, rsz; + char *tmppath; + const char *format; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, + &tmppath) != 0) + return (EZFS_NODEVICE); + + pos = *bytes_written; + bytes_left = physpath_size - pos; + format = (pos == 0) ? "%s" : " %s"; + + rsz = snprintf(physpath + pos, bytes_left, format, tmppath); + *bytes_written += rsz; + + if (rsz >= bytes_left) { + /* if physpath was not copied properly, clear it */ + if (bytes_left != 0) { + physpath[pos] = 0; + } + return (EZFS_NOSPC); + } + return (0); +} + +static int +vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, + size_t *rsz, boolean_t is_spare) +{ + char *type; + int ret; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EZFS_INVALCONFIG); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + /* + * An active spare device has ZPOOL_CONFIG_IS_SPARE set. + * For a spare vdev, we only want to boot from the active + * spare device. + */ + if (is_spare) { + uint64_t spare = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare); + if (!spare) + return (EZFS_INVALCONFIG); + } + + if (vdev_online(nv)) { + if ((ret = vdev_get_one_physpath(nv, physpath, + phypath_size, rsz)) != 0) + return (ret); + } + } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || + strcmp(type, VDEV_TYPE_REPLACING) == 0 || + (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { + nvlist_t **child; + uint_t count; + int i, ret; + + if (nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) + return (EZFS_INVALCONFIG); + + for (i = 0; i < count; i++) { + ret = vdev_get_physpaths(child[i], physpath, + phypath_size, rsz, is_spare); + if (ret == EZFS_NOSPC) + return (ret); + } + } + + return (EZFS_POOL_INVALARG); +} + +/* + * Get phys_path for a root pool config. + * Return 0 on success; non-zero on failure. + */ +static int +zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) +{ + size_t rsz; nvlist_t *vdev_root; nvlist_t **child; uint_t count; - int i; + char *type; - /* - * Make sure this is a root pool, as phys_path doesn't mean - * anything to a non-root pool. - */ - if (!pool_is_bootable(zhp)) - return (-1); + rsz = 0; - verify(nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &vdev_root) == 0); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &vdev_root) != 0) + return (EZFS_INVALCONFIG); - if (nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, + if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || + nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) - return (-2); + return (EZFS_INVALCONFIG); - for (i = 0; i < count; i++) { - nvlist_t **child2; - uint_t count2; - char *type; - char *tmppath; - int j; + /* + * root pool can not have EFI labeled disks and can only have + * a single top-level vdev. + */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 || + pool_uses_efi(vdev_root)) + return (EZFS_POOL_INVALARG); - if (nvlist_lookup_string(child[i], ZPOOL_CONFIG_TYPE, &type) - != 0) - return (-3); - - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - if (!vdev_online(child[i])) - return (-8); - verify(nvlist_lookup_string(child[i], - ZPOOL_CONFIG_PHYS_PATH, &tmppath) == 0); - (void) strncpy(physpath, tmppath, strlen(tmppath)); - } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) { - if (nvlist_lookup_nvlist_array(child[i], - ZPOOL_CONFIG_CHILDREN, &child2, &count2) != 0) - return (-4); - - for (j = 0; j < count2; j++) { - if (!vdev_online(child2[j])) - return (-8); - if (nvlist_lookup_string(child2[j], - ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0) - return (-5); - - if ((strlen(physpath) + strlen(tmppath)) > - MAXNAMELEN) - return (-6); - - if (strlen(physpath) == 0) { - (void) strncpy(physpath, tmppath, - strlen(tmppath)); - } else { - (void) strcat(physpath, " "); - (void) strcat(physpath, tmppath); - } - } - } else { - return (-7); - } - } + (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, + B_FALSE); + + /* No online devices */ + if (rsz == 0) + return (EZFS_NODEVICE); return (0); } /* + * Get phys_path for a root pool + * Return 0 on success; non-zero on failure. + */ +int +zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) +{ + return (zpool_get_config_physpath(zhp->zpool_config, physpath, + phypath_size)); +} + +/* * Returns TRUE if the given guid corresponds to the given type. * This is used to check for hot spares (INUSE or not), and level 2 cache * devices. @@ -1607,6 +1749,45 @@ is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type) } /* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +static int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) +{ + char path[MAXPATHLEN]; + char errbuf[1024]; + int fd, error; + int (*_efi_use_whole_disk)(int); + + if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, + "efi_use_whole_disk")) == NULL) + return (-1); + + (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); + + if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device"), name); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + */ + error = _efi_use_whole_disk(fd); + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), name); + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + return (0); +} + +/* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ @@ -1617,15 +1798,20 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare, l2cache; + boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot online %s"), path); + if (flags & ZFS_ONLINE_EXPAND) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot expand %s"), path); + } else { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot online %s"), path); + } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == NULL) + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); @@ -1634,6 +1820,31 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (flags & ZFS_ONLINE_EXPAND || + zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + char *pathname = NULL; + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, + &pathname) == 0); + + /* + * XXX - L2ARC 1.0 devices can't support expansion. + */ + if (l2cache) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot expand cache devices")); + return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); + } + + if (wholedisk) { + pathname += strlen(DISK_ROOT) + 1; + (void) zpool_relabel_disk(zhp->zpool_hdl, pathname); + } + } + zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; @@ -1684,6 +1895,12 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + case EEXIST: + /* + * The log device has unplayed logs + */ + return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); + default: return (zpool_standard_error(hdl, errno, msg)); } @@ -1888,6 +2105,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please " "be sure to invoke %s to make '%s' bootable.\n"), BOOTCMD, new_disk); + + /* + * XXX need a better way to prevent user from + * booting up a half-baked vdev. + */ + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " + "sure to wait until resilver is done " + "before rebooting.\n")); } return (0); } @@ -2803,14 +3028,6 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, free(mntpnt); } -#define RDISK_ROOT "/dev/rdsk" -#define BACKUP_SLICE "s2" -/* - * Don't start the slice at the default block of 34; many storage - * devices will use a stripe width of 128k, so start there instead. - */ -#define NEW_START_BLOCK 256 - /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 5a2e2aeb6..612a09914 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -237,6 +237,8 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) zfs_prop_t prop = zfs_name_to_prop(propname); nvlist_t *propnv; + assert(zfs_prop_user(propname) || prop != ZPROP_INVAL); + if (!zfs_prop_user(propname) && zfs_prop_readonly(prop)) continue; @@ -594,12 +596,18 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { - (void) fprintf(stderr, - "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) " - "is not earlier than it\n", - zhp->zfs_name, sdd->tosnap, - zhp->zfs_name, sdd->fromsnap); + if (sdd->fromsnap) { + (void) fprintf(stderr, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) " + "is not earlier than it\n", + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + } else { + (void) fprintf(stderr, "WARNING: " + "could not send %s@%s: does not exist\n", + zhp->zfs_name, sdd->tosnap); + } sdd->err = B_TRUE; } } else { diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 259f5fd49..30829d50d 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -210,6 +210,9 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); + case EZFS_UNPLAYED_LOGS: + return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " + "logs")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -364,6 +367,11 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; default: zfs_error_aux(hdl, strerror(errno)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); @@ -437,6 +445,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); return (-1); + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; default: zfs_error_aux(hdl, strerror(error)); @@ -575,6 +588,7 @@ libzfs_init(void) zfs_prop_init(); zpool_prop_init(); + libzfs_mnttab_init(hdl); return (hdl); } @@ -592,6 +606,7 @@ libzfs_fini(libzfs_handle_t *hdl) (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl); namespace_clear(hdl); + libzfs_mnttab_fini(hdl); free(hdl); } @@ -1209,7 +1224,7 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, * dataset property, */ if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - !zfs_prop_user(propname))) { + (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index a50e4b0ac..230c233a2 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -59,6 +59,7 @@ extern "C" { #include <atomic.h> #include <dirent.h> #include <time.h> +#include <libsysevent.h> #include <sys/note.h> #include <sys/types.h> #include <sys/cred.h> @@ -73,6 +74,7 @@ extern "C" { #include <sys/kstat.h> #include <sys/u8_textprep.h> #include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> /* * Debugging @@ -316,6 +318,7 @@ typedef void (task_func_t)(void *); #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* Use dynamic thread scheduling */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ @@ -540,6 +543,10 @@ typedef struct ksiddomain { ksiddomain_t *ksid_lookupdomain(const char *); void ksiddomain_rele(ksiddomain_t *); +#define DDI_SLEEP KM_SLEEP +#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ + sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) + #ifdef __cplusplus } #endif diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 93acdcf8e..1a73fe83c 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -174,6 +174,19 @@ taskq_create(const char *name, int nthreads, pri_t pri, taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; + if (flags & TASKQ_THREADS_CPU_PCT) { + int pct; + ASSERT3S(nthreads, >=, 0); + ASSERT3S(nthreads, <=, 100); + pct = MIN(nthreads, 100); + pct = MAX(pct, 0); + + nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; + nthreads = MAX(nthreads, 1); /* need at least 1 thread */ + } else { + ASSERT3S(nthreads, >=, 1); + } + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); diff --git a/module/zcommon/include/sys/fm/fs/zfs.h b/module/zcommon/include/sys/fm/fs/zfs.h index 66ca9c5d7..21b7dbe52 100644 --- a/module/zcommon/include/sys/fm/fs/zfs.h +++ b/module/zcommon/include/sys/fm/fs/zfs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -57,6 +55,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/module/zcommon/include/sys/fs/zfs.h b/module/zcommon/include/sys/fs/zfs.h index 95f04d842..6651b140c 100644 --- a/module/zcommon/include/sys/fs/zfs.h +++ b/module/zcommon/include/sys/fs/zfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,6 +48,10 @@ typedef enum { #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN (1024 * 8) +#define ZAP_OLDMAXVALUELEN 1024 + /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected @@ -105,9 +109,21 @@ typedef enum { ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, + ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ + ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_NUM_PROPS } zfs_prop_t; +typedef enum { + ZFS_PROP_USERUSED, + ZFS_PROP_USERQUOTA, + ZFS_PROP_GROUPUSED, + ZFS_PROP_GROUPQUOTA, + ZFS_NUM_USERQUOTA_PROPS +} zfs_userquota_prop_t; + +extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; + /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected @@ -130,6 +146,7 @@ typedef enum { ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, + ZPOOL_PROP_AUTOEXPAND, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -169,6 +186,7 @@ boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); +boolean_t zfs_prop_userquota(const char *name); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); @@ -213,6 +231,9 @@ typedef enum { #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" +#define ZFS_SMB_ACL_SRC "src" +#define ZFS_SMB_ACL_TARGET "target" + typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, @@ -226,6 +247,13 @@ typedef enum zfs_share_op { ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; +typedef enum zfs_smb_acl_op { + ZFS_SMB_ACL_ADD, + ZFS_SMB_ACL_REMOVE, + ZFS_SMB_ACL_RENAME, + ZFS_SMB_ACL_PURGE +} zfs_smb_acl_op_t; + typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, @@ -250,13 +278,16 @@ typedef enum zfs_cache_type { #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL +#define SPA_VERSION_16 16ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk - * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, - * and do the appropriate changes. + * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. Also bump the version number in + * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_14 -#define SPA_VERSION_STRING "14" +#define SPA_VERSION SPA_VERSION_16 +#define SPA_VERSION_STRING "16" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -292,6 +323,8 @@ typedef enum zfs_cache_type { #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 +#define SPA_VERSION_STMF_PROP SPA_VERSION_16 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -299,19 +332,21 @@ typedef enum zfs_cache_type { * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! - * See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*} + * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL -#define ZPL_VERSION ZPL_VERSION_3 -#define ZPL_VERSION_STRING "3" +#define ZPL_VERSION_4 4ULL +#define ZPL_VERSION ZPL_VERSION_4 +#define ZPL_VERSION_STRING "4" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 +#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 /* * The following are configuration names used in the nvlist describing a pool's @@ -361,6 +396,7 @@ typedef enum zfs_cache_type { #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -503,7 +539,7 @@ typedef struct vdev_stat { /* * And here are the things we need with /dev, etc. in front of them. */ -#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:" +#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" #define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/" #define ZVOL_PROP_NAME "name" @@ -531,6 +567,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, + ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, @@ -560,7 +597,11 @@ typedef enum zfs_ioc { ZFS_IOC_GET_FSACL, ZFS_IOC_ISCSI_PERM_CHECK, ZFS_IOC_SHARE, - ZFS_IOC_INHERIT_PROP + ZFS_IOC_INHERIT_PROP, + ZFS_IOC_SMB_ACL, + ZFS_IOC_USERSPACE_ONE, + ZFS_IOC_USERSPACE_MANY, + ZFS_IOC_USERSPACE_UPGRADE } zfs_ioc_t; /* @@ -602,6 +643,7 @@ typedef enum { #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* diff --git a/module/zcommon/include/zfs_deleg.h b/module/zcommon/include/zfs_deleg.h index 561b73e63..cdbbd83de 100644 --- a/module/zcommon/include/zfs_deleg.h +++ b/module/zcommon/include/zfs_deleg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_DELEG_H #define _ZFS_DELEG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/fs/zfs.h> #ifdef __cplusplus @@ -59,6 +57,10 @@ typedef enum { ZFS_DELEG_NOTE_USERPROP, ZFS_DELEG_NOTE_MOUNT, ZFS_DELEG_NOTE_SHARE, + ZFS_DELEG_NOTE_USERQUOTA, + ZFS_DELEG_NOTE_GROUPQUOTA, + ZFS_DELEG_NOTE_USERUSED, + ZFS_DELEG_NOTE_GROUPUSED, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; diff --git a/module/zcommon/include/zfs_namecheck.h b/module/zcommon/include/zfs_namecheck.h index ec85e62f7..7711da099 100644 --- a/module/zcommon/include/zfs_namecheck.h +++ b/module/zcommon/include/zfs_namecheck.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_NAMECHECK_H #define _ZFS_NAMECHECK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -50,7 +48,6 @@ typedef enum { int pool_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *); int mountpoint_namecheck(const char *, namecheck_err_t *); -int dataset_name_hidden(const char *); int snapshot_namecheck(const char *, namecheck_err_t *, char *); int permset_namecheck(const char *, namecheck_err_t *, char *); diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c index 0fd5800a8..2964cae5d 100644 --- a/module/zcommon/zfs_deleg.c +++ b/module/zcommon/zfs_deleg.c @@ -19,13 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #if defined(_KERNEL) #include <sys/systm.h> #include <sys/sunddi.h> @@ -66,6 +63,10 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + {ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, {NULL, ZFS_DELEG_NOTE_NONE } }; diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index a9d109be2..45730c6fc 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Common name validation routines for ZFS. These routines are shared by the * userland code as well as the ioctl() layer to ensure that we don't @@ -345,19 +343,3 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (0); } - -/* - * Check if the dataset name is private for internal usage. - * '$' is reserved for internal dataset names. e.g. "$MOS" - * - * Return 1 if the given name is used internally. - * Return 0 if it is not. - */ -int -dataset_name_hidden(const char *name) -{ - if (strchr(name, '$') != NULL) - return (1); - - return (0); -} diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index effd2dba7..05d830633 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,6 +43,14 @@ static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; +/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ +const char *zfs_userquota_prop_prefixes[] = { + "userused@", + "userquota@", + "groupused@", + "groupquota@" +}; + zprop_desc_t * zfs_prop_get_table(void) { @@ -133,6 +141,7 @@ zfs_prop_init(void) { "1", 1 }, { "2", 2 }, { "3", 3 }, + { "4", 4 }, { "current", ZPL_VERSION }, { NULL } }; @@ -218,7 +227,7 @@ zfs_prop_init(void) /* default index properties */ register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | current", "VERSION", version_table); + "1 | 2 | 3 | 4 | current", "VERSION", version_table); register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); @@ -305,8 +314,13 @@ zfs_prop_init(void) PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); + register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", + PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, + "STMF_SBD_LU"); register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "GUID"); + register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL); /* oddball properties */ register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, @@ -330,7 +344,6 @@ zfs_name_to_prop(const char *propname) return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } - /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. @@ -368,6 +381,26 @@ zfs_prop_user(const char *name) } /* + * Returns true if this is a valid userspace-type property (one with a '@'). + * Note that after the @, any character is valid (eg, another @, for SID + * user@domain). + */ +boolean_t +zfs_prop_userquota(const char *name) +{ + zfs_userquota_prop_t prop; + + for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { + if (strncmp(name, zfs_userquota_prop_prefixes[prop], + strlen(zfs_userquota_prop_prefixes[prop])) == 0) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index f5efe18d2..d57dcfb63 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -96,6 +96,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); + register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, + ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); /* default index properties */ register_index(ZPOOL_PROP_FAILUREMODE, "failmode", diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c index bd267e2e6..85f55c276 100644 --- a/module/zcommon/zprop_common.c +++ b/module/zcommon/zprop_common.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Common routines used by zfs and zpool property management. */ @@ -205,9 +203,6 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) #ifndef _KERNEL const char *colname = prop_entry->pd_colname; int c; - - if (colname == NULL) - return (B_FALSE); #endif if (len == strlen(propname) && @@ -215,7 +210,7 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) return (B_TRUE); #ifndef _KERNEL - if (len != strlen(colname)) + if (colname == NULL || len != strlen(colname)) return (B_FALSE); for (c = 0; c < len; c++) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3a9598a92..d5e5aa544 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -124,6 +124,7 @@ #include <sys/arc.h> #include <sys/refcount.h> #include <sys/vdev.h> +#include <sys/vdev_impl.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -397,6 +398,7 @@ static arc_state_t *arc_l2c_only; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; @@ -610,7 +612,7 @@ typedef struct l2arc_write_callback { struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ - daddr_t b_daddr; /* disk address, offset byte */ + uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { @@ -1203,6 +1205,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) return (buf); } +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(hdr->b_state == arc_anon); + ASSERT(buf->b_data != NULL); + VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); + VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -2504,7 +2541,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, uint32_t *arc_flags, const zbookmark_t *zb) { int err; - arc_buf_hdr_t *hdr = pbuf->b_hdr; ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); @@ -2512,9 +2548,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - - ASSERT3P(hdr, ==, pbuf->b_hdr); rw_exit(&pbuf->b_lock); + return (err); } @@ -2604,7 +2639,7 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - daddr_t addr; + uint64_t addr; boolean_t devw = B_FALSE; if (hdr == NULL) { @@ -2923,6 +2958,7 @@ arc_release(arc_buf_t *buf, void *tag) kmutex_t *hash_lock; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; + boolean_t released = B_FALSE; rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; @@ -2938,12 +2974,12 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); rw_exit(&buf->b_lock); - return; + released = B_TRUE; + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); @@ -2951,6 +2987,9 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } + if (released) + goto out; + /* * Do we have more than one buf? */ @@ -3018,6 +3057,7 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_efunc = NULL; buf->b_private = NULL; +out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3311,10 +3351,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t inflight_data = arc_anon->arcs_size; uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3376,6 +3415,7 @@ int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; + uint64_t anon_size; #ifdef ZFS_DEBUG /* @@ -3392,11 +3432,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) return (ENOMEM); /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, txg)) + if (error = arc_memory_throttle(reserve, anon_size, txg)) return (error); /* @@ -3406,8 +3453,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ - if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_anon->arcs_size > arc_c / 4) { + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, @@ -3592,6 +3640,8 @@ arc_fini(void) mutex_destroy(&zfs_write_limit_lock); buf_fini(); + + ASSERT(arc_loaned_bytes == 0); } /* @@ -4486,7 +4536,7 @@ l2arc_vdev_present(vdev_t *vd) * validated the vdev and opened it. */ void -l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; @@ -4500,8 +4550,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; - adddev->l2ad_start = start; - adddev->l2ad_end = end; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 113fa1f03..1b6f242dd 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -329,7 +329,7 @@ dbuf_verify(dmu_buf_impl_t *db) if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); @@ -465,15 +465,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - int bonuslen = dn->dn_bonuslen; + int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, - bonuslen); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -908,15 +908,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. - * XXX We may want to prohibit dirtying in syncing context even - * if they did pre-dirty. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || - dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_objset->os_dsl_dataset == NULL || - dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); - + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty @@ -975,7 +971,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * Only valid if not already dirty. */ - ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); @@ -987,15 +984,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * We should only be dirtying in syncing context if it's the - * mos, a spa os, or we're initializing the os. However, we are - * allowed to dirty in syncing context provided we already - * dirtied it in open context. Hence we must make this - * assertion only if we're not already dirty. + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. */ - ASSERT(!dmu_tx_is_syncing(tx) || - os->os_dsl_dataset == NULL || - !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || - !BP_IS_HOLE(os->os_rootbp)); + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1312,6 +1307,68 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT(buf != NULL); + ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + return; + } + + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dbuf_fill_done(db, tx); +} + +/* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunetely, * when we are not holding the dn_dbufs_mtx, we can't clear the @@ -1855,6 +1912,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user_ptr); } +boolean_t +dmu_buf_freeable(dmu_buf_t *dbuf) +{ + boolean_t res = B_FALSE; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + + if (db->db_blkptr) + res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, + db->db_blkptr->blk_birth); + + return (res); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -1943,7 +2013,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -2049,32 +2118,25 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) return; } - if (db->db_state != DB_NOFILL) { - blksz = arc_buf_size(*datap); - - if (dn->dn_object != DMU_META_DNODE_OBJECT) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - if (refcount_count(&db->db_holds) > 1 && - *datap == db->db_buf) { - arc_buf_contents_t type = - DBUF_GET_BUFC_TYPE(db); - *datap = - arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); - } - } - - ASSERT(*datap != NULL); + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + refcount_count(&db->db_holds) > 1 && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + int blksz = arc_buf_size(*datap); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b6205bd50..785c7c621 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -85,6 +85,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "ZFS user/group used" }, + { zap_byteswap, TRUE, "ZFS user/group quota" }, }; int @@ -180,22 +182,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) * whose dnodes are in the same block. */ static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; - uint32_t flags; + uint32_t dbuf_flags; int err; zio_t *zio; hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); - flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; - if (length > zfetch_array_rd_sz) - flags |= DB_RF_NOPREFETCH; + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { @@ -233,7 +235,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, /* initiate async i/o */ if (read) { rw_exit(&dn->dn_struct_rwlock); - (void) dbuf_read(db, zio, flags); + (void) dbuf_read(db, zio, dbuf_flags); rw_enter(&dn->dn_struct_rwlock, RW_READER); } dbp[i] = &db->db; @@ -285,7 +287,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); @@ -300,7 +302,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, int err; err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); return (err); } @@ -442,7 +444,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, object_size = align == 1 ? dn->dn_datablksz : (dn->dn_maxblkid + 1) << dn->dn_datablkshift; - if (trunc || (end = offset + length) > object_size) + end = offset + length; + if (trunc || end > object_size) end = object_size; if (end <= offset) return (0); @@ -450,6 +453,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, while (length) { start = end; + /* assert(offset <= start) */ err = get_next_chunk(dn, &start, offset); if (err) return (err); @@ -540,7 +544,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) + void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; @@ -570,7 +574,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp); + TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; @@ -810,6 +814,58 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } #endif +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + + return (arc_loan_buf(dn->dn_objset->os_spa, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db; + uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint64_t blkid; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, offset); + VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + rw_exit(&dn->dn_struct_rwlock); + + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + dbuf_rele(db, FTAG); + ASSERT(dn->dn_objset->os.os == dn->dn_objset); + dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz, + buf->b_data, tx); + dmu_return_arcbuf(buf); + } +} + typedef struct { dbuf_dirty_record_t *dr; dmu_sync_cb_t *done; diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 1b9247d66..1f91fc1ad 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -108,22 +106,56 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + int blocksize, dmu_object_type_t bonustype, int bonuslen) { dnode_t *dn; + dmu_tx_t *tx; + int nblkptr; int err; - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + if (object == DMU_META_DNODE_OBJECT) return (EBADF); err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); + + if (dn->dn_type == ot && dn->dn_datablksz == blocksize && + dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) { + /* nothing is changing, this is a noop */ + dnode_rele(dn, FTAG); + return (0); + } + + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + */ + if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) { + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); + if (err) + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + goto out; + } + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + + dmu_tx_commit(tx); +out: dnode_rele(dn, FTAG); - return (0); + return (err); } int diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c9e00d511..e962c4b88 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -164,10 +164,15 @@ dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; - ASSERT(size == sizeof (objset_phys_t)); + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } } int @@ -210,12 +215,30 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, err = EIO; return (err); } + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_buf_alloc(spa, + sizeof (objset_phys_t), &osi->os_phys_buf, + ARC_BUFC_METADATA); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(osi->os_phys_buf->b_data, buf->b_data, + arc_buf_size(osi->os_phys_buf)); + (void) arc_buf_remove_ref(osi->os_phys_buf, + &osi->os_phys_buf); + osi->os_phys_buf = buf; + } + osi->os_phys = osi->os_phys_buf->b_data; + osi->os_flags = osi->os_phys->os_flags; } else { - osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + osi->os_phys_buf = arc_buf_alloc(spa, size, &osi->os_phys_buf, ARC_BUFC_METADATA); osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, sizeof (objset_phys_t)); + bzero(osi->os_phys, size); } /* @@ -276,6 +299,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, osi->os_meta_dnode = dnode_special_open(osi, &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) { + osi->os_userused_dnode = dnode_special_open(osi, + &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); + osi->os_groupused_dnode = dnode_special_open(osi, + &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + } /* * We should be the only thread trying to do this because we @@ -456,13 +485,15 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) os.os = osi; (void) dmu_objset_evict_dbufs(&os); - ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); - dnode_special_close(osi->os_meta_dnode); + if (osi->os_userused_dnode) { + dnode_special_close(osi->os_userused_dnode); + dnode_special_close(osi->os_groupused_dnode); + } zil_free(osi->os_zil); + ASSERT3P(list_head(&osi->os_dnodes), ==, NULL); + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); @@ -520,6 +551,10 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); osi->os_phys->os_type = type; + if (dmu_objset_userused_enabled(osi)) { + osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + osi->os_flags = osi->os_phys->os_flags; + } dsl_dataset_dirty(ds, tx); @@ -704,13 +739,33 @@ struct snaparg { char *snapname; char failed[MAXPATHLEN]; boolean_t checkperms; - list_t objsets; + nvlist_t *props; }; -struct osnode { - list_node_t node; - objset_t *os; -}; +static int +snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct snaparg *sn = arg2; + + /* The props have already been checked by zfs_check_userprops(). */ + + return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset, + sn->snapname, tx)); +} + +static void +snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; + struct snaparg *sn = arg2; + + dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + + if (sn->props) + dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx); +} static int dmu_objset_snapshot_one(char *name, void *arg) @@ -747,13 +802,8 @@ dmu_objset_snapshot_one(char *name, void *arg) */ err = zil_suspend(dmu_objset_zil(os)); if (err == 0) { - struct osnode *osn; - dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, - sn->snapname, 3); - osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); - osn->os = os; - list_insert_tail(&sn->objsets, osn); + dsl_sync_task_create(sn->dstg, snapshot_check, + snapshot_sync, os, sn, 3); } else { dmu_objset_close(os); } @@ -762,11 +812,11 @@ dmu_objset_snapshot_one(char *name, void *arg) } int -dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, + nvlist_t *props, boolean_t recursive) { dsl_sync_task_t *dst; - struct osnode *osn; - struct snaparg sn = { 0 }; + struct snaparg sn; spa_t *spa; int err; @@ -778,8 +828,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; - list_create(&sn.objsets, sizeof (struct osnode), - offsetof(struct osnode, node)); + sn.props = props; if (recursive) { sn.checkperms = B_TRUE; @@ -790,27 +839,19 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) err = dmu_objset_snapshot_one(fsname, &sn); } - if (err) - goto out; - - err = dsl_sync_task_group_wait(sn.dstg); + if (err == 0) + err = dsl_sync_task_group_wait(sn.dstg); for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; if (dst->dst_err) dsl_dataset_name(ds, sn.failed); + zil_resume(dmu_objset_zil(os)); + dmu_objset_close(os); } -out: - while (osn = list_head(&sn.objsets)) { - list_remove(&sn.objsets, osn); - zil_resume(dmu_objset_zil(osn->os)); - dmu_objset_close(osn->os); - kmem_free(osn, sizeof (struct osnode)); - } - list_destroy(&sn.objsets); - if (err) (void) strcpy(fsname, sn.failed); dsl_sync_task_group_destroy(sn.dstg); @@ -819,7 +860,7 @@ out: } static void -dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; @@ -827,14 +868,20 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* - * Initialize dn_zio outside dnode_sync() - * to accomodate meta-dnode + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); + + if (newlist) { + (void) dnode_add_ref(dn, newlist); + list_insert_tail(newlist, dn); + } + dnode_sync(dn, tx); } } @@ -853,9 +900,12 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) ASSERT(BP_GET_LEVEL(bp) == 0); /* - * Update rootbp fill count. + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). */ - bp->blk_fill = 1; /* count the meta-dnode */ + bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; @@ -878,6 +928,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) writeprops_t wp = { 0 }; zio_t *zio; list_t *list; + list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -915,20 +966,41 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) } arc_release(os->os_phys_buf, &os->os_phys_buf); + zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* - * Sync meta-dnode - the parent IO for the sync is the root block + * Sync special dnodes - the parent IO for the sync is the root block */ os->os_meta_dnode->dn_zio = zio; dnode_sync(os->os_meta_dnode, tx); + os->os_phys->os_flags = os->os_flags; + + if (os->os_userused_dnode && + os->os_userused_dnode->dn_type != DMU_OT_NONE) { + os->os_userused_dnode->dn_zio = zio; + dnode_sync(os->os_userused_dnode, tx); + os->os_groupused_dnode->dn_zio = zio; + dnode_sync(os->os_groupused_dnode, tx); + } + txgoff = tx->tx_txg & TXG_MASK; - dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + if (dmu_objset_userused_enabled(os)) { + newlist = &os->os_synced_dnodes; + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. + */ + list_create(newlist, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &os->os_meta_dnode->dn_dirty_records[txgoff]; while (dr = list_head(list)) { @@ -945,6 +1017,145 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) zio_nowait(zio); } +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_impl_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] && + os->os_userused_dnode); +} + +void +dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx) +{ + dnode_t *dn; + list_t *list = &os->os_synced_dnodes; + static const char zerobuf[DN_MAX_BONUSLEN] = {0}; + + ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + + while (dn = list_head(list)) { + dmu_object_type_t bonustype; + + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_oldphys); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + /* Allocate the user/groupused objects if necessary. */ + if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(&os->os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY(0 == zap_create_claim(&os->os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + /* + * If the object was not previously + * accounted, pretend that it was free. + */ + if (!(dn->dn_oldphys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)) { + bzero(dn->dn_oldphys, sizeof (dnode_phys_t)); + } + + /* + * If the object was freed, use the previous bonustype. + */ + bonustype = dn->dn_phys->dn_bonustype ? + dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype; + ASSERT(dn->dn_phys->dn_type != 0 || + (bcmp(DN_BONUS(dn->dn_phys), zerobuf, + DN_MAX_BONUSLEN) == 0 && + DN_USED_BYTES(dn->dn_phys) == 0)); + ASSERT(dn->dn_oldphys->dn_type != 0 || + (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf, + DN_MAX_BONUSLEN) == 0 && + DN_USED_BYTES(dn->dn_oldphys) == 0)); + used_cbs[os->os_phys->os_type](&os->os, bonustype, + DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys), + DN_USED_BYTES(dn->dn_oldphys), + DN_USED_BYTES(dn->dn_phys), tx); + + /* + * The mutex is needed here for interlock with dnode_allocate. + */ + mutex_enter(&dn->dn_mtx); + zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); + dn->dn_oldphys = NULL; + mutex_exit(&dn->dn_mtx); + + list_remove(list, dn); + dnode_rele(dn, list); + } +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os->os)) + return (ENOTSUP); + if (dmu_objset_is_snapshot(os)) + return (EINVAL); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr) + continue; + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) @@ -978,6 +1189,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); } int diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6043df0aa..9ca3999dd 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -161,7 +161,9 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - if (bp == NULL && zb->zb_object == 0) { + if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + return (0); + } else if (bp == NULL && zb->zb_object == 0) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); @@ -775,11 +777,6 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_tx_t *tx; void *data = NULL; - err = dmu_object_info(os, drro->drr_object, NULL); - - if (err != 0 && err != ENOENT) - return (EINVAL); - if (drro->drr_type == DMU_OT_NONE || drro->drr_type >= DMU_OT_NUMTYPES || drro->drr_bonustype >= DMU_OT_NUMTYPES || @@ -792,18 +789,21 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) return (EINVAL); } + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (ra->err) return (ra->err); } - tx = dmu_tx_create(os); - if (err == ENOENT) { /* currently free, want to be allocated */ + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); @@ -812,28 +812,22 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) err = dmu_object_claim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); + dmu_tx_commit(tx); } else { /* currently allocated, want to be allocated */ - dmu_tx_hold_bonus(tx, drro->drr_object); - /* - * We may change blocksize and delete old content, - * so need to hold_write and hold_free. - */ - dmu_tx_hold_write(tx, drro->drr_object, 0, 1); - dmu_tx_hold_free(tx, drro->drr_object, 0, DMU_OBJECT_END); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_reclaim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); + drro->drr_bonustype, drro->drr_bonuslen); } - if (err) { - dmu_tx_commit(tx); + if (err) return (EINVAL); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, drro->drr_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); } dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 197284e1e..89cbfad29 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -64,6 +64,9 @@ struct traverse_data { void *td_arg; }; +static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object); + /* ARGSUSED */ static void traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -189,7 +192,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, td->td_spa, bp, pbuf, @@ -201,20 +204,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ dnp = buf->b_data; for (i = 0; i < epb && err == 0; i++, dnp++) { - for (j = 0; j < dnp->dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_blkptr[j], &czb); - if (err) - break; - } + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); + if (err) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; + dnode_phys_t *dnp; err = arc_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, @@ -223,20 +221,19 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, return (err); osp = buf->b_data; - /* - * traverse_zil is just here for zdb's leak checking. - * For other consumers, there will be no ZIL blocks. - */ traverse_zil(td, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - err = traverse_visitbp(td, &osp->os_meta_dnode, buf, - (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], - &czb); - if (err) - break; + dnp = &osp->os_meta_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0); + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_userused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_groupused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_GROUPUSED_OBJECT); } } @@ -249,6 +246,23 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, return (err); } +static int +traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object) +{ + int j, err = 0; + zbookmark_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) + break; + } + return (err); +} + /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index bf560e565..af2b049a7 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -160,6 +160,41 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } +static void +dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db, + boolean_t freeable, dmu_buf_impl_t **history) +{ + int i = db->db_level + 1; + dnode_t *dn = db->db_dnode; + + if (i >= dn->dn_nlevels) + return; + + db = db->db_parent; + if (db == NULL) { + uint64_t lvls = dn->dn_nlevels - i; + + txh->txh_space_towrite += lvls << dn->dn_indblkshift; + return; + } + + if (db != history[i]) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint64_t space = 1ULL << dn->dn_indblkshift; + + freeable = (db->db_blkptr && (freeable || + dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth))); + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (db->db_blkptr) + txh->txh_space_tounref += space; + history[i] = db; + dmu_tx_count_indirects(txh, db, freeable, history); + } +} + /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) @@ -177,18 +212,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - if (dn) { + dmu_buf_impl_t *last[DN_MAX_LEVELS]; + int nlvls = dn->dn_nlevels; + int delta; + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ if (dn->dn_maxblkid == 0) { - if ((off > 0 || len < dn->dn_datablksz) && - off < dn->dn_datablksz) { + delta = dn->dn_datablksz; + start = (off < dn->dn_datablksz) ? 0 : 1; + end = (off+len <= dn->dn_datablksz) ? 0 : 1; + if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; + delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, @@ -213,10 +254,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = start+1; i < end; i++) { + if (nlvls > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; @@ -226,20 +266,70 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) err = zio_wait(zio); if (err) goto out; + delta = P2NPHASE(off, dn->dn_datablksz); } - } - /* - * If there's more than one block, the blocksize can't change, - * so we can make a more precise estimate. Alternatively, - * if the dnode's ibs is larger than max_ibs, always use that. - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on existing pools. - */ - if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_datablkshift != 0) + if (dn->dn_maxblkid > 0) { + /* + * The blocksize can't change, + * so we can make a more precise estimate. + */ + ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; + min_ibs = max_ibs = dn->dn_indblkshift; + } else if (dn->dn_indblkshift > max_ibs) { + /* + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on older pools. + */ + min_ibs = max_ibs = dn->dn_indblkshift; + } + + /* + * If this write is not off the end of the file + * we need to account for overwrites/unref. + */ + if (start <= dn->dn_maxblkid) + bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS); + while (start <= dn->dn_maxblkid) { + spa_t *spa = txh->txh_tx->tx_pool->dp_spa; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, 0, start, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db->db_blkptr && dsl_dataset_block_freeable(ds, + db->db_blkptr->blk_birth)) { + dprintf_bp(db->db_blkptr, "can free old%s", ""); + txh->txh_space_tooverwrite += dn->dn_datablksz; + txh->txh_space_tounref += dn->dn_datablksz; + dmu_tx_count_indirects(txh, db, TRUE, last); + } else { + txh->txh_space_towrite += dn->dn_datablksz; + if (db->db_blkptr) + txh->txh_space_tounref += + bp_get_dasize(spa, db->db_blkptr); + dmu_tx_count_indirects(txh, db, FALSE, last); + } + dbuf_rele(db, FTAG); + if (++start > end) { + /* + * Account for new indirects appearing + * before this IO gets assigned into a txg. + */ + bits = 64 - min_bs; + epbs = min_ibs - SPA_BLKPTRSHIFT; + for (bits -= epbs * (nlvls - 1); + bits >= 0; bits -= epbs) + txh->txh_fudge += 1ULL << max_ibs; + goto out; + } + off += delta; + if (len >= delta) + len -= delta; + delta = dn->dn_datablksz; + } } /* @@ -262,20 +352,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs; - /* - * If we increase the number of levels of indirection, - * we'll need new blkid=0 indirect blocks. If start == 0, - * we're already accounting for that blocks; and if end == 0, - * we can't increase the number of levels beyond that. - */ - if (start != 0 && end != 0) - txh->txh_space_towrite += 1ULL << max_ibs; + ASSERT3U(end, >=, start); txh->txh_space_towrite += (end - start + 1) << max_ibs; + if (start != 0) { + /* + * We also need a new blkid=0 indirect block + * to reference any existing file data. + */ + txh->txh_space_towrite += 1ULL << max_ibs; + } } - ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); - out: + if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + 2 * DMU_MAX_ACCESS) + err = EFBIG; + if (err) txh->txh_tx->tx_err = err; } @@ -292,6 +384,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh) dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; + txh->txh_space_tounref += space; } else { txh->txh_space_towrite += space; if (dn && dn->dn_dbuf->db_blkptr) @@ -535,7 +628,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; @@ -584,9 +677,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; } else { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - txh->txh_space_tounref += - BP_GET_ASIZE(dn->dn_phys->dn_blkptr); } + if (dn->dn_phys->dn_blkptr[0].blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -603,12 +696,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) } } - /* - * 3 blocks overwritten: target leaf, ptrtbl block, header block - * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks - */ - dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + (add ? 3 : 0)) << dn->dn_datablkshift); + err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite, + txh->txh_dnode->dn_datablkshift); /* * If the modified blocks are scattered to the four winds, @@ -616,7 +706,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; + if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; + else + txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; } void diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 538a141b0..cf49b97f1 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -156,7 +156,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + @@ -321,6 +321,7 @@ dnode_destroy(dnode_t *dn) } ASSERT(NULL == list_head(&dn->dn_dbufs)); #endif + ASSERT(dn->dn_oldphys == NULL); mutex_enter(&os->os_lock); list_remove(&os->os_dnodes, dn); @@ -417,8 +418,7 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i, nblkptr; - dmu_buf_impl_t *db = NULL; + int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); @@ -430,42 +430,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); - for (i = 0; i < TXG_SIZE; i++) - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - - /* - * XXX I should really have a generation number to tell if we - * need to do this... - */ - if (blocksize != dn->dn_datablksz || - dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { - /* free all old data */ - dnode_free_range(dn, 0, -1ULL, tx); - } - - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); - /* change blocksize */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (blocksize != dn->dn_datablksz && - (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || - list_head(&dn->dn_dbufs) != NULL)) { - db = dbuf_hold(dn, 0, FTAG); - dbuf_new_size(db, blocksize, tx); - } - dnode_setdblksz(dn, blocksize); dnode_setdirty(dn, tx); - dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + if (dn->dn_datablksz != blocksize) { + /* change blocksize */ + ASSERT(dn->dn_maxblkid == 0 && + (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + dnode_block_freed(dn, 0))); + dnode_setdblksz(dn, blocksize); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + } + if (dn->dn_bonuslen != bonuslen) + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; rw_exit(&dn->dn_struct_rwlock); - if (db) - dbuf_rele(db, FTAG); /* change type */ dn->dn_type = ot; @@ -569,6 +552,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { + dn = (object == DMU_USERUSED_OBJECT) ? + os->os_userused_dnode : os->os_groupused_dnode; + if (dn == NULL) + return (ENOENT); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (ENOENT); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (EEXIST); + DNODE_VERIFY(dn); + (void) refcount_add(&dn->dn_holds, tag); + *dnp = dn; + return (0); + } + if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); @@ -627,7 +626,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { + ((flag & DNODE_MUST_BE_FREE) && + (type != DMU_OT_NONE || dn->dn_oldphys))) { mutex_exit(&dn->dn_mtx); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); @@ -692,8 +692,10 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); return; + } DNODE_VERIFY(dn); @@ -1189,11 +1191,6 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid) if (dn->dn_free_txg) return (TRUE); - /* - * If dn_datablkshift is not set, then there's only a single - * block, in which case there will never be a free range so it - * won't matter. - */ range_tofind.fr_blkid = blkid; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { @@ -1278,7 +1275,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - hole = flags & DNODE_FIND_HOLE; + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); @@ -1325,16 +1322,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, for (i = (*offset >> span) & (blkfill - 1); i >= 0 && i < blkfill; i += inc) { - boolean_t newcontents = B_TRUE; - if (txg) { - int j; - newcontents = B_FALSE; - for (j = 0; j < dnp[i].dn_nblkptr; j++) { - if (dnp[i].dn_blkptr[j].blk_birth > txg) - newcontents = B_TRUE; - } - } - if (!dnp[i].dn_type == hole && newcontents) + if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; *offset += (1ULL << span) * inc; } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 23dcb4c7b..184fe292b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -504,9 +504,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. - * - * NOTE: The dnode is kept in memory by being dirty. Once the - * dirty bit is cleared, it may be evicted. Beware of this! */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) @@ -515,20 +512,33 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; + static const dnode_phys_t zerodn = { 0 }; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + if (dmu_objset_userused_enabled(dn->dn_objset) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + ASSERT(dn->dn_oldphys == NULL); + dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); + *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + } else { + /* Once we account for it, we should always account for it. */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + } + mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ - /* XXX shouldn't the phys already be zeroed? */ - bzero(dnp, DNODE_CORE_SIZE); dnp->dn_nlevels = 1; dnp->dn_nblkptr = dn->dn_nblkptr; } @@ -626,7 +636,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dbuf_sync_list(list, tx); - if (dn->dn_object != DMU_META_DNODE_OBJECT) { + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index a68b12d33..0fe7eb583 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -229,7 +229,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); } -int +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) { return (blk_birth > dsl_dataset_prev_snap_txg(ds)); @@ -548,6 +548,7 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, return (err); if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { dsl_dataset_rele(*dsp, owner); + *dsp = NULL; return (EBUSY); } return (0); @@ -974,6 +975,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) (void) dmu_free_object(os, obj); } + /* + * We need to sync out all in-flight IO before we try to evict + * (the dataset evict func is trying to clear the cached entries + * for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); + + /* + * If we managed to free all the objects in open + * context, the user space accounting should be zero. + */ + if (ds->ds_phys->ds_bp.blk_fill == 0 && + dmu_objset_userused_enabled(os->os)) { + uint64_t count; + + ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || + count == 0); + ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || + count == 0); + } + dmu_objset_close(os); if (err != ESRCH) goto out; @@ -1058,7 +1080,6 @@ dsl_dataset_get_user_ptr(dsl_dataset_t *ds) return (ds->ds_user_ptr); } - blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { @@ -1164,8 +1185,18 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, if (bp == NULL) return (0); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) || + (zb->zb_object != 0 && dnp == NULL)) { + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool, + ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT)); + } else { + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + } return (0); } @@ -1209,13 +1240,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); - /* - * Before the roll back destroy the zil. - */ if (ds->ds_user_ptr != NULL) { - zil_rollback_destroy( - ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); - /* * We need to make sure that the objset_impl_t is reopened after * we do the rollback, otherwise it will have the wrong @@ -1248,7 +1273,10 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_deadlist_obj)); { - /* Free blkptrs that we gave birth to */ + /* + * Free blkptrs that we gave birth to - this covers + * claimed but not played log blocks too. + */ zio_t *zio; struct killarg ka; @@ -1262,8 +1290,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) (void) zio_wait(zio); } - ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || - ds->ds_phys->ds_unique_bytes == 0); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { /* Change our contents to that of the prev snapshot */ @@ -1481,7 +1508,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && ds_prev->ds_phys->ds_next_clones_obj != 0) { - VERIFY(0 == zap_remove_int(mos, + VERIFY3U(0, ==, zap_remove_int(mos, ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(0 == zap_add_int(mos, @@ -1654,7 +1681,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); - ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); } @@ -2583,7 +2610,7 @@ snaplist_destroy(list_t *l, boolean_t own) { struct promotenode *snap; - if (!list_link_active(&l->list_head)) + if (!l || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index e5e18f428..f19653d92 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -227,24 +227,11 @@ dsl_dir_namelen(dsl_dir_t *dd) return (result); } -int -dsl_dir_is_private(dsl_dir_t *dd) -{ - int rv = FALSE; - - if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) - rv = TRUE; - if (dataset_name_hidden(dd->dd_myname)) - rv = TRUE; - return (rv); -} - - static int getcomponent(const char *path, char *component, const char **nextp) { char *p; - if (path == NULL) + if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index dacc57c81..2c5dfca53 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -90,6 +90,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, + 1, 4, 0); + return (dp); } @@ -129,14 +132,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_close(dd, dp); if (err) goto out; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap); - if (err) - goto out; - dsl_dataset_rele(ds, FTAG); - dsl_dir_close(dd, dp); } /* get scrub status */ @@ -226,6 +230,7 @@ dsl_pool_close(dsl_pool_t *dp) rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_scrub_cancel_lock); + taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); @@ -296,24 +301,52 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; + start = gethrtime(); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_datasets, ds); - else - dmu_buf_rele(ds->ds_dbuf, ds); + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&dp->dp_synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); - - start = gethrtime(); err = zio_wait(zio); + write_time = gethrtime() - start; ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) + dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx); + + /* + * Sync the datasets again to push out the changes due to + * userquota updates. This must be done before we process the + * sync tasks, because that could cause a snapshot of a dataset + * whose ds_bp will be rewritten when we do this 2nd sync. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); dsl_sync_task_group_sync(dstg, tx); + } DTRACE_PROBE(pool_sync__3task); start = gethrtime(); @@ -611,3 +644,9 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } + +taskq_t * +dsl_pool_vnrele_taskq(dsl_pool_t *dp) +{ + return (dp->dp_vnrele_taskq); +} diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 212acbbc5..664ccff45 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -416,6 +414,34 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void +dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + nvlist_t *nvl = arg2; + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + struct prop_set_arg psa; + + psa.name = nvpair_name(elem); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(elem, + (char **)&psa.buf) == 0); + psa.intsz = 1; + psa.numints = strlen(psa.buf) + 1; + } else { + uint64_t intval; + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + psa.intsz = sizeof (intval); + psa.numints = 1; + psa.buf = &intval; + } + dsl_prop_set_sync(ds, &psa, cr, tx); + } +} + +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx) { @@ -438,6 +464,7 @@ dsl_prop_set(const char *dsname, const char *propname, int intsz, int numints, const void *buf) { dsl_dataset_t *ds; + uint64_t version; int err; struct prop_set_arg psa; @@ -447,15 +474,19 @@ dsl_prop_set(const char *dsname, const char *propname, */ if (strlen(propname) >= ZAP_MAXNAMELEN) return (ENAMETOOLONG); - if (intsz * numints >= ZAP_MAXVALUELEN) - return (E2BIG); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } if (dsl_dataset_is_snapshot(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } @@ -471,6 +502,50 @@ dsl_prop_set(const char *dsname, const char *propname, return (err); } +int +dsl_props_set(const char *dsname, nvlist_t *nvl) +{ + dsl_dataset_t *ds; + uint64_t version; + nvpair_t *elem = NULL; + int err; + + if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + return (err); + /* + * Do these checks before the syncfunc, since it can't fail. + */ + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + dsl_dataset_rele(ds, FTAG); + return (ENAMETOOLONG); + } + if (nvpair_type(elem) == DATA_TYPE_STRING) { + char *valstr; + VERIFY(nvpair_value_string(elem, &valstr) == 0); + if (strlen(valstr) >= (version < + SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } + } + } + + if (dsl_dataset_is_snapshot(ds) && + version < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_props_set_sync, ds, nvl, 2); + + dsl_dataset_rele(ds, FTAG); + return (err); +} + /* * Iterate over all properties for this dataset and return them in an nvlist. */ diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index 783604101..8a802b53a 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scrub_cb_t dsl_pool_scrub_clean_cb; static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; +static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object); int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ @@ -348,6 +350,12 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) if (bp->blk_birth <= dp->dp_scrub_min_txg) return; + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * plain scrub there's nothing to do to it). + */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) return; @@ -373,6 +381,11 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) if (bp->blk_birth <= dp->dp_scrub_min_txg) return; + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return; @@ -472,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; dnode_phys_t *child_dnp; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, dp->dp_spa, bp, pbuf, @@ -487,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, child_dnp = buf->b_data; for (i = 0; i < epb; i++, child_dnp++) { - for (j = 0; j < child_dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - child_dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, child_dnp, buf, - &child_dnp->dn_blkptr[j], &czb); - } + scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, @@ -516,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, traverse_zil(dp, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - scrub_visitbp(dp, &osp->os_meta_dnode, buf, - &osp->os_meta_dnode.dn_blkptr[j], &czb); + scrub_visitdnode(dp, &osp->os_meta_dnode, + buf, zb->zb_objset, 0); + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + scrub_visitdnode(dp, &osp->os_userused_dnode, + buf, zb->zb_objset, 0); + scrub_visitdnode(dp, &osp->os_groupused_dnode, + buf, zb->zb_objset, 0); } } @@ -532,6 +537,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } static void +scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); + } + +} + +static void scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) { zbookmark_t zb; diff --git a/module/zfs/fletcher.c b/module/zfs/fletcher.c index edda3c9a9..54247d724 100644 --- a/module/zfs/fletcher.c +++ b/module/zfs/fletcher.c @@ -19,11 +19,111 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Fletcher Checksums + * ------------------ + * + * ZFS's 2nd and 4th order Fletcher checksums are defined by the following + * recurrence relations: + * + * a = a + f + * i i-1 i-1 + * + * b = b + a + * i i-1 i + * + * c = c + b (fletcher-4 only) + * i i-1 i + * + * d = d + c (fletcher-4 only) + * i i-1 i + * + * Where + * a_0 = b_0 = c_0 = d_0 = 0 + * and + * f_0 .. f_(n-1) are the input data. + * + * Using standard techniques, these translate into the following series: + * + * __n_ __n_ + * \ | \ | + * a = > f b = > i * f + * n /___| n - i n /___| n - i + * i = 1 i = 1 + * + * + * __n_ __n_ + * \ | i*(i+1) \ | i*(i+1)*(i+2) + * c = > ------- f d = > ------------- f + * n /___| 2 n - i n /___| 6 n - i + * i = 1 i = 1 + * + * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. + * Since the additions are done mod (2^64), errors in the high bits may not + * be noticed. For this reason, fletcher-2 is deprecated. + * + * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. + * A conservative estimate of how big the buffer can get before we overflow + * can be estimated using f_i = 0xffffffff for all i: + * + * % bc + * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 + * 2264 + * quit + * % + * + * So blocks of up to 2k will not overflow. Our largest block size is + * 128k, which has 32k 4-byte words, so we can compute the largest possible + * accumulators, then divide by 2^64 to figure the max amount of overflow: + * + * % bc + * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } + * a/2^64;b/2^64;c/2^64;d/2^64 + * 0 + * 0 + * 1365 + * 11186858 + * quit + * % + * + * So a and b cannot overflow. To make sure each bit of input has some + * effect on the contents of c and d, we can look at what the factors of + * the coefficients in the equations for c_n and d_n are. The number of 2s + * in the factors determines the lowest set bit in the multiplier. Running + * through the cases for n*(n+1)/2 reveals that the highest power of 2 is + * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow + * the 64-bit accumulators, every bit of every f_i effects every accumulator, + * even for 128k blocks. + * + * If we wanted to make a stronger version of fletcher4 (fletcher4c?), + * we could do our calculations mod (2^32 - 1) by adding in the carries + * periodically, and store the number of carries in the top 32-bits. + * + * -------------------- + * Checksum Performance + * -------------------- + * + * There are two interesting components to checksum performance: cached and + * uncached performance. With cached data, fletcher-2 is about four times + * faster than fletcher-4. With uncached data, the performance difference is + * negligible, since the cost of a cache fill dominates the processing time. + * Even though fletcher-4 is slower than fletcher-2, it is still a pretty + * efficient pass over the data. + * + * In normal operation, the data which is being checksummed is in a buffer + * which has been filled either by: + * + * 1. a compression step, which will be mostly cached, or + * 2. a bcopy() or copyin(), which will be uncached (because the + * copy is cache-bypassing). + * + * For both cached and uncached data, both fletcher checksums are much faster + * than sha-256, and slower than 'off', which doesn't touch the data at all. + */ #include <sys/types.h> #include <sys/sysmacros.h> diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h index c402d3d58..6e5955b7c 100644 --- a/module/zfs/include/sys/arc.h +++ b/module/zfs/include/sys/arc.h @@ -85,6 +85,8 @@ void *arc_data_buf_alloc(uint64_t space); void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); +arc_buf_t *arc_loan_buf(spa_t *spa, int size); +void arc_return_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); @@ -134,7 +136,7 @@ void arc_fini(void); * Level 2 ARC */ -void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_init(void); diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h index 75ce27264..267852519 100644 --- a/module/zfs/include/sys/dbuf.h +++ b/module/zfs/include/sys/dbuf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -264,6 +264,7 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_clear(dmu_buf_impl_t *db); diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index 3b1e5c8fb..a363d7032 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,7 @@ struct zbookmark; struct spa; struct nvlist; struct objset_impl; +struct arc_buf; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -114,6 +115,8 @@ typedef enum dmu_object_type { DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -156,6 +159,9 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_MAX_ACCESS (10<<20) /* 10MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ +#define DMU_USERUSED_OBJECT (-1ULL) +#define DMU_GROUPUSED_OBJECT (-2ULL) + /* * Public routines to create, destroy, open, and close objsets. */ @@ -171,7 +177,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, int dmu_objset_destroy(const char *name); int dmu_snapshots_destroy(char *fsname, char *snapname); int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, + boolean_t recursive); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); int dmu_objset_find(char *name, int func(char *, void *), void *arg, @@ -235,7 +242,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + int blocksize, dmu_object_type_t bonustype, int bonuslen); /* * Free an object from this objset. @@ -398,6 +405,11 @@ void *dmu_buf_get_user(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* + * Tells if the given dbuf is freeable. + */ +boolean_t dmu_buf_freeable(dmu_buf_t *); + +/* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has @@ -422,7 +434,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); @@ -445,8 +457,10 @@ int dmu_free_object(objset_t *os, uint64_t object); * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ +#define DMU_READ_PREFETCH 0 /* prefetch */ +#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf); + void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, @@ -456,6 +470,10 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); +struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); +void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, + dmu_tx_t *tx); extern int zfs_prefetch_disable; @@ -562,6 +580,12 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); + +typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype, + void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused, + dmu_tx_t *tx); +extern void dmu_objset_register_type(dmu_objset_type_t ost, + objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 1d6572780..82cb6ad7d 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,12 +42,20 @@ struct dsl_dataset; struct dmu_tx; struct objset_impl; +#define OBJSET_PHYS_SIZE 2048 +#define OBJSET_OLD_PHYS_SIZE 1024 + +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) + typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; - char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - - sizeof (uint64_t)]; + uint64_t os_flags; + char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; } objset_phys_t; struct objset { @@ -62,6 +70,8 @@ typedef struct objset_impl { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; dnode_t *os_meta_dnode; + dnode_t *os_userused_dnode; + dnode_t *os_groupused_dnode; zilog_t *os_zil; objset_t os; uint8_t os_checksum; /* can change, under dsl_dir's locks */ @@ -74,6 +84,8 @@ typedef struct objset_impl { struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; zil_header_t os_zil_header; + list_t os_synced_dnodes; + uint64_t os_flags; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -92,6 +104,7 @@ typedef struct objset_impl { } objset_impl_t; #define DMU_META_DNODE_OBJECT 0 +#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ @@ -106,7 +119,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_destroy(const char *name); int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, + boolean_t recursive); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, @@ -127,6 +141,10 @@ objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_impl_t **osip); void dmu_objset_evict(struct dsl_dataset *ds, void *arg); +void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_impl_t *os); +int dmu_objset_userspace_upgrade(objset_t *os); +boolean_t dmu_objset_userspace_present(objset_t *os); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h index be9e56908..48e4da8cd 100644 --- a/module/zfs/include/sys/dnode.h +++ b/module/zfs/include/sys/dnode.h @@ -98,7 +98,8 @@ enum dnode_dirtycontext { }; /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ -#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ @@ -131,10 +132,7 @@ typedef struct dnode { */ krwlock_t dn_struct_rwlock; - /* - * Our link on dataset's dd_dnodes list. - * Protected by dd_accounting_mtx. - */ + /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ list_node_t dn_link; /* immutable: */ @@ -191,6 +189,9 @@ typedef struct dnode { /* parent IO for current sync write */ zio_t *dn_zio; + /* used in syncing context */ + dnode_phys_t *dn_oldphys; + /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h index 8665aec2d..a1c2896e3 100644 --- a/module/zfs/include/sys/dsl_dataset.h +++ b/module/zfs/include/sys/dsl_dataset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -195,7 +195,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx); -int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h index a29e44e67..b064c9228 100644 --- a/module/zfs/include/sys/dsl_deleg.h +++ b/module/zfs/include/sys/dsl_deleg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_DELEG_H #define _SYS_DSL_DELEG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dsl_pool.h> #include <sys/zfs_context.h> @@ -51,6 +49,10 @@ extern "C" { #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" /* * Note: the names of properties that are marked delegatable are also diff --git a/module/zfs/include/sys/dsl_dir.h b/module/zfs/include/sys/dsl_dir.h index 86b9636ce..56d06388c 100644 --- a/module/zfs/include/sys/dsl_dir.h +++ b/module/zfs/include/sys/dsl_dir.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -107,7 +107,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); -int dsl_dir_is_private(dsl_dir_t *dd); uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); dsl_checkfunc_t dsl_dir_destroy_check; diff --git a/module/zfs/include/sys/dsl_pool.h b/module/zfs/include/sys/dsl_pool.h index 3bb4ad4ef..d8da295f3 100644 --- a/module/zfs/include/sys/dsl_pool.h +++ b/module/zfs/include/sys/dsl_pool.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -77,6 +77,7 @@ typedef struct dsl_pool { struct dsl_dir *dp_mos_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; + struct taskq *dp_vnrele_taskq; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -143,6 +144,8 @@ int dsl_pool_scrub_clean(dsl_pool_t *dp); void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_scrub_restart(dsl_pool_t *dp); +taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h index d66caa86c..26018a46d 100644 --- a/module/zfs/include/sys/dsl_prop.h +++ b/module/zfs/include/sys/dsl_prop.h @@ -19,18 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_PROP_H #define _SYS_DSL_PROP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dsl_pool.h> #include <sys/zfs_context.h> +#include <sys/dsl_synctask.h> #ifdef __cplusplus extern "C" { @@ -66,8 +65,10 @@ int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint); +dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, int intsz, int numints, const void *buf); +int dsl_props_set(const char *dsname, nvlist_t *nvl); void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/metaslab.h b/module/zfs/include/sys/metaslab.h index 1c9d89e8f..5d3e11c97 100644 --- a/module/zfs/include/sys/metaslab.h +++ b/module/zfs/include/sys/metaslab.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,6 +39,8 @@ extern "C" { typedef struct metaslab_class metaslab_class_t; typedef struct metaslab_group metaslab_group_t; +extern space_map_ops_t *zfs_metaslab_ops; + extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, uint64_t start, uint64_t size, uint64_t txg); extern void metaslab_fini(metaslab_t *msp); @@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -extern metaslab_class_t *metaslab_class_create(void); +extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); diff --git a/module/zfs/include/sys/metaslab_impl.h b/module/zfs/include/sys/metaslab_impl.h index 5980cbc84..d67dea7e9 100644 --- a/module/zfs/include/sys/metaslab_impl.h +++ b/module/zfs/include/sys/metaslab_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/metaslab.h> #include <sys/space_map.h> #include <sys/vdev.h> @@ -41,6 +39,7 @@ extern "C" { struct metaslab_class { metaslab_group_t *mc_rotor; uint64_t mc_allocated; + space_map_ops_t *mc_ops; }; struct metaslab_group { diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 029123dfe..c7ae4022e 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -324,12 +324,9 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); -extern int spa_check_rootconf(char *devpath, char *devid, - nvlist_t **bestconf, uint64_t *besttxg); -extern boolean_t spa_rootdev_validate(nvlist_t *nv); extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); +extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, @@ -347,6 +344,7 @@ extern void spa_inject_delref(spa_t *spa); #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -356,6 +354,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); diff --git a/module/zfs/include/sys/spa_boot.h b/module/zfs/include/sys/spa_boot.h index b56073b97..1d3622f5a 100644 --- a/module/zfs/include/sys/spa_boot.h +++ b/module/zfs/include/sys/spa_boot.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/nvpair.h> #ifdef __cplusplus @@ -36,7 +34,6 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); -extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index 588b4f5a9..12999ee9e 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -141,9 +141,6 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ - kmutex_t spa_async_root_lock; /* protects async root count */ - uint64_t spa_async_root_count; /* number of async root zios */ - kcondvar_t spa_async_root_cv; /* notify when count == 0 */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ boolean_t spa_last_open_failed; /* true if last open faled */ @@ -163,15 +160,16 @@ struct spa { uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_async_zio_root; /* root of all async I/O */ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ - boolean_t spa_import_faulted; /* allow faulted vdevs */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ + uint64_t spa_autoexpand; /* lun expansion on/off */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h index 8d7860660..a682bbd40 100644 --- a/module/zfs/include/sys/space_map.h +++ b/module/zfs/include/sys/space_map.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,12 +46,14 @@ typedef struct space_map { uint8_t sm_loading; /* map loading? */ kcondvar_t sm_load_cv; /* map load completion */ space_map_ops_t *sm_ops; /* space map block picker ops vector */ + avl_tree_t *sm_pp_root; /* picker-private AVL tree */ void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; typedef struct space_seg { avl_node_t ss_node; /* AVL node */ + avl_node_t ss_pp_node; /* AVL picker-private node */ uint64_t ss_start; /* starting offset of this segment */ uint64_t ss_end; /* ending offset (non-inclusive) */ } space_seg_t; @@ -74,6 +76,7 @@ struct space_map_ops { uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); + uint64_t (*smop_max)(space_map_t *sm); }; /* @@ -152,6 +155,7 @@ extern void space_map_unload(space_map_t *sm); extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); +extern uint64_t space_map_maxsize(space_map_t *sm); extern void space_map_sync(space_map_t *sm, uint8_t maptype, space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h index b8313a920..71b9b12d6 100644 --- a/module/zfs/include/sys/vdev.h +++ b/module/zfs/include/sys/vdev.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,7 +50,6 @@ extern int vdev_open(vdev_t *); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); -extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); @@ -71,6 +70,8 @@ extern boolean_t vdev_resilver_needed(vdev_t *vd, extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); +extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); @@ -113,7 +114,8 @@ extern void vdev_queue_io_done(zio_t *zio); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + boolean_t); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 652349341..8240b66ac 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -113,6 +113,7 @@ struct vdev { uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ @@ -125,6 +126,7 @@ struct vdev { uint64_t vdev_children; /* number of children */ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ + boolean_t vdev_expanding; /* expand the vdev? */ /* * Top-level vdev state. @@ -159,6 +161,7 @@ struct vdev { char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ + char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ @@ -188,8 +191,9 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; -#define VDEV_SKIP_SIZE (8 << 10) -#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -201,26 +205,14 @@ struct vdev { offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) -/* ZFS boot block */ -#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL -#define VDEV_BOOT_VERSION 1 /* version number */ - -typedef struct vdev_boot_header { - uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ - uint64_t vb_version; /* VDEV_BOOT_VERSION */ - uint64_t vb_offset; /* start offset (bytes) */ - uint64_t vb_size; /* size (bytes) */ - char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; -} vdev_boot_header_t; - typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; zio_block_tail_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { - char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ - vdev_boot_header_t vl_boot_header; /* 8K */ + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ @@ -249,6 +241,7 @@ typedef struct vdev_label { #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 +#define VDEV_ALLOC_ROOTPOOL 4 /* * Allocate or free a vdev @@ -269,6 +262,7 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ +extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -290,7 +284,8 @@ extern vdev_ops_t vdev_spare_ops; * Common size functions */ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); -extern uint64_t vdev_get_rsize(vdev_t *vd); +extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern void vdev_set_min_asize(vdev_t *vd); /* * zdb uses this tunable, so it must be declared here to make lint happy. diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h index f88cc068b..de2053812 100644 --- a/module/zfs/include/sys/zap.h +++ b/module/zfs/include/sys/zap.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZAP - ZFS Attribute Processor * @@ -87,9 +85,6 @@ extern "C" { #endif -#define ZAP_MAXNAMELEN 256 -#define ZAP_MAXVALUELEN 1024 - /* * The matchtype specifies which entry will be accessed. * MT_EXACT: only find an exact match (non-normalized) @@ -186,6 +181,10 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, + int add, uint64_t *towrite, uint64_t *tooverwrite, + uint64_t dn_datablkshift); + /* * Create an attribute with the given name and value. * diff --git a/module/zfs/include/sys/zap_impl.h b/module/zfs/include/sys/zap_impl.h index 0dc02ab6b..c86bb16de 100644 --- a/module/zfs/include/sys/zap_impl.h +++ b/module/zfs/include/sys/zap_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zap.h> #include <sys/zfs_context.h> #include <sys/avl.h> @@ -195,6 +193,8 @@ int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h index bd91b33d1..f5e5aa7f4 100644 --- a/module/zfs/include/sys/zfs_acl.h +++ b/module/zfs/include/sys/zfs_acl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -114,8 +114,6 @@ typedef struct zfs_acl_phys { uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; - - typedef struct acl_ops { uint32_t (*ace_mask_get) (void *acep); /* get access mask */ void (*ace_mask_set) (void *acep, @@ -161,12 +159,21 @@ typedef struct zfs_acl { zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ - boolean_t z_has_fuids; /* FUIDs present in ACL? */ } zfs_acl_t; #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) +struct zfs_fuid_info; + +typedef struct zfs_acl_ids { + uint64_t z_fuid; /* file owner fuid */ + uint64_t z_fgid; /* file group owner fuid */ + uint64_t z_mode; /* mode to set on create */ + zfs_acl_t *z_aclp; /* ACL to create with file */ + struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ +} zfs_acl_ids_t; + /* * Property values for acl_mode and acl_inherit. * @@ -183,16 +190,18 @@ typedef struct zfs_acl { struct znode; struct zfsvfs; -struct zfs_fuid_info; #ifdef _KERNEL -void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, - dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); +int zfs_acl_ids_create(struct znode *, int, vattr_t *, + cred_t *, vsecattr_t *, zfs_acl_ids_t *); +void zfs_acl_ids_free(zfs_acl_ids_t *); +boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); void zfs_acl_rele(void *); void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); +extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); @@ -202,9 +211,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); void zfs_acl_free(zfs_acl_t *); -int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); -int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, - struct zfs_fuid_info **, dmu_tx_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, + struct zfs_fuid_info **, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); #endif diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h index a5be3e130..40de32084 100644 --- a/module/zfs/include/sys/zfs_context.h +++ b/module/zfs/include/sys/zfs_context.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -62,6 +60,7 @@ extern "C" { #include <sys/zfs_debug.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> #include <sys/fm/util.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/module/zfs/include/sys/zfs_ctldir.h b/module/zfs/include/sys/zfs_ctldir.h index ce29625d1..c15c946d5 100644 --- a/module/zfs/include/sys/zfs_ctldir.h +++ b/module/zfs/include/sys/zfs_ctldir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_CTLDIR_H #define _ZFS_CTLDIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/pathname.h> #include <sys/vnode.h> #include <sys/zfs_vfsops.h> @@ -66,6 +64,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); #define ZFSCTL_INO_ROOT 0x1 #define ZFSCTL_INO_SNAPDIR 0x2 +#define ZFSCTL_INO_SHARES 0x3 #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zfs_dir.h b/module/zfs/include/sys/zfs_dir.h index ebb66e8ae..650315be2 100644 --- a/module/zfs/include/sys/zfs_dir.h +++ b/module/zfs/include/sys/zfs_dir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_DIR_H #define _SYS_FS_ZFS_DIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/pathname.h> #include <sys/dmu.h> #include <sys/zfs_znode.h> @@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); + uint_t, znode_t **, int, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h index 810ffc81a..f81ddf4a5 100644 --- a/module/zfs/include/sys/zfs_fuid.h +++ b/module/zfs/include/sys/zfs_fuid.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_FUID_H #define _SYS_FS_ZFS_FUID_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef _KERNEL #include <sys/kidmap.h> #include <sys/sid.h> @@ -51,11 +49,11 @@ typedef enum { * Estimate space needed for one more fuid table entry. * for now assume its current size + 1K */ -#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) +#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) -#define FUID_INDEX(x) (x >> 32) -#define FUID_RID(x) (x & 0xffffffff) -#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +#define FUID_INDEX(x) ((x) >> 32) +#define FUID_RID(x) ((x) & 0xffffffff) +#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) /* * FUIDs cause problems for the intent log * we need to replay the creation of the FUID, @@ -104,17 +102,23 @@ struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, - dmu_tx_t *, cred_t *, zfs_fuid_info_t **); + cred_t *, zfs_fuid_info_t **); extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, - dmu_tx_t *, zfs_fuid_info_t **); -extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, - uid_t *gid); + zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, + uid_t *uid, uid_t *gid); extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); -extern void zfs_fuid_info_free(); +extern void zfs_fuid_info_free(zfs_fuid_info_t *); extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); +extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, + char **retdomain, boolean_t addok); +extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); +extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); #endif char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index 1692608bb..1e9f35155 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/cred.h> #include <sys/dmu.h> #include <sys/zio.h> @@ -118,7 +116,7 @@ typedef struct zinject_record { uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; - uint32_t zi_pad; /* pad out to 64 bit alignment */ + uint32_t zi_failfast; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -162,12 +160,20 @@ typedef struct zfs_cmd { uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; } zfs_cmd_t; +typedef struct zfs_useracct { + char zu_domain[256]; + uid_t zu_rid; + uint32_t zu_pad; + uint64_t zu_space; +} zfs_useracct_t; + #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index 7e0440be4..b8ed7b27f 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,6 +53,7 @@ struct zfsvfs { avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ @@ -72,8 +73,12 @@ struct zfsvfs { boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ - kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + kmutex_t z_online_recv_lock; /* held while recv in progress */ uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -130,6 +135,17 @@ extern uint_t zfs_fsyncer_key; extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, + boolean_t isgroup, uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index 9192abcd7..69f4b50f5 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -93,12 +93,15 @@ extern "C" { /* * Special attributes for master node. + * "userquota@" and "groupquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). */ #define ZFS_FSID "FSID" #define ZFS_UNLINKED_SET "DELETE_QUEUE" #define ZFS_ROOT_OBJ "ROOT" #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -309,7 +312,6 @@ extern int zfs_create_op_tables(); extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); extern dev_t zfs_cmpldev(uint64_t); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); -extern int zfs_set_version(const char *name, uint64_t newvers); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern void zfs_znode_dmu_fini(znode_t *); @@ -336,6 +338,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h index b69323cfa..2aff8cd68 100644 --- a/module/zfs/include/sys/zil.h +++ b/module/zfs/include/sys/zil.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,10 +56,16 @@ typedef struct zil_header { uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ uint64_t zh_claim_seq; /* highest claimed sequence number */ - uint64_t zh_pad[5]; + uint64_t zh_flags; /* header flags */ + uint64_t zh_pad[4]; } zil_header_t; /* + * zh_flags bit settings + */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ + +/* * Log block trailer - structure at the end of the header and each log block * * The zit_bt contains a zbt_cksum which for the intent log is @@ -299,7 +305,27 @@ typedef struct { */ /* - * ZFS intent log transaction structure + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs/zvol_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. */ typedef enum { WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ @@ -359,9 +385,9 @@ extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); +extern int zil_vdev_offline(char *osname, void *txarg); extern int zil_claim(char *osname, void *txarg); extern int zil_check_log_chain(char *osname, void *txarg); -extern int zil_clear_log_chain(char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog); extern int zil_is_committed(zilog_t *zilog); diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h index 3f2582931..685305fb5 100644 --- a/module/zfs/include/sys/zil_impl.h +++ b/module/zfs/include/sys/zil_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,6 +101,9 @@ typedef struct zil_dva_node { avl_node_t zn_node; } zil_dva_node_t; +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ + sizeof (lr_write_t)) + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index 67adc3b4c..5c51717c1 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -76,7 +76,7 @@ enum zio_checksum { ZIO_CHECKSUM_FUNCTIONS }; -#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON enum zio_compress { @@ -116,30 +116,33 @@ enum zio_compress { #define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) #define ZIO_PRIORITY_TABLE_SIZE 10 -#define ZIO_FLAG_MUSTSUCCEED 0x00000 -#define ZIO_FLAG_CANFAIL 0x00001 -#define ZIO_FLAG_SPECULATIVE 0x00002 -#define ZIO_FLAG_CONFIG_WRITER 0x00004 -#define ZIO_FLAG_DONT_RETRY 0x00008 +#define ZIO_FLAG_MUSTSUCCEED 0x000000 +#define ZIO_FLAG_CANFAIL 0x000001 +#define ZIO_FLAG_SPECULATIVE 0x000002 +#define ZIO_FLAG_CONFIG_WRITER 0x000004 +#define ZIO_FLAG_DONT_RETRY 0x000008 -#define ZIO_FLAG_DONT_CACHE 0x00010 -#define ZIO_FLAG_DONT_QUEUE 0x00020 -#define ZIO_FLAG_DONT_AGGREGATE 0x00040 -#define ZIO_FLAG_DONT_PROPAGATE 0x00080 +#define ZIO_FLAG_DONT_CACHE 0x000010 +#define ZIO_FLAG_DONT_QUEUE 0x000020 +#define ZIO_FLAG_DONT_AGGREGATE 0x000040 +#define ZIO_FLAG_DONT_PROPAGATE 0x000080 -#define ZIO_FLAG_IO_BYPASS 0x00100 -#define ZIO_FLAG_IO_REPAIR 0x00200 -#define ZIO_FLAG_IO_RETRY 0x00400 -#define ZIO_FLAG_IO_REWRITE 0x00800 +#define ZIO_FLAG_IO_BYPASS 0x000100 +#define ZIO_FLAG_IO_REPAIR 0x000200 +#define ZIO_FLAG_IO_RETRY 0x000400 +#define ZIO_FLAG_IO_REWRITE 0x000800 -#define ZIO_FLAG_SELF_HEAL 0x01000 -#define ZIO_FLAG_RESILVER 0x02000 -#define ZIO_FLAG_SCRUB 0x04000 -#define ZIO_FLAG_SCRUB_THREAD 0x08000 +#define ZIO_FLAG_SELF_HEAL 0x001000 +#define ZIO_FLAG_RESILVER 0x002000 +#define ZIO_FLAG_SCRUB 0x004000 +#define ZIO_FLAG_SCRUB_THREAD 0x008000 -#define ZIO_FLAG_PROBE 0x10000 -#define ZIO_FLAG_GANG_CHILD 0x20000 -#define ZIO_FLAG_RAW 0x40000 +#define ZIO_FLAG_PROBE 0x010000 +#define ZIO_FLAG_GANG_CHILD 0x020000 +#define ZIO_FLAG_RAW 0x040000 +#define ZIO_FLAG_GODFATHER 0x080000 + +#define ZIO_FLAG_TRYHARD 0x100000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -157,7 +160,8 @@ enum zio_compress { (ZIO_FLAG_GANG_INHERIT | \ ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_IO_RETRY | \ - ZIO_FLAG_PROBE) + ZIO_FLAG_PROBE | \ + ZIO_FLAG_TRYHARD) #define ZIO_FLAG_AGG_INHERIT \ (ZIO_FLAG_DONT_AGGREGATE | \ @@ -281,7 +285,6 @@ struct zio { int io_cmd; uint8_t io_priority; uint8_t io_reexecute; - uint8_t io_async_root; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; @@ -324,6 +327,7 @@ struct zio { int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t *io_stall; + zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; @@ -415,7 +419,7 @@ extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio); -extern void zio_resume(spa_t *spa); +extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* @@ -435,7 +439,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern int zio_handle_fault_injection(zio_t *zio, int error); -extern int zio_handle_device_injection(vdev_t *vd, int error); +extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); #ifdef __cplusplus diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 412832968..77556ac5d 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space_map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 30; + +/* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * -metaslab_class_create(void) +metaslab_class_create(space_map_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc->mc_rotor = NULL; + mc->mc_ops = ops; return (mc); } @@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. */ -static void -metaslab_ff_load(space_map_t *sm) -{ - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); -} - -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; -} - static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) { - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; space_seg_t *ss, ssearch; avl_index_t where; @@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size) return (-1ULL); *cursor = 0; - return (metaslab_ff_alloc(sm, size)); + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static void +metaslab_ff_load(space_map_t *sm) +{ + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + sm->sm_pp_root = NULL; +} + +static void +metaslab_ff_unload(space_map_t *sm) +{ + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; +} + +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); } /* ARGSUSED */ @@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = { metaslab_ff_unload, metaslab_ff_alloc, metaslab_ff_claim, - metaslab_ff_free + metaslab_ff_free, + NULL /* maxsize */ +}; + +/* + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + */ + +uint64_t +metaslab_df_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +static int +metaslab_df_seg_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; + + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); +} + +static void +metaslab_df_load(space_map_t *sm) +{ + space_seg_t *ss; + + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(sm->sm_pp_root, metaslab_df_seg_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); +} + +static void +metaslab_df_unload(space_map_t *sm) +{ + void *cookie = NULL; + + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; + + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } + + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; +} + +static uint64_t +metaslab_df_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t max_size = metaslab_df_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = sm->sm_pp_root; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); +} + +/* ARGSUSED */ +static void +metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +static space_map_ops_t metaslab_df_ops = { + metaslab_df_load, + metaslab_df_unload, + metaslab_df_alloc, + metaslab_df_claim, + metaslab_df_free, + metaslab_df_maxsize }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; + /* * ========================================================================== * Metaslabs @@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { space_map_t *sm = &msp->ms_map; + space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, &metaslab_ff_ops, - SM_FREE, &msp->ms_smo, + int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); if (error) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } + + /* + * If we were able to load the map then make sure + * that this map is still able to satisfy our request. + */ + if (msp->ms_weight < size) + return (ENOSPC); + metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, int i; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + for (i = 0; i < d; i++) { + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + break; + } + } for (;;) { + boolean_t was_active; + mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { if (msp->ms_weight < size) { @@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, return (-1ULL); } + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size) { + if (msp->ms_weight < size || (was_active && + !(msp->ms_weight & METASLAB_ACTIVE_MASK) && + activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); continue; } @@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (metaslab_activate(msp, activation_weight, size) != 0) { mutex_exit(&msp->ms_lock); continue; } @@ -869,7 +1043,7 @@ next: goto top; } - if (!zio_lock) { + if (!allocatable && !zio_lock) { dshift = 3; zio_lock = B_TRUE; goto top; @@ -955,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index df1b7e125..6a95b399b 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -59,6 +59,7 @@ #include <sys/systeminfo.h> #include <sys/sunddi.h> #include <sys/spa_boot.h> +#include <sys/zfs_ioctl.h> #ifdef _KERNEL #include <sys/zone.h> @@ -67,16 +68,44 @@ #include "zfs_prop.h" #include "zfs_comutil.h" -int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - /* ISSUE INTR */ - { 1, 1 }, /* ZIO_TYPE_NULL */ - { 1, 8 }, /* ZIO_TYPE_READ */ - { 8, 1 }, /* ZIO_TYPE_WRITE */ - { 1, 1 }, /* ZIO_TYPE_FREE */ - { 1, 1 }, /* ZIO_TYPE_CLAIM */ - { 1, 1 }, /* ZIO_TYPE_IOCTL */ +enum zti_modes { + zti_mode_fixed, /* value is # of threads (min 1) */ + zti_mode_online_percent, /* value is % of online CPUs */ + zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_nmodes }; +#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } +#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } +#define ZTI_THREAD_TUNE { zti_mode_tune, 0 } + +#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) + +typedef struct zio_taskq_info { + const char *zti_name; + struct { + enum zti_modes zti_mode; + uint_t zti_value; + } zti_nthreads[ZIO_TASKQ_TYPES]; +} zio_taskq_info_t; + +static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { + "issue", "intr" +}; + +const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { + /* ISSUE INTR */ + { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, + { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, + { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, +}; + +enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; +uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ + static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); @@ -304,12 +333,18 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: + case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = EINVAL; break; case ZPOOL_PROP_BOOTFS: + /* + * If the pool version is less than SPA_VERSION_BOOTFS, + * or the pool is still being created (version == 0), + * the bootfs property cannot be set. + */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = ENOTSUP; break; @@ -541,14 +576,50 @@ spa_activate(spa_t *spa, int mode) spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; - spa->spa_normal_class = metaslab_class_create(); - spa->spa_log_class = metaslab_class_create(); + spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); for (int t = 0; t < ZIO_TYPES; t++) { + const zio_taskq_info_t *ztip = &zio_taskqs[t]; for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", - zio_taskq_threads[t][q], maxclsyspri, 50, - INT_MAX, TASKQ_PREPOPULATE); + enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; + uint_t value = ztip->zti_nthreads[q].zti_value; + char name[32]; + + (void) snprintf(name, sizeof (name), + "%s_%s", ztip->zti_name, zio_taskq_types[q]); + + if (mode == zti_mode_tune) { + mode = zio_taskq_tune_mode; + value = zio_taskq_tune_value; + if (mode == zti_mode_tune) + mode = zti_mode_online_percent; + } + + switch (mode) { + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + + spa->spa_zio_taskq[t][q] = taskq_create(name, + value, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + break; + + case zti_mode_online_percent: + spa->spa_zio_taskq[t][q] = taskq_create(name, + value, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + break; + + case zti_mode_tune: + default: + panic("unrecognized mode for " + "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " + "in spa_activate()", + t, q, mode, value); + break; + } } } @@ -577,7 +648,7 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); - + ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); txg_list_destroy(&spa->spa_vdev_txg_list); @@ -621,7 +692,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; - uint_t c, children; + uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) @@ -642,7 +713,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (EINVAL); } - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { @@ -683,10 +754,10 @@ spa_unload(spa_t *spa) /* * Wait for any outstanding async I/O to complete. */ - mutex_enter(&spa->spa_async_root_lock); - while (spa->spa_async_root_count != 0) - cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); - mutex_exit(&spa->spa_async_root_lock); + if (spa->spa_async_zio_root != NULL) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = NULL; + } /* * Close the dsl pool. @@ -828,6 +899,7 @@ spa_load_spares(spa_t *spa) } vd->vdev_top = vd; + vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; @@ -869,7 +941,7 @@ spa_load_l2cache(spa_t *spa) nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; - uint64_t guid, size; + uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; @@ -933,12 +1005,8 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); - if (!vdev_is_dead(vd)) { - size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + if (!vdev_is_dead(vd)) + l2arc_add_vdev(spa, vd); } } @@ -1001,7 +1069,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, + DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -1016,9 +1085,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { @@ -1028,6 +1095,33 @@ spa_check_removed(vdev_t *vd) } /* + * Load the slog device state from the config object since it's possible + * that the label does not contain the most up-to-date information. + */ +void +spa_load_log_state(spa_t *spa) +{ + nvlist_t *nv, *nvroot, **child; + uint64_t is_log; + uint_t children; + vdev_t *rvd = spa->spa_root_vdev; + + VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0); + VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + + for (int c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log) == 0 && is_log) + vdev_load_log_state(tvd, child[c]); + } + nvlist_free(nv); +} + +/* * Check for missing log devices */ int @@ -1043,13 +1137,7 @@ spa_check_logs(spa_t *spa) return (1); } break; - - case SPA_LOG_CLEAR: - (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, - DS_FIND_CHILDREN); - break; } - spa->spa_log_state = SPA_LOG_GOOD; return (0); } @@ -1107,6 +1195,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa->spa_load_guid = pool_guid; /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. @@ -1132,16 +1226,19 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; /* - * Validate the labels for all leaf vdevs. We need to grab the config - * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of mosconfig. If + * mosconfig is true then we're validating the vdev labels based on + * that config. Otherwise, we're validating against the cached config + * (zpool.cache) that was read when we loaded the zfs module, and then + * later we will recursively call spa_load() and validate against + * the vdev config. */ - if (mosconfig) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; - } + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + goto out; if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { error = ENXIO; @@ -1372,6 +1469,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_exit(spa, SCL_ALL, FTAG); } + spa_load_log_state(spa); + if (spa_check_logs(spa)) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LOG); @@ -1410,6 +1509,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), sizeof (uint64_t), 1, &spa->spa_failmode); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), + sizeof (uint64_t), 1, &spa->spa_autoexpand); } /* @@ -1459,6 +1562,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); + spa->spa_log_state = SPA_LOG_GOOD; spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1646,6 +1750,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint_t vsc; uint64_t pool; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_spares.sav_count == 0) return; @@ -1693,11 +1799,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_stat_t *vs; uint_t vsc; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_l2cache.sav_count == 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, @@ -1731,8 +1837,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_get_stats(vd, vs); } } - - spa_config_exit(spa, SCL_CONFIG, FTAG); } int @@ -1744,16 +1848,27 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) *config = NULL; error = spa_open_common(name, &spa, FTAG, config); - if (spa && *config != NULL) { - VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); + if (spa != NULL) { + /* + * This still leaves a window of inconsistency where the spares + * or l2cache devices could change and the config would be + * self-inconsistent. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - if (spa_suspended(spa)) + if (*config != NULL) { VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, + spa->spa_failmode) == 0); - spa_add_spares(spa, *config); - spa_add_l2cache(spa, *config); + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + } } /* @@ -1775,8 +1890,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) } } - if (spa != NULL) + if (spa != NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); + } return (error); } @@ -1969,7 +2086,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; - int c, error = 0; + int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; @@ -1995,7 +2112,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_txg = txg - 1; if (props && (error = spa_prop_validate(spa, props))) { - spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -2010,6 +2126,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_ubsync = spa->spa_uberblock; /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -2026,9 +2148,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_init(rvd->vdev_child[c], txg); - vdev_config_dirty(rvd); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_metaslab_set_size(rvd->vdev_child[c]); + vdev_expand(rvd->vdev_child[c], txg); + } } spa_config_exit(spa, SCL_ALL, FTAG); @@ -2127,6 +2250,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, CRED(), tx); @@ -2155,17 +2279,239 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (0); } +#ifdef _KERNEL /* - * Import the given pool into the system. We set up the necessary spa_t and - * then call spa_load() to do the dirty work. + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. */ -static int -spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, - boolean_t isroot, boolean_t allowfaulted) +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +static nvlist_t * +spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) +{ + nvlist_t *config; + nvlist_t *nvtop, *nvroot; + uint64_t pgid; + + if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) + return (NULL); + + /* + * Add this top-level vdev to the child array. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + return (config); +} + +/* + * Walk the vdev tree and see if we can find a device with "better" + * configuration. A configuration is "better" if the label on that + * device has a more recent txg. + */ +static void +spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + spa_alt_rootvdev(vd->vdev_child[c], avd, txg); + + if (vd->vdev_ops->vdev_op_leaf) { + nvlist_t *label; + uint64_t label_txg; + + if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, + &label) != 0) + return; + + VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &label_txg) == 0); + + /* + * Do we have a better boot device? + */ + if (label_txg > *txg) { + *txg = label_txg; + *avd = vd; + } + nvlist_free(label); + } +} + +/* + * Import a root pool. + * + * For x86. devpath_list will consist of devid and/or physpath name of + * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). + * The GRUB "findroot" command will return the vdev we should boot. + * + * For Sparc, devpath_list consists the physpath name of the booting device + * no matter the rootpool is a single device pool or a mirrored pool. + * e.g. + * "/pci@1f,0/ide@d/disk@0,0:a" + */ +int +spa_import_rootpool(char *devpath, char *devid) +{ + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t guid, txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) { + cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + devpath); + return (EIO); + } + + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so that we + * can replace it with the correct config we just read in. + */ + spa_remove(spa); + } + + spa = spa_add(pname, NULL); + spa->spa_is_root = B_TRUE; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + /* + * Get the boot vdev. + */ + if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", + (u_longlong_t)guid); + error = ENOENT; + goto out; + } + + /* + * Determine if there is a better boot device. + */ + avd = bvd; + spa_alt_rootvdev(rvd, &avd, &txg); + if (avd != bvd) { + cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " + "try booting from '%s'", avd->vdev_path); + error = EINVAL; + goto out; + } + + /* + * If the boot device is part of a spare vdev then ensure that + * we're booting off the active spare. + */ + if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && + !bvd->vdev_isspare) { + cmn_err(CE_NOTE, "The boot device is currently spared. Please " + "try booting from '%s'", + bvd->vdev_parent->vdev_child[1]->vdev_path); + error = EINVAL; + goto out; + } + + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + error = 0; +out: + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (error); +} + +#endif + +/* + * Take a pool and insert it into the namespace as if it had been loaded at + * boot. + */ +int +spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) { spa_t *spa; char *altroot = NULL; - int error, loaderr; + + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, altroot); + + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Import a non-root pool into the system. + */ +int +spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +{ + spa_t *spa; + char *altroot = NULL; + int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; @@ -2175,18 +2521,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) != NULL) { - if (isroot) { - /* - * Remove the existing root pool from the - * namespace so that we can replace it with - * the correct config we just read in. - */ - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - spa_remove(spa); - } else { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } + mutex_exit(&spa_namespace_lock); + return (EEXIST); } /* @@ -2197,29 +2533,29 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, spa = spa_add(pool, altroot); spa_activate(spa, spa_mode_global); - if (allowfaulted) - spa->spa_import_faulted = B_TRUE; - spa->spa_is_root = isroot; + /* + * Don't start async tasks until we know everything is healthy. + */ + spa_async_suspend(spa); /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig (unless this is a root pool) because - * the user-supplied config is actually the one to trust when + * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig + * because the user-supplied config is actually the one to trust when * doing an import. */ - loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); + error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* - * Toss any existing sparelist, as it doesn't have any validity anymore, - * and conflicts with spa_has_spare(). + * Toss any existing sparelist, as it doesn't have any validity + * anymore, and conflicts with spa_has_spare(). */ - if (!isroot && spa->spa_spares.sav_config) { + if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } - if (!isroot && spa->spa_l2cache.sav_config) { + if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); @@ -2228,7 +2564,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); @@ -2239,29 +2576,15 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { - if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { - /* - * If we failed to load the pool, but 'allowfaulted' is - * set, then manually set the config as if the config - * passed in was specified in the cache file. - */ - error = 0; - spa->spa_import_faulted = B_FALSE; - if (spa->spa_config == NULL) - spa->spa_config = spa_config_generate(spa, - NULL, -1ULL, B_TRUE); - spa_unload(spa); - spa_deactivate(spa); - spa_config_sync(spa, B_FALSE, B_TRUE); - } else { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - } + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } + spa_async_resume(spa); + /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -2301,246 +2624,20 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, /* * Update the config cache to include the newly-imported pool. */ - spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); + spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE); } - spa->spa_import_faulted = B_FALSE; - mutex_exit(&spa_namespace_lock); - - return (0); -} - -#ifdef _KERNEL -/* - * Build a "root" vdev for a top level vdev read in from a rootpool - * device label. - */ -static void -spa_build_rootpool_config(nvlist_t *config) -{ - nvlist_t *nvtop, *nvroot; - uint64_t pgid; - - /* - * Add this top-level vdev to the child array. - */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) - == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) - == 0); - - /* - * Put this pool's top-level vdevs into a root vdev. - */ - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) - == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &nvtop, 1) == 0); - /* - * Replace the existing vdev_tree with the new root vdev in - * this pool's configuration (remove the old, add the new). + * It's possible that the pool was expanded while it was exported. + * We kick off an async task to handle this for us. */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - nvlist_free(nvroot); -} + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); -/* - * Get the root pool information from the root disk, then import the root pool - * during the system boot up time. - */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - -int -spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, - uint64_t *besttxg) -{ - nvlist_t *config; - uint64_t txg; - int error; - - if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) - return (error); - - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); - - if (bestconf != NULL) - *bestconf = config; - else - nvlist_free(config); - *besttxg = txg; - return (0); -} - -boolean_t -spa_rootdev_validate(nvlist_t *nv) -{ - uint64_t ival; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) - return (B_FALSE); - - return (B_TRUE); -} - - -/* - * Given the boot device's physical path or devid, check if the device - * is in a valid state. If so, return the configuration from the vdev - * label. - */ -int -spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) -{ - nvlist_t *conf = NULL; - uint64_t txg = 0; - nvlist_t *nvtop, **child; - char *type; - char *bootpath = NULL; - uint_t children, c; - char *tmp; - int error; - - if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) - *tmp = '\0'; - if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { - cmn_err(CE_NOTE, "error reading device label"); - return (error); - } - if (txg == 0) { - cmn_err(CE_NOTE, "this device is detached"); - nvlist_free(conf); - return (EINVAL); - } - - VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); - - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - if (spa_rootdev_validate(nvtop)) { - goto out; - } else { - nvlist_free(conf); - return (EINVAL); - } - } - - ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); - - VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); - - /* - * Go thru vdevs in the mirror to see if the given device - * has the most recent txg. Only the device with the most - * recent txg has valid information and should be booted. - */ - for (c = 0; c < children; c++) { - char *cdevid, *cpath; - uint64_t tmptxg; - - cpath = NULL; - cdevid = NULL; - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, - &cpath) != 0 && nvlist_lookup_string(child[c], - ZPOOL_CONFIG_DEVID, &cdevid) != 0) - return (EINVAL); - if ((spa_check_rootconf(cpath, cdevid, NULL, - &tmptxg) == 0) && (tmptxg > txg)) { - txg = tmptxg; - VERIFY(nvlist_lookup_string(child[c], - ZPOOL_CONFIG_PATH, &bootpath) == 0); - } - } + mutex_exit(&spa_namespace_lock); - /* Does the best device match the one we've booted from? */ - if (bootpath) { - cmn_err(CE_NOTE, "try booting from '%s'", bootpath); - return (EINVAL); - } -out: - *bestconf = conf; return (0); } -/* - * Import a root pool. - * - * For x86. devpath_list will consist of devid and/or physpath name of - * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). - * The GRUB "findroot" command will return the vdev we should boot. - * - * For Sparc, devpath_list consists the physpath name of the booting device - * no matter the rootpool is a single device pool or a mirrored pool. - * e.g. - * "/pci@1f,0/ide@d/disk@0,0:a" - */ -int -spa_import_rootpool(char *devpath, char *devid) -{ - nvlist_t *conf = NULL; - char *pname; - int error; - - /* - * Get the vdev pathname and configuation from the most - * recently updated vdev (highest txg). - */ - if (error = spa_get_rootconf(devpath, devid, &conf)) - goto msg_out; - - /* - * Add type "root" vdev to the config. - */ - spa_build_rootpool_config(conf); - - VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); - - /* - * We specify 'allowfaulted' for this to be treated like spa_open() - * instead of spa_import(). This prevents us from marking vdevs as - * persistently unavailable, and generates FMA ereports as if it were a - * pool open, not import. - */ - error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); - ASSERT(error != EEXIST); - - nvlist_free(conf); - return (error); - -msg_out: - cmn_err(CE_NOTE, "\n" - " *************************************************** \n" - " * This device is not bootable! * \n" - " * It is either offlined or detached or faulted. * \n" - " * Please try to boot from a different device. * \n" - " *************************************************** "); - - return (error); -} -#endif - -/* - * Import a non-root pool into the system. - */ -int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) -{ - return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); -} - -int -spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) -{ - return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); -} - /* * This (illegal) pool name is used when temporarily importing a spa_t in order @@ -2624,8 +2721,10 @@ spa_tryimport(nvlist_t *tryconfig) /* * Add the list of hot spares and level 2 cache devices. */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); @@ -2971,10 +3070,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } /* - * Compare the new device size with the replaceable/attachable - * device size. + * Make sure the new device is big enough. */ - if (newvd->vdev_psize < vdev_get_rsize(oldvd)) + if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -3018,12 +3116,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) newvd->vdev_id = pvd->vdev_children; vdev_add_child(pvd, newvd); - /* - * If newvd is smaller than oldvd, but larger than its rsize, - * the addition of newvd may have decreased our parent's asize. - */ - pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); - tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); @@ -3039,8 +3131,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, open_txg - TXG_INITIAL + 1); - if (newvd->vdev_isspare) + if (newvd->vdev_isspare) { spa_spare_activate(newvd); + spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); + } + oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; @@ -3237,12 +3332,16 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_propagate_state(cvd); /* - * If the device we just detached was smaller than the others, it may be - * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() - * can't fail because the existing metaslabs are already in core, so - * there's nothing to read from disk. + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. */ - VERIFY(vdev_metaslab_init(tvd, txg) == 0); + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } vdev_config_dirty(tvd); @@ -3400,9 +3499,8 @@ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); @@ -3475,59 +3573,50 @@ spa_vdev_resilver_done(spa_t *spa) } /* - * Update the stored path for this vdev. Dirty the vdev configuration, relying - * on spa_vdev_enter/exit() to synchronize the labels and cache. + * Update the stored path or FRU for this vdev. Dirty the vdev configuration, + * relying on spa_vdev_enter/exit() to synchronize the labels and cache. */ int -spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, + boolean_t ispath) { vdev_t *vd; uint64_t txg; txg = spa_vdev_enter(spa); - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { - /* - * Determine if this is a reference to a hot spare device. If - * it is, update the path manually as there is no associated - * vdev_t that can be synced to disk. - */ - nvlist_t **spares; - uint_t i, nspares; - - if (spa->spa_spares.sav_config != NULL) { - VERIFY(nvlist_lookup_nvlist_array( - spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0); - for (i = 0; i < nspares; i++) { - uint64_t theguid; - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, - 0)); - } - } - } - + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - } if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(newpath); + if (ispath) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + } else { + if (vd->vdev_fru != NULL) + spa_strfree(vd->vdev_fru); + vd->vdev_fru = spa_strdup(value); + } vdev_config_dirty(vd->vdev_top); return (spa_vdev_exit(spa, NULL, txg, 0)); } +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); +} + +int +spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) +{ + return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); +} + /* * ========================================================================== * SPA Scrubbing @@ -3599,6 +3688,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd) } static void +spa_async_autoexpand(spa_t *spa, vdev_t *vd) +{ + sysevent_id_t eid; + nvlist_t *attr; + char *physpath; + + if (!spa->spa_autoexpand) + return; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + spa_async_autoexpand(spa, cvd); + } + + if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) + return; + + physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); +} + +static void spa_async_thread(spa_t *spa) { int tasks; @@ -3614,9 +3734,33 @@ spa_async_thread(spa_t *spa) * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + uint64_t oldsz, space_update; + mutex_enter(&spa_namespace_lock); + oldsz = spa_get_space(spa); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + space_update = spa_get_space(spa) - oldsz; mutex_exit(&spa_namespace_lock); + + /* + * If the pool grew as a result of the config update, + * then log an internal history event. + */ + if (space_update) { + dmu_tx_t *tx; + + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(tx, TXG_WAIT) == 0) { + spa_history_internal_log(LOG_POOL_VDEV_ONLINE, + spa, tx, CRED(), + "pool '%s' size: %llu(+%llu)", + spa_name(spa), spa_get_space(spa), + space_update); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } } /* @@ -3632,6 +3776,12 @@ spa_async_thread(spa_t *spa) (void) spa_vdev_state_exit(spa, NULL, 0); } + if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_async_autoexpand(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + /* * See if any devices need to be probed. */ @@ -3944,6 +4094,10 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + break; default: break; } @@ -4105,9 +4259,8 @@ spa_sync(spa_t *spa, uint64_t txg) int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); - int c; - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; @@ -4115,10 +4268,16 @@ spa_sync(spa_t *spa, uint64_t txg) if (svdcount == SPA_DVAS_PER_BP) break; } - error = vdev_config_sync(svd, svdcount, txg); + error = vdev_config_sync(svd, svdcount, txg, B_FALSE); + if (error != 0) + error = vdev_config_sync(svd, svdcount, txg, + B_TRUE); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg); + rvd->vdev_children, txg, B_FALSE); + if (error != 0) + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg, B_TRUE); } spa_config_exit(spa, SCL_STATE, FTAG); @@ -4239,7 +4398,7 @@ spa_evict_all(void) } vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; @@ -4247,12 +4406,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); - if (l2cache) { + if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } + + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } } return (NULL); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 1c47efd0c..7103e179b 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -432,10 +432,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) { - vdev_init(tvd, txg); - vdev_config_dirty(tvd); - } + if (tvd->vdev_ms_array == 0) + vdev_metaslab_set_size(tvd); + vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index c642bd768..ac0a20aaf 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Routines to manage the on-disk persistent error log. * @@ -61,8 +59,8 @@ * lowercase hexidecimal numbers that don't overflow. */ #ifdef _KERNEL -static uint64_t -strtonum(char *str, char **nptr) +uint64_t +strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; @@ -82,7 +80,8 @@ strtonum(char *str, char **nptr) str++; } - *nptr = str; + if (nptr) + *nptr = (char *)str; return (val); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index c997240c1..97d97d847 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/zap.h> @@ -127,12 +125,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf)) != 0) + buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread)) != 0) + buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } @@ -380,10 +378,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (0); } - err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, + DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len); + leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 485e83fce..aea3f5625 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -425,7 +425,6 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); @@ -434,7 +433,6 @@ spa_add(const char *name, const char *altroot) mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -508,12 +506,10 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_async_root_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_async_root_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_errlist_lock); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 1cdacc81d..75b55d5c1 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) if (merge_before && merge_after) { avl_remove(&sm->sm_root, ss_before); + if (sm->sm_pp_root) { + avl_remove(sm->sm_pp_root, ss_before); + avl_remove(sm->sm_pp_root, ss_after); + } ss_after->ss_start = ss_before->ss_start; kmem_free(ss_before, sizeof (*ss_before)); + ss = ss_after; } else if (merge_before) { ss_before->ss_end = end; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_before); + ss = ss_before; } else if (merge_after) { ss_after->ss_start = start; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_after); + ss = ss_after; } else { ss = kmem_alloc(sizeof (*ss), KM_SLEEP); ss->ss_start = start; @@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) avl_insert(&sm->sm_root, ss, where); } + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, ss); + sm->sm_space += size; } @@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) left_over = (ss->ss_start != start); right_over = (ss->ss_end != end); + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss); + if (left_over && right_over) { newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); newseg->ss_start = end; newseg->ss_end = ss->ss_end; ss->ss_end = start; avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, newseg); } else if (left_over) { ss->ss_end = start; } else if (right_over) { @@ -176,8 +195,12 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) } else { avl_remove(&sm->sm_root, ss); kmem_free(ss, sizeof (*ss)); + ss = NULL; } + if (sm->sm_pp_root && ss != NULL) + avl_add(sm->sm_pp_root, ss); + sm->sm_space -= size; } @@ -288,7 +311,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, smo->smo_object, offset, size); mutex_exit(sm->sm_lock); - error = dmu_read(os, smo->smo_object, offset, size, entry_map); + error = dmu_read(os, smo->smo_object, offset, size, entry_map, + DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -342,6 +366,15 @@ space_map_unload(space_map_t *sm) } uint64_t +space_map_maxsize(space_map_t *sm) +{ + if (sm->sm_loaded && sm->sm_ops != NULL) + return (sm->sm_ops->smop_max(sm)); + else + return (-1ULL); +} + +uint64_t space_map_alloc(space_map_t *sm, uint64_t size) { uint64_t start; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 2554f96a9..3fa677e05 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/zap.h> #include <sys/fs/zfs.h> #include <sys/arc.h> +#include <sys/zil.h> /* * Virtual device management. @@ -83,9 +84,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; - uint64_t c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } @@ -94,40 +94,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) } /* - * Get the replaceable or attachable device size. - * If the parent is a mirror or raidz, the replaceable size is the minimum - * psize of all its children. For the rest, just return our own psize. - * - * e.g. - * psize rsize - * root - - - * mirror/raidz - - - * disk1 20g 20g - * disk2 40g 20g - * disk3 80g 80g + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. */ uint64_t -vdev_get_rsize(vdev_t *vd) +vdev_get_min_asize(vdev_t *vd) { - vdev_t *pvd, *cvd; - uint64_t c, rsize; + vdev_t *pvd = vd->vdev_parent; - pvd = vd->vdev_parent; + /* + * The our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); /* - * If our parent is NULL or the root, just return our own psize. + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. */ - if (pvd == NULL || pvd->vdev_parent == NULL) - return (vd->vdev_psize); + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - rsize = 0; + /* + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. + */ + if (pvd->vdev_ops == &vdev_raidz_ops) + return (pvd->vdev_min_asize / pvd->vdev_children); - for (c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; - } + return (pvd->vdev_min_asize); +} + +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); - return (rsize); + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * @@ -148,13 +155,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { - int c; vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); @@ -250,17 +256,17 @@ vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; - int newc, c; + int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = newc = 0; c < oldc; c++) + for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - for (c = newc = 0; c < oldc; c++) { + for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; @@ -372,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -435,6 +444,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value @@ -448,9 +459,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ - if (!spa->spa_import_faulted) - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); /* * Get the alignment requirement. @@ -473,13 +483,23 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); @@ -511,7 +531,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, void vdev_free(vdev_t *vd) { - int c; spa_t *spa = vd->vdev_spa; /* @@ -525,7 +544,7 @@ vdev_free(vdev_t *vd) /* * Free all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); @@ -560,6 +579,8 @@ vdev_free(vdev_t *vd) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); @@ -653,14 +674,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { - int c; - if (vd == NULL) return; vd->vdev_top = tvd; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } @@ -679,6 +698,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; @@ -751,6 +771,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ return (0); + /* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the current "typical" blocksize. + * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, + * or we will inconsistently account for existing bp's. + */ + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + ASSERT(oldc <= newc); if (vd->vdev_islog) @@ -776,7 +805,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) { uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object); + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); if (error) return (error); if (object != 0) { @@ -903,7 +933,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_DONT_RETRY; + ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -950,8 +980,8 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad)), - VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + offsetof(vdev_label_t, vl_pad2)), + VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } @@ -971,7 +1001,6 @@ vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; - int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; @@ -983,6 +1012,9 @@ vdev_open(vdev_t *vd) vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); @@ -998,7 +1030,7 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, ENXIO); + error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && @@ -1020,12 +1052,13 @@ vdev_open(vdev_t *vd) vd->vdev_state = VDEV_STATE_HEALTHY; } - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } + } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); @@ -1050,6 +1083,15 @@ vdev_open(vdev_t *vd) vd->vdev_psize = psize; + /* + * Make sure the allocatable size hasn't shrunk. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1066,25 +1108,18 @@ vdev_open(vdev_t *vd) VDEV_AUX_BAD_LABEL); return (EINVAL); } + } - /* - * Make sure the device hasn't shrunk. - */ - if (asize < vd->vdev_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. If automatic + * expansion is enabled then use the additional space. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) + vd->vdev_asize = asize; - /* - * If all children are healthy and the asize has increased, - * then we've experienced dynamic LUN growth. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - asize > vd->vdev_asize) { - vd->vdev_asize = asize; - } - } + vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the @@ -1098,18 +1133,6 @@ vdev_open(vdev_t *vd) } /* - * If this is a top-level vdev, compute the raidz-deflation - * ratio. Note, we hard-code in 128k (1<<17) because it is the - * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE - * changes, this algorithm must never change, or we will - * inconsistently account for existing bp's. - */ - if (vd->vdev_top == vd) { - vd->vdev_deflate_ratio = (1<<17) / - (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); - } - - /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. @@ -1135,12 +1158,11 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - int c; nvlist_t *label; uint64_t guid, top_guid; uint64_t state; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (EBADF); @@ -1226,7 +1248,7 @@ vdev_close(vdev_t *vd) vdev_cache_purge(vd); /* - * We record the previous state before we close it, so that if we are + * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ @@ -1257,12 +1279,9 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - !l2arc_vdev_present(vd)) { - uint64_t size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + vd->vdev_aux == &spa->spa_l2cache && + !l2arc_vdev_present(vd)) + l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } @@ -1302,26 +1321,14 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) return (0); } -/* - * The is the latter half of vdev_create(). It is distinct because it - * involves initiating transactions in order to do metaslab creation. - * For creation, we want to try to create all vdevs at once and then undo it - * if anything fails; this is much harder if we have pending transactions. - */ void -vdev_init(vdev_t *vd, uint64_t txg) +vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly 200 metaslabs per vdev. */ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); - - /* - * Initialize the vdev's metaslabs. This can't fail because - * there's nothing to read when creating all new metaslabs. - */ - VERIFY(vdev_metaslab_init(vd, txg) == 0); } void @@ -1879,7 +1886,7 @@ vdev_degrade(spa_t *spa, uint64_t guid) int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { - vdev_t *vd; + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; spa_vdev_state_enter(spa); @@ -1889,13 +1896,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); - vdev_reopen(vd->vdev_top); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + } + + vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && @@ -1904,13 +1924,21 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { + + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { - vdev_t *vd; + vdev_t *vd, *tvd; + int error; spa_vdev_state_enter(spa); @@ -1920,34 +1948,58 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, - * don't allow it to be offlined. + * don't allow it to be offlined. Log devices are always + * expendable. */ - if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * Offline this device and reopen its top-level vdev. - * If this action results in the top-level vdev becoming - * unusable, undo it and fail the request. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top); + vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - return (spa_vdev_state_exit(spa, vd, 0)); + if (!tvd->vdev_islog || !vdev_is_dead(tvd)) + return (spa_vdev_state_exit(spa, vd, 0)); + + (void) spa_vdev_state_exit(spa, vd, 0); + + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error) { + (void) vdev_online(spa, guid, 0, NULL); + return (error); + } + /* + * If we successfully offlined the log device then we need to + * sync out the current txg so that the "stubby" block can be + * removed by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + return (0); } /* @@ -2062,7 +2114,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_rsize(vd); + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; mutex_exit(&vd->vdev_stat_lock); /* @@ -2155,14 +2209,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (flags & ZIO_FLAG_SPECULATIVE) return; + /* + * If this is an I/O error that is going to be retried, then ignore the + * error. Otherwise, the user may interpret B_FAILFAST I/O errors as + * hard errors, when in reality they can happen for any number of + * innocuous reasons (bus resets, MPxIO link failure, etc). + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { + if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); @@ -2205,10 +2269,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) { - int c; vdev_stat_t *vs = &vd->vdev_stat; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_scrub_stat_update(vd->vdev_child[c], type, complete); mutex_enter(&vd->vdev_stat_lock); @@ -2252,6 +2315,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; @@ -2293,8 +2357,8 @@ vdev_config_dirty(vdev_t *vd) int c; /* - * If this is an aux vdev (as with l2cache devices), then we update the - * vdev config manually and set the sync flag. + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; @@ -2316,8 +2380,11 @@ vdev_config_dirty(vdev_t *vd) sav->sav_sync = B_TRUE; - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } ASSERT(c < naux); @@ -2415,11 +2482,10 @@ vdev_propagate_state(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; - int c; vdev_t *child; if (vd->vdev_children > 0) { - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; if (!vdev_readable(child) || @@ -2523,7 +2589,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * an error. */ if (spa->spa_load_state == SPA_LOAD_IMPORT && - !spa->spa_import_faulted && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2582,8 +2647,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_FALSE; } - if (!isopen) - vdev_propagate_state(vd); + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -2595,8 +2660,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) boolean_t vdev_is_bootable(vdev_t *vd) { - int c; - if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; @@ -2611,9 +2674,53 @@ vdev_is_bootable(vdev_t *vd) return (B_FALSE); } - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } + +void +vdev_load_log_state(vdev_t *vd, nvlist_t *nv) +{ + uint_t children; + nvlist_t **child; + uint64_t val; + spa_t *spa = vd->vdev_spa; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (int c = 0; c < children; c++) + vdev_load_log_state(vd->vdev_child[c], child[c]); + } + + if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) { + + /* + * It would be nice to call vdev_offline() + * directly but the pool isn't fully loaded and + * the txg threads have not been started yet. + */ + spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER); + vd->vdev_offline = val; + vdev_reopen(vd->vdev_top); + spa_config_exit(spa, SCL_STATE_ALL, FTAG); + } +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +} diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index f61de3c4e..48d5fc232 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath) == 0); + if (vd->vdev_fru != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, + vd->vdev_fru) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -335,8 +339,8 @@ vdev_label_read_config(vdev_t *vd) nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; - int flags = - ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -345,6 +349,7 @@ vdev_label_read_config(vdev_t *vd) vp = zio_buf_alloc(sizeof (vdev_phys_t)); +retry: for (int l = 0; l < VDEV_LABELS; l++) { zio = zio_root(spa, NULL, NULL, flags); @@ -364,6 +369,11 @@ vdev_label_read_config(vdev_t *vd) } } + if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + zio_buf_free(vp, sizeof (vdev_phys_t)); return (config); @@ -488,7 +498,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - vdev_boot_header_t *vb; + char *pad2; uberblock_t *ub; zio_t *zio; char *buf; @@ -630,16 +640,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } /* - * Initialize boot block header. - */ - vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); - bzero(vb, sizeof (vdev_boot_header_t)); - vb->vb_magic = VDEV_BOOT_MAGIC; - vb->vb_version = VDEV_BOOT_VERSION; - vb->vb_offset = VDEV_BOOT_OFFSET; - vb->vb_size = VDEV_BOOT_SIZE; - - /* * Initialize uberblock template. */ ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); @@ -647,9 +647,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) *ub = spa->spa_uberblock; ub->ub_txg = 0; + /* Initialize the 2nd padding area. */ + pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + bzero(pad2, VDEV_PAD_SIZE); + /* * Write everything in parallel. */ +retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { @@ -658,9 +663,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); - vdev_label_write(zio, vd, l, vb, - offsetof(vdev_label_t, vl_boot_header), - sizeof (vdev_boot_header_t), NULL, NULL, flags); + /* + * Skip the 1st padding area. + * Zero out the 2nd padding area where it might have + * left over data from previous filesystem format. + */ + vdev_label_write(zio, vd, l, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_write(zio, vd, l, ub, @@ -671,9 +681,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + nvlist_free(label); + zio_buf_free(pad2, VDEV_PAD_SIZE); zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); - zio_buf_free(vb, sizeof (vdev_boot_header_t)); zio_buf_free(vp, sizeof (vdev_phys_t)); /* @@ -757,8 +772,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - int flags = - ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; if (vd == rvd) { ASSERT(zio == NULL); @@ -996,7 +1011,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; @@ -1005,6 +1020,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) int error; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + /* + * Normally, we don't want to try too hard to write every label and + * uberblock. If there is a flaky disk, we don't want the rest of the + * sync process to block while we retry. But if we can't write a + * single label out, we should retry with ZIO_FLAG_TRYHARD before + * bailing out and declaring the pool faulted. + */ + if (tryhard) + flags |= ZIO_FLAG_TRYHARD; + ASSERT(ub->ub_txg <= txg); /* diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 4e3c20acd..5e57a1513 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -48,10 +48,11 @@ int zfs_vdev_time_shift = 6; int zfs_vdev_ramp_rate = 2; /* - * i/os will be aggregated into a single large i/o up to - * zfs_vdev_aggregation_limit bytes long. + * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. + * For read i/os, we also aggregate across small adjacency gaps. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -159,16 +160,23 @@ vdev_queue_agg_io_done(zio_t *aio) zio_buf_free(aio->io_data, aio->io_size); } -#define IS_ADJACENT(io, nio) \ - ((io)->io_offset + (io)->io_size == (nio)->io_offset) +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio, *nio; avl_tree_t *t; - uint64_t size; int flags; + uint64_t maxspan = zfs_vdev_aggregation_limit; + uint64_t maxgap; ASSERT(MUTEX_HELD(&vq->vq_lock)); @@ -179,8 +187,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) fio = lio = avl_first(&vq->vq_deadline_tree); t = fio->io_vdev_tree; - size = fio->io_size; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; + maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* @@ -191,22 +199,18 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) * scrub/resilver, can be preserved in the aggregate. */ while ((dio = AVL_PREV(t, fio)) != NULL && - IS_ADJACENT(dio, fio) && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - size + dio->io_size <= zfs_vdev_aggregation_limit) { + IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) fio = dio; - size += dio->io_size; - } + while ((dio = AVL_NEXT(t, lio)) != NULL && - IS_ADJACENT(lio, dio) && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - size + dio->io_size <= zfs_vdev_aggregation_limit) { + IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) lio = dio; - size += dio->io_size; - } } if (fio != lio) { + uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, @@ -214,9 +218,10 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); - /* We want to process lio, then stop */ - lio = AVL_NEXT(t, lio); - for (dio = fio; dio != lio; dio = nio) { + nio = fio; + do { + dio = nio; + nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); @@ -224,13 +229,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); - nio = AVL_NEXT(t, dio); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); - } + } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index ad997f528..92753d871 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -697,7 +697,7 @@ vdev_raidz_io_start(zio_t *zio) continue; } if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & ZIO_FLAG_SCRUB)) { + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, diff --git a/module/zfs/zap.c b/module/zfs/zap.c index ca859ec35..2dc2705b9 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -19,13 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. @@ -45,6 +42,7 @@ #include <sys/dmu.h> #include <sys/zfs_context.h> #include <sys/zfs_znode.h> +#include <sys/fs/zfs.h> #include <sys/zap.h> #include <sys/refcount.h> #include <sys/zap_impl.h> @@ -1134,3 +1132,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } + +int +fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite) +{ + zap_t *zap = zn->zn_zap; + zap_leaf_t *l; + int err; + + /* + * Account for the header block of the fatzap. + */ + if (!add && dmu_buf_freeable(zap->zap_dbuf)) { + tooverwrite += zap->zap_dbuf->db_size; + } else { + towrite += zap->zap_dbuf->db_size; + } + + /* + * Account for the pointer table blocks. + * If we are adding we need to account for the following cases : + * - If the pointer table is embedded, this operation could force an + * external pointer table. + * - If this already has an external pointer table this operation + * could extend the table. + */ + if (add) { + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) + towrite += zap->zap_dbuf->db_size; + else + towrite += (zap->zap_dbuf->db_size * 3); + } + + /* + * Now, check if the block containing leaf is freeable + * and account accordingly. + */ + err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) { + return (err); + } + + if (!add && dmu_buf_freeable(l->l_dbuf)) { + tooverwrite += l->l_dbuf->db_size; + } else { + /* + * If this an add operation, the leaf block could split. + * Hence, we need to account for an additional leaf block. + */ + towrite += (add ? 2 : 1) * l->l_dbuf->db_size; + } + + zap_put_leaf(l); + return (0); +} diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index da498b6bc..9d8354e73 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -19,24 +19,23 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ +#include <sys/spa.h> +#include <sys/dmu.h> #include <sys/zfs_context.h> +#include <sys/fs/zfs.h> #include <sys/zap.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> -#include <sys/spa.h> -#include <sys/dmu.h> static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index abba42775..fbc93b423 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/dmu.h> #include <sys/zfs_context.h> @@ -78,8 +76,8 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, - &err); + zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | + U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } @@ -1067,3 +1065,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) zap_unlockdir(zap); return (0); } + +int +zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, + uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift) +{ + zap_t *zap; + int err = 0; + + + /* + * Since, we don't have a name, we cannot figure out which blocks will + * be affected in this operation. So, account for the worst case : + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + * + * This also accomodates the case where an add operation to a fairly + * large microzap results in a promotion to fatzap. + */ + if (name == NULL) { + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + return (err); + } + + /* + * We lock the zap with adding == FALSE. Because, if we pass + * the actual value of add, it could trigger a mzap_upgrade(). + * At present we are just evaluating the possibility of this operation + * and hence we donot want to trigger an upgrade. + */ + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + + if (!zap->zap_ismicro) { + zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn) { + err = fzap_count_write(zn, add, towrite, + tooverwrite); + zap_name_free(zn); + } else { + /* + * We treat this case as similar to (name == NULL) + */ + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + } + } else { + if (!add) { + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + } else { + /* + * We are here if we are adding and (name != NULL). + * It is hard to find out if this add will promote this + * microzap to fatzap. Hence, we assume the worst case + * and account for the blocks assuming this microzap + * would be promoted to a fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += 1 << dn_datablkshift; + else + *towrite += 1 << dn_datablkshift; + *towrite += 4 << dn_datablkshift; + } + } + + zap_unlockdir(zap); + return (err); +} diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index fdf92a156..734bd8395 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -65,15 +65,16 @@ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) -#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ - ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) @@ -538,8 +539,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp) * ACE FUIDs will be created later. */ int -zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, - zfs_ace_t *z_acl, int aclcnt, size_t *size) +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; @@ -555,9 +557,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { - if (!aclp->z_has_fuids) - aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); - aceptr->z_fuid = (uint64_t)acep->a_who; + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* @@ -682,7 +684,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, * convert old ACL format to new */ void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; @@ -714,9 +716,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, - newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size) == 0); + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); @@ -770,8 +772,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, * Also, create FUIDs for any User/Group ACEs */ static uint64_t -zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) { int entry_type; mode_t mode; @@ -905,15 +906,6 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } } } - /* - * Now handle FUID create for user/group ACEs - */ - if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { - aclp->z_ops.ace_who_set(acep, - zfs_fuid_create(zp->z_zfsvfs, who, cr, - (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, - tx, fuidp)); - } } return (mode); } @@ -989,7 +981,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) aclnode = zfs_acl_node_alloc(aclsize); list_insert_head(&aclp->z_acl, aclnode); error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata); + aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); aclnode->z_ace_count = acl_count; aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; @@ -1014,8 +1006,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) * already checked the acl and knows whether to inherit. */ int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; znode_phys_t *zphys = zp->z_phys; @@ -1026,12 +1017,9 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_object_type_t otype; zfs_acl_node_t *aclnode; - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + zphys->zp_mode = zfs_mode_compute(zp, aclp); /* * Decide which opbject type to use. If we are forced to @@ -1043,7 +1031,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp); + zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } @@ -1125,7 +1113,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); return (0); } @@ -1336,7 +1323,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) * Prepend deny ACE */ static void * -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, +zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep, mode_t mode) { zfs_acl_node_t *aclnode; @@ -1349,7 +1336,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, fuid = aclp->z_ops.ace_who_get(acep); flags = aclp->z_ops.ace_flags_get(acep); zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid); return (newacep); } @@ -1473,9 +1460,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, * in PSARC/2002/240 */ static void -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, + uint64_t mode, zfs_acl_t *aclp) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *acep = NULL, *prevacep = NULL; uint64_t who; int i; @@ -1485,11 +1472,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) uint16_t iflags, type; uint32_t access_mask; - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT(MUTEX_HELD(&zp->z_lock)); - - aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - /* * If discard then just discard all ACL nodes which * represent the ACEs. @@ -1554,17 +1536,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) if (!reuse_deny) { prevacep = - zfs_acl_prepend_deny(zp, + zfs_acl_prepend_deny(uid, aclp, acep, mode); } else { zfs_acl_prepend_fixup( aclp, prevacep, - acep, mode, - zp->z_phys->zp_uid); + acep, mode, uid); } zfs_fixup_group_entries(aclp, acep, prevacep, mode); - } } } @@ -1623,8 +1603,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) mutex_enter(&zp->z_acl_lock); *aclp = NULL; error = zfs_acl_node_read(zp, aclp, B_TRUE); - if (error == 0) - zfs_acl_chmod(zp, mode, *aclp); + if (error == 0) { + (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); + } mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); return (error); @@ -1649,9 +1631,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) * Should ACE be inherited? */ static int -zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) { - int vtype = ZTOV(zp)->v_type; int iflags = (acep_flags & 0xf); if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) @@ -1666,10 +1647,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) * inherit inheritable ACEs from parent */ static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, - boolean_t *need_chmod) +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *pacep; void *acep, *acep2; zfs_acl_node_t *aclnode, *aclnode2; @@ -1680,8 +1660,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; - boolean_t vdir = ZTOV(zp)->v_type == VDIR; - boolean_t vreg = ZTOV(zp)->v_type == VREG; + boolean_t vdir = vtype == VDIR; + boolean_t vreg = vtype == VREG; boolean_t passthrough, passthrough_x, noallow; passthrough_x = @@ -1710,7 +1690,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, ace_size = aclp->z_ops.ace_size(pacep); - if (!zfs_ace_can_use(zp, iflags)) + if (!zfs_ace_can_use(vtype, iflags)) continue; /* @@ -1806,55 +1786,58 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, * Create file system object initial permissions * including inheritable ACEs. */ -void -zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) { - uint64_t mode, fuid, fgid; int error; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zfs_acl_t *aclp = NULL; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; - xvattr_t *xvap = (xvattr_t *)vap; gid_t gid; boolean_t need_chmod = B_TRUE; - if (setaclp) - aclp = setaclp; + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); - mode = MAKEIMODE(vap->va_type, vap->va_mode); + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); /* * Determine uid and gid. */ if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, - ZFS_OWNER, tx, fuidp); - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { - fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); - fgid = 0; + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (fgid != parent->z_phys->zp_gid && + if (acl_ids->z_fgid != dzp->z_phys->zp_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) - fgid = 0; + acl_ids->z_fgid = 0; } - if (fgid == 0) { - if (parent->z_phys->zp_mode & S_ISGID) { - fgid = parent->z_phys->zp_gid; - gid = zfs_fuid_map_id(zfsvfs, fgid, + if (acl_ids->z_fgid == 0) { + if (dzp->z_phys->zp_mode & S_ISGID) { + acl_ids->z_fgid = dzp->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); } else { - fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, tx, cr, fuidp); + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); gid = crgetgid(cr); } } @@ -1867,57 +1850,61 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, * file's new group, clear the file's set-GID bit. */ - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { - mode |= S_ISGID; + if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; } else { - if ((mode & S_ISGID) && + if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(cr, gid) != 0) - mode &= ~S_ISGID; - } - - zp->z_phys->zp_uid = fuid; - zp->z_phys->zp_gid = fgid; - zp->z_phys->zp_mode = mode; - - if (aclp == NULL) { - mutex_enter(&parent->z_lock); - if ((ZTOV(parent)->v_type == VDIR && - (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(zp->z_phys->zp_flags & ZFS_XATTR)) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod); + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && + (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && + !(dzp->z_phys->zp_flags & ZFS_XATTR)) { + mutex_enter(&dzp->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); + mutex_exit(&dzp->z_acl_lock); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); zfs_acl_free(paclp); } else { - aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + } + mutex_exit(&dzp->z_lock); + if (need_chmod) { + acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? + ZFS_ACL_AUTO_INHERIT : 0; + zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, + acl_ids->z_mode, acl_ids->z_aclp); } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - if (need_chmod) - zfs_acl_chmod(zp, mode, aclp); - } else { - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); } - /* Force auto_inherit on all new directory objects */ - if (vap->va_type == VDIR) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - - error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); - - /* Set optional attributes if any */ - if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(zp, xvap); + return (0); +} - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT3U(error, ==, 0); +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} - if (aclp != setaclp) - zfs_acl_free(aclp); +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) +{ + return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* @@ -2018,7 +2005,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, - vsecattr_t *vsecp, zfs_acl_t **zaclp) + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; @@ -2041,9 +2028,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, return (error); } } else { - if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size)) != 0) { + &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); @@ -2084,6 +2071,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; if (mask == 0) return (ENOSYS); @@ -2094,7 +2082,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) return (error); - error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); if (error) return (error); @@ -2135,18 +2124,9 @@ top: } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - if (aclp->z_has_fuids) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { @@ -2163,9 +2143,13 @@ top: return (error); } - error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) @@ -2180,45 +2164,17 @@ done: } /* - * working_mode returns the permissions that were not granted + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. */ static int -zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { - zfs_acl_t *aclp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - uid_t uid = crgetuid(cr); - uint64_t who; - uint16_t type, iflags; - uint16_t entry_type; - uint32_t access_mask; - uint32_t deny_mask = 0; - zfs_ace_hdr_t *acep = NULL; - boolean_t checkit; - uid_t fowner; - uid_t gowner; - - /* - * Short circuit empty requests - */ - if (v4_mode == 0) - return (0); - - *check_privs = B_TRUE; - - if (zfsvfs->z_replay) { - *working_mode = 0; - return (0); - } - - *working_mode = v4_mode; - if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)))) { - *check_privs = B_FALSE; + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { return (EROFS); } @@ -2230,31 +2186,64 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { - *check_privs = B_FALSE; return (EPERM); } if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { - *check_privs = B_FALSE; return (EPERM); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { - *check_privs = B_FALSE; return (EACCES); } - /* - * The caller requested that the ACL check be skipped. This - * would only happen if the caller checked VOP_ACCESS() with a - * 32 bit ACE mask and already had the appropriate permissions. - */ - if (skipaclchk) { - *working_mode = 0; - return (0); - } + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t fowner; + uid_t gowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); @@ -2268,6 +2257,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { + uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; @@ -2275,6 +2265,11 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) continue; + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; @@ -2313,14 +2308,24 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, } if (checkit) { - uint32_t mask_matched = (access_mask & *working_mode); - - if (mask_matched) { - if (type == DENY) - deny_mask |= mask_matched; - - *working_mode &= ~mask_matched; + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + return (0); + } } + *working_mode &= ~mask_matched; } /* Are we done? */ @@ -2342,6 +2347,69 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, return (0); } +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, + zp->z_phys->zp_uid, cr, ZFS_OWNER); + + return ( + secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || + secpolicy_vnode_chown(cr, B_TRUE) == 0 || + secpolicy_vnode_chown(cr, B_FALSE) == 0 || + secpolicy_vnode_setdac(cr, owner) == 0 || + secpolicy_vnode_remove(cr) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr) diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 0da026100..27c2c51a3 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -114,12 +114,16 @@ snapentry_compare(const void *a, const void *b) vnodeops_t *zfsctl_ops_root; vnodeops_t *zfsctl_ops_snapdir; vnodeops_t *zfsctl_ops_snapshot; +vnodeops_t *zfsctl_ops_shares; +vnodeops_t *zfsctl_ops_shares_dir; static const fs_operation_def_t zfsctl_tops_root[]; static const fs_operation_def_t zfsctl_tops_snapdir[]; static const fs_operation_def_t zfsctl_tops_snapshot[]; +static const fs_operation_def_t zfsctl_tops_shares[]; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); @@ -127,14 +131,18 @@ static gfs_opsvec_t zfsctl_opsvec[] = { { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir }, + { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares }, { NULL } }; /* - * Root directory elements. We have only a single static entry, 'snapshot'. + * Root directory elements. We only have two entries + * snapshot and shares. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, { NULL } }; @@ -166,21 +174,34 @@ zfsctl_fini(void) vn_freevnodeops(zfsctl_ops_snapdir); if (zfsctl_ops_snapshot) vn_freevnodeops(zfsctl_ops_snapshot); + if (zfsctl_ops_shares) + vn_freevnodeops(zfsctl_ops_shares); + if (zfsctl_ops_shares_dir) + vn_freevnodeops(zfsctl_ops_shares_dir); zfsctl_ops_root = NULL; zfsctl_ops_snapdir = NULL; zfsctl_ops_snapshot = NULL; + zfsctl_ops_shares = NULL; + zfsctl_ops_shares_dir = NULL; } /* - * Return the inode number associated with the 'snapshot' directory. + * Return the inode number associated with the 'snapshot' or + * 'shares' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { - ASSERT(index == 0); - return (ZFSCTL_INO_SNAPDIR); + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ASSERT(index <= 2); + + if (index == 0) + return (ZFSCTL_INO_SNAPDIR); + + return (zfsvfs->z_shares_dir); } /* @@ -348,6 +369,30 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) return (0); } + +/*ARGSUSED*/ +static int +zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_FID(ZTOV(dzp), fidp, ct); + VN_RELE(ZTOV(dzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} /* * .zfs inode namespace * @@ -478,7 +523,7 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) VN_RELE(svp); return (error); } - VFS_RELE(svp->v_vfsp); + /* * We can't use VN_RELE(), as that will try to invoke * zfsctl_snapdir_inactive(), which would cause us to destroy @@ -691,7 +736,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, return (err); if (err == 0) { - err = dmu_objset_snapshot(name, dirname, B_FALSE); + err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); if (err) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); @@ -732,9 +777,6 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, ASSERT(dvp->v_type == VDIR); - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) - return (0); - /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the @@ -746,6 +788,11 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, ZFS_ENTER(zfsvfs); + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (flags & FIGNORECASE) { boolean_t conflict = B_FALSE; @@ -844,7 +891,7 @@ domount: * Return the mounted root rather than the covered mount point. * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns * the ZFS vnode mounted on top of the GFS node. This ZFS - * vnode is the root the newly created vfsp. + * vnode is the root of the newly created vfsp. */ VFS_RELE(vfsp); err = traverse(vpp); @@ -879,6 +926,37 @@ domount: /* ARGSUSED */ static int +zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) + error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp, + flags, rdir, cr, ct, direntflags, realpnp); + + VN_RELE(ZTOV(dzp)); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* ARGSUSED */ +static int zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data, int flags) { @@ -921,6 +999,33 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, return (0); } +/* ARGSUSED */ +static int +zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags); + VN_RELE(ZTOV(dzp)); + } else { + *eofp = 1; + error = ENOENT; + } + + ZFS_EXIT(zfsvfs); + return (error); +} + /* * pvp is the '.zfs' directory (zfsctl_node_t). * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). @@ -946,6 +1051,45 @@ zfsctl_mknode_snapdir(vnode_t *pvp) return (vp); } +vnode_t * +zfsctl_mknode_shares(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_node_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, + zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, + NULL, NULL); + sdp = vp->v_data; + sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + return (vp); + +} + +/* ARGSUSED */ +static int +zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct); + VN_RELE(ZTOV(dzp)); + } + ZFS_EXIT(zfsvfs); + return (error); + + +} + /* ARGSUSED */ static int zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, @@ -996,6 +1140,20 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = { { NULL } }; +static const fs_operation_def_t zfsctl_tops_shares[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, + { NULL } +}; + /* * pvp is the GFS vnode '.zfs/snapshot'. * @@ -1013,7 +1171,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; - VFS_HOLD(vp->v_vfsp); return (vp); } @@ -1052,7 +1209,6 @@ zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); - VFS_RELE(vp->v_vfsp); /* * Dispose of the vnode for the snapshot mount point. diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index 9353d0199..b3f768328 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -805,44 +805,49 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) znode_t *xzp; dmu_tx_t *tx; int error; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; *xvpp = NULL; if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) return (error); + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + return (EDQUOT); + } + tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); if (error == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); dmu_buf_will_dirty(zp->z_dbuf, tx); zp->z_phys->zp_xattr = xzp->z_id; (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); *xvpp = ZTOV(xzp); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 236d69e7e..8b7785fa8 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -96,7 +96,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, nvlist_t *ereport, *detector; uint64_t ena; char class[64]; - int state; /* * If we are doing a spa_tryimport(), ignore errors. @@ -130,15 +129,39 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, return; /* - * If the vdev has already been marked as failing due to a - * failed probe, then ignore any subsequent I/O errors, as the - * DE will automatically fault the vdev on the first such - * failure. + * If this I/O is not a retry I/O, don't post an ereport. + * Otherwise, we risk making bad diagnoses based on B_FAILFAST + * I/Os. */ - if (vd != NULL && - (!vdev_readable(vd) || !vdev_writeable(vd)) && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && + !vdev_accessible(vd, zio) && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + return; + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return; + } } if ((ereport = fm_nvlist_create(NULL)) == NULL) @@ -189,21 +212,13 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, */ /* - * If we are importing a faulted pool, then we treat it like an open, - * not an import. Otherwise, the DE will ignore all faults during - * import, since the default behavior is to mark the devices as - * persistently unavailable, not leave them in the faulted state. - */ - state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; - - /* * Generic payload members common to all ereports. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, - state, NULL); + spa->spa_load_state, NULL); if (spa != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, @@ -222,14 +237,18 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); - if (vd->vdev_path) + if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); - if (vd->vdev_devid) + if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); + if (vd->vdev_fru != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, + DATA_TYPE_STRING, vd->vdev_fru, NULL); if (pvd != NULL) { fm_payload_set(ereport, diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 286dafba8..8e481dffb 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,8 +47,10 @@ * During file system initialization the nvlist(s) are read and * two AVL trees are created. One tree is keyed by the index number * and the other by the domain string. Nodes are never removed from - * trees, but new entries may be added. If a new entry is added then the - * on-disk packed nvlist will also be updated. + * trees, but new entries may be added. If a new entry is added then + * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then + * be responsible for calling zfs_fuid_sync() to sync the changes to disk. + * */ #define FUID_IDX "fuid_idx" @@ -97,6 +99,15 @@ domain_compare(const void *arg1, const void *arg2) return (val > 0 ? 1 : -1); } +void +zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); +} + /* * load initial fuid domain and idx trees. This function is used by * both the kernel and zdb. @@ -108,12 +119,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, dmu_buf_t *db; uint64_t fuid_size; - avl_create(idx_tree, idx_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); - avl_create(domain_tree, domain_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); - - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + ASSERT(fuid_obj != 0); + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, + FTAG, &db)); fuid_size = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); @@ -125,7 +133,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, int i; packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, @@ -189,10 +198,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) * Load the fuid table(s) into memory. */ static void -zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +zfs_fuid_init(zfsvfs_t *zfsvfs) { - int error = 0; - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); if (zfsvfs->z_fuid_loaded) { @@ -200,41 +207,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) return; } - if (zfsvfs->z_fuid_obj == 0) { - - /* first make sure we need to allocate object */ - - error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (error == ENOENT && tx != NULL) { - zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, sizeof (uint64_t), 1, - &zfsvfs->z_fuid_obj, tx) == 0); - } - } + zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (zfsvfs->z_fuid_obj != 0) { zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - zfsvfs->z_fuid_loaded = B_TRUE; } + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * sync out AVL trees to persistent storage. + */ +void +zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + nvlist_t *nvp; + nvlist_t **fuids; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + fuid_domain_t *domnode; + int numnodes; + int i; + + if (!zfsvfs->z_fuid_dirty) { + return; + } + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + /* + * First see if table needs to be created? + */ + if (zfsvfs->z_fuid_obj == 0) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); + fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); + for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { + VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, numnodes) == 0); + for (i = 0; i != numnodes; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, numnodes * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + zfsvfs->z_fuid_dirty = B_FALSE; rw_exit(&zfsvfs->z_fuid_lock); } /* * Query domain table for a given domain. * - * If domain isn't found it is added to AVL trees and - * the results are pushed out to disk. + * If domain isn't found and addok is set, it is added to AVL trees and + * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be + * necessary for the caller or another thread to detect the dirty table + * and sync out the changes. */ int -zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, - dmu_tx_t *tx) +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, + char **retdomain, boolean_t addok) { fuid_domain_t searchnode, *findnode; avl_index_t loc; @@ -246,16 +313,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, * for the user nobody. */ if (domain[0] == '\0') { - *retdomain = nulldomain; + if (retdomain) + *retdomain = nulldomain; return (0); } searchnode.f_ksid = ksid_lookupdomain(domain); - if (retdomain) { + if (retdomain) *retdomain = searchnode.f_ksid->kd_name; - } if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, tx); + zfs_fuid_init(zfsvfs); retry: rw_enter(&zfsvfs->z_fuid_lock, rw); @@ -265,15 +332,9 @@ retry: rw_exit(&zfsvfs->z_fuid_lock); ksiddomain_rele(searchnode.f_ksid); return (findnode->f_idx); - } else { + } else if (addok) { fuid_domain_t *domnode; - nvlist_t *nvp; - nvlist_t **fuids; uint64_t retidx; - size_t nvsize = 0; - char *packed; - dmu_buf_t *db; - int i = 0; if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { rw_exit(&zfsvfs->z_fuid_lock); @@ -288,46 +349,11 @@ retry: avl_add(&zfsvfs->z_fuid_domain, domnode); avl_add(&zfsvfs->z_fuid_idx, domnode); - /* - * Now resync the on-disk nvlist. - */ - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - domnode = avl_first(&zfsvfs->z_fuid_domain); - fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); - while (domnode) { - VERIFY(nvlist_alloc(&fuids[i], - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], - FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); - domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); - } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, retidx) == 0); - for (i = 0; i != retidx; i++) - nvlist_free(fuids[i]); - kmem_free(fuids, retidx * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); - nvlist_free(nvp); - zfsvfs->z_fuid_size = nvsize; - dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); - kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; - dmu_buf_rele(db, FTAG); - + zfsvfs->z_fuid_dirty = B_TRUE; rw_exit(&zfsvfs->z_fuid_lock); return (retidx); + } else { + return (-1); } } @@ -337,7 +363,7 @@ retry: * Returns a pointer from an avl node of the domain string. * */ -static char * +const char * zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) { char *domain; @@ -346,7 +372,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) return (NULL); if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, NULL); + zfs_fuid_init(zfsvfs); rw_enter(&zfsvfs->z_fuid_lock, RW_READER); @@ -374,7 +400,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, cred_t *cr, zfs_fuid_type_t type) { uint32_t index = FUID_INDEX(fuid); - char *domain; + const char *domain; uid_t id; if (index == 0) @@ -439,6 +465,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, } if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* * Now allocate fuid entry and add it on the end of the list */ @@ -463,7 +490,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, - dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) + cred_t *cr, zfs_fuid_info_t **fuidp) { uint64_t idx; ksid_t *ksid; @@ -490,7 +517,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); @@ -511,7 +538,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, */ uint64_t zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, - zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) + zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) { const char *domain; char *kdomain; @@ -581,10 +608,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, } } - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); if (!zfsvfs->z_replay) - zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + zfs_fuid_node_add(fuidpp, kdomain, + rid, idx, id, type); else if (zfuid != NULL) { list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); @@ -658,16 +686,15 @@ boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); uid_t gid; - if (ksid) { + if (ksid && ksidlist) { int i; ksid_t *ksid_groups; - ksidlist_t *ksidlist = crgetsidlist(cr); uint32_t idx = FUID_INDEX(id); uint32_t rid = FUID_RID(id); - ASSERT(ksidlist); ksid_groups = ksidlist->ksl_sids; for (i = 0; i != ksidlist->ksl_nsid; i++) { @@ -677,7 +704,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) return (B_TRUE); } } else { - char *domain; + const char *domain; domain = zfs_fuid_find_by_idx(zfsvfs, idx); ASSERT(domain != NULL); @@ -700,4 +727,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); } + +void +zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } +} #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index d032648b5..07dd03c35 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -79,17 +79,29 @@ dev_info_t *zfs_dip; typedef int zfs_ioc_func_t(zfs_cmd_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME +} zfs_ioc_namecheck_t; + typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; - enum { - NO_NAME, - POOL_NAME, - DATASET_NAME - } zvec_namecheck; + zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_his_log; + boolean_t zvec_pool_check; } zfs_ioc_vec_t; +/* This array is indexed by zfs_userquota_prop_t */ +static const char *userquota_perms[] = { + ZFS_DELEG_PERM_USERUSED, + ZFS_DELEG_PERM_USERQUOTA, + ZFS_DELEG_PERM_GROUPUSED, + ZFS_DELEG_PERM_GROUPQUOTA, +}; + +static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); @@ -389,6 +401,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_SEND, cr)); } +static int +zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) +{ + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); +} + int zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) { @@ -398,25 +434,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) if (secpolicy_nfs(cr) == 0) { return (0); } else { - vnode_t *vp; - int error; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ + return (zfs_secpolicy_deleg_share(zc, cr)); + } +} - if (vp->v_vfsp->vfs_fstype != zfsfstype || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (EPERM); - } +int +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curproc)) + return (EPERM); - VN_RELE(vp); - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_SHARE, cr)); + if (secpolicy_smb(cr) == 0) { + return (0); + } else { + return (zfs_secpolicy_deleg_share(zc, cr)); } } @@ -681,11 +712,60 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) } } +static int +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + if (zc->zc_value[0] == 0) { + /* + * They are asking about a posix uid/gid. If it's + * themself, allow it. + */ + if (zc->zc_objset_type == ZFS_PROP_USERUSED || + zc->zc_objset_type == ZFS_PROP_USERQUOTA) { + if (zc->zc_guid == crgetuid(cr)) + return (0); + } else { + if (groupmember(zc->zc_guid, cr)) + return (0); + } + } + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr)); +} + /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int -get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) +get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; @@ -699,7 +779,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) packed = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { kmem_free(packed, size); return (error); } @@ -730,8 +811,8 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) packed = kmem_alloc(size, KM_SLEEP); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); - error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size); + error = ddi_copyout(packed, + (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags); kmem_free(packed, size); } @@ -740,6 +821,69 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) } static int +getzfsvfs(const char *dsname, zfsvfs_t **zvp) +{ + objset_t *os; + int error; + + error = dmu_objset_open(dsname, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os); + if (error) + return (error); + + mutex_enter(&os->os->os_user_ptr_lock); + *zvp = dmu_objset_get_user(os); + if (*zvp) { + VFS_HOLD((*zvp)->z_vfs); + } else { + error = ESRCH; + } + mutex_exit(&os->os->os_user_ptr_lock); + dmu_objset_close(os); + return (error); +} + +/* + * Find a zfsvfs_t for a mounted filesystem, or create our own, in which + * case its z_vfs will be NULL, and it will be opened as the owner. + */ +static int +zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp) +{ + int error = 0; + int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0); + + if (getzfsvfs(name, zvp) != 0) + error = zfsvfs_create(name, mode, zvp); + if (error == 0) { + rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag); + if ((*zvp)->z_unmounted) { + /* + * XXX we could probably try again, since the unmounting + * thread should be just about to disassociate the + * objset from the zfsvfs. + */ + rrw_exit(&(*zvp)->z_teardown_lock, tag); + return (EBUSY); + } + } + return (error); +} + +static void +zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +{ + rrw_exit(&zfsvfs->z_teardown_lock, tag); + + if (zfsvfs->z_vfs) { + VFS_RELE(zfsvfs->z_vfs); + } else { + dmu_objset_close(zfsvfs->z_os); + zfsvfs_free(zfsvfs); + } +} + +static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; @@ -749,11 +893,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) char *buf; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) + zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -825,11 +970,12 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) uint64_t guid; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) != 0) + zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -838,7 +984,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) guid != zc->zc_guid) error = EINVAL; else if (zc->zc_cookie) - error = spa_import_faulted(zc->zc_name, config, + error = spa_import_verbatim(zc->zc_name, config, props); else error = spa_import(zc->zc_name, config, props); @@ -917,7 +1063,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &tryconfig)) != 0) + zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); @@ -1005,9 +1151,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, - (char *)(uintptr_t)zc->zc_history, - zc->zc_history_len); + error = ddi_copyout(hist_buf, + (void *)(uintptr_t)zc->zc_history, + zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); @@ -1055,7 +1201,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config); + zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); @@ -1145,7 +1291,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) == 0) { + zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } @@ -1186,6 +1332,23 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_vdev_setfru(zfs_cmd_t *zc) +{ + spa_t *spa; + char *fru = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setfru(spa, guid, fru); + spa_close(spa, FTAG); + return (error); +} + /* * inputs: * zc_name name of filesystem @@ -1291,6 +1454,23 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) return (err); } +static boolean_t +dataset_name_hidden(const char *name) +{ + /* + * Skip over datasets that are not visible in this zone, + * internal datasets (which have a $ in their name), and + * temporary datasets (which have a % in their name). + */ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) + return (B_TRUE); + return (B_FALSE); +} + /* * inputs: * zc_name name of filesystem @@ -1299,6 +1479,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) * * outputs: * zc_name name of next filesystem + * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist @@ -1322,12 +1503,16 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); + /* + * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 + * but is not declared void because its called by dmu_objset_find(). + */ if (zc->zc_cookie == 0) { uint64_t cookie = 0; int len = sizeof (zc->zc_name) - (p - zc->zc_name); while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) - dmu_objset_prefetch(p, NULL); + (void) dmu_objset_prefetch(p, NULL); } do { @@ -1336,15 +1521,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curproc) && - !zone_dataset_visible(zc->zc_name, NULL)); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_close(os); - /* - * If it's a hidden dataset (ie. with a '$' in its name), don't - * try to get stats for it. Userland will skip over it. - */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) + if (error == 0) error = zfs_ioc_objset_stats(zc); /* fill in the stats */ return (error); @@ -1373,9 +1553,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) if (error) return (error == ENOENT ? ESRCH : error); - if (zc->zc_cookie == 0) - dmu_objset_find(zc->zc_name, dmu_objset_prefetch, + if (zc->zc_cookie == 0) { + (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, NULL, DS_FIND_SNAPSHOTS); + } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. @@ -1404,9 +1585,11 @@ int zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) { nvpair_t *elem; - int error; + int error = 0; uint64_t intval; char *strval; + nvlist_t *genericnvl; + boolean_t issnap = (strchr(name, '@') != NULL); /* * First validate permission to set all of the properties @@ -1421,16 +1604,35 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) * If this is a user-defined property, it must be a * string, and there is no further validation to do. */ - if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) - return (EINVAL); + if (zfs_prop_user(propname) && + nvpair_type(elem) == DATA_TYPE_STRING) { + if (error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + continue; + } - if (error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED())) - return (error); - continue; + if (!issnap && zfs_prop_userquota(propname) && + nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) { + const char *perm; + const char *up = zfs_userquota_prop_prefixes + [ZFS_PROP_USERQUOTA]; + if (strncmp(propname, up, strlen(up)) == 0) + perm = ZFS_DELEG_PERM_USERQUOTA; + else + perm = ZFS_DELEG_PERM_GROUPQUOTA; + if (error = zfs_secpolicy_write_perms(name, + perm, CRED())) + return (error); + continue; + } + + return (EINVAL); } + if (issnap) + return (EINVAL); + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) return (error); @@ -1466,8 +1668,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) break; case ZFS_PROP_COPIES: - if (zfs_earlier_version(name, - SPA_VERSION_DITTO_BLOCKS)) + if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS)) return (ENOTSUP); break; @@ -1486,77 +1687,122 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) } } + VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); if (prop == ZPROP_INVAL) { - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - return (error); + if (zfs_prop_userquota(propname)) { + uint64_t *valary; + unsigned int vallen; + const char *domain; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + + VERIFY(nvpair_value_uint64_array(elem, + &valary, &vallen) == 0); + VERIFY(vallen == 3); + type = valary[0]; + rid = valary[1]; + quota = valary[2]; + domain = propname + + strlen(zfs_userquota_prop_prefixes[type]); + + error = zfsvfs_hold(name, B_FALSE, FTAG, + &zfsvfs); + if (error == 0) { + error = zfs_set_userquota(zfsvfs, + type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } + if (error == 0) + continue; + else + goto out; + } else if (zfs_prop_user(propname)) { + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + goto out; + } } switch (prop) { case ZFS_PROP_QUOTA: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dir_set_quota(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_REFQUOTA: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dataset_set_quota(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_RESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dir_set_reservation(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_REFRESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dataset_set_reservation(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VOLSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = zvol_set_volsize(name, ddi_driver_major(zfs_dip), intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VOLBLOCKSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = zvol_set_volblocksize(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VERSION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zfs_set_version(name, intval)) != 0) - return (error); + { + zfsvfs_t *zfsvfs; + + if ((error = nvpair_value_uint64(elem, &intval)) != 0) + goto out; + if ((error = zfsvfs_hold(name, B_FALSE, FTAG, + &zfsvfs)) != 0) + goto out; + error = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); + + if (error == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t zc = { 0 }; + (void) strcpy(zc.zc_name, name); + (void) zfs_ioc_userspace_upgrade(&zc); + } + if (error) + goto out; break; + } default: if (nvpair_type(elem) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != - PROP_TYPE_STRING) - return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - if ((error = dsl_prop_set(name, - nvpair_name(elem), 1, strlen(strval) + 1, - strval)) != 0) - return (error); + PROP_TYPE_STRING) { + error = EINVAL; + goto out; + } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { const char *unused; @@ -1566,35 +1812,72 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: - return (EINVAL); + error = EINVAL; + goto out; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) - return (EINVAL); + intval, &unused) != 0) { + error = EINVAL; + goto out; + } break; default: cmn_err(CE_PANIC, "unknown property type"); break; } - - if ((error = dsl_prop_set(name, propname, - 8, 1, &intval)) != 0) - return (error); } else { - return (EINVAL); + error = EINVAL; + goto out; } - break; + if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0) + goto out; } } + if (nvlist_next_nvpair(genericnvl, NULL) != NULL) { + error = dsl_props_set(name, genericnvl); + } +out: + nvlist_free(genericnvl); + return (error); +} + +/* + * Check that all the properties are valid user properties. + */ +static int +zfs_check_userprops(char *fsname, nvlist_t *nvl) +{ + nvpair_t *elem = NULL; + int error = 0; + + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + char *valstr; + + if (!zfs_prop_user(propname) || + nvpair_type(elem) != DATA_TYPE_STRING) + return (EINVAL); + + if (error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + VERIFY(nvpair_value_string(elem, &valstr) == 0); + if (strlen(valstr) >= ZAP_MAXVALUELEN) + return (E2BIG); + } return (0); } /* * inputs: * zc_name name of filesystem - * zc_value name of property to inherit + * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie clear existing local props? * @@ -1607,7 +1890,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvl)) != 0) + zc->zc_iflags, &nvl)) != 0) return (error); if (zc->zc_cookie) { @@ -1654,7 +1937,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) nvpair_t *elem; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props))) + zc->zc_iflags, &props))) return (error); /* @@ -1731,7 +2014,7 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) cred_t *usercred; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvp)) != 0) { + zc->zc_iflags, &nvp)) != 0) { return (error); } @@ -1781,7 +2064,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &fsaclnv)) != 0) + zc->zc_iflags, &fsaclnv)) != 0) return (error); /* @@ -1918,11 +2201,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) * processing. */ static int -zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver, +zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - uint64_t zplver = default_zplver; uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; @@ -2010,6 +2292,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, ASSERT(cp != NULL); cp[0] = '\0'; + if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) + zplver = ZPL_VERSION_USERSPACE - 1; if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { zplver = ZPL_VERSION_FUID - 1; fuids_ok = B_FALSE; @@ -2085,7 +2369,7 @@ zfs_ioc_create(zfs_cmd_t *zc) if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); zct.zct_zplprops = NULL; @@ -2188,32 +2472,12 @@ zfs_ioc_create(zfs_cmd_t *zc) return (error); } -struct snap_prop_arg { - nvlist_t *nvprops; - const char *snapname; -}; - -static int -set_snap_props(char *name, void *arg) -{ - struct snap_prop_arg *snpa = arg; - int len = strlen(name) + strlen(snpa->snapname) + 2; - char *buf = kmem_alloc(len, KM_SLEEP); - int err; - - (void) snprintf(buf, len, "%s@%s", name, snpa->snapname); - err = zfs_set_prop_nvlist(buf, snpa->nvprops); - if (err) - (void) dmu_objset_destroy(buf); - kmem_free(buf, len); - return (err); -} - /* * inputs: * zc_name name of filesystem * zc_value short name of snapshot * zc_cookie recursive flag + * zc_nvlist_src[_size] property list * * outputs: none */ @@ -2229,29 +2493,23 @@ zfs_ioc_snapshot(zfs_cmd_t *zc) if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive); + error = zfs_check_userprops(zc->zc_name, nvprops); + if (error) + goto out; - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - struct snap_prop_arg snpa; - snpa.nvprops = nvprops; - snpa.snapname = zc->zc_value; - if (recursive) { - error = dmu_objset_find(zc->zc_name, - set_snap_props, &snpa, DS_FIND_CHILDREN); - if (error) { - (void) dmu_snapshots_destroy(zc->zc_name, - zc->zc_value); - } - } else { - error = set_snap_props(zc->zc_name, &snpa); - } + if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL && + zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { + error = ENOTSUP; + goto out; } + + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, + nvprops, recursive); + +out: nvlist_free(nvprops); return (error); } @@ -2355,31 +2613,19 @@ zfs_ioc_rollback(zfs_cmd_t *zc) if (error) return (error); - if (dmu_objset_type(os) == DMU_OST_ZFS) { - mutex_enter(&os->os->os_user_ptr_lock); - zfsvfs = dmu_objset_get_user(os); - if (zfsvfs != NULL) - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&os->os->os_user_ptr_lock); - } - - if (zfsvfs != NULL) { - char *osname; + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { int mode; - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); + error = zfs_suspend_fs(zfsvfs, NULL, &mode); if (error == 0) { int resume_err; - ASSERT(strcmp(osname, zc->zc_name) == 0); error = dmu_objset_rollback(os); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); + resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode); error = error ? error : resume_err; } else { dmu_objset_close(os); } - kmem_free(osname, MAXNAMELEN); VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_objset_rollback(os); @@ -2484,7 +2730,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props)) != 0) + zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; @@ -2494,32 +2740,26 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } - if (dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Try to get the zfsvfs for the receiving objset. - * There won't be one if we're operating on a zvol, - * if the objset doesn't exist yet, or is not mounted. - */ - mutex_enter(&os->os->os_user_ptr_lock); - if (zfsvfs = dmu_objset_get_user(os)) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - mutex_exit(&os->os->os_user_ptr_lock); - dmu_objset_close(os); - zfsvfs = NULL; - error = EBUSY; - goto out; - } - VFS_HOLD(zfsvfs->z_vfs); + if (getzfsvfs(tofs, &zfsvfs) == 0) { + if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { + VFS_RELE(zfsvfs->z_vfs); + zfsvfs = NULL; + error = EBUSY; + goto out; } - mutex_exit(&os->os->os_user_ptr_lock); - /* * If new properties are supplied, they are to completely * replace the existing ones, so stash away the existing ones. */ if (props) - (void) dsl_prop_get_all(os, &origprops, TRUE); + (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE); + } else if (props && dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + /* + * Get the props even if there was no zfsvfs (zvol or + * unmounted zpl). + */ + (void) dsl_prop_get_all(os, &origprops, TRUE); dmu_objset_close(os); } @@ -2759,11 +2999,12 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * Resume any suspended I/Os. */ - zio_resume(spa); + if (zio_resume(spa) != 0) + error = EIO; spa_close(spa, FTAG); - return (0); + return (error); } /* @@ -2791,6 +3032,120 @@ zfs_ioc_promote(zfs_cmd_t *zc) } /* + * Retrieve a single {user|group}{used|quota}@... property. + * + * inputs: + * zc_name name of filesystem + * zc_objset_type zfs_userquota_prop_t + * zc_value domain name (eg. "S-1-234-567-89") + * zc_guid RID/UID/GID + * + * outputs: + * zc_cookie property value + */ +static int +zfs_ioc_userspace_one(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + if (error) + return (error); + + error = zfs_userspace_one(zfsvfs, + zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_objset_type zfs_userquota_prop_t + * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) + * + * outputs: + * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) + * zc_cookie zap cursor + */ +static int +zfs_ioc_userspace_many(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + if (error) + return (error); + + int bufsize = zc->zc_nvlist_dst_size; + void *buf = kmem_alloc(bufsize, KM_SLEEP); + + error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, + buf, &zc->zc_nvlist_dst_size); + + if (error == 0) { + error = xcopyout(buf, + (void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size); + } + kmem_free(buf, bufsize); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + zfsvfs_t *zfsvfs; + + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) { + /* + * If userused is not enabled, it may be because the + * objset needs to be closed & reopened (to grow the + * objset_phys_t). Suspend/resume the fs will do that. + */ + int mode; + error = zfs_suspend_fs(zfsvfs, NULL, &mode); + if (error == 0) { + error = zfs_resume_fs(zfsvfs, + zc->zc_name, mode); + } + } + if (error == 0) + error = dmu_objset_userspace_upgrade(zfsvfs->z_os); + VFS_RELE(zfsvfs->z_vfs); + } else { + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER, &os); + if (error) + return (error); + + error = dmu_objset_userspace_upgrade(os); + dmu_objset_close(os); + } + + return (error); +} + +/* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when @@ -2903,7 +3258,7 @@ zfs_ioc_share(zfs_cmd_t *zc) if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? - B_TRUE : B_FALSE)) { + B_TRUE: B_FALSE)) { return (error); } break; @@ -2924,64 +3279,280 @@ zfs_ioc_share(zfs_cmd_t *zc) } +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +/* + * Remove all ACL files in shares dir + */ +static int +zfs_smb_acl_purge(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, + NULL, 0)) != 0) + break; + } + zap_cursor_fini(&zc); + return (error); +} + +static int +zfs_ioc_smb_acl(zfs_cmd_t *zc) +{ + vnode_t *vp; + znode_t *dzp; + vnode_t *resourcevp = NULL; + znode_t *sharedir; + zfsvfs_t *zfsvfs; + nvlist_t *nvlist; + char *src, *target; + vattr_t vattr; + vsecattr_t vsec; + int error = 0; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EINVAL); + } + + dzp = VTOZ(vp); + zfsvfs = dzp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + + /* + * Create share dir if its missing. + */ + mutex_enter(&zfsvfs->z_lock); + if (zfsvfs->z_shares_dir == 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, + ZFS_SHARES_DIR); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = zfs_create_share_dir(zfsvfs, tx); + dmu_tx_commit(tx); + } + if (error) { + mutex_exit(&zfsvfs->z_lock); + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + } + mutex_exit(&zfsvfs->z_lock); + + ASSERT(zfsvfs->z_shares_dir); + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + + switch (zc->zc_cookie) { + case ZFS_SMB_ACL_ADD: + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VREG; + vattr.va_mode = S_IFREG|0777; + vattr.va_uid = 0; + vattr.va_gid = 0; + + vsec.vsa_mask = VSA_ACE; + vsec.vsa_aclentp = &full_access; + vsec.vsa_aclentsz = sizeof (full_access); + vsec.vsa_aclcnt = 1; + + error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, + &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); + if (resourcevp) + VN_RELE(resourcevp); + break; + + case ZFS_SMB_ACL_REMOVE: + error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, + NULL, 0); + break; + + case ZFS_SMB_ACL_RENAME: + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || + nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, + &target)) { + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + ZFS_EXIT(zfsvfs); + return (error); + } + error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, + kcred, NULL, 0); + nvlist_free(nvlist); + break; + + case ZFS_SMB_ACL_PURGE: + error = zfs_smb_acl_purge(sharedir); + break; + + default: + error = EINVAL; + break; + } + + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + + ZFS_EXIT(zfsvfs); + + return (error); +} + /* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. */ static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, - { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, - { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, - DATASET_NAME, B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, - { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, + B_TRUE}, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + DATASET_NAME, B_FALSE, B_TRUE }, }; +int +pool_status_check(const char *name, zfs_ioc_namecheck_t type) +{ + spa_t *spa; + int error; + + ASSERT(type == POOL_NAME || type == DATASET_NAME); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if (spa_suspended(spa)) + error = EAGAIN; + spa_close(spa, FTAG); + } + return (error); +} + static int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) { @@ -3000,7 +3571,7 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); + error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error == 0) error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); @@ -3011,15 +3582,22 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) */ if (error == 0) { zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + zc->zc_iflags = flag & FKIOCTL; switch (zfs_ioc_vec[vec].zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case NO_NAME: @@ -3030,10 +3608,10 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) if (error == 0) error = zfs_ioc_vec[vec].zvec_func(zc); - rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); + rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0) { error = rc; - if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + if (zfs_ioc_vec[vec].zvec_his_log) zfs_log_history(zc); } diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 84d64b4df..3f0b6b0ed 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -467,9 +467,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ ssize_t zfs_immediate_write_sz = 32768; -#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ - sizeof (lr_write_t)) - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) @@ -483,29 +480,6 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ - /* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * In this mode, if we need to commit the write later, then the block - * is immediately written into the file system (using dmu_sync), - * and a pointer to the block is put into the log record. - * When the txg commits the block is linked in. - * This saves additionally writing the data into the log record. - * There are a few requirements for this to occur: - * - write is greater than zfs_immediate_write_sz - * - not using slogs (as slogs are assumed to always be faster - * than writing into the main pool) - * - the write occupies only one block - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ slogging = spa_has_slogs(zilog->zl_spa); if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; @@ -535,7 +509,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1) != 0) { + zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 1bf1bc527..8a859b575 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,8 @@ static major_t zfs_major; static minor_t zfs_minor; static kmutex_t zfs_dev_mtx; +extern int sys_shutdown; + static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); @@ -145,12 +147,24 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (sys_shutdown && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + txg_wait_synced(dp, 0); ZFS_EXIT(zfsvfs); } else { /* @@ -554,6 +568,393 @@ unregister: } +static void +uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, + int64_t delta, dmu_tx_t *tx) +{ + uint64_t used = 0; + char buf[32]; + int err; + uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + + if (delta == 0) + return; + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); + err = zap_lookup(os, obj, buf, 8, 1, &used); + ASSERT(err == 0 || err == ENOENT); + /* no underflow/overflow */ + ASSERT(delta > 0 || used >= -delta); + ASSERT(delta < 0 || used + delta > used); + used += delta; + if (used == 0) + err = zap_remove(os, obj, buf, tx); + else + err = zap_update(os, obj, buf, 8, 1, &used, tx); + ASSERT(err == 0); +} + +static void +zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype, + void *oldbonus, void *newbonus, + uint64_t oldused, uint64_t newused, dmu_tx_t *tx) +{ + znode_phys_t *oldznp = oldbonus; + znode_phys_t *newznp = newbonus; + + if (bonustype != DMU_OT_ZNODE) + return; + + /* We charge 512 for the dnode (if it's allocated). */ + if (oldznp->zp_gen != 0) + oldused += DNODE_SIZE; + if (newznp->zp_gen != 0) + newused += DNODE_SIZE; + + if (oldznp->zp_uid == newznp->zp_uid) { + uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx); + } else { + uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx); + uidacct(os, B_FALSE, newznp->zp_uid, newused, tx); + } + + if (oldznp->zp_gid == newznp->zp_gid) { + uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx); + } else { + uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx); + uidacct(os, B_TRUE, newznp->zp_gid, newused, tx); + } +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + extern uint64_t strtonum(const char *str, char **nptr); + uint64_t fuid; + const char *domain; + + fuid = strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (ENOENT); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (EINVAL); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (ENOTSUP); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +int +zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + uint64_t zval; + int i, error; + + if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL)) + return (error); + if (zval) + mode |= DS_MODE_READONLY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); + if (error == EROFS) { + mode |= DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); + } + if (error) + return (error); + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error) { + goto out; + } else if (zfsvfs->z_version > ZPL_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %llu on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + error = ENOTSUP; + goto out; + } + + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + goto out; + zfsvfs->z_norm = (int)zval; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + goto out; + zfsvfs->z_utf8 = (zval != 0); + + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + goto out; + zfsvfs->z_case = (uint_t)zval; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + goto out; + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error && error != ENOENT) + goto out; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + *zvp = zfsvfs; + return (0); + +out: + dmu_objset_close(os); + *zvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); +} + static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { @@ -570,6 +971,12 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + if (zil_disable) { + zil_destroy(zfsvfs->z_log, 0); + zfsvfs->z_log = NULL; + } + /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all @@ -588,11 +995,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) else zfs_unlinked_drain(zfsvfs); - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - if (zil_disable) { - zil_destroy(zfsvfs->z_log, 0); - zfsvfs->z_log = NULL; - } else { + if (zfsvfs->z_log) { /* * Parse and replay the intent log. * @@ -630,49 +1033,63 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) return (0); } -static void -zfs_freezfsvfs(zfsvfs_t *zfsvfs) +void +zfsvfs_free(zfsvfs_t *zfsvfs) { + int i; + extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_online_recv_lock); + mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + } +} + static int zfs_domount(vfs_t *vfsp, char *osname) { dev_t mount_dev; - uint64_t recordsize, readonly; + uint64_t recordsize, fsid_guid; int error = 0; - int mode; zfsvfs_t *zfsvfs; - znode_t *zp = NULL; ASSERT(vfsp); ASSERT(osname); - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs); + if (error) + return (error); zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; @@ -694,39 +1111,24 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs; - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - mode = DS_MODE_OWNER; - if (readonly) - mode |= DS_MODE_READONLY; - - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_OWNER | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp)) - goto out; - - /* The call to zfs_init_fs leaves the vnode held, release it here. */ - VN_RELE(ZTOV(zp)); + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + zfsfstype & 0xFF; /* * Set features for file system. */ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids) { - vfs_set_feature(vfsp, VFSFT_XVATTR); - vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); - vfs_set_feature(vfsp, VFSFT_ACLONCREATE); - } + zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); @@ -739,13 +1141,16 @@ zfs_domount(vfs_t *vfsp, char *osname) if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; - ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } @@ -754,9 +1159,8 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - zfs_freezfsvfs(zfsvfs); + dmu_objset_close(zfsvfs->z_os); + zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); } @@ -1067,6 +1471,13 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) error = zfs_domount(vfsp, osname); + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); + out: pn_free(&spn); return (error); @@ -1426,15 +1837,16 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - *mode = zfsvfs->z_os->os_mode; - dmu_objset_name(zfsvfs->z_os, name); + *modep = zfsvfs->z_os->os_mode; + if (name) + dmu_objset_name(zfsvfs->z_os, name); dmu_objset_close(zfsvfs->z_os); return (0); @@ -1493,13 +1905,15 @@ static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); + /* + * If this is a snapshot, we have an extra VFS_HOLD on our parent + * from zfs_mount(). Release it here. + */ + if (zfsvfs->z_issnap) + VFS_RELE(zfsvfs->z_parent->z_vfs); - zfs_fuid_destroy(zfsvfs); - zfs_freezfsvfs(zfsvfs); + zfsvfs_free(zfsvfs); atomic_add_32(&zfs_active_fs_count, -1); } @@ -1558,6 +1972,8 @@ zfs_init(void) * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void @@ -1574,54 +1990,46 @@ zfs_busy(void) } int -zfs_set_version(const char *name, uint64_t newvers) +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; - objset_t *os; + objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; - uint64_t curvers; - - /* - * XXX for now, require that the filesystem be unmounted. Would - * be nice to find the zfsvfs_t and just update that if - * possible. - */ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (EINVAL); - error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); - if (error) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &curvers); - if (error) - goto out; - if (newvers < curvers) { - error = EINVAL; - goto out; - } + if (newvers < zfsvfs->z_version) + return (EINVAL); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - goto out; + return (error); + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); } - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &newvers, tx); spa_history_internal_log(LOG_DS_UPGRADE, dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, - dmu_objset_id(os)); + "oldver=%llu newver=%llu dataset = %llu", + zfsvfs->z_version, newvers, dmu_objset_id(os)); + dmu_tx_commit(tx); -out: - dmu_objset_close(os); - return (error); + zfsvfs->z_version = newvers; + + if (zfsvfs->z_version >= ZPL_VERSION_FUID) + zfs_set_fuid_feature(zfsvfs); + + return (0); } /* diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index e38abb28c..88d4e52c1 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -101,6 +101,7 @@ * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. @@ -363,7 +364,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) caddr_t va; va = zfs_map_page(pp, S_WRITE); - (void) dmu_read(os, oid, start+off, nbytes, va+off); + (void) dmu_read(os, oid, start+off, nbytes, va+off, + DMU_READ_PREFETCH); zfs_unmap_page(pp, va); page_unlock(pp); } @@ -567,6 +569,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) int max_blksz = zfsvfs->z_max_blksz; uint64_t pflags; int error; + arc_buf_t *abuf; /* * Fasttrack empty write @@ -663,10 +666,46 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * and allows us to do more fine-grained space accounting. */ while (n > 0) { + abuf = NULL; + woff = uio->uio_loffset; + +again: + if (zfs_usergroup_overquota(zfsvfs, + B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, + B_TRUE, zp->z_phys->zp_gid)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = EDQUOT; + break; + } + + /* + * If dmu_assign_arcbuf() is expected to execute with minimum + * overhead loan an arc buffer and copy user data to it before + * we enter a txg. This avoids holding a txg forever while we + * pagefault on a hanging NFS server mapping. + */ + if (abuf == NULL && n >= max_blksz && + woff >= zp->z_phys->zp_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + size_t cbytes; + + abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if (error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes)) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + /* * Start a transaction. */ - woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); @@ -675,9 +714,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); - continue; + goto again; } dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); break; } @@ -706,12 +747,22 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - tx_bytes = uio->uio_resid; - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); - tx_bytes -= uio->uio_resid; - if (tx_bytes && vn_has_cached_data(vp)) + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, + nbytes, tx); + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(tx_bytes == max_blksz); + dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id); + } /* * If we made no progress, we're done. If we made even @@ -791,10 +842,15 @@ zfs_get_done(dmu_buf_t *db, void *vzgd) zgd_t *zgd = (zgd_t *)vzgd; rl_t *rl = zgd->zgd_rl; vnode_t *vp = ZTOV(rl->r_zp); + objset_t *os = rl->r_zp->z_zfsvfs->z_os; dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); - VN_RELE(vp); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } @@ -824,7 +880,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (ENOENT); } @@ -842,7 +903,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) error = ENOENT; goto out; } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf, + DMU_READ_NO_PREFETCH)); } else { /* indirect write */ uint64_t boff; /* block starting offset */ @@ -896,7 +958,11 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } out: zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (error); } @@ -1074,11 +1140,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; /* * If we have an ephemeral id, ACL, or XVATTR then @@ -1141,21 +1207,9 @@ top: if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - ZFS_EXIT(zfsvfs); - if (dl) - zfs_dirent_unlock(dl); - return (error); - } - } - if (zp == NULL) { uint64_t txtype; @@ -1177,30 +1231,28 @@ top: goto out; } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) + goto out; + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + error = EDQUOT; + goto out; + } + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); } error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -1209,19 +1261,21 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; @@ -1292,8 +1346,6 @@ out: *vpp = svp; } } - if (aclp) - zfs_acl_free(aclp); ZFS_EXIT(zfsvfs); return (error); @@ -1528,12 +1580,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, uint64_t txtype; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1594,38 +1646,33 @@ top: return (error); } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); } + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } + /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -1634,19 +1681,16 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - - if (aclp) - zfs_acl_free(aclp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ @@ -1657,10 +1701,10 @@ top: txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -1969,6 +2013,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, } } + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + VN_RELE(ZTOV(ezp)); + goto skip_entry; + } + VN_RELE(ZTOV(ezp)); + } + if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else @@ -2020,6 +2079,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, if (prefetch) dmu_prefetch(os, objnum, 0, 0); + skip_entry: /* * Move to the next entry, fill in the previous offset. */ @@ -2120,8 +2180,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_VERIFY_ZP(zp); pzp = zp->z_phys; - mutex_enter(&zp->z_lock); - /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should @@ -2131,7 +2189,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, (pzp->zp_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { - mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } @@ -2142,6 +2199,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, * than to determine whether we were asked the question. */ + mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = pzp->zp_mode & MODEMASK; zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); @@ -2312,6 +2370,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, uint_t saved_mask; int trim_mask = 0; uint64_t new_mode; + uint64_t new_uid, new_gid; znode_t *attrzp; int need_policy = FALSE; int err; @@ -2320,6 +2379,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, xoptattr_t *xoap; zfs_acl_t *aclp = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; if (mask == 0) return (0); @@ -2610,30 +2670,14 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; if (pzp->zp_acl.z_acl_extern_obj) { /* Are we upgrading ACL from old V0 format to new V1 */ if (zfsvfs->z_version <= ZPL_VERSION_FUID && @@ -2655,36 +2699,53 @@ top: } } - if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (err); + if (mask & (AT_UID | AT_GID)) { + if (pzp->zp_xattr) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) + goto out; + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != pzp->zp_uid && + zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + err = EDQUOT; + goto out; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != pzp->zp_gid && + zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + err = EDQUOT; + goto out; + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } } - dmu_tx_hold_bonus(tx, attrzp->z_id); } err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - - if (aclp) { - zfs_acl_free(aclp); - aclp = NULL; - } - - if (err == ERESTART) { + if (err == ERESTART) dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); + goto out; } dmu_buf_will_dirty(zp->z_dbuf, tx); @@ -2702,7 +2763,7 @@ top: if (mask & AT_MODE) { mutex_enter(&zp->z_acl_lock); zp->z_phys->zp_mode = new_mode; - err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); mutex_exit(&zp->z_acl_lock); } @@ -2711,25 +2772,17 @@ top: mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { - pzp->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - if (attrzp) { - attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - } + pzp->zp_uid = new_uid; + if (attrzp) + attrzp->z_phys->zp_uid = new_uid; } if (mask & AT_GID) { - pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, - cr, ZFS_GROUP, tx, &fuidp); + pzp->zp_gid = new_gid; if (attrzp) - attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, - vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + attrzp->z_phys->zp_gid = new_gid; } - if (aclp) - zfs_acl_free(aclp); - if (attrzp) mutex_exit(&attrzp->z_lock); @@ -2791,17 +2844,35 @@ top: zfs_xvattr_set(zp, xvap); } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - if (fuidp) - zfs_fuid_info_free(fuidp); mutex_exit(&zp->z_lock); +out: if (attrzp) VN_RELE(ZTOV(attrzp)); - dmu_tx_commit(tx); + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) + dmu_tx_abort(tx); + else + dmu_tx_commit(tx); + + if (err == ERESTART) + goto top; ZFS_EXIT(zfsvfs); return (err); @@ -3232,7 +3303,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, int len = strlen(link); int error; int zflg = ZNEW; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VLNK); @@ -3267,26 +3339,25 @@ top: return (error); } + VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -3306,13 +3377,16 @@ top: * otherwise, store it just like any other file data. */ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. @@ -3333,15 +3407,14 @@ top: * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); -out: if (error == 0) { uint64_t txtype = TX_SYMLINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); } - if (fuidp) - zfs_fuid_info_free(fuidp); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -3618,6 +3691,12 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, pvn_write_done(trunc, flags); len = filesz - off; } + + if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) { + err = EDQUOT; + goto out; + } top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); @@ -3705,7 +3784,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, else io_off = 0; if (len > 0 && ISP2(blksz)) - io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t); + io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); else io_len = 0; @@ -3869,7 +3948,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill * multiple pages at once (klustering) to fill up the supplied page - * list. + * list. Note that the pages to be filled are held with an exclusive + * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, @@ -3888,7 +3968,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, */ io_off = off; io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + pp = page_create_va(vp, io_off, io_len, + PG_EXCL | PG_WAIT, seg, addr); } else { /* * Try to find enough pages to fill the page list @@ -3913,7 +3994,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -3991,7 +4073,7 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, *protp = PROT_ALL; /* - * Loop through the requested range [off, off + len] looking + * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ @@ -4337,6 +4419,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, (vp->v_type == VREG || vp->v_type == VDIR); return (0); + case _PC_ACCESS_FILTERING: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && + vp->v_type == VDIR; + return (0); + case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); @@ -4489,6 +4576,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = { }; /* + * special share hidden files vnode operations template + */ +vnodeops_t *zfs_sharevnodeops; +const fs_operation_def_t zfs_sharevnodeops_template[] = { + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* * Extended attribute directory vnode operations template * This template is identical to the directory vnodes * operation template except for restricted operations: diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 74983cdc5..8ced95174 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -87,6 +87,12 @@ * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ @@ -154,8 +160,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) #ifdef ZNODE_STATS static struct { uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_recheck1; uint64_t zms_zfsvfs_unmounted; - uint64_t zms_zfsvfs_recheck_invalid; + uint64_t zms_zfsvfs_recheck2; uint64_t zms_obj_held; uint64_t zms_vnode_locked; uint64_t zms_not_only_dnlc; @@ -206,17 +213,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) POINTER_INVALIDATE(&ozp->z_zfsvfs); } -/* - * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise - * returns a non-zero error code. - */ -static int -zfs_enter(zfsvfs_t *zfsvfs) -{ - ZFS_ENTER(zfsvfs); - return (0); -} - /*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) @@ -240,12 +236,32 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) } /* - * Ensure that the filesystem is not unmounted during the move. + * Close a small window in which it's possible that the filesystem could + * be unmounted and freed, and zfsvfs, though valid in the previous + * statement, could point to unrelated memory by the time we try to + * prevent the filesystem from being unmounted. + */ + rw_enter(&zfsvfs_lock, RW_WRITER); + if (zfsvfs != ozp->z_zfsvfs) { + rw_exit(&zfsvfs_lock); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the znode is still valid, then so is the file system. We know that + * no valid file system can be freed while we hold zfsvfs_lock, so we + * can safely ensure that the filesystem is not and will not be + * unmounted. The next statement is equivalent to ZFS_ENTER(). */ - if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); + if (zfsvfs->z_unmounted) { + ZFS_EXIT(zfsvfs); + rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } + rw_exit(&zfsvfs_lock); mutex_enter(&zfsvfs->z_znodes_lock); /* @@ -255,7 +271,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) if (zfsvfs != ozp->z_zfsvfs) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); return (KMEM_CBRC_DONT_KNOW); } @@ -311,6 +327,7 @@ zfs_znode_init(void) /* * Initialize zcache */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, @@ -332,6 +349,7 @@ zfs_znode_fini(void) if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; + rw_destroy(&zfsvfs_lock); } struct vnodeops *zfs_dvnodeops; @@ -339,6 +357,7 @@ struct vnodeops *zfs_fvnodeops; struct vnodeops *zfs_symvnodeops; struct vnodeops *zfs_xdvnodeops; struct vnodeops *zfs_evnodeops; +struct vnodeops *zfs_sharevnodeops; void zfs_remove_op_tables() @@ -363,12 +382,15 @@ zfs_remove_op_tables() vn_freevnodeops(zfs_xdvnodeops); if (zfs_evnodeops) vn_freevnodeops(zfs_evnodeops); + if (zfs_sharevnodeops) + vn_freevnodeops(zfs_sharevnodeops); zfs_dvnodeops = NULL; zfs_fvnodeops = NULL; zfs_symvnodeops = NULL; zfs_xdvnodeops = NULL; zfs_evnodeops = NULL; + zfs_sharevnodeops = NULL; } extern const fs_operation_def_t zfs_dvnodeops_template[]; @@ -376,6 +398,7 @@ extern const fs_operation_def_t zfs_fvnodeops_template[]; extern const fs_operation_def_t zfs_xdvnodeops_template[]; extern const fs_operation_def_t zfs_symvnodeops_template[]; extern const fs_operation_def_t zfs_evnodeops_template[]; +extern const fs_operation_def_t zfs_sharevnodeops_template[]; int zfs_create_op_tables() @@ -412,103 +435,58 @@ zfs_create_op_tables() error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, &zfs_evnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, + &zfs_sharevnodeops); return (error); } -/* - * zfs_init_fs - Initialize the zfsvfs struct and the file system - * incore "master" object. Verify version compatibility. - */ int -zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { - extern int zfsfstype; - - objset_t *os = zfsvfs->z_os; - int i, error; - uint64_t fsid_guid; - uint64_t zval; - - *zpp = NULL; - - error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error) { - return (error); - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); - return (ENOTSUP); - } - - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) - return (error); - zfsvfs->z_norm = (int)zval; - if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) - return (error); - zfsvfs->z_utf8 = (zval != 0); - if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) - return (error); - zfsvfs->z_case = (uint_t)zval; - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - zfsvfs->z_case == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + vnode_t *vp; + znode_t *zp; + int error; - /* - * The fsid is 64 bits, composed of an 8-bit fs type, which - * separates our fsid from any other filesystem types, and a - * 56-bit objset unique ID. The objset unique ID is unique to - * all objsets open on this system, provided by unique_create(). - * The 8-bit fs type must be put in the low bits of fsid[1] - * because that's where other Solaris filesystems put it. - */ - fsid_guid = dmu_objset_fsid_guid(os); - ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); - zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; - zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | - zfsfstype & 0xFF; - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error) - return (error); - ASSERT(zfsvfs->z_root != 0); + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error) - return (error); + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; - /* - * Initialize zget mutex's - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + vp = ZTOV(sharezp); + vn_reinit(vp); + vp->v_type = VDIR; - error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); - if (error) { - /* - * On error, we destroy the mutexes here since it's not - * possible for the caller to determine if the mutexes were - * initialized properly. - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - return (error); - } - ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, - &zfsvfs->z_fuid_obj); - if (error == ENOENT) - error = 0; + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, + &zp, 0, &acl_ids); + ASSERT3P(zp, ==, sharezp); + ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + ZTOV(sharezp)->v_count = 0; + dmu_buf_rele(sharezp->z_dbuf, NULL); + sharezp->z_dbuf = NULL; + kmem_cache_free(znode_cache, sharezp); - return (0); + return (error); } /* @@ -676,7 +654,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) break; case VREG: vp->v_flag |= VMODSORT; - vn_setops(vp, zfs_fvnodeops); + if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) + vn_setops(vp, zfs_sharevnodeops); + else + vn_setops(vp, zfs_fvnodeops); break; case VLNK: vn_setops(vp, zfs_symvnodeops); @@ -720,8 +701,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, - zfs_fuid_info_t **fuidp) + uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) { dmu_buf_t *db; znode_phys_t *pzp; @@ -846,7 +826,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ *zpp = dzp; } - zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); + pzp->zp_uid = acl_ids->z_fuid; + pzp->zp_gid = acl_ids->z_fgid; + pzp->zp_mode = acl_ids->z_mode; + VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap); } void @@ -1474,7 +1459,7 @@ void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { zfsvfs_t zfsvfs; - uint64_t moid, doid, version; + uint64_t moid, obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; @@ -1483,6 +1468,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vnode_t *vp; vattr_t vattr; znode_t *zp; + zfs_acl_ids_t acl_ids; /* * First attempt to create master node. @@ -1499,12 +1485,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) /* * Set starting attributes. */ - if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) version = ZPL_VERSION; + else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + version = ZPL_VERSION_USERSPACE - 1; else version = ZPL_VERSION_FUID - 1; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ @@ -1515,9 +1501,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { - version = val; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); + if (val < version) + version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } @@ -1528,13 +1513,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) sense = val; } ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create a delete queue. */ - doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* @@ -1575,17 +1561,28 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_zfsvfs = &zfsvfs; - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); ASSERT3P(zp, ==, rootzp); ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); ZTOV(rootzp)->v_count = 0; dmu_buf_rele(rootzp->z_dbuf, NULL); rootzp->z_dbuf = NULL; kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(&zfsvfs, tx); + + ASSERT(error == 0); } #endif /* _KERNEL */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 83fef0d87..53d9d9bf7 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -19,12 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/dmu.h> #include <sys/zap.h> #include <sys/arc.h> @@ -471,34 +472,22 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) } /* - * zil_rollback_destroy() is only called by the rollback code. - * We already have a syncing tx. Rollback has exclusive access to the - * dataset, so we don't have to worry about concurrent zil access. - * The actual freeing of any log blocks occurs in zil_sync() later in - * this txg syncing phase. + * return true if the initial log block is not valid */ -void -zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) +static boolean_t +zil_empty(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; - uint64_t txg; + arc_buf_t *abuf = NULL; if (BP_IS_HOLE(&zh->zh_log)) - return; + return (B_TRUE); - txg = dmu_tx_get_txg(tx); - ASSERT3U(zilog->zl_destroy_txg, <, txg); - zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = B_FALSE; + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (B_TRUE); - /* - * Ensure there's no outstanding ZIL IO. No lwbs or just the - * unused one that allocated in advance is ok. - */ - ASSERT(zilog->zl_lwb_list.list_head.list_next == - zilog->zl_lwb_list.list_head.list_prev); - (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, - tx, zh->zh_claim_txg); + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (B_FALSE); } int @@ -520,6 +509,30 @@ zil_claim(char *osname, void *txarg) zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) { + if (!BP_IS_HOLE(&zh->zh_log)) + zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + /* + * Record here whether the zil has any records to replay. + * If the header block pointer is null or the block points + * to the stubby then we know there are no valid log records. + * We use the header to store this state as the the zilog gets + * freed later in dmu_objset_close(). + * The flags (and the rest of the header fields) are cleared in + * zil_sync() as a result of a zil_destroy(), after replaying the log. + * + * Note, the intent log can be empty but still need the + * stubby to be claimed. + */ + if (!zil_empty(zilog)) { + zh->zh_flags |= ZIL_REPLAY_NEEDED; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can @@ -587,36 +600,6 @@ zil_check_log_chain(char *osname, void *txarg) return (error); } -/* - * Clear a log chain - */ -/* ARGSUSED */ -int -zil_clear_log_chain(char *osname, void *txarg) -{ - zilog_t *zilog; - zil_header_t *zh; - objset_t *os; - dmu_tx_t *tx; - int error; - - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); - if (error) { - cmn_err(CE_WARN, "can't open objset for %s", osname); - return (0); - } - - zilog = dmu_objset_zil(os); - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - zh = zil_header_in_syncing_context(zilog); - BP_ZERO(&zh->zh_log); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_tx_commit(tx); - dmu_objset_close(os); - return (0); -} - static int zil_vdev_compare(const void *x1, const void *x2) { @@ -719,18 +702,26 @@ zil_lwb_write_done(zio_t *zio) ASSERT(zio->io_bp->blk_fill == 0); /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * Ensure the lwb buffer pointer is cleared before releasing + * the txg. If we have had an allocation failure and + * the txg is waiting to sync then we want want zil_sync() + * to remove the lwb so that it's not picked up as the next new + * one in zil_commit_writer(). zil_sync() will only remove + * the lwb if lwb_buf is null. */ - txg_rele_to_sync(&lwb->lwb_txgh); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; mutex_exit(&zilog->zl_lock); + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + txg_rele_to_sync(&lwb->lwb_txgh); } /* @@ -752,9 +743,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, - lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); + 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, + zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb); } } @@ -1040,7 +1031,7 @@ zil_clean(zilog_t *zilog) if ((itx != NULL) && (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (void) taskq_dispatch(zilog->zl_clean_taskq, - (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP); } mutex_exit(&zilog->zl_lock); } @@ -1216,6 +1207,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) spa_t *spa = zilog->zl_spa; lwb_t *lwb; + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); @@ -1226,7 +1224,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - ASSERT(spa_sync_pass(spa) == 1); bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); @@ -1245,12 +1242,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) } } - for (;;) { - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - mutex_exit(&zilog->zl_lock); - return; - } + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; @@ -1344,25 +1336,6 @@ zil_free(zilog_t *zilog) } /* - * return true if the initial log block is not valid - */ -static boolean_t -zil_empty(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - arc_buf_t *abuf = NULL; - - if (BP_IS_HOLE(&zh->zh_log)) - return (B_TRUE); - - if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (B_TRUE); - - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (B_FALSE); -} - -/* * Open an intent log. */ zilog_t * @@ -1417,7 +1390,7 @@ zil_suspend(zilog_t *zilog) const zil_header_t *zh = zilog->zl_header; mutex_enter(&zilog->zl_lock); - if (zh->zh_claim_txg != 0) { /* unplayed log */ + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } @@ -1601,7 +1574,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; - if (zil_empty(zilog)) { + if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } @@ -1671,3 +1644,24 @@ out: mutex_exit(&zilog->zl_lock); return (ret); } + +/* ARGSUSED */ +int +zil_vdev_offline(char *osname, void *arg) +{ + objset_t *os; + zilog_t *zilog; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) + return (error); + + zilog = dmu_objset_zil(os); + if (zil_suspend(zilog) != 0) + error = EEXIST; + else + zil_resume(zilog); + dmu_objset_close(os); + return (error); +} diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a669ad64a..a2bdab9a7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -490,11 +490,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ - if (zio->io_child_type == ZIO_CHILD_LOGICAL) { - if (BP_IS_GANG(bp)) - pipeline |= ZIO_GANG_STAGES; + if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; - } + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; @@ -520,6 +519,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; + if (zio->io_child_type == ZIO_CHILD_GANG) + zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } @@ -529,21 +530,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, static void zio_destroy(zio_t *zio) { - spa_t *spa = zio->io_spa; - uint8_t async_root = zio->io_async_root; - list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); - - if (async_root) { - mutex_enter(&spa->spa_async_root_lock); - if (--spa->spa_async_root_count == 0) - cv_broadcast(&spa->spa_async_root_cv); - mutex_exit(&spa->spa_async_root_lock); - } } zio_t * @@ -836,7 +827,8 @@ zio_read_bp_init(zio_t *zio) blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && - zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) { + zio->io_child_type == ZIO_CHILD_LOGICAL && + !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); @@ -885,16 +877,10 @@ zio_write_bp_init(zio_t *zio) * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(zio->io_spa); - ASSERT(pass > 1); if (pass > SYNC_PASS_DONT_COMPRESS) compress = ZIO_COMPRESS_OFF; - /* - * Only MOS (objset 0) data should need to be rewritten. - */ - ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); - /* Make sure someone doesn't change their mind on overwrites */ ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); @@ -956,11 +942,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) zio_type_t t = zio->io_type; /* - * If we're a config writer, the normal issue and interrupt threads - * may all be blocked waiting for the config lock. In this case, - * select the otherwise-unused taskq for ZIO_TYPE_NULL. + * If we're a config writer or a probe, the normal issue and + * interrupt threads may all be blocked waiting for the config lock. + * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ - if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) + if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* @@ -1092,14 +1078,12 @@ zio_nowait(zio_t *zio) zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. - * Track how many outstanding I/Os of this type exist so - * that spa_unload() knows when they are all done. + * We add it to the spa_async_root_zio "Godfather" I/O which + * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - zio->io_async_root = B_TRUE; - mutex_enter(&spa->spa_async_root_lock); - spa->spa_async_root_count++; - mutex_exit(&spa->spa_async_root_lock); + + zio_add_child(spa->spa_async_zio_root, zio); } zio_execute(zio); @@ -1118,6 +1102,8 @@ zio_reexecute(zio_t *pio) ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); + ASSERT(pio->io_gang_leader == NULL); + ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; @@ -1161,8 +1147,11 @@ zio_reexecute(zio_t *pio) /* * Now that all children have been reexecuted, execute the parent. + * We don't reexecute "The Godfather" I/O here as it's the + * responsibility of the caller to wait on him. */ - zio_execute(pio); + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + zio_execute(pio); } void @@ -1178,11 +1167,14 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) - spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); @@ -1193,10 +1185,10 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_exit(&spa->spa_suspend_lock); } -void +int zio_resume(spa_t *spa) { - zio_t *pio, *cio, *cio_next; + zio_t *pio; /* * Reexecute all previously suspended i/o. @@ -1209,18 +1201,10 @@ zio_resume(spa_t *spa) mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) - return; - - for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { - zio_link_t *zl = pio->io_walk_link; - cio_next = zio_walk_children(pio); - zio_remove_child(pio, cio, zl); - zio_reexecute(cio); - } + return (0); - ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); - - (void) zio_wait(pio); + zio_reexecute(pio); + return (zio_wait(pio)); } void @@ -1327,7 +1311,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ - if (gn != pio->io_logical->io_gang_tree) { + if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } @@ -1409,27 +1393,26 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) } static void -zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) +zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - ASSERT(lio->io_logical == lio); + ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, + zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); + gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { - zio_t *lio = zio->io_logical; + zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; - zio_t *pio = zio_unique_parent(zio); - ASSERT(pio == lio); + ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio_walk_children(zio) == NULL); if (zio->io_error) @@ -1446,25 +1429,25 @@ zio_gang_tree_assemble_done(zio_t *zio) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; - zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); + zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); - ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); - ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); @@ -1478,8 +1461,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) } } - if (gn == lio->io_gang_tree) - ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); + if (gn == gio->io_gang_tree) + ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); @@ -1490,7 +1473,10 @@ zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; - ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); @@ -1500,18 +1486,18 @@ zio_gang_assemble(zio_t *zio) static int zio_gang_issue(zio_t *zio) { - zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(BP_IS_GANG(bp) && zio == lio); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else - zio_gang_tree_free(&lio->io_gang_tree); + zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1522,7 +1508,7 @@ static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); - zio_t *lio = zio->io_logical; + zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; @@ -1533,7 +1519,7 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); - ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); + ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas); ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); @@ -1553,28 +1539,28 @@ zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; - int ndvas = lio->io_prop.zp_ndvas; + int ndvas = gio->io_prop.zp_ndvas; int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); zio_prop_t zp; int error; error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, - bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, + bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } - if (pio == lio) { - gnpp = &lio->io_gang_tree; + if (pio == gio) { + gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); @@ -1598,11 +1584,11 @@ zio_write_gang_block(zio_t *pio) SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); - zp.zp_checksum = lio->io_prop.zp_checksum; + zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; - zp.zp_ndvas = lio->io_prop.zp_ndvas; + zp.zp_ndvas = gio->io_prop.zp_ndvas; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, @@ -1635,6 +1621,11 @@ zio_dva_allocate(zio_t *zio) blkptr_t *bp = zio->io_bp; int error; + if (zio->io_gang_leader == NULL) { + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + zio->io_gang_leader = zio; + } + ASSERT(BP_IS_HOLE(bp)); ASSERT3U(BP_GET_NDVAS(bp), ==, 0); ASSERT3U(zio->io_prop.zp_ndvas, >, 0); @@ -1864,7 +1855,8 @@ zio_vdev_io_done(zio_t *zio) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(vd, EIO); + zio->io_error = zio_handle_device_injection(vd, + zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); @@ -2079,11 +2071,10 @@ zio_ready(zio_t *zio) blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; - if (zio->io_ready) { - if (BP_IS_GANG(bp) && - zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); @@ -2128,7 +2119,7 @@ zio_done(zio_t *zio) zio_t *pio, *pio_next; /* - * If our of children haven't all completed, + * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || @@ -2219,6 +2210,21 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && + zio->io_child_type == ZIO_CHILD_LOGICAL) { + ASSERT(zio->io_child_type != ZIO_CHILD_GANG); + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + } + + zio_gang_tree_free(&zio->io_gang_tree); + + /* + * Godfather I/Os should never suspend. + */ + if ((zio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) + zio->io_reexecute = 0; + if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. @@ -2235,21 +2241,37 @@ zio_done(zio_t *zio) */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - if (IO_IS_ALLOCATING(zio)) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); - - zio_gang_tree_free(&zio->io_gang_tree); + zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); + /* + * "The Godfather" I/O monitors its children but is + * not a true parent to them. It will track them through + * the pipeline but severs its ties whenever they get into + * trouble (e.g. suspended). This allows "The Godfather" + * I/O to return status without blocking. + */ + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + + if ((pio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + zio_remove_child(pio, zio, zl); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } + } + if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { @@ -2282,8 +2304,6 @@ zio_done(zio_t *zio) if (zio->io_done) zio->io_done(zio); - zio_gang_tree_free(&zio->io_gang_tree); - mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index b3469fdd5..f8e6880c9 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -195,7 +195,7 @@ zio_handle_label_injection(zio_t *zio, int error) int -zio_handle_device_injection(vdev_t *vd, int error) +zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { inject_handler_t *handler; int ret = 0; @@ -210,6 +210,12 @@ zio_handle_device_injection(vdev_t *vd, int error) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_failfast && + (zio == NULL || (zio->io_flags & + (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { + continue; + } + if (handler->zi_record.zi_error == error) { /* * For a failed open, pretend like the device diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index d3be8fb50..cfd3b3dbd 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -107,7 +107,7 @@ typedef struct zvol_state { uint64_t zv_volblocksize; /* volume block size */ minor_t zv_minor; /* minor number */ uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_flags; /* readonly; dumpified */ + uint8_t zv_flags; /* readonly, dumpified, etc. */ objset_t *zv_objset; /* objset handle */ uint32_t zv_mode; /* DS_MODE_* flags at open time */ uint32_t zv_open_count[OTYPCNT]; /* open counts */ @@ -123,6 +123,7 @@ typedef struct zvol_state { #define ZVOL_RDONLY 0x1 #define ZVOL_DUMPIFIED 0x2 #define ZVOL_EXCL 0x4 +#define ZVOL_WCE 0x8 /* * zvol maximum transfer in one DMU tx. @@ -741,6 +742,27 @@ zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) } } + /* + * Generate a LUN expansion event. + */ + if (error == 0) { + sysevent_id_t eid; + nvlist_t *attr; + char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(physpath, MAXPATHLEN, "%s%uc", ZVOL_PSEUDO_DEV, + zv->zv_minor); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); + } + out: if (state.zv_objset) dmu_objset_close(state.zv_objset); @@ -924,7 +946,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf, + DMU_READ_NO_PREFETCH)); zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; @@ -968,11 +991,15 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) ssize_t zvol_immediate_write_sz = 32768; static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, + boolean_t sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; - lr_write_t *lr; + boolean_t slogging; + + if (zil_disable) + return; if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); @@ -981,23 +1008,58 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) return; } - while (len) { - ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + slogging = spa_has_slogs(zilog->zl_spa); - itx->itx_wr_state = - len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; - itx->itx_private = zv; + while (resid) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + itx_wr_state_t write_state; + + /* + * Unlike zfs_log_write() we can be called with + * upto DMU_MAX_ACCESS/2 (5MB) writes. + */ + if (blocksize > zvol_immediate_write_sz && !slogging && + resid >= blocksize && off % blocksize == 0) { + write_state = WR_INDIRECT; /* uses dmu_sync */ + len = blocksize; + } else if (sync) { + write_state = WR_COPIED; + len = MIN(ZIL_MAX_LOG_DATA, resid); + } else { + write_state = WR_NEED_COPY; + len = MIN(ZIL_MAX_LOG_DATA, resid); + } + + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; - lr->lr_length = nbytes; + lr->lr_length = len; lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); BP_ZERO(&lr->lr_blkptr); + itx->itx_private = zv; + itx->itx_sync = sync; + (void) zil_itx_assign(zilog, itx, tx); - len -= nbytes; - off += nbytes; + + off += len; + resid -= len; } } @@ -1010,7 +1072,9 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, int numerrors = 0; for (c = 0; c < vd->vdev_children; c++) { - ASSERT(vd->vdev_ops == &vdev_mirror_ops); + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); int err = zvol_dumpio_vdev(vd->vdev_child[c], addr, offset, size, doread, isdump); if (err != 0) { @@ -1086,6 +1150,7 @@ zvol_strategy(buf_t *bp) int error = 0; boolean_t doread = bp->b_flags & B_READ; boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; + boolean_t sync; if (zv == NULL) { bioerror(bp, ENXIO); @@ -1123,6 +1188,9 @@ zvol_strategy(buf_t *bp) return (0); } + sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump && + !(zv->zv_flags & ZVOL_WCE) && !zil_disable; + /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. @@ -1137,7 +1205,8 @@ zvol_strategy(buf_t *bp) error = zvol_dumpio(zv, addr, off, size, doread, B_FALSE); } else if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); @@ -1146,7 +1215,7 @@ zvol_strategy(buf_t *bp) dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size); + zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } } @@ -1165,7 +1234,7 @@ zvol_strategy(buf_t *bp) if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); - if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump) + if (sync) zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); biodone(bp); @@ -1280,6 +1349,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; rl_t *rl; int error = 0; + boolean_t sync; if (minor == 0) /* This is the control device */ return (ENXIO); @@ -1299,6 +1369,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable; + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1317,14 +1389,14 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) } error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); if (error == 0) - zvol_log_write(zv, tx, off, bytes); + zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_range_unlock(rl); - if (!zil_disable) + if (sync) zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); return (error); } @@ -1408,6 +1480,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) mutex_exit(&zvol_state_lock); return (ENXIO); } + ASSERT(zv->zv_total_opens > 0); switch (cmd) { @@ -1444,12 +1517,40 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case DKIOCFLUSHWRITECACHE: dkc = (struct dk_callback *)arg; + mutex_exit(&zvol_state_lock); zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { (*dkc->dkc_callback)(dkc->dkc_cookie, error); error = 0; } - break; + return (error); + + case DKIOCGETWCE: + { + int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; + if (ddi_copyout(&wce, (void *)arg, sizeof (int), + flag)) + error = EFAULT; + break; + } + case DKIOCSETWCE: + { + int wce; + if (ddi_copyin((void *)arg, &wce, sizeof (int), + flag)) { + error = EFAULT; + break; + } + if (wce) { + zv->zv_flags |= ZVOL_WCE; + mutex_exit(&zvol_state_lock); + } else { + zv->zv_flags &= ~ZVOL_WCE; + mutex_exit(&zvol_state_lock); + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + } + return (0); + } case DKIOCGGEOM: case DKIOCGVTOC: @@ -1468,6 +1569,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) break; case DKIOCDUMPFINI: + if (!(zv->zv_flags & ZVOL_DUMPIFIED)) + break; rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, RL_WRITER); error = zvol_dump_fini(zv); |