diff options
author | Matthew Ahrens <[email protected]> | 2011-11-17 10:14:36 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2012-07-31 09:25:30 -0700 |
commit | 330d06f90d143b41b276796526a66a1c1fff046d (patch) | |
tree | e263b2fb2b9846fba94bb1a17ba0b03fc6ebf18b /module | |
parent | 7eebaff409d969394dccb9b12e30aef466cc04b2 (diff) |
Illumos #1644, #1645, #1646, #1647, #1708
1644 add ZFS "clones" property
1645 add ZFS "written" and "written@..." properties
1646 "zfs send" should estimate size of stream
1647 "zfs destroy" should determine space reclaimed by
destroying multiple snapshots
1708 adjust size of zpool history data
References:
https://www.illumos.org/issues/1644
https://www.illumos.org/issues/1645
https://www.illumos.org/issues/1646
https://www.illumos.org/issues/1647
https://www.illumos.org/issues/1708
This commit modifies the user to kernel space ioctl ABI. Extra
care should be taken when updating to ensure both the kernel
modules and utilities are updated. This change has reordered
all of the new ioctl()s to the end of the list. This should
help minimize this issue in the future.
Reviewed by: Richard Lowe <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Albert Lee <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
Ported by: Martin Matuska <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #826
Closes #664
Diffstat (limited to 'module')
-rw-r--r-- | module/zcommon/zfs_prop.c | 18 | ||||
-rw-r--r-- | module/zfs/bpobj.c | 7 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 107 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 320 | ||||
-rw-r--r-- | module/zfs/dsl_deadlist.c | 30 | ||||
-rw-r--r-- | module/zfs/dsl_deleg.c | 11 | ||||
-rw-r--r-- | module/zfs/dsl_pool.c | 4 | ||||
-rw-r--r-- | module/zfs/spa_history.c | 7 | ||||
-rw-r--r-- | module/zfs/zap_micro.c | 3 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 214 |
10 files changed, 594 insertions, 127 deletions
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 9afe3d900..0a0c25bd4 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -264,7 +264,7 @@ zfs_prop_init(void) /* default index properties */ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | 4 | current", "VERSION", version_table); + "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); @@ -294,6 +294,8 @@ zfs_prop_init(void) /* string properties */ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN"); + zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES"); zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT"); @@ -339,6 +341,8 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV"); zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS"); + zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "<size>", "WRITTEN"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, @@ -472,6 +476,18 @@ zfs_prop_userquota(const char *name) } /* + * Returns true if this is a valid written@ property. + * Note that after the @, any character is valid (eg, another @, for + * written@pool/fs@origin). + */ +boolean_t +zfs_prop_written(const char *name) +{ + static const char *prefix = "written@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 72be31235..022921c66 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -20,11 +20,13 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/bpobj.h> #include <sys/zfs_context.h> #include <sys/refcount.h> +#include <sys/dsl_pool.h> uint64_t bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) @@ -440,7 +442,10 @@ space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) struct space_range_arg *sra = arg; if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { - sra->used += bp_get_dsize_sync(sra->spa, bp); + if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) + sra->used += bp_get_dsize_sync(sra->spa, bp); + else + sra->used += bp_get_dsize(sra->spa, bp); sra->comp += BP_GET_PSIZE(bp); sra->uncomp += BP_GET_UCSIZE(bp); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index fad770e27..2f0613211 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -20,9 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* + * Copyright (c) 2011 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -47,6 +47,9 @@ #include <sys/ddt.h> #include <sys/zfs_onexit.h> +/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ +int zfs_send_corrupt_data = B_FALSE; + static char *dmu_recv_tag = "dmu_recv_tag"; /* @@ -368,8 +371,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) - return (EIO); + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { + if (zfs_send_corrupt_data) { + uint64_t *ptr; + /* Send a block filled with 0x"zfs badd bloc" */ + abuf = arc_buf_alloc(spa, blksz, &abuf, + ARC_BUFC_DATA); + for (ptr = abuf->b_data; + (char *)ptr < (char *)abuf->b_data + blksz; + ptr++) + *ptr = 0x2f5baddb10c; + } else { + return (EIO); + } + } err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, blksz, bp, abuf->b_data); @@ -498,6 +513,85 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, return (0); } +int +dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + uint64_t *sizep) +{ + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + uint64_t size, recordsize; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + if (fromorigin) { + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + /* Get uncompressed size estimate of changed data. */ + if (fromds == NULL) { + size = ds->ds_phys->ds_uncompressed_bytes; + } else { + uint64_t used, comp; + err = dsl_dataset_space_written(fromds, ds, + &used, &comp, &size); + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + if (err) + return (err); + } + + /* + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). + */ + + /* + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume all blocks are recordsize. Assume ditto blocks and + * internal fragmentation counter out compression. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block, which we observe in practice. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_prop_get_ds(ds, "recordsize", + sizeof (recordsize), 1, &recordsize, NULL); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + size -= size / recordsize * sizeof (blkptr_t); + + /* Add in the space for the record associated with each block. */ + size += size / recordsize * sizeof (dmu_replay_record_t); + + *sizep = size; + + return (0); +} + struct recvbeginsyncarg { const char *tofs; const char *tosnap; @@ -1500,7 +1594,7 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; - int err; + int err, myerr; /* * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() @@ -1538,7 +1632,8 @@ out: if (err == 0 && drc->drc_guid_to_ds_map != NULL) (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); dsl_dataset_disown(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + ASSERT3U(myerr, ==, 0); return (err); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2deec8cf1..25c8ac6b1 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -907,69 +907,56 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, return (dsobj); } -struct destroyarg { - dsl_sync_task_group_t *dstg; - char *snapname; - char *failed; - boolean_t defer; -}; - -static int -dsl_snapshot_destroy_one(const char *name, void *arg) -{ - struct destroyarg *da = arg; - dsl_dataset_t *ds; - int err; - char *dsname; - - dsname = kmem_asprintf("%s@%s", name, da->snapname); - err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); - strfree(dsname); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; - - dsl_dataset_make_exclusive(ds, da->dstg); - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); - dsda->ds = ds; - dsda->defer = da->defer; - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, da->dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - (void) strcpy(da->failed, name); - } - return (err); -} - /* - * Destroy 'snapname' in all descendants of 'fsname'. + * The snapshots must all be in the same pool. */ -#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) +dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) { int err; - struct destroyarg da; dsl_sync_task_t *dst; spa_t *spa; + nvpair_t *pair; + dsl_sync_task_group_t *dstg; - err = spa_open(fsname, &spa, FTAG); + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + err = spa_open(nvpair_name(pair), &spa, FTAG); if (err) return (err); - da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - da.snapname = snapname; - da.failed = fsname; - da.defer = defer; + dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - err = dmu_objset_find(fsname, - dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); + if (err == 0) { + struct dsl_ds_destroyarg *dsda; + + dsl_dataset_make_exclusive(ds, dstg); + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), + KM_SLEEP); + dsda->ds = ds; + dsda->defer = defer; + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, dsda, dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { + (void) strcpy(failed, nvpair_name(pair)); + break; + } + } if (err == 0) - err = dsl_sync_task_group_wait(da.dstg); + err = dsl_sync_task_group_wait(dstg); - for (dst = list_head(&da.dstg->dstg_tasks); dst; - dst = list_next(&da.dstg->dstg_tasks, dst)) { + for (dst = list_head(&dstg->dstg_tasks); dst; + dst = list_next(&dstg->dstg_tasks, dst)) { struct dsl_ds_destroyarg *dsda = dst->dst_arg1; dsl_dataset_t *ds = dsda->ds; @@ -977,17 +964,17 @@ dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) * Return the file system name that triggered the error */ if (dst->dst_err) { - dsl_dataset_name(ds, fsname); - *strchr(fsname, '@') = '\0'; + dsl_dataset_name(ds, failed); } ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, da.dstg); + dsl_dataset_disown(ds, dstg); kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } - dsl_sync_task_group_destroy(da.dstg); + dsl_sync_task_group_destroy(dstg); spa_close(spa, FTAG); return (err); + } static boolean_t @@ -2151,6 +2138,55 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_objset_sync(ds->ds_objset, zio, tx); } +static void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *propval; + nvlist_t *val; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + /* + * There may me missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (ds->ds_phys->ds_next_clones_obj != 0) { + ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + } + if (count != ds->ds_phys->ds_num_children - 1) { + goto fail; + } + for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAXNAMELEN]; + if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone) != 0) { + goto fail; + } + dsl_dir_name(clone->ds_dir, buf); + VERIFY(nvlist_add_boolean(val, buf) == 0); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); + VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), + propval) == 0); +fail: + nvlist_free(val); + nvlist_free(propval); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { @@ -2181,6 +2217,27 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + if (ds->ds_phys->ds_prev_snap_obj != 0) { + uint64_t written, comp, uncomp; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + int err; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + rw_exit(&dp->dp_config_rwlock); + if (err == 0) { + err = dsl_dataset_space_written(prev, ds, &written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + if (err == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + } + ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : (ds->ds_phys->ds_uncompressed_bytes * 100 / ds->ds_phys->ds_compressed_bytes); @@ -2194,6 +2251,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); + + get_clones_stat(ds, nv); } } @@ -4022,7 +4081,7 @@ dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) } /* - * Note, this fuction is used as the callback for dmu_objset_find(). We + * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. @@ -4042,7 +4101,157 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) return (0); } + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two snapshots, thus reducing new's used space relative to old's. + * Specifically, this is the space that was born before old->ds_creation_txg, + * and freed before new (ie. on new's deadlist or a previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * oldsnap new + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + *usedp = 0; + *usedp += new->ds_phys->ds_used_bytes; + *usedp -= oldsnap->ds_phys->ds_used_bytes; + + *compp = 0; + *compp += new->ds_phys->ds_compressed_bytes; + *compp -= oldsnap->ds_phys->ds_compressed_bytes; + + *uncompp = 0; + *uncompp += new->ds_phys->ds_uncompressed_bytes; + *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = new->ds_object; + while (snapobj != oldsnap->ds_object) { + dsl_dataset_t *snap; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + + if (snap->ds_phys->ds_prev_snap_txg == + oldsnap->ds_phys->ds_creation_txg) { + /* + * The blocks in the deadlist can not be born after + * ds_prev_snap_txg, so get the whole deadlist space, + * which is more efficient (especially for old-format + * deadlists). Unfortunately the deadlist code + * doesn't have enough information to make this + * optimization itself. + */ + dsl_deadlist_space(&snap->ds_deadlist, + &used, &comp, &uncomp); + } else { + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, oldsnap->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + } + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + /* + * If we get to the beginning of the chain of snapshots + * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap + * was not a snapshot of/before new. + */ + snapobj = snap->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (snapobj == 0) { + err = EINVAL; + break; + } + + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} + +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; + + ASSERT(dsl_dataset_is_snapshot(firstsnap)); + ASSERT(dsl_dataset_is_snapshot(lastsnap)); + + /* + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. + */ + if (firstsnap->ds_dir != lastsnap->ds_dir || + firstsnap->ds_phys->ds_creation_txg > + lastsnap->ds_phys->ds_creation_txg) + return (EINVAL); + + *usedp = *compp = *uncompp = 0; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + snapobj = lastsnap->ds_phys->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = ds->ds_phys->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + rw_exit(&dp->dp_config_rwlock); + return (err); +} + #if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_obj); EXPORT_SYMBOL(dsl_dataset_own); @@ -4056,7 +4265,6 @@ EXPORT_SYMBOL(dsl_dataset_make_exclusive); EXPORT_SYMBOL(dsl_dataset_create_sync); EXPORT_SYMBOL(dsl_dataset_create_sync_dd); EXPORT_SYMBOL(dsl_dataset_destroy); -EXPORT_SYMBOL(dsl_snapshots_destroy); EXPORT_SYMBOL(dsl_dataset_destroy_check); EXPORT_SYMBOL(dsl_dataset_destroy_sync); EXPORT_SYMBOL(dsl_dataset_snapshot_check); @@ -4072,6 +4280,8 @@ EXPORT_SYMBOL(dsl_dataset_get_blkptr); EXPORT_SYMBOL(dsl_dataset_set_blkptr); EXPORT_SYMBOL(dsl_dataset_get_spa); EXPORT_SYMBOL(dsl_dataset_modified_since_lastsnap); +EXPORT_SYMBOL(dsl_dataset_space_written); +EXPORT_SYMBOL(dsl_dataset_space_wouldfree); EXPORT_SYMBOL(dsl_dataset_sync); EXPORT_SYMBOL(dsl_dataset_block_born); EXPORT_SYMBOL(dsl_dataset_block_kill); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 064f8aceb..dd6db2120 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/dsl_dataset.h> @@ -29,6 +30,26 @@ #include <sys/zfs_context.h> #include <sys/dsl_pool.h> +/* + * Deadlist concurrency: + * + * Deadlists can only be modified from the syncing thread. + * + * Except for dsl_deadlist_insert(), it can only be modified with the + * dp_config_rwlock held with RW_WRITER. + * + * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can + * be called concurrently, from open context, with the dl_config_rwlock held + * with RW_READER. + * + * Therefore, we only need to provide locking between dsl_deadlist_insert() and + * the accessors, protecting: + * dl_phys->dl_used,comp,uncomp + * and protecting the dl_tree from being loaded. + * The locking is provided by dl_lock. Note that locking on the bpobj_t + * provides its own locking, and dl_oldfmt is immutable. + */ + static int dsl_deadlist_compare(const void *arg1, const void *arg2) { @@ -309,14 +330,14 @@ dsl_deadlist_space(dsl_deadlist_t *dl, * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is - * UINT64_MAX). + * larger than any bp in the deadlist (eg. UINT64_MAX)). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t dle_tofind; avl_index_t where; if (dl->dl_oldfmt) { @@ -325,9 +346,10 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, return; } - dsl_deadlist_load_tree(dl); *usedp = *compp = *uncompp = 0; + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); /* @@ -336,6 +358,7 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, */ ASSERT(dle != NULL || avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + for (; dle && dle->dle_mintxg < maxtxg; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t used, comp, uncomp; @@ -347,6 +370,7 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, *compp += comp; *uncompp += uncomp; } + mutex_exit(&dl->dl_lock); } static void diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 6b5c8424a..a4d4e42da 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* @@ -534,10 +535,12 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, } /* - * Check if user has requested permission. + * Check if user has requested permission. If descendent is set, must have + * descendent perms. */ int -dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) +dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, + cred_t *cr) { dsl_dir_t *dd; dsl_pool_t *dp; @@ -558,7 +561,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) SPA_VERSION_DELEGATED_PERMS) return (EPERM); - if (dsl_dataset_is_snapshot(ds)) { + if (dsl_dataset_is_snapshot(ds) || descendent) { /* * Snapshots are treated as descendents only, * local permissions do not apply. @@ -651,7 +654,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) if (error) return (error); - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); dsl_dataset_rele(ds, FTAG); return (error); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index d428b7ad7..3b285df65 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/dsl_pool.h> @@ -291,7 +292,10 @@ static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; + dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); + rw_enter(&dp->dp_config_rwlock, RW_READER); dsl_deadlist_insert(dl, bp, tx); + rw_exit(&dp->dp_config_rwlock); return (0); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 243f2b4ab..7a2537875 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -101,11 +102,11 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) /* * Figure out maximum size of history log. We set it at - * 1% of pool size, with a max of 32MB and min of 128KB. + * 0.1% of pool size, with a max of 1G and min of 128KB. */ shpp->sh_phys_max_off = - metaslab_class_get_dspace(spa_normal_class(spa)) / 100; - shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); + metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 49aad2a3b..bd3d4a8d8 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -1404,7 +1405,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, } /* - * We lock the zap with adding == FALSE. Because, if we pass + * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation * and hence we donot want to trigger an upgrade. diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 65b0a1975..94c91e876 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -23,8 +23,6 @@ * Portions Copyright 2011 Martin Matuska * Portions Copyright 2012 Pawel Jakub Dawidek <[email protected]> * Copyright (c) 2012, Joyent, Inc. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. */ @@ -319,17 +317,37 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) return (zfs_dozonecheck_impl(dataset, zoned, cr)); } +/* + * If name ends in a '@', then require recursive permissions. + */ int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; + boolean_t descendent = B_FALSE; + dsl_dataset_t *ds; + char *at; + + at = strchr(name, '@'); + if (at != NULL && at[1] == '\0') { + *at = '\0'; + descendent = B_TRUE; + } + + error = dsl_dataset_hold(name, FTAG, &ds); + if (at != NULL) + *at = '@'; + if (error != 0) + return (error); - error = zfs_dozonecheck(name, cr); + error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error) - error = dsl_deleg_access(name, perm, cr); + error = dsl_deleg_access_impl(ds, descendent, perm, cr); } + + dsl_dataset_rele(ds, FTAG); return (error); } @@ -343,7 +361,7 @@ zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, if (error == 0) { error = secpolicy_zfs(cr); if (error) - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); } return (error); } @@ -666,24 +684,14 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) /* * Destroying snapshots with delegated permissions requires * descendent mount and destroy permissions. - * Reassemble the full filesystem@snap name so dsl_deleg_access() - * can do the correct permission check. - * - * Since this routine is used when doing a recursive destroy of snapshots - * and destroying snapshots requires descendent permissions, a successfull - * check of the top level snapshot applies to snapshots of all descendent - * datasets as well. - * - * The target snapshot may not exist when doing a recursive destroy. - * In this case fallback to permissions of the parent dataset. */ static int -zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_destroy_recursive(zfs_cmd_t *zc, cred_t *cr) { int error; char *dsname; - dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + dsname = kmem_asprintf("%s@", zc->zc_name); error = zfs_secpolicy_destroy_perms(dsname, cr); if (error == ENOENT) @@ -1742,9 +1750,12 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ - if (!zc->zc_objset_stats.dds_inconsistent) { - if (dmu_objset_type(os) == DMU_OST_ZVOL) - error = zvol_get_stats(os, nv); + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) { + error = zvol_get_stats(os, nv); + if (error == EIO) + return (error); + VERIFY3S(error, ==, 0); } if (error == 0) error = put_nvlist(zc, nv); @@ -1954,8 +1965,7 @@ top: NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && dataset_name_hidden(zc->zc_name) && - !(zc->zc_iflags & FKIOCTL)); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* @@ -2233,6 +2243,8 @@ retry: if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = EINVAL; + } else { + err = EINVAL; } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { @@ -3118,25 +3130,45 @@ zfs_unmount_snap(const char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem, snaps must be under it + * zc_nvlist_src[_size] full names of snapshots to destroy * zc_defer_destroy mark for deferred destroy * - * outputs: none + * outputs: + * zc_name on failure, name of failed snapshot */ static int -zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) { - int err; + int err, len; + nvlist_t *nvl; + nvpair_t *pair; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - err = dmu_objset_find(zc->zc_name, - zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); - if (err) + if ((err = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl)) != 0) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, - zc->zc_defer_destroy)); + + len = strlen(zc->zc_name); + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + const char *name = nvpair_name(pair); + /* + * The snap name must be underneath the zc_name. This ensures + * that our permission checks were legitimate. + */ + if (strncmp(zc->zc_name, name, len) != 0 || + (name[len] != '@' && name[len] != '/')) { + nvlist_free(nvl); + return (EINVAL); + } + + (void) zfs_unmount_snap(name, NULL); + } + + err = dmu_snapshots_destroy_nvl(nvl, zc->zc_defer_destroy, + zc->zc_name); + nvlist_free(nvl); + return (err); } /* @@ -3787,6 +3819,8 @@ out: * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) + * zc_guid if set, estimate size of stream only. zc_cookie is ignored. + * output size in zc_objset_type. * * outputs: none */ @@ -3795,13 +3829,13 @@ zfs_ioc_send(zfs_cmd_t *zc) { objset_t *fromsnap = NULL; objset_t *tosnap; - file_t *fp; int error; offset_t off; dsl_dataset_t *ds; dsl_dataset_t *dsfrom = NULL; spa_t *spa; dsl_pool_t *dp; + boolean_t estimate = (zc->zc_guid != 0); error = spa_open(zc->zc_name, &spa, FTAG); if (error) @@ -3842,20 +3876,26 @@ zfs_ioc_send(zfs_cmd_t *zc) spa_close(spa, FTAG); } - fp = getf(zc->zc_cookie); - if (fp == NULL) { - dsl_dataset_rele(ds, FTAG); - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); - return (EBADF); - } + if (estimate) { + error = dmu_send_estimate(tosnap, fromsnap, zc->zc_obj, + &zc->zc_objset_type); + } else { + file_t *fp = getf(zc->zc_cookie); + if (fp == NULL) { + dsl_dataset_rele(ds, FTAG); + if (dsfrom) + dsl_dataset_rele(dsfrom, FTAG); + return (EBADF); + } - off = fp->f_offset; - error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); + off = fp->f_offset; + error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, + fp->f_vnode, &off); - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - releasef(zc->zc_cookie); + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + releasef(zc->zc_cookie); + } if (dsfrom) dsl_dataset_rele(dsfrom, FTAG); dsl_dataset_rele(ds, FTAG); @@ -4591,6 +4631,70 @@ zfs_ioc_events_clear(zfs_cmd_t *zc) } /* + * inputs: + * zc_name name of new filesystem or snapshot + * zc_value full name of old snapshot + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_written(zfs_cmd_t *zc) +{ + int error; + dsl_dataset_t *new, *old; + + error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + if (error != 0) + return (error); + error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (error); + } + + error = dsl_dataset_space_written(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + return (error); +} + +/* + * inputs: + * zc_name full name of last snapshot + * zc_value full name of first snapshot + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_snaps(zfs_cmd_t *zc) +{ + int error; + dsl_dataset_t *new, *old; + + error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + if (error != 0) + return (error); + error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (error); + } + + error = dsl_dataset_space_wouldfree(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + return (error); +} + +/* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. @@ -4656,7 +4760,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, POOL_CHECK_NONE }, @@ -4670,8 +4774,8 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { POOL_CHECK_NONE }, { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, - B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_destroy_snaps_nvl, zfs_secpolicy_destroy_recursive, + DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, @@ -4716,12 +4820,16 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, - { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, - POOL_CHECK_NONE } + POOL_CHECK_NONE }, + { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, + { zfs_ioc_space_written, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED }, + { zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + POOL_CHECK_SUSPENDED }, }; int |