diff options
author | Brian Behlendorf <[email protected]> | 2016-06-29 11:26:30 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2016-06-29 13:42:23 -0700 |
commit | 5c27b296055301f13103ca0aa98c2ded01dcd9a0 (patch) | |
tree | 4813f9cb944c53140de7a12710f0ca7b12e3ec27 /module | |
parent | 669cf0ab298dd66e512b37f6c4a42aee2d767b80 (diff) | |
parent | 0dab2e84fcecff2806287efacb7c6205f346f69d (diff) |
Merge branch 'illumos-2605'
Adds support for resuming interrupted zfs send streams and include
all related send/recv bug fixes from upstream OpenZFS.
Unlike the upstream implementation this branch does not change
the existing ioctl interface. Instead a new ZFS_IOC_RECV_NEW ioctl
was added to support resuming zfs send streams. This was done by
applying the original upstream patch and then reverting the ioctl
changes in a follow up patch. For this reason there are a handful
on commits between the relevant patches on this branch which are
not interoperable. This was done to make it easier to extract
the new ZFS_IOC_RECV_NEW and submit it upstream.
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #4742
Diffstat (limited to 'module')
-rw-r--r-- | module/zcommon/zfs_fletcher.c | 14 | ||||
-rw-r--r-- | module/zcommon/zfs_namecheck.c | 46 | ||||
-rw-r--r-- | module/zcommon/zfs_prop.c | 4 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 34 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 739 | ||||
-rw-r--r-- | module/zfs/dmu_traverse.c | 81 | ||||
-rw-r--r-- | module/zfs/dsl_bookmark.c | 4 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 220 | ||||
-rw-r--r-- | module/zfs/dsl_deleg.c | 6 | ||||
-rw-r--r-- | module/zfs/dsl_destroy.c | 12 | ||||
-rw-r--r-- | module/zfs/dsl_dir.c | 27 | ||||
-rw-r--r-- | module/zfs/dsl_prop.c | 4 | ||||
-rw-r--r-- | module/zfs/dsl_scan.c | 6 | ||||
-rw-r--r-- | module/zfs/dsl_userhold.c | 6 | ||||
-rw-r--r-- | module/zfs/spa.c | 24 | ||||
-rw-r--r-- | module/zfs/spa_history.c | 6 | ||||
-rw-r--r-- | module/zfs/zfs_ctldir.c | 56 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 390 | ||||
-rw-r--r-- | module/zfs/zfs_vfsops.c | 2 | ||||
-rw-r--r-- | module/zfs/zil.c | 2 | ||||
-rw-r--r-- | module/zfs/zpl_inode.c | 2 | ||||
-rw-r--r-- | module/zfs/zvol.c | 2 |
22 files changed, 1301 insertions, 386 deletions
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 2c2d01d5c..e76c5b8a5 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -334,7 +334,12 @@ fletcher_4_impl_get(void) void fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) { - const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + const fletcher_4_ops_t *ops; + + if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t))) + ops = fletcher_4_impl_get(); + else + ops = &fletcher_4_scalar_ops; ops->init(zcp); ops->compute(buf, size, zcp); @@ -345,7 +350,12 @@ fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) void fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { - const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + const fletcher_4_ops_t *ops; + + if (IS_P2ALIGNED(size, 4 * sizeof (uint32_t))) + ops = fletcher_4_impl_get(); + else + ops = &fletcher_4_scalar_ops; ops->init(zcp); ops->compute_byteswap(buf, size, zcp); diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index ff724be58..b58071bed 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -69,7 +69,7 @@ zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) { const char *loc; - if (strlen(path) >= MAXNAMELEN) { + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -140,27 +140,8 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) /* * Make sure the name is not too long. - * - * ZFS_MAXNAMELEN is the maximum dataset length used in the userland - * which is the same as MAXNAMELEN used in the kernel. - * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all - * places using MAXNAMELEN. - * - * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name - * length is 20 bytes. This 20 bytes is broken down as follows to - * provide a maximum safe <pool>/<dataset>[@snapshot] length of only - * 18 bytes. To ensure bytes are left for <dataset>[@snapshot] the - * <pool> portition is futher limited to 9 bytes. For 2.6.27 and - * newer kernels this limit is set to MAXNAMELEN. - * - * <pool>/<dataset> + <partition> + <newline> - * (18) + (1) + (1) */ -#ifdef HAVE_KOBJ_NAME_LEN - if (strlen(path) > 18) { -#else - if (strlen(path) >= MAXNAMELEN) { -#endif /* HAVE_KOBJ_NAME_LEN */ + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -289,7 +270,7 @@ mountpoint_namecheck(const char *path, namecheck_err_t *why) while (*end != '/' && *end != '\0') end++; - if (end - start >= MAXNAMELEN) { + if (end - start >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -314,27 +295,8 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) /* * Make sure the name is not too long. - * - * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland - * which is the same as MAXNAMELEN used in the kernel. - * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all - * places using MAXNAMELEN. - * - * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name - * length is 20 bytes. This 20 bytes is broken down as follows to - * provide a maximum safe <pool>/<dataset>[@snapshot] length of only - * 18 bytes. To ensure bytes are left for <dataset>[@snapshot] the - * <pool> portition is futher limited to 8 bytes. For 2.6.27 and - * newer kernels this limit is set to MAXNAMELEN. - * - * <pool>/<dataset> + <partition> + <newline> - * (18) + (1) + (1) */ -#ifdef HAVE_KOBJ_NAME_LEN - if (strlen(pool) > 8) { -#else - if (strlen(pool) >= MAXNAMELEN) { -#endif /* HAVE_KOBJ_NAME_LEN */ + if (strlen(pool) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 1dbeab084..1d68ca29e 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -375,6 +375,10 @@ zfs_prop_init(void) zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux rootcontext>", "ROOTCONTEXT"); + zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, + "receive_resume_token", + NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "<string token>", "RESUMETOK"); /* readonly number properties */ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index cdc897726..22ca84d96 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -405,6 +405,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -461,6 +472,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnodesize_changed_cb, os); } } + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); @@ -520,6 +533,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; @@ -651,7 +671,7 @@ dmu_objset_refresh_ownership(objset_t *os, void *tag) { dsl_pool_t *dp; dsl_dataset_t *ds, *newds; - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; ds = os->os_dsl_dataset; VERIFY3P(ds, !=, NULL); @@ -875,6 +895,9 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); + if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); @@ -961,6 +984,9 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); + if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); @@ -1000,7 +1026,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; - char namebuf[MAXNAMELEN]; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); @@ -2027,7 +2053,7 @@ dmu_objset_get_user(objset_t *os) /* * Determine name of filesystem, given name of snapshot. - * buf must be at least MAXNAMELEN bytes + * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ int dmu_fsname(const char *snapname, char *buf) @@ -2035,7 +2061,7 @@ dmu_fsname(const char *snapname, char *buf) char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); - if (atp - snapname >= MAXNAMELEN) + if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 901386a5a..80f7dc1aa 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -20,10 +20,11 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ @@ -62,14 +63,18 @@ int zfs_send_corrupt_data = B_FALSE; int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; static char *dmu_recv_tag = "dmu_recv_tag"; -static const char *recv_clone_name = "%recv"; +const char *recv_clone_name = "%recv"; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) +static void byteswap_record(dmu_replay_record_t *drr); + struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ @@ -77,6 +82,7 @@ struct send_thread_arg { int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; + zbookmark_phys_t resume; }; struct send_block_record { @@ -99,8 +105,21 @@ dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; dmu_sendarg_t *dsp = dbi->dbi_dsp; - dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); ssize_t resid; /* have to get resid to get detailed errno */ + + /* + * The code does not rely on this (len being a multiple of 8). We keep + * this assertion because of the corresponding assertion in + * receive_read(). Keeping this assertion ensures that we do not + * inadvertently break backwards compatibility (causing the assertion + * in receive_read() to trigger on old software). + * + * Removing the assertions could be rolled into a new feature that uses + * data that isn't 8-byte aligned; if the assertions were removed, a + * feature flag would have to be added. + */ + ASSERT0(dbi->dbi_len % 8); dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, @@ -169,6 +188,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) return (0); } +/* + * Fill in the drr_free struct, or perform aggregation if the previous record is + * also a free record, and the two are adjacent. + * + * Note that we send free records even for a full send, because we want to be + * able to receive a full send as a clone, which requires a list of all the free + * and freeobject records that were generated on the source. + */ static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) @@ -180,7 +207,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given - * object+offset. We know that the one-record constraint is + * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * @@ -192,15 +219,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); - /* - * If we are doing a non-incremental send, then there can't - * be any data in the dataset we're receiving into. Therefore - * a free record would simply be a no-op. Save space by not - * sending it to begin with. - */ - if (!dsp->dsa_incremental) - return (0); - if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -378,10 +396,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); - /* See comment in dump_free(). */ - if (!dsp->dsa_incremental) - return (0); - /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -428,6 +442,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); @@ -509,6 +536,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t record_size; int err = 0; + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + if (sta->cancel) return (SET_ERROR(EINTR)); @@ -545,8 +575,10 @@ send_traverse_thread(void *arg) struct send_block_record *data; if (st_arg->ds != NULL) { - err = traverse_dataset(st_arg->ds, st_arg->fromtxg, - st_arg->flags, send_cb, arg); + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + if (err != EINTR) st_arg->error_code = err; } @@ -575,6 +607,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT3U(zb->zb_level, >=, 0); + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); + if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); @@ -637,6 +672,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) uint64_t offset; ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { @@ -697,8 +736,10 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, - boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) + zfs_bookmark_phys_t *ancestor_zb, + boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, + vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; @@ -707,6 +748,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, uint64_t fromtxg = 0; uint64_t featureflags = 0; struct send_thread_arg to_arg; + void *payload = NULL; + size_t payload_len = 0; struct send_block_record *to_data; err = dmu_objset_from_ds(to_ds, &os); @@ -721,6 +764,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); + bzero(&to_arg, sizeof (to_arg)); + #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; @@ -746,6 +791,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); @@ -757,6 +806,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = @@ -779,8 +830,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); @@ -789,7 +841,26 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); - if (dump_record(dsp, NULL, 0) != 0) { + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + nvlist_t *nvl; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { err = dsp->dsa_err; goto out; } @@ -899,19 +970,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, outfd, 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, outfd, 0, 0, vp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int -dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, - int outfd, vnode_t *vp, offset_t *off) +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, + vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; @@ -978,10 +1049,12 @@ dmu_send(const char *tosnap, const char *fromsnap, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, vp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, vp, off); } if (owned) dsl_dataset_disown(ds, FTAG); @@ -1221,6 +1294,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || @@ -1233,6 +1307,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the @@ -1269,7 +1347,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE) { + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -1278,7 +1356,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ - char buf[MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; /* * If it's a non-clone incremental, we are missing the @@ -1288,8 +1366,17 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) drba->drba_origin)) return (SET_ERROR(ENOENT)); + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); + /* Open the parent of tofs */ - ASSERT3U(strlen(tofs), <, MAXNAMELEN); + ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) @@ -1327,7 +1414,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } - if (dsl_dataset_phys(origin)->ds_guid != fromguid) { + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); @@ -1345,15 +1433,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; - uint64_t crflags; + uint64_t crflags = 0; - crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? - DS_FLAG_CI_DATASET : 0; + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1391,6 +1480,32 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + if (drba->drba_cookie->drc_resumable) { + uint64_t one = 1; + uint64_t zero = 0; + + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1408,56 +1523,194 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal_ds(newds, "receive", tx, ""); } +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + uint64_t val; + + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; - dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_resumable = resumable; drc->drc_cred = CRED(); - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - else if (drrb->drr_magic != DMU_BACKUP_MAGIC) - return (SET_ERROR(EINVAL)); - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (drc->drc_byteswap) { - fletcher_4_incremental_byteswap(drr, + fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - fletcher_4_incremental_native(drr, + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - - if (drc->drc_byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } else { + return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); - return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } } struct receive_record_arg { @@ -1469,6 +1722,7 @@ struct receive_record_arg { */ arc_buf_t *write_buf; int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; @@ -1477,6 +1731,7 @@ struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; + /* * These three args are used to signal to the main thread that we're * done. @@ -1484,15 +1739,34 @@ struct receive_writer_arg { kmutex_t mutex; kcondvar_t cv; boolean_t done; + int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object, last_offset; + uint64_t bytes_read; /* bytes read when current record created */ +}; + +struct objlist { + list_t list; /* List of struct receive_objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t last_lookup; +}; + +struct receive_objnode { + list_node_t node; + uint64_t object; }; struct receive_arg { objset_t *os; vnode_t *vp; /* The vnode to read the stream from */ uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. @@ -1505,12 +1779,7 @@ struct receive_arg { int err; boolean_t byteswap; /* Sorted list of objects not to issue prefetches for. */ - list_t ignore_obj_list; -}; - -struct receive_ign_obj_node { - list_node_t node; - uint64_t object; + struct objlist ignore_objlist; }; typedef struct guid_map_entry { @@ -1553,7 +1822,10 @@ receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; - /* some things will require 8-byte alignment, so everything must */ + /* + * The code doesn't rely on this (lengths being multiples of 8). See + * comment in dump_bytes. + */ ASSERT0(len % 8); while (done < len) { @@ -1564,14 +1836,21 @@ receive_read(struct receive_arg *ra, int len, void *buf) ra->voff, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - if (resid == len - done) - ra->err = SET_ERROR(EINVAL); + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } + ra->bytes_read += len; + ASSERT3U(done, ==, len); return (0); } @@ -1675,6 +1954,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } } +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) @@ -1773,6 +2089,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); + return (0); } @@ -1782,13 +2099,14 @@ receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; + int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; @@ -1804,7 +2122,8 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); } - + if (next_err != ESRCH) + return (next_err); return (0); } @@ -1820,6 +2139,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -1842,8 +2173,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); + return (0); } @@ -1902,43 +2242,48 @@ receive_write_byref(struct receive_writer_arg *rwa, dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwnp, void *data) + struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; - if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); - if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); - if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); - if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrwnp->drr_object, - drrwnp->drr_offset, drrwnp->drr_length); + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - dmu_write_embedded(rwa->os, drrwnp->drr_object, - drrwnp->drr_offset, data, drrwnp->drr_etype, - drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } @@ -2012,10 +2357,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { - char name[MAXNAMELEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } } static void @@ -2044,12 +2395,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - err = receive_read(ra, len, ra->rrd->payload); + err = receive_read(ra, len, buf); if (err != 0) return (err); - receive_cksum(ra, len, ra->rrd->payload); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } } ra->prev_cksum = ra->cksum; @@ -2057,6 +2413,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; @@ -2096,6 +2453,70 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) return (0); } +static void +objlist_create(struct objlist *list) +{ + list_create(&list->list, sizeof (struct receive_objnode), + offsetof(struct receive_objnode, node)); + list->last_lookup = 0; +} + +static void +objlist_destroy(struct objlist *list) +{ + struct receive_objnode *n; + + for (n = list_remove_head(&list->list); + n != NULL; n = list_remove_head(&list->list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->list); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +static boolean_t +objlist_exists(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = list_head(&list->list); + ASSERT3U(object, >=, list->last_lookup); + list->last_lookup = object; + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&list->list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->list); + } + return (node != NULL && node->object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +static void +objlist_insert(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->object = object; +#ifdef ZFS_DEBUG + { + struct receive_objnode *last_object = list_tail(&list->list); + uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); + } +#endif + list_insert_tail(&list->list, node); +} + /* * Issue the prefetch reads for any necessary indirect blocks. * @@ -2118,13 +2539,7 @@ static void receive_read_prefetch(struct receive_arg *ra, uint64_t object, uint64_t offset, uint64_t length) { - struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); - while (node != NULL && node->object < object) { - VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); - kmem_free(node, sizeof (*node)); - node = list_head(&ra->ignore_obj_list); - } - if (node == NULL || node->object > object) { + if (!objlist_exists(&ra->ignore_objlist, object)) { dmu_prefetch(ra->os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } @@ -2157,20 +2572,7 @@ receive_read_record(struct receive_arg *ra) */ if (err == ENOENT || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { - struct receive_ign_obj_node *node = - kmem_zalloc(sizeof (*node), - KM_SLEEP); - node->object = drro->drr_object; -#ifdef ZFS_DEBUG - { - struct receive_ign_obj_node *last_object = - list_tail(&ra->ignore_obj_list); - uint64_t last_objnum = (last_object != NULL ? - last_object->object : 0); - ASSERT3U(node->object, >, last_objnum); - } -#endif - list_insert_tail(&ra->ignore_obj_list, node); + objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; } return (err); @@ -2236,7 +2638,7 @@ receive_read_record(struct receive_arg *ra) { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: @@ -2263,6 +2665,10 @@ receive_process_record(struct receive_writer_arg *rwa, { int err; + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2357,6 +2763,32 @@ receive_writer_thread(void *arg) mutex_exit(&rwa->mutex); } +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2377,7 +2809,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, struct receive_arg *ra; struct receive_writer_arg *rwa; int featureflags; - struct receive_ign_obj_node *n; + uint32_t payloadlen; + void *payload; + nvlist_t *begin_nvl = NULL; ra = kmem_zalloc(sizeof (*ra), KM_SLEEP); rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); @@ -2386,8 +2820,14 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra->cksum = drc->drc_cksum; ra->vp = vp; ra->voff = *voffp; - list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node), - offsetof(struct receive_ign_obj_node, node)); + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra->bytes_read), 1, &ra->bytes_read); + } + + objlist_create(&ra->ignore_objlist); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -2438,9 +2878,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, drc->drc_guid_to_ds_map = rwa->guid_to_ds_map; } - err = receive_read_payload_and_next_header(ra, 0, NULL); - if (err) + payloadlen = drc->drc_drr_begin->drr_payloadlen; + payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(ra, begin_nvl); + if (err != 0) + goto out; + } (void) bqueue_init(&rwa->q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); @@ -2448,6 +2908,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = ra->os; rwa->byteswap = drc->drc_byteswap; + rwa->resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); @@ -2461,7 +2922,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the - * first loop and ra->rrd was never allocated, or it's later, and ra.rrd + * first loop and ra->rrd was never allocated, or it's later and ra->rrd * has been handed off to the writer thread who will free it. Finally, * if receive_read_record fails or we're at the end of the stream, then * we free ra->rrd and exit. @@ -2506,24 +2967,21 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, err = rwa->err; out: + nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* - * destroy what we created, so we don't leave it in the - * inconsistent restoring state. + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } *voffp = ra->voff; - - for (n = list_remove_head(&ra->ignore_obj_list); n != NULL; - n = list_remove_head(&ra->ignore_obj_list)) { - kmem_free(n, sizeof (*n)); - } - list_destroy(&ra->ignore_obj_list); + objlist_destroy(&ra->ignore_objlist); kmem_free(ra, sizeof (*ra)); kmem_free(rwa, sizeof (*rwa)); return (err); @@ -2674,6 +3132,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); @@ -2722,16 +3194,13 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) int error; #ifdef _KERNEL - char *name; - /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ - name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); - kmem_free(name, MAXNAMELEN); #endif error = dsl_sync_task(drc->drc_tofs, diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 44ba74181..0df12fac8 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -47,6 +47,7 @@ typedef struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; + zbookmark_phys_t pd_resume; } prefetch_data_t; typedef struct traverse_data { @@ -323,30 +324,29 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - dnode_phys_t *cdnp; + dnode_phys_t *child_dnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - cdnp = buf->b_data; + child_dnp = buf->b_data; - for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) { - prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) { - err = traverse_dnode(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; - dnode_phys_t *mdnp, *gdnp, *udnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); @@ -354,11 +354,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, goto post; osp = buf->b_data; - mdnp = &osp->os_meta_dnode; - gdnp = &osp->os_groupused_dnode; - udnp = &osp->os_userused_dnode; - - prefetch_dnode_metadata(td, mdnp, zb->zb_objset, + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); /* * See the block comment above for the goal of this variable. @@ -370,21 +366,21 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_realloc_possible = B_FALSE; if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - prefetch_dnode_metadata(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); - prefetch_dnode_metadata(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, mdnp, zb->zb_objset, + err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); + err = traverse_dnode(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + err = traverse_dnode(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } } @@ -416,9 +412,15 @@ post: * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. */ - td->td_resume->zb_blkid = zb->zb_blkid << - (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } td->td_paused = B_TRUE; } @@ -450,6 +452,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); @@ -527,6 +533,7 @@ traverse_prefetch_thread(void *arg) td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -556,12 +563,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); - /* - * The data prefetching mechanism (the prefetch thread) is incompatible - * with resuming from a bookmark. - */ - ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); - td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP); pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP); czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); @@ -586,6 +587,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, } pd->pd_flags = flags; + if (resume != NULL) + pd->pd_resume = *resume; mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); @@ -638,11 +641,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, * in syncing context). */ int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, - blkptr_cb_t func, void *arg) +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); } int @@ -675,7 +686,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, /* visit each dataset */ for (obj = 1; err == 0; - err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 447a3a2dc..5a7f034ce 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -34,10 +34,10 @@ static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, dsl_dataset_t **dsp, void *tag, char **shortnamep) { - char buf[MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; char *hashp; - if (strlen(fullname) >= MAXNAMELEN) + if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); hashp = strchr(fullname, '#'); if (hashp == NULL) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 9c275b234..5b7de74dc 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -25,6 +25,7 @@ * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. */ #include <sys/dmu_objset.h> @@ -52,6 +53,9 @@ #include <sys/dsl_userhold.h> #include <sys/dsl_bookmark.h> #include <sys/policy.h> +#include <sys/dmu_send.h> +#include <sys/zio_compress.h> +#include <zfs_fletcher.h> /* * The SPA supports block sizes up to 16MB. However, very large blocks @@ -75,6 +79,8 @@ int zfs_max_recordsize = 1 * 1024 * 1024; extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); +extern int spa_asize_inflation; + /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been @@ -664,22 +670,38 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) dsl_dir_name(ds->ds_dir, name); VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { - (void) strcat(name, "@"); + VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); - (void) strcat(name, ds->ds_snapname); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&ds->ds_lock); } else { - (void) strcat(name, ds->ds_snapname); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); } } } } +int +dsl_dataset_namelen(dsl_dataset_t *ds) +{ + int len; + VERIFY0(dsl_dataset_get_snapname(ds)); + mutex_enter(&ds->ds_lock); + len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname); + mutex_exit(&ds->ds_lock); + return (len); +} + void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { @@ -704,6 +726,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; @@ -714,6 +737,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) return (gotit); } +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + static void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { @@ -1238,10 +1271,10 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) int error = 0; dsl_dataset_t *ds; char *name, *atp; - char dsname[MAXNAMELEN]; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); - if (strlen(name) >= MAXNAMELEN) + if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); if (error == 0) { atp = strchr(name, '@'); @@ -1414,7 +1447,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { dsl_dataset_t *ds; char *name, *atp; - char dsname[MAXNAMELEN]; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); atp = strchr(name, '@'); @@ -1461,7 +1494,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) suspended = fnvlist_alloc(); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { - char fsname[MAXNAMELEN]; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *snapname = nvpair_name(pair); char *atp; void *cookie; @@ -1615,6 +1648,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + dmu_objset_sync(ds->ds_objset, zio, tx); for (f = 0; f < SPA_FEATURES; f++) { @@ -1655,7 +1703,7 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; - char buf[ZFS_MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); @@ -1670,6 +1718,78 @@ fail: nvlist_free(propval); } +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + zio_cksum_t cksum; + char *propval; + char buf[MAXNAMELEN]; + int i; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + fletcher_4_native(compressed, compressed_size, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + strfree(propval); + } +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { @@ -1693,7 +1813,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) get_clones_stat(ds, nv); } else { if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { - char buf[MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, buf); dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); } @@ -1743,6 +1863,32 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) } } + if (!dsl_dataset_is_snapshot(ds)) { + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } } void @@ -1863,7 +2009,7 @@ dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(hds->ds_dir) + 1 + - strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); return (error); @@ -1970,7 +2116,8 @@ dsl_dataset_rename_snapshot(const char *fsname, * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away - * when we release the long hold. + * when we release the long hold. Allow a long hold to exist for volumes, this + * may occur when asynchronously registering the minor with the kernel. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) @@ -1985,7 +2132,7 @@ dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) dsl_dataset_long_rele(ds, owner); } - held = dsl_dataset_long_held(ds); + held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag)); if (owner != NULL) dsl_dataset_long_hold(ds, owner); @@ -2095,7 +2242,7 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds, *clone; uint64_t cloneobj; - char namebuf[ZFS_MAXNAMELEN]; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); @@ -2648,7 +2795,7 @@ promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) * Promote a clone. * * If it fails due to a conflicting snapshot name, "conflsnap" will be filled - * in with the name. (It must be at least MAXNAMELEN bytes long.) + * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) */ int dsl_dataset_promote(const char *name, char *conflsnap) @@ -2685,6 +2832,11 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { + /* + * "slack" factor for received datasets with refquota set on them. + * See the bottom of this function for details on its use. + */ + uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation; int64_t unused_refres_delta; /* they should both be heads */ @@ -2727,10 +2879,22 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); - /* clone can't be over the head's refquota */ + /* + * The clone can't be too much over the head's refquota. + * + * To ensure that the entire refquota can be used, we allow one + * transaction to exceed the the refquota. Therefore, this check + * needs to also allow for the space referenced to be more than the + * refquota. The maximum amount of space that one transaction can use + * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this + * overage ensures that we are able to receive a filesystem that + * exceeds the refquota on the source system. + * + * So that overage is the refquota_slack we use below. + */ if (origin_head->ds_quota != 0 && dsl_dataset_phys(clone)->ds_referenced_bytes > - origin_head->ds_quota) + origin_head->ds_quota + refquota_slack) return (SET_ERROR(EDQUOT)); return (0); @@ -2745,8 +2909,13 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, int64_t unused_refres_delta; ASSERT(clone->ds_reserved == 0); + /* + * NOTE: On DEBUG kernels there could be a race between this and + * the check function if spa_asize_inflation is adjusted... + */ ASSERT(origin_head->ds_quota == 0 || - dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota); + dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + + DMU_MAX_ACCESS * spa_asize_inflation); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); /* @@ -3391,6 +3560,23 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} + #if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_LP64) module_param(zfs_max_recordsize, int, 0644); diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 952422be2..eb39cff57 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ /* @@ -330,7 +330,7 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); + source = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { @@ -370,7 +370,7 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) nvlist_free(sp_nvp); } - kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + kmem_free(source, ZFS_MAX_DATASET_NAME_LEN); kmem_free(baseza, sizeof (zap_attribute_t)); kmem_free(basezc, sizeof (zap_cursor_t)); kmem_free(za, sizeof (zap_attribute_t)); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index d7c34c9a4..716081ba3 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -978,9 +978,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { - boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + dmu_objset_rele(os, FTAG); - if (inconsistent) + if (need_destroy) (void) dsl_destroy_head(dsname); } return (0); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 8983e0793..ae67b362e 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -299,13 +299,14 @@ dsl_dir_async_rele(dsl_dir_t *dd, void *tag) dmu_buf_rele(dd->dd_dbuf, tag); } -/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ +/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); - (void) strcat(buf, "/"); + VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); } else { buf[0] = '\0'; } @@ -315,10 +316,12 @@ dsl_dir_name(dsl_dir_t *dd, char *buf) * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); - (void) strcat(buf, dd->dd_myname); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&dd->dd_lock); } else { - (void) strcat(buf, dd->dd_myname); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); } } @@ -367,12 +370,12 @@ getcomponent(const char *path, char *component, const char **nextp) if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); - if (strlen(path) >= MAXNAMELEN) + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { - if (p - path >= MAXNAMELEN) + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; @@ -384,7 +387,7 @@ getcomponent(const char *path, char *component, const char **nextp) */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); - if (p - path >= MAXNAMELEN) + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; @@ -412,7 +415,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t *dd; uint64_t ddobj; - buf = kmem_alloc(MAXNAMELEN, KM_SLEEP); + buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); err = getcomponent(name, buf, &next); if (err != 0) goto error; @@ -479,7 +482,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, *tailp = next; *ddp = dd; error: - kmem_free(buf, MAXNAMELEN); + kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); return (err); } @@ -974,7 +977,7 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; - char buf[MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); @@ -1691,11 +1694,11 @@ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { int *deltap = arg; - char namebuf[MAXNAMELEN]; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, namebuf); - if (strlen(namebuf) + *deltap >= MAXNAMELEN) + if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 361473275..66e899a57 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright 2015, Joyent, Inc. */ @@ -1095,7 +1095,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err = 0; - char setpoint[MAXNAMELEN]; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 72163521e..6c5f1f0b5 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1115,7 +1115,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ - char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", @@ -1146,7 +1146,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); - dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", @@ -1154,7 +1154,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_pausing); - kmem_free(dsname, ZFS_MAXNAMELEN); + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_pausing) goto out; diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 1b234ed48..a6d1aa937 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ @@ -181,7 +181,7 @@ dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, } typedef struct zfs_hold_cleanup_arg { - char zhca_spaname[MAXNAMELEN]; + char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN]; uint64_t zhca_spa_load_guid; nvlist_t *zhca_holds; } zfs_hold_cleanup_arg_t; @@ -580,7 +580,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, error = dsl_dataset_hold_obj_string(tmpdp, nvpair_name(pair), FTAG, &ds); if (error == 0) { - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, name); dsl_pool_config_exit(tmpdp, FTAG); dsl_dataset_rele(ds, FTAG); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d1aefe585..26181af84 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -361,8 +361,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) break; } - strval = kmem_alloc( - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); @@ -375,8 +374,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) - kmem_free(strval, - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; @@ -2018,6 +2016,16 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (0); } +/* ARGSUSED */ +int +verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + return (0); +} + static int spa_load_verify(spa_t *spa) { @@ -2032,6 +2040,14 @@ spa_load_verify(spa_t *spa) if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + error = dmu_objset_find_dp(spa->spa_dsl_pool, + spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, + DS_FIND_CHILDREN); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + if (error != 0) + return (error); + rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 01aa4641e..cf6fc224a 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -493,7 +493,7 @@ spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; - char namebuf[MAXNAMELEN]; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); @@ -512,7 +512,7 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; - char namebuf[MAXNAMELEN]; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index e47cfc878..d279d1828 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -749,12 +749,13 @@ zfsctl_snapshot_path_objset(zfs_sb_t *zsb, uint64_t objsetid, return (ENOENT); cookie = spl_fstrans_mark(); - snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN, - snapname, &id, &pos, &case_conflict); + error = dmu_snapshot_list_next(zsb->z_os, + ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, + &case_conflict); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto out; @@ -767,7 +768,7 @@ zfsctl_snapshot_path_objset(zfs_sb_t *zsb, uint64_t objsetid, snprintf(full_path, path_len - 1, "%s/.zfs/snapshot/%s", zsb->z_mntopts->z_mntpoint, snapname); out: - kmem_free(snapname, MAXNAMELEN); + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); return (error); @@ -854,14 +855,14 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm, ZFS_ENTER(zsb); - to = kmem_alloc(MAXNAMELEN, KM_SLEEP); - from = kmem_alloc(MAXNAMELEN, KM_SLEEP); - real = kmem_alloc(MAXNAMELEN, KM_SLEEP); - fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zsb->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zsb->z_os, snm, real, - MAXNAMELEN, NULL); + ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { snm = real; } else if (error != ENOTSUP) { @@ -871,9 +872,11 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm, dmu_objset_name(zsb->z_os, fsname); - error = zfsctl_snapshot_name(ITOZSB(sdip), snm, MAXNAMELEN, from); + error = zfsctl_snapshot_name(ITOZSB(sdip), snm, + ZFS_MAX_DATASET_NAME_LEN, from); if (error == 0) - error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, MAXNAMELEN, to); + error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, + ZFS_MAX_DATASET_NAME_LEN, to); if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); if (error != 0) @@ -903,10 +906,10 @@ zfsctl_snapdir_rename(struct inode *sdip, char *snm, rw_exit(&zfs_snapshot_lock); out: - kmem_free(from, MAXNAMELEN); - kmem_free(to, MAXNAMELEN); - kmem_free(real, MAXNAMELEN); - kmem_free(fsname, MAXNAMELEN); + kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); ZFS_EXIT(zsb); @@ -929,12 +932,12 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) ZFS_ENTER(zsb); - snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - real = kmem_alloc(MAXNAMELEN, KM_SLEEP); + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zsb->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zsb->z_os, name, real, - MAXNAMELEN, NULL); + ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { name = real; } else if (error != ENOTSUP) { @@ -942,7 +945,8 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) } } - error = zfsctl_snapshot_name(ITOZSB(dip), name, MAXNAMELEN, snapname); + error = zfsctl_snapshot_name(ITOZSB(dip), name, + ZFS_MAX_DATASET_NAME_LEN, snapname); if (error == 0) error = zfs_secpolicy_destroy_perms(snapname, cr); if (error != 0) @@ -952,8 +956,8 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) if ((error == 0) || (error == ENOENT)) error = dsl_destroy_snapshot(snapname, B_FALSE); out: - kmem_free(snapname, MAXNAMELEN); - kmem_free(real, MAXNAMELEN); + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); ZFS_EXIT(zsb); @@ -975,7 +979,7 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, if (!zfs_admin_snapshot) return (EACCES); - dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { error = SET_ERROR(EILSEQ); @@ -997,7 +1001,7 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, 0, cr, NULL, NULL); } out: - kmem_free(dsname, MAXNAMELEN); + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); return (error); } @@ -1075,11 +1079,11 @@ zfsctl_snapshot_mount(struct path *path, int flags) zsb = ITOZSB(ip); ZFS_ENTER(zsb); - full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP); + full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_name(zsb, dname(dentry), - MAXNAMELEN, full_name); + ZFS_MAX_DATASET_NAME_LEN, full_name); if (error) goto error; @@ -1153,7 +1157,7 @@ zfsctl_snapshot_mount(struct path *path, int flags) } path_put(&spath); error: - kmem_free(full_name, MAXNAMELEN); + kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); ZFS_EXIT(zsb); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 30338ac14..09f83a5cf 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek <[email protected]> * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. @@ -603,7 +604,7 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; - char setpoint[MAXNAMELEN]; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) @@ -845,7 +846,7 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, @@ -898,7 +899,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; @@ -944,6 +945,13 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) ZFS_DELEG_PERM_CREATE, cr)); } +/* ARGSUSED */ +static int +zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_recv(zc, innvl, cr)); +} + int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { @@ -1068,7 +1076,7 @@ zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; @@ -1211,7 +1219,7 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { - char fsname[MAXNAMELEN]; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); @@ -1232,7 +1240,7 @@ zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char fsname[MAXNAMELEN]; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); @@ -2252,7 +2260,8 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ - if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= + ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } @@ -3040,7 +3049,7 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; spa_t *spa; uint64_t spa_vers; @@ -3406,7 +3415,7 @@ zfs_destroy_unmount_origin(const char *fsname) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { - char originname[MAXNAMELEN]; + char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); (void) zfs_unmount_snap(originname); @@ -3990,77 +3999,93 @@ next: } } +/* + * Extract properties that cannot be set PRIOR to the receipt of a dataset. + * For example, refquota cannot be set until after the receipt of a dataset, + * because in replication streams, an older/earlier snapshot may exceed the + * refquota. We want to receive the older/earlier snapshot, but setting + * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent + * the older/earlier snapshot from being received (with EDQUOT). + * + * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. + * + * libzfs will need to be judicious handling errors encountered by props + * extracted by this function. + */ +static nvlist_t * +extract_delay_props(nvlist_t *props) +{ + nvlist_t *delayprops; + nvpair_t *nvp, *tmp; + static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; + int i; + + VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(props, nvp)) { + /* + * strcmp() is safe because zfs_prop_to_name() always returns + * a bounded string. + */ + for (i = 0; delayable[i] != 0; i++) { + if (strcmp(zfs_prop_to_name(delayable[i]), + nvpair_name(nvp)) == 0) { + break; + } + } + if (delayable[i] != 0) { + tmp = nvlist_prev_nvpair(props, nvp); + VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); + VERIFY(nvlist_remove_nvpair(props, nvp) == 0); + nvp = tmp; + } + } + + if (nvlist_empty(delayprops)) { + nvlist_free(delayprops); + delayprops = NULL; + } + return (delayprops); +} + #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* - * inputs: - * zc_name name of containing filesystem - * zc_nvlist_src{_size} nvlist of properties to apply - * zc_value name of snapshot to create - * zc_string name of clone origin (if DRR_FLAG_CLONE) - * zc_cookie file descriptor to recv from - * zc_begin_record the BEGIN record of the stream (not byteswapped) - * zc_guid force flag - * zc_cleanup_fd cleanup-on-exit file descriptor - * zc_action_handle handle for this guid/ds mapping (or zero on first call) - * - * outputs: - * zc_cookie number of bytes read - * zc_nvlist_dst{_size} error for each unapplied received property - * zc_obj zprop_errflags_t - * zc_action_handle handle for this guid/ds mapping + * On failure the 'errors' nvlist may be allocated and will contain a + * descriptions of the failures. It's the callers responsibilty to free. */ static int -zfs_ioc_recv(zfs_cmd_t *zc) +zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, + nvlist_t *props, boolean_t force, boolean_t resumable, int input_fd, + dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, + uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { - file_t *fp; dmu_recv_cookie_t drc; - boolean_t force = (boolean_t)zc->zc_guid; - int fd; int error = 0; int props_error = 0; - nvlist_t *errors; offset_t off; - nvlist_t *props = NULL; /* sent properties */ + nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ nvlist_t *origprops = NULL; /* existing properties */ - char *origin = NULL; - char *tosnap; - char tofs[ZFS_MAXNAMELEN]; boolean_t first_recvd_props = B_FALSE; + file_t *input_fp; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) - return (SET_ERROR(EINVAL)); - - (void) strcpy(tofs, zc->zc_value); - tosnap = strchr(tofs, '@'); - *tosnap++ = '\0'; - - if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props)) != 0) - return (error); - - fd = zc->zc_cookie; - fp = getf(fd); - if (fp == NULL) { - nvlist_free(props); + *errors = NULL; + input_fp = getf(input_fd); + if (input_fp == NULL) return (SET_ERROR(EBADF)); - } - - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - if (zc->zc_string[0]) - origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, - &zc->zc_begin_record, force, origin, &drc); + begin_record, force, resumable, origin, &drc); if (error != 0) goto out; + *read_bytes = 0; + *errflags = 0; + *errors = fnvlist_alloc(); + /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if @@ -4090,14 +4115,14 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) - (void) nvlist_merge(errors, errlist, 0); + (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origprops, first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; + *errflags |= ZPROP_ERR_NOCLEAR; } else { - zc->zc_obj |= ZPROP_ERR_NOCLEAR; + *errflags |= ZPROP_ERR_NOCLEAR; } } @@ -4105,24 +4130,15 @@ zfs_ioc_recv(zfs_cmd_t *zc) props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { + delayprops = extract_delay_props(props); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - props, errors); + props, *errors); } } - if (zc->zc_nvlist_dst_size != 0 && - (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || - put_nvlist(zc, errors) != 0)) { - /* - * Caller made zc->zc_nvlist_dst less than the minimum expected - * size or supplied an invalid address. - */ - props_error = SET_ERROR(EINVAL); - } - - off = fp->f_offset; - error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd, - &zc->zc_action_handle); + off = input_fp->f_offset; + error = dmu_recv_stream(&drc, input_fp->f_vnode, &off, cleanup_fd, + action_handle); if (error == 0) { zfs_sb_t *zsb = NULL; @@ -4144,11 +4160,32 @@ zfs_ioc_recv(zfs_cmd_t *zc) } else { error = dmu_recv_end(&drc, NULL); } + + /* Set delayed properties now, after we're done receiving. */ + if (delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + delayprops, *errors); + } } - zc->zc_cookie = off - fp->f_offset; - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; + if (delayprops != NULL) { + /* + * Merge delayed props back in with initial props, in case + * we're DEBUG and zfs_ioc_recv_inject_err is set (which means + * we have to make sure clear_received_props() includes + * the delayed properties). + * + * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, + * using ASSERT() will be just like a VERIFY. + */ + ASSERT(nvlist_merge(props, delayprops, 0) == 0); + nvlist_free(delayprops); + } + + + *read_bytes = off - input_fp->f_offset; + if (VOP_SEEK(input_fp->f_vnode, input_fp->f_offset, &off, NULL) == 0) + input_fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { @@ -4167,14 +4204,14 @@ zfs_ioc_recv(zfs_cmd_t *zc) * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; + *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; + *errflags |= ZPROP_ERR_NORESTORE; } /* @@ -4191,14 +4228,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) * We stashed the original properties but failed to * restore them. */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; + *errflags |= ZPROP_ERR_NORESTORE; } } out: - nvlist_free(props); + releasef(input_fd); nvlist_free(origprops); - nvlist_free(errors); - releasef(fd); if (error == 0) error = props_error; @@ -4208,6 +4243,176 @@ out: /* * inputs: + * zc_name name of containing filesystem (unused) + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * zc_cleanup_fd cleanup-on-exit file descriptor + * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * + * outputs: + * zc_cookie number of bytes read + * zc_obj zprop_errflags_t + * zc_action_handle handle for this guid/ds mapping + * zc_nvlist_dst{_size} error for each unapplied received property + */ +static int +zfs_ioc_recv(zfs_cmd_t *zc) +{ + dmu_replay_record_t begin_record; + nvlist_t *errors = NULL; + nvlist_t *props = NULL; + char *origin = NULL; + char *tosnap; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + int error = 0; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) + return (SET_ERROR(EINVAL)); + + (void) strcpy(tofs, zc->zc_value); + tosnap = strchr(tofs, '@'); + *tosnap++ = '\0'; + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props)) != 0) + return (error); + + if (zc->zc_string[0]) + origin = zc->zc_string; + + begin_record.drr_type = DRR_BEGIN; + begin_record.drr_payloadlen = 0; + begin_record.drr_u.drr_begin = zc->zc_begin_record; + + error = zfs_ioc_recv_impl(tofs, tosnap, origin, props, zc->zc_guid, + B_FALSE, zc->zc_cookie, &begin_record, zc->zc_cleanup_fd, + &zc->zc_cookie, &zc->zc_obj, &zc->zc_action_handle, &errors); + nvlist_free(props); + + /* + * Now that all props, initial and delayed, are set, report the prop + * errors to the caller. + */ + if (zc->zc_nvlist_dst_size != 0 && errors != NULL && + (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || + put_nvlist(zc, errors) != 0)) { + /* + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. + */ + error = SET_ERROR(EINVAL); + } + + nvlist_free(errors); + + return (error); +} + +/* + * innvl: { + * "snapname" -> full name of the snapshot to create + * (optional) "props" -> properties to set (nvlist) + * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) + * "begin_record" -> non-byteswapped dmu_replay_record_t + * "input_fd" -> file descriptor to read stream from (int32) + * (optional) "force" -> force flag (value ignored) + * (optional) "resumable" -> resumable flag (value ignored) + * (optional) "cleanup_fd" -> cleanup-on-exit file descriptor + * (optional) "action_handle" -> handle for this guid/ds mapping + * } + * + * outnvl: { + * "read_bytes" -> number of bytes read + * "error_flags" -> zprop_errflags_t + * "action_handle" -> handle for this guid/ds mapping + * "errors" -> error for each unapplied received property (nvlist) + * } + */ +static int +zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + dmu_replay_record_t *begin_record; + uint_t begin_record_size; + nvlist_t *errors = NULL; + nvlist_t *props = NULL; + char *snapname = NULL; + char *origin = NULL; + char *tosnap; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + boolean_t force; + boolean_t resumable; + uint64_t action_handle = 0; + uint64_t read_bytes = 0; + uint64_t errflags = 0; + int input_fd = -1; + int cleanup_fd = -1; + int error; + + error = nvlist_lookup_string(innvl, "snapname", &snapname); + if (error != 0) + return (SET_ERROR(EINVAL)); + + if (dataset_namecheck(snapname, NULL, NULL) != 0 || + strchr(snapname, '@') == NULL || + strchr(snapname, '%')) + return (SET_ERROR(EINVAL)); + + (void) strcpy(tofs, snapname); + tosnap = strchr(tofs, '@'); + *tosnap++ = '\0'; + + error = nvlist_lookup_string(innvl, "origin", &origin); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_byte_array(innvl, "begin_record", + (uchar_t **) &begin_record, &begin_record_size); + if (error != 0 || begin_record_size != sizeof (*begin_record)) + return (SET_ERROR(EINVAL)); + + error = nvlist_lookup_int32(innvl, "input_fd", &input_fd); + if (error != 0) + return (SET_ERROR(EINVAL)); + + force = nvlist_exists(innvl, "force"); + resumable = nvlist_exists(innvl, "resumable"); + + error = nvlist_lookup_int32(innvl, "cleanup_fd", &cleanup_fd); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_uint64(innvl, "action_handle", &action_handle); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_nvlist(innvl, "props", &props); + if (error && error != ENOENT) + return (error); + + error = zfs_ioc_recv_impl(tofs, tosnap, origin, props, force, + resumable, input_fd, begin_record, cleanup_fd, &read_bytes, + &errflags, &action_handle, &errors); + + fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); + fnvlist_add_uint64(outnvl, "error_flags", errflags); + fnvlist_add_uint64(outnvl, "action_handle", action_handle); + fnvlist_add_nvlist(outnvl, "errors", errors); + + nvlist_free(errors); + nvlist_free(props); + + return (error); +} + +/* + * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) @@ -5182,6 +5387,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. * } * * outnvl is unused @@ -5197,6 +5404,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) file_t *fp; boolean_t largeblockok; boolean_t embedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5207,12 +5416,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + if ((fp = getf(fd)) == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, - fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, embedok, largeblockok, fd, + resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5470,6 +5682,10 @@ zfs_ioctl_init(void) POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, + zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index a72841c15..ef04b203d 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1020,7 +1020,7 @@ zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) statp->f_fsid.val[0] = (uint32_t)fsid; statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); statp->f_type = ZFS_SUPER_MAGIC; - statp->f_namelen = ZFS_MAXNAMELEN; + statp->f_namelen = MAXNAMELEN - 1; /* * We have all of 40 characters to stuff a string here. diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 988ffec29..863ccb930 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2080,7 +2080,7 @@ typedef struct zil_replay_arg { static int zil_replay_error(zilog_t *zilog, lr_t *lr, int error) { - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c index 089e3a1bc..8c75698e5 100644 --- a/module/zfs/zpl_inode.c +++ b/module/zfs/zpl_inode.c @@ -50,7 +50,7 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int zfs_flags = 0; zfs_sb_t *zsb = dentry->d_sb->s_fs_info; - if (dlen(dentry) > ZFS_MAXNAMELEN) + if (dlen(dentry) > ZFS_MAX_DATASET_NAME_LEN) return (ERR_PTR(-ENAMETOOLONG)); crhold(cr); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 9c89493ed..73277901f 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -61,7 +61,7 @@ unsigned long zvol_max_discard_blocks = 16384; static kmutex_t zvol_state_lock; static list_t zvol_state_list; -static char *zvol_tag = "zvol_tag"; +void *zvol_tag = "zvol_tag"; /* * The in-core state of each volume. |