Raw sends must be able to decrease nlevels

Currently, when a raw zfs send file includes a DRR_OBJECT record that would decrease the number of levels of an existing object, the object is reallocated with dmu_object_reclaim() which creates the new dnode using the old object's nlevels. For non-raw sends this doesn't really matter, but raw sends require that nlevels on the receive side match that of the send side so that the checksum-of-MAC tree can be properly maintained. This patch corrects the issue by freeing the object completely before allocating it again in this case. This patch also corrects several issues with dnode_hold_impl() and related functions that prevented dnodes (particularly multi-slot dnodes) from being reallocated properly due to the fact that existing dnodes were not being fully cleaned up when they were freed. This patch adds a test to make sure that zfs recv functions properly with incremental streams containing dnodes of different sizes. Reviewed by: Matthew Ahrens <[email protected]> Reviewed-by: Jorgen Lundman <[email protected]> Signed-off-by: Tom Caputi <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #6821 Closes #6864
author: Tom Caputi <[email protected]> 2018-01-19 04:19:47 -0500
committer: Brian Behlendorf <[email protected]> 2018-02-02 11:43:11 -0800
commit: 047116ac76526d869e3f347afb5d81cc2b156fdf (patch)
tree: 8307d677ee02803b9341d3a7536e27cc71910194 /module/zfs
parent: d53bd7f5244a1cd0009d2f90d3ec9df22352fbb3 (diff)
4 files changed, 158 insertions, 13 deletions
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index e7412b750..f53da407f 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
 	return (err);
 }
 
-
 int
 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 63a4f98bf..2c2ed8fb3 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2455,10 +2455,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 	}
 
 	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
-	if (err != 0 && err != ENOENT)
+	if (err != 0 && err != ENOENT && err != EEXIST)
 		return (SET_ERROR(EINVAL));
-	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
 	if (drro->drr_object > rwa->max_object)
 		rwa->max_object = drro->drr_object;
@@ -2476,6 +2474,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		int nblkptr = deduce_nblkptr(drro->drr_bonustype,
 		    drro->drr_bonuslen);
 
+		object = drro->drr_object;
+
 		/* nblkptr will be bounded by the bonus size and type */
 		if (rwa->raw && nblkptr != drro->drr_nblkptr)
 			return (SET_ERROR(EINVAL));
@@ -2484,18 +2484,89 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		    (drro->drr_blksz != doi.doi_data_block_size ||
 		    nblkptr < doi.doi_nblkptr ||
 		    indblksz != doi.doi_metadata_block_size ||
-		    drro->drr_nlevels < doi.doi_indirection)) {
+		    drro->drr_nlevels < doi.doi_indirection ||
+		    drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT)) {
 			err = dmu_free_long_range_raw(rwa->os,
 			    drro->drr_object, 0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		} else if (drro->drr_blksz != doi.doi_data_block_size ||
-		    nblkptr < doi.doi_nblkptr) {
+		    nblkptr < doi.doi_nblkptr ||
+		    drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
 			err = dmu_free_long_range(rwa->os, drro->drr_object,
 			    0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		}
+
+		/*
+		 * The dmu does not currently support decreasing nlevels
+		 * on an object. For non-raw sends, this does not matter
+		 * and the new object can just use the previous one's nlevels.
+		 * For raw sends, however, the structure of the received dnode
+		 * (including nlevels) must match that of the send side.
+		 * Therefore, instead of using dmu_object_reclaim(), we must
+		 * free the object completely and call dmu_object_claim_dnsize()
+		 * instead.
+		 */
+		if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
+		    drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+			if (rwa->raw) {
+				err = dmu_free_long_object_raw(rwa->os,
+				    drro->drr_object);
+			} else {
+				err = dmu_free_long_object(rwa->os,
+				    drro->drr_object);
+			}
+			if (err != 0)
+				return (SET_ERROR(EINVAL));
+
+			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+			object = DMU_NEW_OBJECT;
+		}
+	} else if (err == EEXIST) {
+		/*
+		 * The object requested is currently an interior slot of a
+		 * multi-slot dnode. This will be resolved when the next txg
+		 * is synced out, since the send stream will have told us
+		 * to free this slot when we freed the associated dnode
+		 * earlier in the stream.
+		 */
+		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+		object = drro->drr_object;
+	} else {
+		/* object is free and we are about to allocate a new one */
+		object = DMU_NEW_OBJECT;
+	}
+
+	/*
+	 * If this is a multi-slot dnode there is a chance that this
+	 * object will expand into a slot that is already used by
+	 * another object from the previous snapshot. We must free
+	 * these objects before we attempt to allocate the new dnode.
+	 */
+	if (drro->drr_dn_slots > 1) {
+		for (uint64_t slot = drro->drr_object + 1;
+		    slot < drro->drr_object + drro->drr_dn_slots;
+		    slot++) {
+			dmu_object_info_t slot_doi;
+
+			err = dmu_object_info(rwa->os, slot, &slot_doi);
+			if (err == ENOENT || err == EEXIST)
+				continue;
+			else if (err != 0)
+				return (err);
+
+			if (rwa->raw)
+				err = dmu_free_long_object_raw(rwa->os, slot);
+			else
+				err = dmu_free_long_object(rwa->os, slot);
+
+			if (err != 0)
+				return (err);
+		}
+
+		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 	}
 
 	tx = dmu_tx_create(rwa->os);
@@ -2849,6 +2920,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 		dmu_tx_abort(tx);
 		return (err);
 	}
+
 	if (rwa->raw) {
 		VERIFY0(dmu_object_dirty_raw(rwa->os, drrs->drr_object, tx));
 		dmu_buf_will_change_crypt_params(db_spill, tx);
@@ -3199,7 +3271,7 @@ receive_read_record(struct receive_arg *ra)
 		 * See receive_read_prefetch for an explanation why we're
 		 * storing this object in the ignore_obj_list.
 		 */
-		if (err == ENOENT ||
+		if (err == ENOENT || err == EEXIST ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
 			objlist_insert(&ra->ignore_objlist, drro->drr_object);
 			err = 0;
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 544e736d8..b4c131e98 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
 	{ "dnode_hold_free_txg",		KSTAT_DATA_UINT64 },
+	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
@@ -518,7 +519,8 @@ dnode_destroy(dnode_t *dn)
 	mutex_exit(&os->os_lock);
 
 	/* the dnode can no longer move, so we can release the handle */
-	zrl_remove(&dn->dn_handle->dnh_zrlock);
+	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+		zrl_remove(&dn->dn_handle->dnh_zrlock);
 
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
@@ -665,6 +667,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
 
 	dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+	dnode_free_interior_slots(dn);
 	DNODE_STAT_BUMP(dnode_reallocate);
 
 	/* clean up any unreferenced dbufs */
@@ -1067,19 +1071,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
 }
 
 static boolean_t
-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 {
 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
 	for (int i = idx; i < idx + slots; i++) {
 		dnode_handle_t *dnh = &children->dnc_children[i];
-		if (dnh->dnh_dnode != ptr)
+		dnode_t *dn = dnh->dnh_dnode;
+
+		if (dn == DN_SLOT_FREE) {
+			continue;
+		} else if (DN_SLOT_IS_PTR(dn)) {
+			mutex_enter(&dn->dn_mtx);
+			dmu_object_type_t type = dn->dn_type;
+			mutex_exit(&dn->dn_mtx);
+
+			if (type != DMU_OT_NONE)
+				return (B_FALSE);
+
+			continue;
+		} else {
 			return (B_FALSE);
+		}
+
+		return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+
+		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+			dnode_destroy(dnh->dnh_dnode);
+			dnh->dnh_dnode = DN_SLOT_FREE;
+		}
+	}
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+	int idx = (dn->dn_object & (epb - 1)) + 1;
+	int slots = dn->dn_num_slots - 1;
+
+	if (slots == 0)
+		return;
+
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	while (!dnode_slots_tryenter(children, idx, slots))
+		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+	dnode_slots_rele(children, idx, slots);
+}
+
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
@@ -1377,7 +1435,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 		while (dn == DN_SLOT_UNINIT) {
 			dnode_slots_hold(dnc, idx, slots);
 
-			if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+			if (!dnode_check_slots_free(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_free_misses);
 				dnode_slots_rele(dnc, idx, slots);
 				dbuf_rele(db, FTAG);
@@ -1390,15 +1448,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 				continue;
 			}
 
-			if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+			if (!dnode_check_slots_free(dnc, idx, slots)) {
 				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
 				dnode_slots_rele(dnc, idx, slots);
 				dbuf_rele(db, FTAG);
 				return (SET_ERROR(ENOSPC));
 			}
 
+			/*
+			 * Allocated but otherwise free dnodes which would
+			 * be in the interior of a multi-slot dnodes need
+			 * to be freed.  Single slot dnodes can be safely
+			 * re-purposed as a performance optimization.
+			 */
+			if (slots > 1)
+				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
 			dnh = &dnc->dnc_children[idx];
-			dn = dnode_create(os, dn_block + idx, db, object, dnh);
+			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+				dn = dnh->dnh_dnode;
+			} else {
+				dn = dnode_create(os, dn_block + idx, db,
+				    object, dnh);
+			}
 		}
 
 		mutex_enter(&dn->dn_mtx);
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 09437993a..7d3850a5f 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -529,6 +529,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	if (dn->dn_allocated_txg != dn->dn_free_txg)
 		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
 	bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+	dnode_free_interior_slots(dn);
 
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_type = DMU_OT_NONE;
@@ -536,6 +537,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
 	dn->dn_have_spill = B_FALSE;
+	dn->dn_num_slots = 1;
 	mutex_exit(&dn->dn_mtx);
 
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
author	Tom Caputi <[email protected]>	2018-01-19 04:19:47 -0500
committer	Brian Behlendorf <[email protected]>	2018-02-02 11:43:11 -0800
commit	047116ac76526d869e3f347afb5d81cc2b156fdf (patch)
tree	8307d677ee02803b9341d3a7536e27cc71910194 /module/zfs
parent	d53bd7f5244a1cd0009d2f90d3ec9df22352fbb3 (diff)