aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--module/zfs/bpobj.c216
1 files changed, 176 insertions, 40 deletions
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index 66ab84df5..94bd1fd8f 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -377,11 +377,98 @@ bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
}
+/*
+ * Logically add subobj's contents to the parent bpobj.
+ *
+ * In the most general case, this is accomplished in constant time by adding
+ * a reference to subobj. This case is used when enqueuing a large subobj:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ *
+ * +--------------+ +--------------+
+ * | sub-bpobj |----------------------> | subsubobj |
+ * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
+ * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
+ *
+ * Result: sub-bpobj added to parent's subobj list.
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+-----+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
+ * +----+----+----+----+----+ +-----+-----+-----+--|--+
+ * |
+ * /-----------------------------------------------------/
+ * v
+ * +--------------+ +--------------+
+ * | sub-bpobj |----------------------> | subsubobj |
+ * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
+ * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
+ *
+ *
+ * In a common case, the subobj is small: its bp's and its list of subobj's
+ * are each stored in a single block. In this case we copy the subobj's
+ * contents to the parent:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ * ^ ^
+ * +--------------+ | +--------------+ |
+ * | sub-bpobj |---------^------------> | subsubobj | ^
+ * +----+----+----+ | +-----+-----+--+ |
+ * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
+ * +----+----+ +-----+-----+
+ *
+ * Result: subobj destroyed, contents copied to parent:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
+ * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
+ * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
+ *
+ *
+ * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
+ * but retain the sub-bpobj:
+ * +--------------+ +--------------+
+ * | bpobj |----------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj |
+ * +----+----+----+----+----+ +-----+-----+-----+
+ * ^
+ * +--------------+ +--------------+ |
+ * | sub-bpobj |----------------------> | subsubobj | ^
+ * +----+----+----+----+---------+----+ +-----+-----+--+ |
+ * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
+ * +----+----+----+----+---------+----+ +-----+-----+
+ *
+ * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
+ * +--------------+ +--------------+
+ * | bpobj |-------------------->| subobj list |
+ * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
+ * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
+ * |
+ * /--------------------------------------------------------------/
+ * v
+ * +--------------+
+ * | sub-bpobj |
+ * +----+----+----+----+---------+----+
+ * | bp | bp | bp | bp | ... | bp |
+ * +----+----+----+----+---------+----+
+ */
void
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
{
bpobj_t subbpo;
uint64_t used, comp, uncomp, subsubobjs;
+ boolean_t copy_subsub = B_TRUE;
+ boolean_t copy_bps = B_TRUE;
ASSERT(bpobj_is_open(bpo));
ASSERT(subobj != 0);
@@ -406,61 +493,110 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
mutex_enter(&bpo->bpo_lock);
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
- if (bpo->bpo_phys->bpo_subobjs == 0) {
- bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
- DMU_OT_NONE, 0, tx);
- }
- ASSERTV(dmu_object_info_t doi);
- ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+ dmu_object_info_t doi;
- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
- sizeof (subobj), &subobj, tx);
- bpo->bpo_phys->bpo_num_subobjs++;
+ if (bpo->bpo_phys->bpo_subobjs != 0) {
+ ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ &doi));
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+ }
/*
* If subobj has only one block of subobjs, then move subobj's
- * subobjs to bpo's subobj list directly. This reduces
- * recursion in bpobj_iterate due to nested subobjs.
+ * subobjs to bpo's subobj list directly. This reduces recursion in
+ * bpobj_iterate due to nested subobjs.
*/
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
if (subsubobjs != 0) {
- dmu_object_info_t doi;
-
- VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
- if (doi.doi_max_offset == doi.doi_data_block_size) {
- dmu_buf_t *subdb;
- uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
-
- VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
- 0, FTAG, &subdb, 0));
- /*
- * Make sure that we are not asking dmu_write()
- * to write more data than we have in our buffer.
- */
- VERIFY3U(subdb->db_size, >=,
- numsubsub * sizeof (subobj));
- dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
- bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
- numsubsub * sizeof (subobj), subdb->db_data, tx);
- dmu_buf_rele(subdb, FTAG);
- bpo->bpo_phys->bpo_num_subobjs += numsubsub;
-
- dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
- subbpo.bpo_phys->bpo_subobjs = 0;
- VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
- subsubobjs, tx));
+ VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+ if (doi.doi_max_offset > doi.doi_data_block_size) {
+ copy_subsub = B_FALSE;
+ }
+ }
+
+ /*
+ * If, in addition to having only one block of subobj's, subobj has
+ * only one block of bp's, then move subobj's bp's to bpo's bp list
+ * directly. This reduces recursion in bpobj_iterate due to nested
+ * subobjs.
+ */
+ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
+ if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
+ copy_bps = B_FALSE;
+ }
+
+ if (copy_subsub && subsubobjs != 0) {
+ dmu_buf_t *subdb;
+ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+ VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
+ 0, FTAG, &subdb, 0));
+ /*
+ * Make sure that we are not asking dmu_write()
+ * to write more data than we have in our buffer.
+ */
+ VERIFY3U(subdb->db_size, >=,
+ numsubsub * sizeof (subobj));
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs =
+ dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
}
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ numsubsub * sizeof (subobj), subdb->db_data, tx);
+ dmu_buf_rele(subdb, FTAG);
+ bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+ subbpo.bpo_phys->bpo_subobjs = 0;
+ VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
}
+
+ if (copy_bps) {
+ dmu_buf_t *bps;
+ uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
+
+ ASSERT(copy_subsub);
+ VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
+ 0, FTAG, &bps, 0));
+
+ /*
+ * Make sure that we are not asking dmu_write()
+ * to write more data than we have in our buffer.
+ */
+ VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
+ dmu_write(bpo->bpo_os, bpo->bpo_object,
+ bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+ numbps * sizeof (blkptr_t),
+ bps->db_data, tx);
+ dmu_buf_rele(bps, FTAG);
+ bpo->bpo_phys->bpo_num_blkptrs += numbps;
+
+ bpobj_close(&subbpo);
+ VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
+ } else {
+ bpobj_close(&subbpo);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs =
+ dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
+ }
+
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+ }
+
bpo->bpo_phys->bpo_bytes += used;
bpo->bpo_phys->bpo_comp += comp;
bpo->bpo_phys->bpo_uncomp += uncomp;
mutex_exit(&bpo->bpo_lock);
- bpobj_close(&subbpo);
}
void