diff options
-rw-r--r-- | config/zfs-build.m4 | 25 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | etc/init.d/zfs.in | 4 | ||||
-rw-r--r-- | include/sys/dmu.h | 5 | ||||
-rw-r--r-- | include/sys/dmu_impl.h | 2 | ||||
-rw-r--r-- | include/sys/dmu_objset.h | 3 | ||||
-rw-r--r-- | include/sys/dmu_tx.h | 23 | ||||
-rw-r--r-- | include/sys/dnode.h | 1 | ||||
-rw-r--r-- | include/sys/dsl_dataset.h | 3 | ||||
-rw-r--r-- | include/sys/dsl_dir.h | 5 | ||||
-rw-r--r-- | include/sys/spa.h | 2 | ||||
-rw-r--r-- | include/sys/trace_dmu.h | 25 | ||||
-rw-r--r-- | include/sys/zap_impl.h | 2 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 84 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 17 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 845 | ||||
-rw-r--r-- | module/zfs/dnode.c | 19 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 38 | ||||
-rw-r--r-- | module/zfs/dsl_dir.c | 40 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 3 | ||||
-rw-r--r-- | module/zfs/zap.c | 61 | ||||
-rw-r--r-- | module/zfs/zap_micro.c | 83 |
22 files changed, 232 insertions, 1059 deletions
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 8b969da36..6c5f13240 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -37,29 +37,6 @@ AC_DEFUN([ZFS_AC_DEBUG], [ AC_MSG_RESULT([$enable_debug]) ]) -AC_DEFUN([ZFS_AC_DEBUG_DMU_TX], [ - AC_ARG_ENABLE([debug-dmu-tx], - [AS_HELP_STRING([--enable-debug-dmu-tx], - [Enable dmu tx validation @<:@default=no@:>@])], - [], - [enable_debug_dmu_tx=no]) - - AS_IF([test "x$enable_debug_dmu_tx" = xyes], - [ - KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_DMU_TX" - DEBUG_DMU_TX="_with_debug_dmu_tx" - AC_DEFINE([DEBUG_DMU_TX], [1], - [Define to 1 to enabled dmu tx validation]) - ], - [ - DEBUG_DMU_TX="_without_debug_dmu_tx" - ]) - - AC_SUBST(DEBUG_DMU_TX) - AC_MSG_CHECKING([whether dmu tx validation is enabled]) - AC_MSG_RESULT([$enable_debug_dmu_tx]) -]) - AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE @@ -140,7 +117,7 @@ AC_DEFUN([ZFS_AC_RPM], [ AC_MSG_RESULT([$HAVE_RPMBUILD]) ]) - RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1" --define "$(DEBUG_DMU_TX) 1"' + RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)" --define "_initconfdir $(DEFAULT_INITCONF_DIR)" $(DEFINE_INITRAMFS)' RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)" --define "require_spldir $(SPL)" --define "require_splobj $(SPL_OBJ)" --define "ksrc $(LINUX)" --define "kobj $(LINUX_OBJ)"' RPM_DEFINE_DKMS= diff --git a/configure.ac b/configure.ac index f603eb1cd..c86a2f63d 100644 --- a/configure.ac +++ b/configure.ac @@ -55,7 +55,6 @@ ZFS_AC_LICENSE ZFS_AC_PACKAGE ZFS_AC_CONFIG ZFS_AC_DEBUG -ZFS_AC_DEBUG_DMU_TX AC_CONFIG_FILES([ Makefile diff --git a/etc/init.d/zfs.in b/etc/init.d/zfs.in index d81ef22c8..7998569b2 100644 --- a/etc/init.d/zfs.in +++ b/etc/init.d/zfs.in @@ -91,10 +91,6 @@ MOUNT_EXTRA_OPTIONS="" # Only applicable for Debian GNU/Linux {dkms,initramfs}. ZFS_DKMS_ENABLE_DEBUG='no' -# Build kernel modules with the --enable-debug-dmu-tx switch? -# Only applicable for Debian GNU/Linux {dkms,initramfs}. -ZFS_DKMS_ENABLE_DEBUG_DMU_TX='no' - # Keep debugging symbols in kernel modules? # Only applicable for Debian GNU/Linux {dkms,initramfs}. ZFS_DKMS_DISABLE_STRIP='no' diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 9c8ca7c36..6459047e8 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -657,11 +657,6 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* - * Tells if the given dbuf is freeable. - */ -boolean_t dmu_buf_freeable(dmu_buf_t *); - -/* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index ae129b7cf..65e417e3f 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -86,7 +86,6 @@ extern "C" { * held from: * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) - * dmu_tx_count_free: * dbuf_read_impl: db_mtx, dmu_zfetch() * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() * dbuf_new_size: db_mtx @@ -197,7 +196,6 @@ extern "C" { * dsl_prop_changed_notify: none (dd_prop_cbs) * dsl_prop_register: none (dd_prop_cbs) * dsl_prop_unregister: none (dd_prop_cbs) - * dsl_dataset_block_freeable: none (dd_sync_*) * * os_lock (leaf) * protects: diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 940785a53..f03f0779d 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -196,6 +196,7 @@ boolean_t dmu_objset_userobjspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); +void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 1ee513fdc..f16e1e858 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TX_H @@ -70,6 +70,9 @@ struct dmu_tx { /* has this transaction already been delayed? */ boolean_t tx_waited; + /* transaction is marked as being a "net free" of space */ + boolean_t tx_netfree; + /* time this transaction was created */ hrtime_t tx_start; @@ -77,14 +80,6 @@ struct dmu_tx { boolean_t tx_wait_dirty; int tx_err; -#ifdef DEBUG_DMU_TX - uint64_t tx_space_towrite; - uint64_t tx_space_tofree; - uint64_t tx_space_tooverwrite; - uint64_t tx_space_tounref; - refcount_t tx_space_written; - refcount_t tx_space_freed; -#endif }; enum dmu_tx_hold_type { @@ -103,16 +98,10 @@ typedef struct dmu_tx_hold { list_node_t txh_node; struct dnode *txh_dnode; refcount_t txh_space_towrite; - refcount_t txh_space_tofree; - refcount_t txh_space_tooverwrite; - refcount_t txh_space_tounref; refcount_t txh_memory_tohold; - refcount_t txh_fudge; -#ifdef DEBUG_DMU_TX enum dmu_tx_hold_type txh_type; uint64_t txh_arg1; uint64_t txh_arg2; -#endif } dmu_tx_hold_t; typedef struct dmu_tx_callback { @@ -172,12 +161,10 @@ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx); void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); -void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); -int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); -#ifdef DEBUG_DMU_TX +#ifdef ZFS_DEBUG #define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) #else #define DMU_TX_DIRTY_BUF(tx, db) diff --git a/include/sys/dnode.h b/include/sys/dnode.h index ebede2d06..a6a9ef822 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -344,7 +344,6 @@ void dnode_verify(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 9ca89dafa..f6499a760 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -286,9 +286,6 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async); -boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, - uint64_t blk_birth); -uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value); diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index fb299684c..69b0b6a53 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -137,8 +137,7 @@ uint64_t dsl_dir_space_available(dsl_dir_t *dd, void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, - dmu_tx_t *tx); + uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, diff --git a/include/sys/spa.h b/include/sys/spa.h index 58520118e..0f05d04ad 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -795,7 +795,7 @@ extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); diff --git a/include/sys/trace_dmu.h b/include/sys/trace_dmu.h index b2f37a6be..5ae59e563 100644 --- a/include/sys/trace_dmu.h +++ b/include/sys/trace_dmu.h @@ -54,14 +54,6 @@ DECLARE_EVENT_CLASS(zfs_delay_mintime_class, __field(hrtime_t, tx_start) __field(boolean_t, tx_wait_dirty) __field(int, tx_err) -#ifdef DEBUG_DMU_TX - __field(uint64_t, tx_space_towrite) - __field(uint64_t, tx_space_tofree) - __field(uint64_t, tx_space_tooverwrite) - __field(uint64_t, tx_space_tounref) - __field(int64_t, tx_space_written) - __field(int64_t, tx_space_freed) -#endif __field(uint64_t, min_tx_time) __field(uint64_t, dirty) ), @@ -74,32 +66,15 @@ DECLARE_EVENT_CLASS(zfs_delay_mintime_class, __entry->tx_start = tx->tx_start; __entry->tx_wait_dirty = tx->tx_wait_dirty; __entry->tx_err = tx->tx_err; -#ifdef DEBUG_DMU_TX - __entry->tx_space_towrite = tx->tx_space_towrite; - __entry->tx_space_tofree = tx->tx_space_tofree; - __entry->tx_space_tooverwrite = tx->tx_space_tooverwrite; - __entry->tx_space_tounref = tx->tx_space_tounref; - __entry->tx_space_written = tx->tx_space_written.rc_count; - __entry->tx_space_freed = tx->tx_space_freed.rc_count; -#endif __entry->dirty = dirty; __entry->min_tx_time = min_tx_time; ), TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu " "anyobj %d waited %d start %llu wait_dirty %d err %i " -#ifdef DEBUG_DMU_TX - "space_towrite %llu space_tofree %llu space_tooverwrite %llu " - "space_tounref %llu space_written %lli space_freed %lli " -#endif "} dirty %llu min_tx_time %llu", __entry->tx_txg, __entry->tx_lastsnap_txg, __entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited, __entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err, -#ifdef DEBUG_DMU_TX - __entry->tx_space_towrite, __entry->tx_space_tofree, - __entry->tx_space_tooverwrite, __entry->tx_space_tounref, - __entry->tx_space_written, __entry->tx_space_freed, -#endif __entry->dirty, __entry->min_tx_time) ); /* END CSTYLED */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index fb0f1a012..250dde3ce 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -216,8 +216,6 @@ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); void fzap_prefetch(zap_name_t *zn); -int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, - refcount_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 096f74a00..f11a11ff1 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1432,41 +1432,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&dn->dn_dbufs_mtx); } -static int -dbuf_block_freeable(dmu_buf_impl_t *db) -{ - dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; - uint64_t birth_txg = 0; - - /* - * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_last_dirty will be set - * so we'll ignore db_blkptr. - * - * This logic ensures that only block births for - * filled blocks are considered. - */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty && (db->db_blkptr == NULL || - !BP_IS_HOLE(db->db_blkptr))) { - birth_txg = db->db_last_dirty->dr_txg; - } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { - birth_txg = db->db_blkptr->blk_birth; - } - - /* - * If this block don't exist or is in a snapshot, it can't be freed. - * Don't pass the bp to dsl_dataset_block_freeable() since we - * are holding the db_mtx lock and might deadlock if we are - * prefetching a dedup-ed block. - */ - if (birth_txg != 0) - return (ds == NULL || - dsl_dataset_block_freeable(ds, NULL, birth_txg)); - else - return (B_FALSE); -} - void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { @@ -1516,7 +1481,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); - dnode_willuse_space(dn, size-osize, tx); + dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } @@ -1566,7 +1531,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; - boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); @@ -1688,15 +1652,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * Update the accounting. - * Note: we delay "free accounting" until after we drop - * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dsize() while - * also holding the db_mtx. - */ - dnode_willuse_space(dn, db->db.db_size, tx); - do_free_accounting = dbuf_block_freeable(db); + dmu_objset_willuse_space(os, db->db.db_size, tx); } /* @@ -1790,21 +1746,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drop_struct_lock = TRUE; } - if (do_free_accounting) { - blkptr_t *bp = db->db_blkptr; - int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dsize(os->os_spa, bp) : db->db.db_size; - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - ddt_prefetch(os->os_spa, bp); - dnode_willuse_space(dn, -willfree, tx); - } + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); @@ -3092,19 +3040,6 @@ dmu_buf_user_evict_wait() taskq_wait(dbu_evict_taskq); } -boolean_t -dmu_buf_freeable(dmu_buf_t *dbuf) -{ - boolean_t res = B_FALSE; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - - if (db->db_blkptr) - res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, - db->db_blkptr, db->db_blkptr->blk_birth); - - return (res); -} - blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { @@ -3891,7 +3826,6 @@ EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); -EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); /* BEGIN CSTYLED */ diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c83ca1b1a..21b68f9f7 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -2344,6 +2344,23 @@ dmu_fsname(const char *snapname, char *buf) return (0); } +/* + * Call when we think we're going to write/free space in open context to track + * the amount of dirty data in the open txg, which is also the amount + * of memory that can not be evicted until this txg syncs. + */ +void +dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); + + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); + } +} + #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 4d4c74f51..ebab5a341 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -29,10 +29,10 @@ #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ -#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> #include <sys/dsl_pool.h> -#include <sys/zap_impl.h> /* for fzap_default_block_shift */ +#include <sys/zap_impl.h> #include <sys/spa.h> #include <sys/sa.h> #include <sys/sa_impl.h> @@ -71,10 +71,6 @@ dmu_tx_create_dd(dsl_dir_t *dd) list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); -#ifdef DEBUG_DMU_TX - refcount_create(&tx->tx_space_written); - refcount_create(&tx->tx_space_freed); -#endif return (tx); } @@ -83,7 +79,6 @@ dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -138,16 +133,10 @@ dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, txh->txh_tx = tx; txh->txh_dnode = dn; refcount_create(&txh->txh_space_towrite); - refcount_create(&txh->txh_space_tofree); - refcount_create(&txh->txh_space_tooverwrite); - refcount_create(&txh->txh_space_tounref); refcount_create(&txh->txh_memory_tohold); - refcount_create(&txh->txh_fudge); -#ifdef DEBUG_DMU_TX txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; -#endif list_insert_tail(&tx->tx_holds, txh); return (txh); @@ -185,6 +174,34 @@ dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } +/* + * This function reads specified data from disk. The specified data will + * be needed to perform the transaction -- i.e, it will be read after + * we do dmu_tx_assign(). There are two reasons that we read the data now + * (before dmu_tx_assign()): + * + * 1. Reading it now has potentially better performance. The transaction + * has not yet been assigned, so the TXG is not held open, and also the + * caller typically has less locks held when calling dmu_tx_hold_*() than + * after the transaction has been assigned. This reduces the lock (and txg) + * hold times, thus reducing lock contention. + * + * 2. It is easier for callers (primarily the ZPL) to handle i/o errors + * that are detected before they start making changes to the DMU state + * (i.e. now). Once the transaction has been assigned, and some DMU + * state has been changed, it can be difficult to recover from an i/o + * error (e.g. to undo the changes already made in memory at the DMU + * layer). Typically code to do so does not exist in the caller -- it + * assumes that the data has already been cached and thus i/o errors are + * not possible. + * + * It has been observed that the i/o initiated here can be a performance + * problem, and it appears to be optional, because we don't look at the + * data which is read. However, removing this read would only serve to + * move the work elsewhere (after the dmu_tx_assign()), where it may + * have a greater impact on performance (in addition to the impact on + * fault tolerance noted above). + */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { @@ -201,260 +218,84 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } -static void -dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, - int level, uint64_t blkid, boolean_t freeable, uint64_t *history) -{ - objset_t *os = dn->dn_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - dmu_buf_impl_t *parent = NULL; - blkptr_t *bp = NULL; - uint64_t space; - - if (level >= dn->dn_nlevels || history[level] == blkid) - return; - - history[level] = blkid; - - space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); - - if (db == NULL || db == dn->dn_dbuf) { - ASSERT(level != 0); - db = NULL; - } else { - ASSERT(DB_DNODE(db) == dn); - ASSERT(db->db_level == level); - ASSERT(db->db.db_size == space); - ASSERT(db->db_blkid == blkid); - bp = db->db_blkptr; - parent = db->db_parent; - } - - freeable = (bp && (freeable || - dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); - - if (freeable) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - space, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - space, FTAG); - } - - if (bp) { - (void) refcount_add_many(&txh->txh_space_tounref, - bp_get_dsize(os->os_spa, bp), FTAG); - } - - dmu_tx_count_twig(txh, dn, parent, level + 1, - blkid >> epbs, freeable, history); -} - /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; - uint64_t start, end, i; - int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; - int l; if (len == 0) return; - min_bs = SPA_MINBLOCKSHIFT; - max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; - min_ibs = DN_MIN_INDBLKSHIFT; - max_ibs = DN_MAX_INDBLKSHIFT; + (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); - if (dn) { - uint64_t history[DN_MAX_LEVELS]; - int nlvls = dn->dn_nlevels; - int delta; - - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - if (dn->dn_maxblkid == 0) { - delta = dn->dn_datablksz; - start = (off < dn->dn_datablksz) ? 0 : 1; - end = (off+len <= dn->dn_datablksz) ? 0 : 1; - if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) - goto out; - delta -= off; - } - } else { - zio_t *zio = zio_root(dn->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - - /* first level-0 block */ - start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || - len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err) - goto out; - } + if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) + err = SET_ERROR(EFBIG); - /* last level-0 block */ - end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && end <= dn->dn_maxblkid && - P2PHASE(off+len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err) - goto out; - } + if (dn == NULL) + return; - /* level-1 blocks */ - if (nlvls > 1) { - int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = (start>>shft)+1; i < end>>shft; i++) { - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) - goto out; - } + /* + * For i/o error checking, read the blocks that will be needed + * to perform the write: the first and last level-0 blocks (if + * they are not aligned, i.e. if they are partial-block writes), + * and all the level-1 blocks. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; } - - err = zio_wait(zio); - if (err) - goto out; - delta = P2NPHASE(off, dn->dn_datablksz); - } - - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_maxblkid > 0) { - /* - * The blocksize can't change, - * so we can make a more precise estimate. - */ - ASSERT(dn->dn_datablkshift != 0); - min_bs = max_bs = dn->dn_datablkshift; - } else { - /* - * The blocksize can increase up to the recordsize, - * or if it is already more than the recordsize, - * up to the next power of 2. - */ - min_bs = highbit64(dn->dn_datablksz - 1); - max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); - /* - * If this write is not off the end of the file - * we need to account for overwrites/unref. - */ - if (start <= dn->dn_maxblkid) { - for (l = 0; l < DN_MAX_LEVELS; l++) - history[l] = -1ULL; + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } } - while (start <= dn->dn_maxblkid) { - dmu_buf_impl_t *db; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, - FALSE, FALSE, FTAG, &db); - rw_exit(&dn->dn_struct_rwlock); - if (err) { + /* last level-0 block */ + uint64_t end = (off + len - 1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off + len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err != 0) { txh->txh_tx->tx_err = err; - return; } + } - dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, - history); - dbuf_rele(db, FTAG); - if (++start > end) { - /* - * Account for new indirects appearing - * before this IO gets assigned into a txg. - */ - bits = 64 - min_bs; - epbs = min_ibs - SPA_BLKPTRSHIFT; - for (bits -= epbs * (nlvls - 1); - bits >= 0; bits -= epbs) { - (void) refcount_add_many( - &txh->txh_fudge, - 1ULL << max_ibs, FTAG); - } - goto out; + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (uint64_t i = (start >> shft) + 1; + i < end >> shft; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + txh->txh_tx->tx_err = err; + } } - off += delta; - if (len >= delta) - len -= delta; - delta = dn->dn_datablksz; } - } - - /* - * 'end' is the last thing we will access, not one past. - * This way we won't overflow when accessing the last byte. - */ - start = P2ALIGN(off, 1ULL << max_bs); - end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; - (void) refcount_add_many(&txh->txh_space_towrite, - end - start + 1, FTAG); - - start >>= min_bs; - end >>= min_bs; - epbs = min_ibs - SPA_BLKPTRSHIFT; - - /* - * The object contains at most 2^(64 - min_bs) blocks, - * and each indirect level maps 2^epbs. - */ - for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { - start >>= epbs; - end >>= epbs; - ASSERT3U(end, >=, start); - (void) refcount_add_many(&txh->txh_space_towrite, - (end - start + 1) << max_ibs, FTAG); - if (start != 0) { - /* - * We also need a new blkid=0 indirect block - * to reference any existing file data. - */ - (void) refcount_add_many(&txh->txh_space_towrite, - 1ULL << max_ibs, FTAG); + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; } } - -out: - if (refcount_count(&txh->txh_space_towrite) + - refcount_count(&txh->txh_space_tooverwrite) > - 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - - if (err) - txh->txh_tx->tx_err = err; } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { - dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); - uint64_t space = mdn->dn_datablksz + - ((uint64_t)(mdn->dn_nlevels-1) << mdn->dn_indblkshift); - - if (dn && dn->dn_dbuf->db_blkptr && - dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - space, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); - if (dn && dn->dn_dbuf->db_blkptr) { - (void) refcount_add_many(&txh->txh_space_tounref, - space, FTAG); - } - } + (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); } void @@ -490,179 +331,6 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } } -static void -dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - uint64_t blkid, nblks, lastblk; - uint64_t space = 0, unref = 0, skipped = 0; - dnode_t *dn = txh->txh_dnode; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int epbs; - uint64_t l0span = 0, nl1blks = 0; - - if (dn->dn_nlevels == 0) - return; - - /* - * The struct_rwlock protects us against dn_nlevels - * changing, in case (against all odds) we manage to dirty & - * sync out the changes after we check for being dirty. - * Also, dbuf_hold_impl() wants us to have the struct_rwlock. - */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - if (dn->dn_maxblkid == 0) { - if (off == 0 && len >= dn->dn_datablksz) { - blkid = 0; - nblks = 1; - } else { - rw_exit(&dn->dn_struct_rwlock); - return; - } - } else { - blkid = off >> dn->dn_datablkshift; - nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - - if (blkid > dn->dn_maxblkid) { - rw_exit(&dn->dn_struct_rwlock); - return; - } - if (blkid + nblks > dn->dn_maxblkid) - nblks = dn->dn_maxblkid - blkid + 1; - - } - l0span = nblks; /* save for later use to calc level > 1 overhead */ - if (dn->dn_nlevels == 1) { - int i; - for (i = 0; i < nblks; i++) { - blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_nblkptr); - bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { - dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dsize(spa, bp); - } - unref += BP_GET_ASIZE(bp); - } - nl1blks = 1; - nblks = 0; - } - - lastblk = blkid + nblks - 1; - while (nblks) { - dmu_buf_impl_t *dbuf; - uint64_t ibyte, new_blkid; - int epb = 1 << epbs; - int err, i, blkoff, tochk; - blkptr_t *bp; - - ibyte = blkid << dn->dn_datablkshift; - err = dnode_next_offset(dn, - DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); - new_blkid = ibyte >> dn->dn_datablkshift; - if (err == ESRCH) { - skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; - break; - } - if (err) { - txh->txh_tx->tx_err = err; - break; - } - if (new_blkid > lastblk) { - skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; - break; - } - - if (new_blkid > blkid) { - ASSERT((new_blkid >> epbs) > (blkid >> epbs)); - skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; - nblks -= new_blkid - blkid; - blkid = new_blkid; - } - blkoff = P2PHASE(blkid, epb); - tochk = MIN(epb - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, - FALSE, FALSE, FTAG, &dbuf); - if (err) { - txh->txh_tx->tx_err = err; - break; - } - - (void) refcount_add_many(&txh->txh_memory_tohold, - dbuf->db.db_size, FTAG); - - /* - * We don't check memory_tohold against DMU_MAX_ACCESS because - * memory_tohold is an over-estimation (especially the >L1 - * indirect blocks), so it could fail. Callers should have - * already verified that they will not be holding too much - * memory. - */ - - err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } - - bp = dbuf->db.db_data; - bp += blkoff; - - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, &bp[i], - bp[i].blk_birth)) { - dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dsize(spa, &bp[i]); - } - unref += BP_GET_ASIZE(bp); - } - dbuf_rele(dbuf, FTAG); - - ++nl1blks; - blkid += tochk; - nblks -= tochk; - } - rw_exit(&dn->dn_struct_rwlock); - - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels and a - * worst-possible distribution of l1-blocks over the region to free. - */ - { - uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); - int level = 2; - /* - * Here we don't use DN_MAX_LEVEL, but calculate it with the - * given datablkshift and indblkshift. This makes the - * difference between 19 and 8 on large files. - */ - int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / - (dn->dn_indblkshift - SPA_BLKPTRSHIFT); - - while (level++ < maxlevel) { - (void) refcount_add_many(&txh->txh_memory_tohold, - MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift, - FTAG); - blkcnt = 1 + (blkcnt >> epbs); - } - } - - /* account for new level 1 indirect blocks that might show up */ - if (skipped > 0) { - (void) refcount_add_many(&txh->txh_fudge, - skipped << dn->dn_indblkshift, FTAG); - skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); - (void) refcount_add_many(&txh->txh_memory_tohold, - skipped << dn->dn_indblkshift, FTAG); - } - (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG); -} - /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and @@ -674,42 +342,24 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) void dmu_tx_mark_netfree(dmu_tx_t *tx) { - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - DMU_NEW_OBJECT, THT_FREE, 0, 0); - - /* - * Pretend that this operation will free 1GB of space. This - * should be large enough to cancel out the largest write. - * We don't want to use something like UINT64_MAX, because that would - * cause overflows when doing math with these values (e.g. in - * dmu_tx_try_assign()). - */ - (void) refcount_add_many(&txh->txh_space_tofree, - 1024 * 1024 * 1024, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, - 1024 * 1024 * 1024, FTAG); + tx->tx_netfree = B_TRUE; } static void dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { - dmu_tx_t *tx; - dnode_t *dn; + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn = txh->txh_dnode; int err; - zio_t *zio; - tx = txh->txh_tx; ASSERT(tx->tx_txg == 0); - dn = txh->txh_dnode; dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) - len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; dmu_tx_count_dnode(txh); @@ -731,7 +381,7 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) - dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_write(txh, off + len, 1); } /* @@ -754,7 +404,7 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (dn->dn_datablkshift == 0) start = end = 0; - zio = zio_root(tx->tx_pool->dp_spa, + zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (i = start; i <= end; i++) { uint64_t ibyte = i << shift; @@ -762,25 +412,28 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) i = ibyte >> shift; if (err == ESRCH || i > end) break; - if (err) { + if (err != 0) { tx->tx_err = err; + (void) zio_wait(zio); return; } + (void) refcount_add_many(&txh->txh_memory_tohold, + 1 << dn->dn_indblkshift, FTAG); + err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) { + if (err != 0) { tx->tx_err = err; + (void) zio_wait(zio); return; } } err = zio_wait(zio); - if (err) { + if (err != 0) { tx->tx_err = err; return; } } - - dmu_tx_count_free(txh, off, len); } void @@ -808,101 +461,48 @@ static void dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name) { dmu_tx_t *tx = txh->txh_tx; - dnode_t *dn; + dnode_t *dn = txh->txh_dnode; int err; - int epbs; - dsl_dataset_phys_t *ds_phys; - int lvl; ASSERT(tx->tx_txg == 0); - dn = txh->txh_dnode; - dmu_tx_count_dnode(txh); - if (dn == NULL) { - /* - * We will be able to fit a new object's entries into one leaf - * block. So there will be at most 2 blocks total, - * including the header block. - */ - dmu_tx_count_write(txh, 0, 2ULL << fzap_default_block_shift); + /* + * Modifying a almost-full microzap is around the worst case (128KB) + * + * If it is a fat zap, the worst case would be 7*16KB=112KB: + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + */ + (void) refcount_add_many(&txh->txh_space_towrite, + MZAP_MAX_BLKSZ, FTAG); + + if (dn == NULL) return; - } ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); - if (dn->dn_maxblkid == 0 && !add) { - blkptr_t *bp; - + if (dn->dn_maxblkid == 0 || name == NULL) { /* - * If there is only one block (i.e. this is a micro-zap) - * and we are not adding anything, the accounting is simple. + * This is a microzap (only one block), or we don't know + * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) { + if (err != 0) { tx->tx_err = err; - return; - } - - /* - * Use max block size here, since we don't know how much - * the size will change between now and the dbuf dirty call. - */ - bp = &dn->dn_phys->dn_blkptr[0]; - if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - bp, bp->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - MZAP_MAX_BLKSZ, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - MZAP_MAX_BLKSZ, FTAG); } - if (!BP_IS_HOLE(bp)) { - (void) refcount_add_many(&txh->txh_space_tounref, - MZAP_MAX_BLKSZ, FTAG); - } - return; - } - - if (dn->dn_maxblkid > 0 && name) { + } else { /* - * access the name in this fat-zap so that we'll check - * for i/o errors to the leaf blocks, etc. + * Access the name so that we'll check for i/o errors to + * the leaf blocks, etc. We ignore ENOENT, as this name + * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); - if (err == EIO) { + if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; - return; - } - } - - err = zap_count_write_by_dnode(dn, name, add, - &txh->txh_space_towrite, &txh->txh_space_tooverwrite); - - /* - * If the modified blocks are scattered to the four winds, - * we'll have to modify an indirect twig for each. We can make - * modifications at up to 3 locations: - * - header block at the beginning of the object - * - target leaf block - * - end of the object, where we might need to write: - * - a new leaf block if the target block needs to be split - * - the new pointer table, if it is growing - * - the new cookie table, if it is growing - */ - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - ds_phys = - dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); - for (lvl = 1; lvl < dn->dn_nlevels; lvl++) { - uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl)); - uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift; - if (ds_phys->ds_prev_snap_obj != 0) { - (void) refcount_add_many(&txh->txh_space_towrite, - spc, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - spc, FTAG); } } } @@ -971,43 +571,15 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } -int -dmu_tx_holds(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - int holds = 0; - - /* - * By asserting that the tx is assigned, we're counting the - * number of dn_tx_holds, which is the same as the number of - * dn_holds. Otherwise, we'd be counting dn_holds, but - * dn_tx_holds could be 0. - */ - ASSERT(tx->tx_txg != 0); - - /* if (tx->tx_anyobj == TRUE) */ - /* return (0); */ - - for (txh = list_head(&tx->tx_holds); txh; - txh = list_next(&tx->tx_holds, txh)) { - if (txh->txh_dnode && txh->txh_dnode->dn_object == object) - holds++; - } - - return (holds); -} - -#ifdef DEBUG_DMU_TX +#ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { - dmu_tx_hold_t *txh; - int match_object = FALSE, match_offset = FALSE; - dnode_t *dn; + boolean_t match_object = B_FALSE; + boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(dn != NULL); + dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); @@ -1023,7 +595,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) return; } - for (txh = list_head(&tx->tx_holds); txh; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) @@ -1242,13 +814,49 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) zfs_sleep_until(wakeup); } +/* + * This routine attempts to assign the transaction to a transaction group. + * To do so, we must determine if there is sufficient free space on disk. + * + * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() + * on it), then it is assumed that there is sufficient free space, + * unless there's insufficient slop space in the pool (see the comment + * above spa_slop_shift in spa_misc.c). + * + * If it is not a "netfree" transaction, then if the data already on disk + * is over the allowed usage (e.g. quota), this will fail with EDQUOT or + * ENOSPC. Otherwise, if the current rough estimate of pending changes, + * plus the rough estimate of this transaction's changes, may exceed the + * allowed usage, then this will fail with ERESTART, which will cause the + * caller to wait for the pending changes to be written to disk (by waiting + * for the next TXG to open), and then check the space usage again. + * + * The rough estimate of pending changes is comprised of the sum of: + * + * - this transaction's holds' txh_space_towrite + * + * - dd_tempreserved[], which is the sum of in-flight transactions' + * holds' txh_space_towrite (i.e. those transactions that have called + * dmu_tx_assign() but not yet called dmu_tx_commit()). + * + * - dd_space_towrite[], which is the amount of dirtied dbufs. + * + * Note that all of these values are inflated by spa_get_worst_case_asize(), + * which means that we may get ERESTART well before we are actually in danger + * of running out of space, but this also mitigates any small inaccuracies + * in the rough estimate (e.g. txh_space_towrite doesn't take into account + * indirect blocks, and dd_space_towrite[] doesn't take into account changes + * to the MOS). + * + * Note that due to this algorithm, it is possible to exceed the allowed + * usage by one transaction. Also, as we approach the allowed usage, + * we will allow a very limited amount of changes into each TXG, thus + * decreasing performance. + */ static int dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { - dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; - uint64_t memory, asize, fsize, usize; - uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ASSERT0(tx->tx_txg); @@ -1292,8 +900,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; - for (txh = list_head(&tx->tx_holds); txh; + uint64_t towrite = 0; + uint64_t tohold = 0; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { @@ -1311,50 +920,18 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) mutex_exit(&dn->dn_mtx); } towrite += refcount_count(&txh->txh_space_towrite); - tofree += refcount_count(&txh->txh_space_tofree); - tooverwrite += refcount_count(&txh->txh_space_tooverwrite); - tounref += refcount_count(&txh->txh_space_tounref); tohold += refcount_count(&txh->txh_memory_tohold); - fudge += refcount_count(&txh->txh_fudge); - } - - /* - * If a snapshot has been taken since we made our estimates, - * assume that we won't be able to free or overwrite anything. - */ - if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > - tx->tx_lastsnap_txg) { - towrite += tooverwrite; - tooverwrite = tofree = 0; } /* needed allocation: worst-case estimate of write space */ - asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); - /* freed space estimate: worst-case overwrite + free estimate */ - fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - /* convert unrefd space to worst-case estimate */ - usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ - memory = towrite + tooverwrite + tohold; + uint64_t memory = towrite + tohold; -#ifdef DEBUG_DMU_TX - /* - * Add in 'tohold' to account for our dirty holds on this memory - * XXX - the "fudge" factor is to account for skipped blocks that - * we missed because dnode_next_offset() misses in-core-only blocks. - */ - tx->tx_space_towrite = asize + - spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); - tx->tx_space_tofree = tofree; - tx->tx_space_tooverwrite = tooverwrite; - tx->tx_space_tounref = tounref; -#endif - - if (tx->tx_dir && asize != 0) { + if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, - asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); - if (err) + asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); + if (err != 0) return (err); } @@ -1366,8 +943,6 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) static void dmu_tx_unassign(dmu_tx_t *tx) { - dmu_tx_hold_t *txh; - if (tx->tx_txg == 0) return; @@ -1377,7 +952,7 @@ dmu_tx_unassign(dmu_tx_t *tx) * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ - for (txh = list_head(&tx->tx_holds); + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh && txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1513,23 +1088,6 @@ dmu_tx_wait(dmu_tx_t *tx) spa_tx_assign_add_nsecs(spa, gethrtime() - before); } -void -dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) -{ -#ifdef DEBUG_DMU_TX - if (tx->tx_dir == NULL || delta == 0) - return; - - if (delta > 0) { - ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, - tx->tx_space_towrite); - (void) refcount_add_many(&tx->tx_space_written, delta, NULL); - } else { - (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); - } -#endif -} - static void dmu_tx_destroy(dmu_tx_t *tx) { @@ -1541,16 +1099,8 @@ dmu_tx_destroy(dmu_tx_t *tx) list_remove(&tx->tx_holds, txh); refcount_destroy_many(&txh->txh_space_towrite, refcount_count(&txh->txh_space_towrite)); - refcount_destroy_many(&txh->txh_space_tofree, - refcount_count(&txh->txh_space_tofree)); - refcount_destroy_many(&txh->txh_space_tooverwrite, - refcount_count(&txh->txh_space_tooverwrite)); - refcount_destroy_many(&txh->txh_space_tounref, - refcount_count(&txh->txh_space_tounref)); refcount_destroy_many(&txh->txh_memory_tohold, refcount_count(&txh->txh_memory_tohold)); - refcount_destroy_many(&txh->txh_fudge, - refcount_count(&txh->txh_fudge)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); @@ -1558,12 +1108,6 @@ dmu_tx_destroy(dmu_tx_t *tx) list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef DEBUG_DMU_TX - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); -#endif kmem_free(tx, sizeof (dmu_tx_t)); } @@ -1604,11 +1148,6 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); -#ifdef DEBUG_DMU_TX - dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", - tx->tx_space_towrite, refcount_count(&tx->tx_space_written), - tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); -#endif dmu_tx_destroy(tx); } @@ -1685,12 +1224,10 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { - int i; - if (!sa->sa_need_attr_registration) return; - for (i = 0; i != sa->sa_num_attrs; i++) { + for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, @@ -1702,44 +1239,14 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) } } - void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dnode_t *dn; - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, - THT_SPILL, 0, 0); - if (txh == NULL) - return; - - dn = txh->txh_dnode; + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, + tx->tx_objset, object, THT_SPILL, 0, 0); - if (dn == NULL) - return; - - /* If blkptr doesn't exist then add space to towrite */ - if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } else { - blkptr_t *bp; - - bp = DN_SPILL_BLKPTR(dn->dn_phys); - if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - bp, bp->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } - if (!BP_IS_HOLE(bp)) { - (void) refcount_add_many(&txh->txh_space_tounref, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } - } + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void @@ -1752,9 +1259,9 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) if (tx->tx_objset->os_sa->sa_master_obj == 0) return; - if (tx->tx_objset->os_sa->sa_layout_attr_obj) + if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - else { + } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index be12ac0fe..28ed7de04 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1949,25 +1949,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) } /* - * Call when we think we're going to write/free space in open context to track - * the amount of memory in use by the currently open txg. - */ -void -dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) -{ - objset_t *os = dn->dn_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - int64_t aspace = spa_get_asize(os->os_spa, space); - - if (ds != NULL) { - dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); - } - - dmu_tx_willuse_space(tx, aspace); -} - -/* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index c91c2e991..f83da16e5 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -242,42 +242,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (used); } -uint64_t -dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) -{ - uint64_t trysnap = 0; - - if (ds == NULL) - return (0); - /* - * The snapshot creation could fail, but that would cause an - * incorrect FALSE return, which would only result in an - * overestimation of the amount of space that an operation would - * consume, which is OK. - * - * There's also a small window where we could miss a pending - * snapshot, because we could set the sync task in the quiescing - * phase. So this should only be used as a guess. - */ - if (ds->ds_trysnap_txg > - spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) - trysnap = ds->ds_trysnap_txg; - return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); -} - -boolean_t -dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, - uint64_t blk_birth) -{ - if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || - (bp != NULL && BP_IS_HOLE(bp))) - return (B_FALSE); - - ddt_prefetch(dsl_dataset_get_spa(ds), bp); - - return (B_TRUE); -} - /* * We have to release the fsid syncronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This @@ -3731,8 +3695,6 @@ EXPORT_SYMBOL(dsl_dataset_space_wouldfree); EXPORT_SYMBOL(dsl_dataset_sync); EXPORT_SYMBOL(dsl_dataset_block_born); EXPORT_SYMBOL(dsl_dataset_block_kill); -EXPORT_SYMBOL(dsl_dataset_block_freeable); -EXPORT_SYMBOL(dsl_dataset_prev_snap_txg); EXPORT_SYMBOL(dsl_dataset_dirty); EXPORT_SYMBOL(dsl_dataset_stats); EXPORT_SYMBOL(dsl_dataset_fast_stat); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 305a87ed9..98aeff5dc 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1031,13 +1031,12 @@ static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; - int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); - for (i = 0; i < TXG_SIZE; i++) { - space += dd->dd_space_towrite[i&TXG_MASK]; - ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); + for (int i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i & TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); } return (space); } @@ -1117,16 +1116,13 @@ struct tempreserve { static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + boolean_t ignorequota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; - uint64_t est_inflight, used_on_disk, quota, parent_rsrv; - uint64_t deferred = 0; + uint64_t quota; struct tempreserve *tr; int retval = EDQUOT; - int txgidx = txg & TXG_MASK; - int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); @@ -1138,10 +1134,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ - est_inflight = dsl_dir_space_towrite(dd); - for (i = 0; i < TXG_SIZE; i++) + uint64_t est_inflight = dsl_dir_space_towrite(dd); + for (int i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; - used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; + uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and @@ -1152,9 +1148,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; - error = dsl_dataset_check_quota(ds, checkrefquota, + error = dsl_dataset_check_quota(ds, !netfree, asize, est_inflight, &used_on_disk, &ref_rsrv); - if (error) { + if (error != 0) { mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (error); @@ -1180,6 +1176,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * we're very close to full, this will allow a steady trickle of * removes to get through. */ + uint64_t deferred = 0; if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); @@ -1210,9 +1207,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, } /* We need to up our estimated delta before dropping dd_lock */ - dd->dd_tempreserved[txgidx] += asize; + dd->dd_tempreserved[txg & TXG_MASK] += asize; - parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); @@ -1222,11 +1219,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ - if (dd->dd_parent && parent_rsrv) { + if (dd->dd_parent != NULL && parent_rsrv != 0) { boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); + parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE)); } else { return (0); } @@ -1240,7 +1237,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) + boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; @@ -1254,7 +1251,6 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); - ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { @@ -1281,8 +1277,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, } if (err == 0) { - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, - FALSE, asize > usize, tr_list, tx, TRUE); + err = dsl_dir_tempreserve_impl(dd, asize, netfree, + B_FALSE, tr_list, tx, B_TRUE); } if (err != 0) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 59d9fe576..bc5273e2e 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1615,7 +1615,7 @@ spa_freeze_txg(spa_t *spa) /* ARGSUSED */ uint64_t -spa_get_asize(spa_t *spa, uint64_t lsize) +spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { return (lsize * spa_asize_inflation); } @@ -2078,7 +2078,6 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); -EXPORT_SYMBOL(spa_get_asize); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 907ab2aa5..435278c6e 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1357,64 +1357,3 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } - -int -fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, - refcount_t *tooverwrite) -{ - zap_t *zap = zn->zn_zap; - zap_leaf_t *l; - int err; - - /* - * Account for the header block of the fatzap. - */ - if (!add && dmu_buf_freeable(zap->zap_dbuf)) { - (void) refcount_add_many(tooverwrite, - zap->zap_dbuf->db_size, FTAG); - } else { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size, FTAG); - } - - /* - * Account for the pointer table blocks. - * If we are adding we need to account for the following cases : - * - If the pointer table is embedded, this operation could force an - * external pointer table. - * - If this already has an external pointer table this operation - * could extend the table. - */ - if (add) { - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size, FTAG); - } else { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size * 3, FTAG); - } - } - - /* - * Now, check if the block containing leaf is freeable - * and account accordingly. - */ - err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) { - return (err); - } - - if (!add && dmu_buf_freeable(l->l_dbuf)) { - (void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG); - } else { - /* - * If this an add operation, the leaf block could split. - * Hence, we need to account for an additional leaf block. - */ - (void) refcount_add_many(towrite, - (add ? 2 : 1) * l->l_dbuf->db_size, FTAG); - } - - zap_put_leaf(l); - return (0); -} diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 53f8d2313..28c80e549 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1594,88 +1594,6 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) return (0); } -int -zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, - refcount_t *towrite, refcount_t *tooverwrite) -{ - zap_t *zap; - int err = 0; - - /* - * Since, we don't have a name, we cannot figure out which blocks will - * be affected in this operation. So, account for the worst case : - * - 3 blocks overwritten: target leaf, ptrtbl block, header block - * - 4 new blocks written if adding: - * - 2 blocks for possibly split leaves, - * - 2 grown ptrtbl blocks - * - * This also accommodates the case where an add operation to a fairly - * large microzap results in a promotion to fatzap. - */ - if (name == NULL) { - (void) refcount_add_many(towrite, - (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); - return (err); - } - - /* - * We lock the zap with adding == FALSE. Because, if we pass - * the actual value of add, it could trigger a mzap_upgrade(). - * At present we are just evaluating the possibility of this operation - * and hence we do not want to trigger an upgrade. - */ - err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - - if (!zap->zap_ismicro) { - zap_name_t *zn = zap_name_alloc(zap, name, 0); - if (zn) { - err = fzap_count_write(zn, add, towrite, - tooverwrite); - zap_name_free(zn); - } else { - /* - * We treat this case as similar to (name == NULL) - */ - (void) refcount_add_many(towrite, - (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); - } - } else { - /* - * We are here if (name != NULL) and this is a micro-zap. - * We account for the header block depending on whether it - * is freeable. - * - * Incase of an add-operation it is hard to find out - * if this add will promote this microzap to fatzap. - * Hence, we consider the worst case and account for the - * blocks assuming this microzap would be promoted to a - * fatzap. - * - * 1 block overwritten : header block - * 4 new blocks written : 2 new split leaf, 2 grown - * ptrtbl blocks - */ - if (dmu_buf_freeable(zap->zap_dbuf)) { - (void) refcount_add_many(tooverwrite, - MZAP_MAX_BLKSZ, FTAG); - } else { - (void) refcount_add_many(towrite, - MZAP_MAX_BLKSZ, FTAG); - } - - if (add) { - (void) refcount_add_many(towrite, - 4 * MZAP_MAX_BLKSZ, FTAG); - } - } - - zap_unlockdir(zap, FTAG); - return (err); -} - #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(zap_create); EXPORT_SYMBOL(zap_create_dnsize); @@ -1694,7 +1612,6 @@ EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); -EXPORT_SYMBOL(zap_count_write_by_dnode); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); |