aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c188
-rw-r--r--module/zfs/zfs_vfsops.c21
-rw-r--r--module/zfs/zfs_znode.c2
-rw-r--r--module/zfs/zpl_super.c126
4 files changed, 287 insertions, 50 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index f71c24844..95d14a9e7 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -104,6 +104,14 @@
* protected from simultaneous callbacks from arc_buf_evict()
* and arc_do_user_evicts().
*
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored. For example,
+ * when using the ZPL each dentry holds a references on a znode. These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -120,14 +128,13 @@
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
-#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
#include <sys/fs/swapnode.h>
-#include <sys/dnlc.h>
+#include <sys/zpl.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -141,8 +148,8 @@ extern int zfs_write_limit_shift;
extern uint64_t zfs_write_limit_max;
extern kmutex_t zfs_write_limit_lock;
-#define ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/* number of bytes to prune from caches when at arc_meta_limit is reached */
+uint_t arc_meta_prune = 1048576;
typedef enum arc_reclaim_strategy {
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
@@ -180,7 +187,7 @@ unsigned long zfs_arc_meta_limit = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
-int zfs_arc_reduce_dnlc_percent = 0;
+int zfs_arc_meta_prune = 0;
/*
* Note that buffers can be in one of 6 states:
@@ -288,6 +295,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_no_grow;
kstat_named_t arcstat_tempreserve;
kstat_named_t arcstat_loaned_bytes;
+ kstat_named_t arcstat_prune;
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
@@ -352,6 +360,7 @@ static arc_stats_t arc_stats = {
{ "arc_no_grow", KSTAT_DATA_UINT64 },
{ "arc_tempreserve", KSTAT_DATA_UINT64 },
{ "arc_loaned_bytes", KSTAT_DATA_UINT64 },
+ { "arc_prune", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
@@ -481,6 +490,8 @@ struct arc_buf_hdr {
list_node_t b_l2node;
};
+static list_t arc_prune_list;
+static kmutex_t arc_prune_mtx;
static arc_buf_t *arc_eviction_list;
static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
@@ -1925,6 +1936,48 @@ arc_adjust(void)
}
}
+/*
+ * Request that arc user drop references so that N bytes can be released
+ * from the cache. This provides a mechanism to ensure the arc can honor
+ * the arc_meta_limit and reclaim buffers which are pinned in the cache
+ * by higher layers. (i.e. the zpl)
+ */
+static void
+arc_do_user_prune(int64_t adjustment)
+{
+ arc_prune_func_t *func;
+ void *private;
+ arc_prune_t *cp, *np;
+
+ mutex_enter(&arc_prune_mtx);
+
+ cp = list_head(&arc_prune_list);
+ while (cp != NULL) {
+ func = cp->p_pfunc;
+ private = cp->p_private;
+ np = list_next(&arc_prune_list, cp);
+ refcount_add(&cp->p_refcnt, func);
+ mutex_exit(&arc_prune_mtx);
+
+ if (func != NULL)
+ func(adjustment, private);
+
+ mutex_enter(&arc_prune_mtx);
+
+ /* User removed prune callback concurrently with execution */
+ if (refcount_remove(&cp->p_refcnt, func) == 0) {
+ ASSERT(!list_link_active(&cp->p_node));
+ refcount_destroy(&cp->p_refcnt);
+ kmem_free(cp, sizeof (*cp));
+ }
+
+ cp = np;
+ }
+
+ ARCSTAT_BUMP(arcstat_prune);
+ mutex_exit(&arc_prune_mtx);
+}
+
static void
arc_do_user_evicts(void)
{
@@ -1949,6 +2002,32 @@ arc_do_user_evicts(void)
}
/*
+ * Evict only meta data objects from the cache leaving the data objects.
+ * This is only used to enforce the tunable arc_meta_limit, if we are
+ * unable to evict enough buffers notify the user via the prune callback.
+ */
+void
+arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
+{
+ int64_t delta;
+
+ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+ arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
+ adjustment -= delta;
+ }
+
+ if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
+ arc_do_user_prune(arc_meta_prune);
+}
+
+/*
* Flush all *evictable* data from the cache for the given spa.
* NOTE: this will not touch "active" (i.e. referenced) data.
*/
@@ -2085,24 +2164,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
-#ifdef _KERNEL
- int retry = 0;
-
- while ((arc_meta_used >= arc_meta_limit) && (retry < 10)) {
- /*
- * We are exceeding our meta-data cache limit.
- * Purge some DNLC entries to release holds on meta-data.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
- retry++;
- }
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
-#endif
/*
* An aggressive reclamation will shrink the cache size as well as
@@ -2121,6 +2182,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
+
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_cache);
}
@@ -2131,6 +2193,7 @@ arc_reclaim_thread(void)
clock_t growtime = 0;
arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
callb_cpr_t cpr;
+ int64_t prune;
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
@@ -2160,9 +2223,14 @@ arc_reclaim_thread(void)
arc_no_grow = FALSE;
}
- /* Keep meta data usage within limits */
- if (arc_meta_used >= arc_meta_limit)
- arc_kmem_reap_now(ARC_RECLAIM_CONS);
+ /*
+ * Keep meta data usage within limits, arc_shrink() is not
+ * used to avoid collapsing the arc_c value when only the
+ * arc_meta_limit is being exceeded.
+ */
+ prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
+ if (prune > 0)
+ arc_adjust_meta(prune, B_TRUE);
arc_adjust();
@@ -2399,16 +2467,27 @@ arc_get_data_buf(arc_buf_t *buf)
state = (arc_mru->arcs_lsize[type] >= size &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
+
if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
if (type == ARC_BUFC_METADATA) {
buf->b_data = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
+
+ /*
+ * If we are unable to recycle an existing meta buffer
+ * signal the reclaim thread. It will notify users
+ * via the prune callback to drop references. The
+ * prune callback in run in the context of the reclaim
+ * thread to avoid deadlocking on the hash_lock.
+ */
+ cv_signal(&arc_reclaim_thr_cv);
} else {
ASSERT(type == ARC_BUFC_DATA);
buf->b_data = zio_data_buf_alloc(size);
ARCSTAT_INCR(arcstat_data_size, size);
atomic_add_64(&arc_size, size);
}
+
ARCSTAT_BUMP(arcstat_recycle_miss);
}
ASSERT(buf->b_data != NULL);
@@ -3021,6 +3100,37 @@ top:
return (0);
}
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+ arc_prune_t *p;
+
+ p = kmem_alloc(sizeof(*p), KM_SLEEP);
+ p->p_pfunc = func;
+ p->p_private = private;
+ list_link_init(&p->p_node);
+ refcount_create(&p->p_refcnt);
+
+ mutex_enter(&arc_prune_mtx);
+ refcount_add(&p->p_refcnt, &arc_prune_list);
+ list_insert_head(&arc_prune_list, p);
+ mutex_exit(&arc_prune_mtx);
+
+ return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+ mutex_enter(&arc_prune_mtx);
+ list_remove(&arc_prune_list, p);
+ if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
@@ -3598,8 +3708,8 @@ arc_init(void)
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
- if (zfs_arc_reduce_dnlc_percent > 0)
- arc_reduce_dnlc_percent = zfs_arc_reduce_dnlc_percent;
+ if (zfs_arc_meta_prune > 0)
+ arc_meta_prune = zfs_arc_meta_prune;
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
@@ -3646,7 +3756,10 @@ arc_init(void)
buf_init();
arc_thread_exit = 0;
+ list_create(&arc_prune_list, sizeof (arc_prune_t),
+ offsetof(arc_prune_t, p_node));
arc_eviction_list = NULL;
+ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
@@ -3674,6 +3787,8 @@ arc_init(void)
void
arc_fini(void)
{
+ arc_prune_t *p;
+
mutex_enter(&arc_reclaim_thr_lock);
#ifdef _KERNEL
spl_unregister_shrinker(&arc_shrinker);
@@ -3693,6 +3808,17 @@ arc_fini(void)
arc_ksp = NULL;
}
+ mutex_enter(&arc_prune_mtx);
+ while ((p = list_head(&arc_prune_list)) != NULL) {
+ list_remove(&arc_prune_list, p);
+ refcount_remove(&p->p_refcnt, &arc_prune_list);
+ refcount_destroy(&p->p_refcnt);
+ kmem_free(p, sizeof (*p));
+ }
+ mutex_exit(&arc_prune_mtx);
+
+ list_destroy(&arc_prune_list);
+ mutex_destroy(&arc_prune_mtx);
mutex_destroy(&arc_eviction_mtx);
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
@@ -4774,6 +4900,8 @@ l2arc_stop(void)
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_remove_ref);
EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
module_param(zfs_arc_min, ulong, 0444);
MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
@@ -4784,8 +4912,8 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
module_param(zfs_arc_meta_limit, ulong, 0444);
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
-module_param(zfs_arc_reduce_dnlc_percent, int, 0444);
-MODULE_PARM_DESC(zfs_arc_reduce_dnlc_percent, "Meta reclaim percentage");
+module_param(zfs_arc_meta_prune, int, 0444);
+MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
module_param(zfs_arc_grow_retry, int, 0444);
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index a0726e117..fb319a547 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -986,6 +986,26 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp)
}
EXPORT_SYMBOL(zfs_root);
+#ifdef HAVE_SHRINK
+int
+zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+ zfs_sb_t *zsb = sb->s_fs_info;
+ struct shrinker *shrinker = &sb->s_shrink;
+ struct shrink_control sc = {
+ .nr_to_scan = nr_to_scan,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ ZFS_ENTER(zsb);
+ *objects = (*shrinker->shrink)(shrinker, &sc);
+ ZFS_EXIT(zsb);
+
+ return (0);
+}
+EXPORT_SYMBOL(zfs_sb_prune);
+#endif /* HAVE_SHRINK */
+
/*
* Teardown the zfs_sb_t::z_os.
*
@@ -1533,6 +1553,7 @@ zfs_init(void)
zfs_znode_init();
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
register_filesystem(&zpl_fs_type);
+ (void) arc_add_prune_callback(zpl_prune_sbs, NULL);
}
void
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index a35e3b5f2..709ae74f8 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -269,6 +269,7 @@ zfs_inode_destroy(struct inode *ip)
mutex_enter(&zsb->z_znodes_lock);
list_remove(&zsb->z_all_znodes, zp);
+ zsb->z_nr_znodes--;
mutex_exit(&zsb->z_znodes_lock);
if (zp->z_acl_cached) {
@@ -401,6 +402,7 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
mutex_enter(&zsb->z_znodes_lock);
list_insert_tail(&zsb->z_all_znodes, zp);
+ zsb->z_nr_znodes++;
membar_producer();
mutex_exit(&zsb->z_znodes_lock);
diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
index 650e9c0d3..3abb26a9e 100644
--- a/module/zfs/zpl_super.c
+++ b/module/zfs/zpl_super.c
@@ -199,34 +199,120 @@ zpl_kill_sb(struct super_block *sb)
kill_anon_super(sb);
}
+#ifdef HAVE_SHRINK
+/*
+ * Linux 3.1 - 3.x API
+ *
+ * The Linux 3.1 API introduced per-sb cache shrinkers to replace the
+ * global ones. This allows us a mechanism to cleanly target a specific
+ * zfs file system when the dnode and inode caches grow too large.
+ *
+ * In addition, the 3.0 kernel added the iterate_supers_type() helper
+ * function which is used to safely walk all of the zfs file systems.
+ */
+static void
+zpl_prune_sb(struct super_block *sb, void *arg)
+{
+ int objects = 0;
+ int error;
+
+ error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects);
+ ASSERT3S(error, <=, 0);
+
+ return;
+}
+
+void
+zpl_prune_sbs(int64_t bytes_to_scan, void *private)
+{
+ unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t));
+
+ iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan);
+ kmem_reap();
+}
+#else
+/*
+ * Linux 2.6.x - 3.0 API
+ *
+ * These are best effort interfaces are provided by the SPL to induce
+ * the Linux VM subsystem to reclaim a fraction of the both dnode and
+ * inode caches. Ideally, we want to just target the zfs file systems
+ * however our only option is to reclaim from them all.
+ */
+void
+zpl_prune_sbs(int64_t bytes_to_scan, void *private)
+{
+ unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t));
+
+ shrink_dcache_memory(nr_to_scan, GFP_KERNEL);
+ shrink_icache_memory(nr_to_scan, GFP_KERNEL);
+ kmem_reap();
+}
+#endif /* HAVE_SHRINK */
+
+#ifdef HAVE_NR_CACHED_OBJECTS
+static int
+zpl_nr_cached_objects(struct super_block *sb)
+{
+ zfs_sb_t *zsb = sb->s_fs_info;
+ int nr;
+
+ mutex_enter(&zsb->z_znodes_lock);
+ nr = zsb->z_nr_znodes;
+ mutex_exit(&zsb->z_znodes_lock);
+
+ return (nr);
+}
+#endif /* HAVE_NR_CACHED_OBJECTS */
+
+#ifdef HAVE_FREE_CACHED_OBJECTS
+/*
+ * Attempt to evict some meta data from the cache. The ARC operates in
+ * terms of bytes while the Linux VFS uses objects. Now because this is
+ * just a best effort eviction and the exact values aren't critical so we
+ * extrapolate from an object count to a byte size using the znode_t size.
+ */
+static void
+zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
+{
+ arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE);
+}
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+
const struct super_operations zpl_super_operations = {
- .alloc_inode = zpl_inode_alloc,
- .destroy_inode = zpl_inode_destroy,
- .dirty_inode = NULL,
- .write_inode = NULL,
- .drop_inode = NULL,
+ .alloc_inode = zpl_inode_alloc,
+ .destroy_inode = zpl_inode_destroy,
+ .dirty_inode = NULL,
+ .write_inode = NULL,
+ .drop_inode = NULL,
#ifdef HAVE_EVICT_INODE
- .evict_inode = zpl_evict_inode,
+ .evict_inode = zpl_evict_inode,
#else
- .clear_inode = zpl_clear_inode,
- .delete_inode = zpl_inode_delete,
+ .clear_inode = zpl_clear_inode,
+ .delete_inode = zpl_inode_delete,
#endif /* HAVE_EVICT_INODE */
- .put_super = zpl_put_super,
- .write_super = NULL,
- .sync_fs = zpl_sync_fs,
- .statfs = zpl_statfs,
- .remount_fs = zpl_remount_fs,
- .show_options = zpl_show_options,
- .show_stats = NULL,
+ .put_super = zpl_put_super,
+ .write_super = NULL,
+ .sync_fs = zpl_sync_fs,
+ .statfs = zpl_statfs,
+ .remount_fs = zpl_remount_fs,
+ .show_options = zpl_show_options,
+ .show_stats = NULL,
+#ifdef HAVE_NR_CACHED_OBJECTS
+ .nr_cached_objects = zpl_nr_cached_objects,
+#endif /* HAVE_NR_CACHED_OBJECTS */
+#ifdef HAVE_FREE_CACHED_OBJECTS
+ .free_cached_objects = zpl_free_cached_objects,
+#endif /* HAVE_FREE_CACHED_OBJECTS */
};
struct file_system_type zpl_fs_type = {
- .owner = THIS_MODULE,
- .name = ZFS_DRIVER,
+ .owner = THIS_MODULE,
+ .name = ZFS_DRIVER,
#ifdef HAVE_MOUNT_NODEV
- .mount = zpl_mount,
+ .mount = zpl_mount,
#else
- .get_sb = zpl_get_sb,
+ .get_sb = zpl_get_sb,
#endif /* HAVE_MOUNT_NODEV */
- .kill_sb = zpl_kill_sb,
+ .kill_sb = zpl_kill_sb,
};