summaryrefslogtreecommitdiffstats
path: root/include/sys
diff options
context:
space:
mode:
Diffstat (limited to 'include/sys')
-rw-r--r--include/sys/fs/zfs.h9
-rw-r--r--include/sys/metaslab.h71
-rw-r--r--include/sys/metaslab_impl.h36
-rw-r--r--include/sys/space_map.h8
-rw-r--r--include/sys/zfs_debug.h17
5 files changed, 92 insertions, 49 deletions
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index d54cd5679..24ad768c0 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -194,6 +194,7 @@ typedef enum {
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
+ ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -600,6 +601,13 @@ typedef struct zpool_rewind_policy {
#define SPA_MINDEVSIZE (64ULL << 20)
/*
+ * Set if the fragmentation has not yet been calculated. This can happen
+ * because the space maps have not been upgraded or the histogram feature
+ * is not enabled.
+ */
+#define ZFS_FRAG_INVALID UINT64_MAX
+
+/*
* The location of the pool configuration repository, shared between kernel and
* userland.
*/
@@ -747,6 +755,7 @@ typedef struct vdev_stat {
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
+ uint64_t vs_fragmentation; /* device fragmentation */
} vdev_stat_t;
/*
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index a3bbc25f7..962dafca4 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -38,23 +38,22 @@ extern "C" {
typedef struct metaslab_ops {
uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
- boolean_t (*msop_fragmented)(metaslab_t *msp);
} metaslab_ops_t;
extern metaslab_ops_t *zfs_metaslab_ops;
-metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id,
- uint64_t object, uint64_t txg);
-void metaslab_fini(metaslab_t *msp);
+metaslab_t *metaslab_init(metaslab_group_t *, uint64_t,
+ uint64_t, uint64_t);
+void metaslab_fini(metaslab_t *);
-void metaslab_load_wait(metaslab_t *msp);
-int metaslab_load(metaslab_t *msp);
-void metaslab_unload(metaslab_t *msp);
+void metaslab_load_wait(metaslab_t *);
+int metaslab_load(metaslab_t *);
+void metaslab_unload(metaslab_t *);
-void metaslab_sync(metaslab_t *msp, uint64_t txg);
-void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-void metaslab_sync_reassess(metaslab_group_t *mg);
-uint64_t metaslab_block_maxsize(metaslab_t *msp);
+void metaslab_sync(metaslab_t *, uint64_t);
+void metaslab_sync_done(metaslab_t *, uint64_t);
+void metaslab_sync_reassess(metaslab_group_t *);
+uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
@@ -63,30 +62,36 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp);
#define METASLAB_GANG_AVOID 0x8
#define METASLAB_FASTWRITE 0x10
-int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
- blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
-void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now);
-int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
-void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
-void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
+int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
+ blkptr_t *, int, uint64_t, blkptr_t *, int);
+void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+void metaslab_check_free(spa_t *, const blkptr_t *);
+void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
+void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
-metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops);
-void metaslab_class_destroy(metaslab_class_t *mc);
-int metaslab_class_validate(metaslab_class_t *mc);
+metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
+void metaslab_class_destroy(metaslab_class_t *);
+int metaslab_class_validate(metaslab_class_t *);
+void metaslab_class_histogram_verify(metaslab_class_t *);
+uint64_t metaslab_class_fragmentation(metaslab_class_t *);
+uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-void metaslab_class_space_update(metaslab_class_t *mc,
- int64_t alloc_delta, int64_t defer_delta,
- int64_t space_delta, int64_t dspace_delta);
-uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
-uint64_t metaslab_class_get_space(metaslab_class_t *mc);
-uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
-uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
+ int64_t, int64_t);
+uint64_t metaslab_class_get_alloc(metaslab_class_t *);
+uint64_t metaslab_class_get_space(metaslab_class_t *);
+uint64_t metaslab_class_get_dspace(metaslab_class_t *);
+uint64_t metaslab_class_get_deferred(metaslab_class_t *);
-metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd);
-void metaslab_group_destroy(metaslab_group_t *mg);
-void metaslab_group_activate(metaslab_group_t *mg);
-void metaslab_group_passivate(metaslab_group_t *mg);
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+void metaslab_group_destroy(metaslab_group_t *);
+void metaslab_group_activate(metaslab_group_t *);
+void metaslab_group_passivate(metaslab_group_t *);
+uint64_t metaslab_group_get_space(metaslab_group_t *);
+void metaslab_group_histogram_verify(metaslab_group_t *);
+uint64_t metaslab_group_fragmentation(metaslab_group_t *);
+void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
#ifdef __cplusplus
}
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 3cd27d75e..88bda071f 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -41,6 +41,23 @@
extern "C" {
#endif
+/*
+ * A metaslab class encompasses a category of allocatable top-level vdevs.
+ * Each top-level vdev is associated with a metaslab group which defines
+ * the allocatable region for that vdev. Examples of these categories include
+ * "normal" for data block allocations (i.e. main pool allocations) or "log"
+ * for allocations designated for intent log devices (i.e. slog devices).
+ * When a block allocation is requested from the SPA it is associated with a
+ * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
+ * to the class can be used to satisfy that request. Allocations are done
+ * by traversing the metaslab groups that are linked off of the mc_rotor field.
+ * This rotor points to the next metaslab group where allocations will be
+ * attempted. Allocating a block is a 3 step process -- select the metaslab
+ * group, select the metaslab, and then allocate the block. The metaslab
+ * class defines the low-level block allocator that will be used as the
+ * final step in allocation. These allocators are pluggable allowing each class
+ * to use a block allocator that best suits that class.
+ */
struct metaslab_class {
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
@@ -51,9 +68,19 @@ struct metaslab_class {
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
+ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
kmutex_t mc_fastwrite_lock;
};
+/*
+ * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
+ * of a top-level vdev. They are linked togther to form a circular linked
+ * list and can belong to only one metaslab class. Metaslab groups may become
+ * ineligible for allocations for a number of reasons such as limited free
+ * space, fragmentation, or going offline. When this happens the allocator will
+ * simply find the next metaslab group in the linked list and attempt
+ * to allocate from that group instead.
+ */
struct metaslab_group {
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
@@ -67,12 +94,14 @@ struct metaslab_group {
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
+ uint64_t mg_fragmentation;
+ uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
/*
* This value defines the number of elements in the ms_lbas array. The value
- * of 64 was chosen as it covers to cover all power of 2 buckets up to
- * UINT64_MAX. This is the equivalent of highbit(UINT64_MAX).
+ * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
+ * This is the equivalent of highbit(UINT64_MAX).
*/
#define MAX_LBAS 64
@@ -135,6 +164,7 @@ struct metaslab {
uint64_t ms_id;
uint64_t ms_start;
uint64_t ms_size;
+ uint64_t ms_fragmentation;
range_tree_t *ms_alloctree[TXG_SIZE];
range_tree_t *ms_freetree[TXG_SIZE];
@@ -142,12 +172,12 @@ struct metaslab {
range_tree_t *ms_tree;
boolean_t ms_condensing; /* condensing? */
+ boolean_t ms_condense_wanted;
boolean_t ms_loaded;
boolean_t ms_loading;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
- uint64_t ms_factor;
uint64_t ms_access_txg;
/*
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 369180330..67fa2767b 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -44,9 +44,7 @@ extern "C" {
* maintain backward compatibility.
*/
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
-#define SPACE_MAP_HISTOGRAM_SIZE(sm) \
- (sizeof ((sm)->sm_phys->smp_histogram) / \
- sizeof ((sm)->sm_phys->smp_histogram[0]))
+#define SPACE_MAP_HISTOGRAM_SIZE 32
/*
* The space_map_phys is the on-disk representation of the space map.
@@ -68,7 +66,7 @@ typedef struct space_map_phys {
* whose size is:
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
*/
- uint64_t smp_histogram[32]; /* histogram of free space */
+ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
} space_map_phys_t;
/*
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index 829b37a46..35ffa0187 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@@ -50,13 +50,14 @@ extern int zfs_flags;
extern int zfs_recover;
extern int zfs_free_leak_on_eio;
-#define ZFS_DEBUG_DPRINTF (1<<0)
-#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
-#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
-#define ZFS_DEBUG_SNAPNAMES (1<<3)
-#define ZFS_DEBUG_MODIFY (1<<4)
-#define ZFS_DEBUG_SPA (1<<5)
-#define ZFS_DEBUG_ZIO_FREE (1<<6)
+#define ZFS_DEBUG_DPRINTF (1<<0)
+#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
+#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
+#define ZFS_DEBUG_SNAPNAMES (1<<3)
+#define ZFS_DEBUG_MODIFY (1<<4)
+#define ZFS_DEBUG_SPA (1<<5)
+#define ZFS_DEBUG_ZIO_FREE (1<<6)
+#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7)
/*
* Always log zfs debug messages to the spl debug subsystem as SS_USER1.