Merge branch 'zil-performance'

This brnach brings some ZIL performance optimizations, with significant increases in synchronous write performance for some workloads and pool configurations. See the individual commit messages for details. Signed-off-by: Brian Behlendorf <[email protected]> Closes #1013
author: Brian Behlendorf <[email protected]> 2012-10-17 08:57:14 -0700
committer: Brian Behlendorf <[email protected]> 2012-10-17 08:57:49 -0700
commit: 658a0140f3d27f04fb789f5a0fe8ea00773a210a (patch)
tree: 860efd0241842d2b3749c0d2666d055ef152c7cf
parent: 82f46731fd5a9eef4f87530e94922664b58a6138 (diff)
parent: 5d7a86d114c2706a8d14d94b71f81ad5cdf066c5 (diff)
14 files changed, 434 insertions, 27 deletions
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 2cf4d2b48..99912424b 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -50,12 +50,15 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
 #define	METASLAB_GANG_HEADER	0x2
 #define	METASLAB_GANG_CHILD	0x4
 #define	METASLAB_GANG_AVOID	0x8
+#define	METASLAB_FASTWRITE	0x10
 
 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
 extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
+extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
 
 extern metaslab_class_t *metaslab_class_create(spa_t *spa,
     space_map_ops_t *ops);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 6c670a162..658359478 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -46,6 +46,7 @@ struct metaslab_class {
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
+	kmutex_t		mc_fastwrite_lock;
 };
 
 struct metaslab_group {
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 5bd432beb..0b532dcdd 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -156,6 +156,7 @@ struct vdev {
 	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
+	uint64_t	vdev_pending_fastwrite; /* allocated fastwrites */
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index 1d4c0cc6c..6c37c1ac2 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -40,6 +40,7 @@ extern "C" {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t       lwb_fastwrite;  /* is blk marked for fastwrite? */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 4f20cab65..289238c36 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -193,7 +193,8 @@ enum zio_flag {
 	ZIO_FLAG_RAW		= 1 << 21,
 	ZIO_FLAG_GANG_CHILD	= 1 << 22,
 	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24
+	ZIO_FLAG_GODFATHER	= 1 << 24,
+	ZIO_FLAG_FASTWRITE      = 1 << 25
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -475,7 +476,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    uint64_t size, boolean_t use_slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c
index de4430f9f..a3223eadc 100644
--- a/lib/libspl/asm-generic/atomic.c
+++ b/lib/libspl/asm-generic/atomic.c
@@ -103,6 +103,31 @@ void atomic_add_ptr(volatile void *target, ssize_t bits)
 }
 
 
+#define ATOMIC_SUB(name, type1, type2) \
+	void atomic_sub_##name(volatile type1 *target, type2 bits)	\
+	{								\
+		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
+		*target -= bits;					\
+		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
+	}
+
+ATOMIC_SUB(8, uint8_t, int8_t)
+ATOMIC_SUB(char, uchar_t, signed char)
+ATOMIC_SUB(16, uint16_t, int16_t)
+ATOMIC_SUB(short, ushort_t, short)
+ATOMIC_SUB(32, uint32_t, int32_t)
+ATOMIC_SUB(int, uint_t, int)
+ATOMIC_SUB(long, ulong_t, long)
+ATOMIC_SUB(64, uint64_t, int64_t)
+
+void atomic_sub_ptr(volatile void *target, ssize_t bits)
+{
+	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
+	*(caddr_t *)target -= bits;
+	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
+}
+
+
 #define ATOMIC_OR(name, type) \
 	void atomic_or_##name(volatile type *target, type bits)		\
 	{								\
@@ -216,6 +241,37 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits)
 }
 
 
+#define ATOMIC_SUB_NV(name, type1, type2) \
+	type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\
+	{								\
+		type1 rc;						\
+		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
+		rc = (*target -= bits);					\
+		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
+		return rc;						\
+	}
+
+ATOMIC_SUB_NV(8, uint8_t, int8_t)
+ATOMIC_SUB_NV(char, uchar_t, signed char)
+ATOMIC_SUB_NV(16, uint16_t, int16_t)
+ATOMIC_SUB_NV(short, ushort_t, short)
+ATOMIC_SUB_NV(32, uint32_t, int32_t)
+ATOMIC_SUB_NV(int, uint_t, int)
+ATOMIC_SUB_NV(long, ulong_t, long)
+ATOMIC_SUB_NV(64, uint64_t, int64_t)
+
+void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits)
+{
+	void *ptr;
+
+	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
+	ptr = (*(caddr_t *)target -= bits);
+	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
+
+	return ptr;
+}
+
+
 #define ATOMIC_OR_NV(name, type) \
 	type atomic_or_##name##_nv(volatile type *target, type bits)	\
 	{								\
diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S
index 93c04bfb8..d3d425090 100644
--- a/lib/libspl/asm-i386/atomic.S
+++ b/lib/libspl/asm-i386/atomic.S
@@ -271,6 +271,40 @@
 	SET_SIZE(atomic_add_int)
 	SET_SIZE(atomic_add_32)
 
+	ENTRY(atomic_sub_8)
+	ALTENTRY(atomic_sub_char)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subb	%cl, (%eax)
+	ret
+	SET_SIZE(atomic_sub_char)
+	SET_SIZE(atomic_sub_8)
+
+	ENTRY(atomic_sub_16)
+	ALTENTRY(atomic_sub_short)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subw	%cx, (%eax)
+	ret
+	SET_SIZE(atomic_sub_short)
+	SET_SIZE(atomic_sub_16)
+
+	ENTRY(atomic_sub_32)
+	ALTENTRY(atomic_sub_int)
+	ALTENTRY(atomic_sub_ptr)
+	ALTENTRY(atomic_sub_long)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subl	%ecx, (%eax)
+	ret
+	SET_SIZE(atomic_sub_long)
+	SET_SIZE(atomic_sub_ptr)
+	SET_SIZE(atomic_sub_int)
+	SET_SIZE(atomic_sub_32)
+
 	ENTRY(atomic_or_8)
 	ALTENTRY(atomic_or_uchar)
 	movl	4(%esp), %eax
@@ -384,6 +418,55 @@
 	SET_SIZE(atomic_add_int_nv)
 	SET_SIZE(atomic_add_32_nv)
 
+	ENTRY(atomic_sub_8_nv)
+	ALTENTRY(atomic_sub_char_nv)
+	movl	4(%esp), %edx
+	movb	(%edx), %al
+1:
+	movl	8(%esp), %ecx
+	subb	%al, %cl
+	lock
+	cmpxchgb %cl, (%edx)
+	jne	1b
+	movzbl	%cl, %eax
+	ret
+	SET_SIZE(atomic_sub_char_nv)
+	SET_SIZE(atomic_sub_8_nv)
+
+	ENTRY(atomic_sub_16_nv)
+	ALTENTRY(atomic_sub_short_nv)
+	movl	4(%esp), %edx
+	movw	(%edx), %ax
+1:
+	movl	8(%esp), %ecx
+	subw	%ax, %cx
+	lock
+	cmpxchgw %cx, (%edx)
+	jne	1b
+	movzwl	%cx, %eax
+	ret
+	SET_SIZE(atomic_sub_short_nv)
+	SET_SIZE(atomic_sub_16_nv)
+
+	ENTRY(atomic_sub_32_nv)
+	ALTENTRY(atomic_sub_int_nv)
+	ALTENTRY(atomic_sub_ptr_nv)
+	ALTENTRY(atomic_sub_long_nv)
+	movl	4(%esp), %edx
+	movl	(%edx), %eax
+1:
+	movl	8(%esp), %ecx
+	subl	%eax, %ecx
+	lock
+	cmpxchgl %ecx, (%edx)
+	jne	1b
+	movl	%ecx, %eax
+	ret
+	SET_SIZE(atomic_sub_long_nv)
+	SET_SIZE(atomic_sub_ptr_nv)
+	SET_SIZE(atomic_sub_int_nv)
+	SET_SIZE(atomic_sub_32_nv)
+
 	/*
 	 * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
 	 * separated, it is important to edit the libc i386 platform
@@ -413,6 +496,29 @@
 	SET_SIZE(atomic_add_64_nv)
 	SET_SIZE(atomic_add_64)
 
+	ENTRY(atomic_sub_64)
+	ALTENTRY(atomic_sub_64_nv)
+	pushl	%edi
+	pushl	%ebx
+	movl	12(%esp), %edi
+	movl	(%edi), %eax
+	movl	4(%edi), %edx
+1:
+	movl	16(%esp), %ebx
+	movl	20(%esp), %ecx
+	subl	%eax, %ebx
+	adcl	%edx, %ecx
+	lock
+	cmpxchg8b (%edi)
+	jne	1b
+	movl	%ebx, %eax
+	movl	%ecx, %edx
+	popl	%ebx
+	popl	%edi
+	ret
+	SET_SIZE(atomic_sub_64_nv)
+	SET_SIZE(atomic_sub_64)
+
 	ENTRY(atomic_or_8_nv)
 	ALTENTRY(atomic_or_uchar_nv)
 	movl	4(%esp), %edx
diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S
index e321bf732..49c9b2ad1 100644
--- a/lib/libspl/asm-x86_64/atomic.S
+++ b/lib/libspl/asm-x86_64/atomic.S
@@ -232,6 +232,40 @@
 	SET_SIZE(atomic_add_ptr)
 	SET_SIZE(atomic_add_64)
 
+	ENTRY(atomic_sub_8)
+	ALTENTRY(atomic_sub_char)
+	lock
+	subb	%sil, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_char)
+	SET_SIZE(atomic_sub_8)
+
+	ENTRY(atomic_sub_16)
+	ALTENTRY(atomic_sub_short)
+	lock
+	subw	%si, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_short)
+	SET_SIZE(atomic_sub_16)
+
+	ENTRY(atomic_sub_32)
+	ALTENTRY(atomic_sub_int)
+	lock
+	subl	%esi, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_int)
+	SET_SIZE(atomic_sub_32)
+
+	ENTRY(atomic_sub_64)
+	ALTENTRY(atomic_sub_ptr)
+	ALTENTRY(atomic_sub_long)
+	lock
+	subq	%rsi, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_long)
+	SET_SIZE(atomic_sub_ptr)
+	SET_SIZE(atomic_sub_64)
+
 	ENTRY(atomic_or_8)
 	ALTENTRY(atomic_or_uchar)
 	lock
@@ -354,6 +388,64 @@
 	SET_SIZE(atomic_add_ptr_nv)
 	SET_SIZE(atomic_add_64_nv)
 
+	ENTRY(atomic_sub_8_nv)
+	ALTENTRY(atomic_sub_char_nv)
+	movb	(%rdi), %al
+1:
+	movb	%sil, %cl
+	subb	%al, %cl
+	lock
+	cmpxchgb %cl, (%rdi)
+	jne	1b
+	movzbl	%cl, %eax
+	ret
+	SET_SIZE(atomic_sub_char_nv)
+	SET_SIZE(atomic_sub_8_nv)
+
+	ENTRY(atomic_sub_16_nv)
+	ALTENTRY(atomic_sub_short_nv)
+	movw	(%rdi), %ax
+1:
+	movw	%si, %cx
+	subw	%ax, %cx
+	lock
+	cmpxchgw %cx, (%rdi)
+	jne	1b
+	movzwl	%cx, %eax
+	ret
+	SET_SIZE(atomic_sub_short_nv)
+	SET_SIZE(atomic_sub_16_nv)
+
+	ENTRY(atomic_sub_32_nv)
+	ALTENTRY(atomic_sub_int_nv)
+	movl	(%rdi), %eax
+1:
+	movl	%esi, %ecx
+	subl	%eax, %ecx
+	lock
+	cmpxchgl %ecx, (%rdi)
+	jne	1b
+	movl	%ecx, %eax
+	ret
+	SET_SIZE(atomic_sub_int_nv)
+	SET_SIZE(atomic_sub_32_nv)
+
+	ENTRY(atomic_sub_64_nv)
+	ALTENTRY(atomic_sub_ptr_nv)
+	ALTENTRY(atomic_sub_long_nv)
+	movq	(%rdi), %rax
+1:
+	movq	%rsi, %rcx
+	subq	%rax, %rcx
+	lock
+	cmpxchgq %rcx, (%rdi)
+	jne	1b
+	movq	%rcx, %rax
+	ret
+	SET_SIZE(atomic_sub_long_nv)
+	SET_SIZE(atomic_sub_ptr_nv)
+	SET_SIZE(atomic_sub_64_nv)
+
 	ENTRY(atomic_and_8_nv)
 	ALTENTRY(atomic_and_uchar_nv)
 	movb	(%rdi), %al
diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h
index 508000152..9b0775bb9 100644
--- a/lib/libspl/include/atomic.h
+++ b/lib/libspl/include/atomic.h
@@ -79,6 +79,21 @@ extern void atomic_add_64(volatile uint64_t *, int64_t);
 #endif
 
 /*
+ * Substract delta from target
+ */
+extern void atomic_sub_8(volatile uint8_t *, int8_t);
+extern void atomic_sub_char(volatile uchar_t *, signed char);
+extern void atomic_sub_16(volatile uint16_t *, int16_t);
+extern void atomic_sub_short(volatile ushort_t *, short);
+extern void atomic_sub_32(volatile uint32_t *, int32_t);
+extern void atomic_sub_int(volatile uint_t *, int);
+extern void atomic_sub_ptr(volatile void *, ssize_t);
+extern void atomic_sub_long(volatile ulong_t *, long);
+#if defined(_INT64_TYPE)
+extern void atomic_sub_64(volatile uint64_t *, int64_t);
+#endif
+
+/*
  * logical OR bits with target
  */
 extern void atomic_or_8(volatile uint8_t *, uint8_t);
@@ -158,6 +173,21 @@ extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t);
 #endif
 
 /*
+ * Substract delta from target
+ */
+extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t);
+extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char);
+extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t);
+extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short);
+extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t);
+extern uint_t atomic_sub_int_nv(volatile uint_t *, int);
+extern void *atomic_sub_ptr_nv(volatile void *, ssize_t);
+extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long);
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t);
+#endif
+
+/*
  * logical OR bits with target and return new value.
  */
 extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 1d4d1257d..e2abf8cf2 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1440,7 +1440,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb));
 
 	return (0);
 }
@@ -1564,7 +1564,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
 	    dmu_sync_ready, dmu_sync_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb));
 
 	return (0);
 }
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d06012ffb..d199921b7 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -107,6 +107,7 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (mc);
 }
@@ -120,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
+	mutex_destroy(&mc->mc_fastwrite_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -1307,7 +1309,7 @@ static int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
 {
-	metaslab_group_t *mg, *rotor;
+	metaslab_group_t *mg, *fast_mg, *rotor;
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
@@ -1325,6 +1327,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
 		return (ENOSPC);
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_enter(&mc->mc_fastwrite_lock);
+
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_aliquot because
@@ -1367,6 +1372,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
+	} else if (flags & METASLAB_FASTWRITE) {
+		mg = fast_mg = mc->mc_rotor;
+
+		do {
+			if (fast_mg->mg_vd->vdev_pending_fastwrite <
+			    mg->mg_vd->vdev_pending_fastwrite)
+				mg = fast_mg;
+		} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+
 	} else {
 		mg = mc->mc_rotor;
 	}
@@ -1453,7 +1467,8 @@ top:
 				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
-			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+			if ((flags & METASLAB_FASTWRITE) ||
+			    atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
 				mc->mc_aliquot = 0;
@@ -1464,6 +1479,12 @@ top:
 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
+			if (flags & METASLAB_FASTWRITE) {
+				atomic_add_64(&vd->vdev_pending_fastwrite,
+				    psize);
+				mutex_exit(&mc->mc_fastwrite_lock);
+			}
+
 			return (0);
 		}
 next:
@@ -1485,6 +1506,8 @@ next:
 
 	bzero(&dva[d], sizeof (dva_t));
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_exit(&mc->mc_fastwrite_lock);
 	return (ENOSPC);
 }
 
@@ -1678,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 
 	return (error);
 }
+
+void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 205a1d1aa..7d6d5278a 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -904,6 +904,8 @@ vdev_metaslab_fini(vdev_t *vd)
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
+
+	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e76e5ecf1..220f2d79e 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -38,6 +38,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/metaslab.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -451,13 +452,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_fastwrite = fastwrite;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -489,6 +491,7 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t fastwrite = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -516,8 +519,9 @@ zil_create(zilog_t *zilog)
 			BP_ZERO(&blk);
 		}
 
-		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
+		    ZIL_MIN_BLKSZ, B_TRUE);
+		fastwrite = TRUE;
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -527,7 +531,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -586,6 +590,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+			ASSERT(lwb->lwb_zio == NULL);
+			if (lwb->lwb_fastwrite)
+				metaslab_fastwrite_unmark(zilog->zl_spa,
+				    &lwb->lwb_blk);
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@@ -826,6 +834,8 @@ zil_lwb_write_done(zio_t *zio)
 	 */
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_zio = NULL;
+	lwb->lwb_fastwrite = FALSE;
 	lwb->lwb_buf = NULL;
 	lwb->lwb_tx = NULL;
 	mutex_exit(&zilog->zl_lock);
@@ -854,12 +864,21 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
+
+	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
+	mutex_enter(&zilog->zl_lock);
 	if (lwb->lwb_zio == NULL) {
+		if (!lwb->lwb_fastwrite) {
+			metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 1;
+		}
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_FASTWRITE, &zb);
 	}
+	mutex_exit(&zilog->zl_lock);
 }
 
 /*
@@ -876,14 +895,13 @@ uint64_t zil_block_buckets[] = {
 };
 
 /*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ * Use the slog as long as the current commit size is less than the
+ * limit or the total list size is less than 2X the limit.  Limit
+ * checking is disabled by setting zil_slog_limit to UINT64_MAX.
  */
 unsigned long zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+#define	USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \
+	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))
 
 /*
  * Start a log block write and advance to the next log block.
@@ -956,10 +974,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
-	/* pass the old blkptr in order to spread log blocks across devs */
 	use_slog = USE_SLOG(zilog);
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    use_slog);
+	error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog));
 	if (use_slog)
 	{
 		ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
@@ -978,7 +994,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
@@ -1625,6 +1641,9 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
+
+		ASSERT(lwb->lwb_zio == NULL);
+
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_free_zil(spa, txg, &lwb->lwb_blk);
 		kmem_cache_free(zil_lwb_cache, lwb);
@@ -1638,6 +1657,19 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		if (list_head(&zilog->zl_lwb_list) == NULL)
 			BP_ZERO(&zh->zh_log);
 	}
+
+	/*
+	 * Remove fastwrite on any blocks that have been pre-allocated for
+	 * the next commit. This prevents fastwrite counter pollution by
+	 * unused, long-lived LWBs.
+	 */
+	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+		if (lwb->lwb_fastwrite && !lwb->lwb_zio) {
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 0;
+		}
+	}
+
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -1817,6 +1849,9 @@ zil_close(zilog_t *zilog)
 	lwb = list_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+		ASSERT(lwb->lwb_zio == NULL);
+		if (lwb->lwb_fastwrite)
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		kmem_cache_free(zil_lwb_cache, lwb);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index ace72a087..ce76e010c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1861,6 +1861,11 @@ zio_write_gang_block(zio_t *pio)
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+	/*
+	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
+	 */
+	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
+
 	zio_nowait(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -2270,6 +2275,7 @@ zio_dva_allocate(zio_t *zio)
 	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
 	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
 	    METASLAB_GANG_CHILD : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
@@ -2333,8 +2339,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
+    boolean_t use_slog)
 {
 	int error = 1;
 
@@ -2347,14 +2353,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	 */
 	if (use_slog) {
 		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
 	}
 
 	if (error) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
 	}
 
 	if (error == 0) {
@@ -3066,6 +3072,11 @@ zio_done(zio_t *zio)
 		zfs_ereport_free_checksum(zcr);
 	}
 
+	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
+	    !BP_IS_HOLE(zio->io_bp)) {
+		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
+	}
+
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
author	Brian Behlendorf <[email protected]>	2012-10-17 08:57:14 -0700
committer	Brian Behlendorf <[email protected]>	2012-10-17 08:57:49 -0700
commit	658a0140f3d27f04fb789f5a0fe8ea00773a210a (patch)
tree	860efd0241842d2b3749c0d2666d055ef152c7cf
parent	82f46731fd5a9eef4f87530e94922664b58a6138 (diff)
parent	5d7a86d114c2706a8d14d94b71f81ad5cdf066c5 (diff)