diff options
author | Brian Behlendorf <[email protected]> | 2012-10-17 08:57:14 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2012-10-17 08:57:49 -0700 |
commit | 658a0140f3d27f04fb789f5a0fe8ea00773a210a (patch) | |
tree | 860efd0241842d2b3749c0d2666d055ef152c7cf | |
parent | 82f46731fd5a9eef4f87530e94922664b58a6138 (diff) | |
parent | 5d7a86d114c2706a8d14d94b71f81ad5cdf066c5 (diff) |
Merge branch 'zil-performance'
This brnach brings some ZIL performance optimizations, with
significant increases in synchronous write performance for
some workloads and pool configurations.
See the individual commit messages for details.
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #1013
-rw-r--r-- | include/sys/metaslab.h | 3 | ||||
-rw-r--r-- | include/sys/metaslab_impl.h | 1 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 1 | ||||
-rw-r--r-- | include/sys/zil_impl.h | 1 | ||||
-rw-r--r-- | include/sys/zio.h | 5 | ||||
-rw-r--r-- | lib/libspl/asm-generic/atomic.c | 56 | ||||
-rw-r--r-- | lib/libspl/asm-i386/atomic.S | 106 | ||||
-rw-r--r-- | lib/libspl/asm-x86_64/atomic.S | 92 | ||||
-rw-r--r-- | lib/libspl/include/atomic.h | 30 | ||||
-rw-r--r-- | module/zfs/dmu.c | 4 | ||||
-rw-r--r-- | module/zfs/metaslab.c | 72 | ||||
-rw-r--r-- | module/zfs/vdev.c | 2 | ||||
-rw-r--r-- | module/zfs/zil.c | 65 | ||||
-rw-r--r-- | module/zfs/zio.c | 23 |
14 files changed, 434 insertions, 27 deletions
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 2cf4d2b48..99912424b 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -50,12 +50,15 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_GANG_AVOID 0x8 +#define METASLAB_FASTWRITE 0x10 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); +extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); +extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp); extern metaslab_class_t *metaslab_class_create(spa_t *spa, space_map_ops_t *ops); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 6c670a162..658359478 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -46,6 +46,7 @@ struct metaslab_class { uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ + kmutex_t mc_fastwrite_lock; }; struct metaslab_group { diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 5bd432beb..0b532dcdd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -156,6 +156,7 @@ struct vdev { uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ + uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 1d4c0cc6c..6c37c1ac2 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -40,6 +40,7 @@ extern "C" { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ + boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ int lwb_nused; /* # used bytes in buffer */ int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 4f20cab65..289238c36 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -193,7 +193,8 @@ enum zio_flag { ZIO_FLAG_RAW = 1 << 21, ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_DDT_CHILD = 1 << 23, - ZIO_FLAG_GODFATHER = 1 << 24 + ZIO_FLAG_GODFATHER = 1 << 24, + ZIO_FLAG_FASTWRITE = 1 << 25 }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -475,7 +476,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t size, boolean_t use_slog); + uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c index de4430f9f..a3223eadc 100644 --- a/lib/libspl/asm-generic/atomic.c +++ b/lib/libspl/asm-generic/atomic.c @@ -103,6 +103,31 @@ void atomic_add_ptr(volatile void *target, ssize_t bits) } +#define ATOMIC_SUB(name, type1, type2) \ + void atomic_sub_##name(volatile type1 *target, type2 bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target -= bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_SUB(8, uint8_t, int8_t) +ATOMIC_SUB(char, uchar_t, signed char) +ATOMIC_SUB(16, uint16_t, int16_t) +ATOMIC_SUB(short, ushort_t, short) +ATOMIC_SUB(32, uint32_t, int32_t) +ATOMIC_SUB(int, uint_t, int) +ATOMIC_SUB(long, ulong_t, long) +ATOMIC_SUB(64, uint64_t, int64_t) + +void atomic_sub_ptr(volatile void *target, ssize_t bits) +{ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + *(caddr_t *)target -= bits; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); +} + + #define ATOMIC_OR(name, type) \ void atomic_or_##name(volatile type *target, type bits) \ { \ @@ -216,6 +241,37 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) } +#define ATOMIC_SUB_NV(name, type1, type2) \ + type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ + { \ + type1 rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target -= bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return rc; \ + } + +ATOMIC_SUB_NV(8, uint8_t, int8_t) +ATOMIC_SUB_NV(char, uchar_t, signed char) +ATOMIC_SUB_NV(16, uint16_t, int16_t) +ATOMIC_SUB_NV(short, ushort_t, short) +ATOMIC_SUB_NV(32, uint32_t, int32_t) +ATOMIC_SUB_NV(int, uint_t, int) +ATOMIC_SUB_NV(long, ulong_t, long) +ATOMIC_SUB_NV(64, uint64_t, int64_t) + +void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) +{ + void *ptr; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + ptr = (*(caddr_t *)target -= bits); + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return ptr; +} + + #define ATOMIC_OR_NV(name, type) \ type atomic_or_##name##_nv(volatile type *target, type bits) \ { \ diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S index 93c04bfb8..d3d425090 100644 --- a/lib/libspl/asm-i386/atomic.S +++ b/lib/libspl/asm-i386/atomic.S @@ -271,6 +271,40 @@ SET_SIZE(atomic_add_int) SET_SIZE(atomic_add_32) + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subb %cl, (%eax) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subw %cx, (%eax) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subl %ecx, (%eax) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + ENTRY(atomic_or_8) ALTENTRY(atomic_or_uchar) movl 4(%esp), %eax @@ -384,6 +418,55 @@ SET_SIZE(atomic_add_int_nv) SET_SIZE(atomic_add_32_nv) + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + subb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + subw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + /* * NOTE: If atomic_add_64 and atomic_add_64_nv are ever * separated, it is important to edit the libc i386 platform @@ -413,6 +496,29 @@ SET_SIZE(atomic_add_64_nv) SET_SIZE(atomic_add_64) + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + subl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_sub_64_nv) + SET_SIZE(atomic_sub_64) + ENTRY(atomic_or_8_nv) ALTENTRY(atomic_or_uchar_nv) movl 4(%esp), %edx diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S index e321bf732..49c9b2ad1 100644 --- a/lib/libspl/asm-x86_64/atomic.S +++ b/lib/libspl/asm-x86_64/atomic.S @@ -232,6 +232,40 @@ SET_SIZE(atomic_add_ptr) SET_SIZE(atomic_add_64) + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + lock + subb %sil, (%rdi) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + lock + subw %si, (%rdi) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + lock + subl %esi, (%rdi) + ret + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + lock + subq %rsi, (%rdi) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_64) + ENTRY(atomic_or_8) ALTENTRY(atomic_or_uchar) lock @@ -354,6 +388,64 @@ SET_SIZE(atomic_add_ptr_nv) SET_SIZE(atomic_add_64_nv) + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movb (%rdi), %al +1: + movb %sil, %cl + subb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movw (%rdi), %ax +1: + movw %si, %cx + subw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + + ENTRY(atomic_sub_64_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + subq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_64_nv) + ENTRY(atomic_and_8_nv) ALTENTRY(atomic_and_uchar_nv) movb (%rdi), %al diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h index 508000152..9b0775bb9 100644 --- a/lib/libspl/include/atomic.h +++ b/lib/libspl/include/atomic.h @@ -79,6 +79,21 @@ extern void atomic_add_64(volatile uint64_t *, int64_t); #endif /* + * Substract delta from target + */ +extern void atomic_sub_8(volatile uint8_t *, int8_t); +extern void atomic_sub_char(volatile uchar_t *, signed char); +extern void atomic_sub_16(volatile uint16_t *, int16_t); +extern void atomic_sub_short(volatile ushort_t *, short); +extern void atomic_sub_32(volatile uint32_t *, int32_t); +extern void atomic_sub_int(volatile uint_t *, int); +extern void atomic_sub_ptr(volatile void *, ssize_t); +extern void atomic_sub_long(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern void atomic_sub_64(volatile uint64_t *, int64_t); +#endif + +/* * logical OR bits with target */ extern void atomic_or_8(volatile uint8_t *, uint8_t); @@ -158,6 +173,21 @@ extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t); #endif /* + * Substract delta from target + */ +extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t); +extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char); +extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t); +extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short); +extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t); +extern uint_t atomic_sub_int_nv(volatile uint_t *, int); +extern void *atomic_sub_ptr_nv(volatile void *, ssize_t); +extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t); +#endif + +/* * logical OR bits with target and return new value. */ extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1d4d1257d..e2abf8cf2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1440,7 +1440,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb)); return (0); } @@ -1564,7 +1564,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb)); return (0); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d06012ffb..d199921b7 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -107,6 +107,7 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops) mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); return (mc); } @@ -120,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + mutex_destroy(&mc->mc_fastwrite_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -1307,7 +1309,7 @@ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { - metaslab_group_t *mg, *rotor; + metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; int dshift = 3; int all_zero; @@ -1325,6 +1327,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); + if (flags & METASLAB_FASTWRITE) + mutex_enter(&mc->mc_fastwrite_lock); + /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_aliquot because @@ -1367,6 +1372,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; + } else if (flags & METASLAB_FASTWRITE) { + mg = fast_mg = mc->mc_rotor; + + do { + if (fast_mg->mg_vd->vdev_pending_fastwrite < + mg->mg_vd->vdev_pending_fastwrite) + mg = fast_mg; + } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } else { mg = mc->mc_rotor; } @@ -1453,7 +1467,8 @@ top: (int64_t)mg->mg_aliquot) / 100; } - if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= + if ((flags & METASLAB_FASTWRITE) || + atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; @@ -1464,6 +1479,12 @@ top: DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); + if (flags & METASLAB_FASTWRITE) { + atomic_add_64(&vd->vdev_pending_fastwrite, + psize); + mutex_exit(&mc->mc_fastwrite_lock); + } + return (0); } next: @@ -1485,6 +1506,8 @@ next: bzero(&dva[d], sizeof (dva_t)); + if (flags & METASLAB_FASTWRITE) + mutex_exit(&mc->mc_fastwrite_lock); return (ENOSPC); } @@ -1678,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } + +void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + atomic_add_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); + atomic_sub_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 205a1d1aa..7d6d5278a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -904,6 +904,8 @@ vdev_metaslab_fini(vdev_t *vd) kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } + + ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { diff --git a/module/zfs/zil.c b/module/zfs/zil.c index e76e5ecf1..220f2d79e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -38,6 +38,7 @@ #include <sys/vdev_impl.h> #include <sys/dmu_tx.h> #include <sys/dsl_pool.h> +#include <sys/metaslab.h> /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -451,13 +452,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) } static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; + lwb->lwb_fastwrite = fastwrite; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; @@ -489,6 +491,7 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; + boolean_t fastwrite = FALSE; /* * Wait for any previous destroy to complete. @@ -516,8 +519,9 @@ zil_create(zilog_t *zilog) BP_ZERO(&blk); } - error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, - ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + error = zio_alloc_zil(zilog->zl_spa, txg, &blk, + ZIL_MIN_BLKSZ, B_TRUE); + fastwrite = TRUE; if (error == 0) zil_init_log_chain(zilog, &blk); @@ -527,7 +531,7 @@ zil_create(zilog_t *zilog) * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, txg); + lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite); /* * If we just allocated the first log block, commit our transaction @@ -586,6 +590,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + ASSERT(lwb->lwb_zio == NULL); + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, + &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); @@ -826,6 +834,8 @@ zil_lwb_write_done(zio_t *zio) */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); + lwb->lwb_zio = NULL; + lwb->lwb_fastwrite = FALSE; lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); @@ -854,12 +864,21 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } + + /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ + mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_FASTWRITE, &zb); } + mutex_exit(&zilog->zl_lock); } /* @@ -876,14 +895,13 @@ uint64_t zil_block_buckets[] = { }; /* - * Use the slog as long as the logbias is 'latency' and the current commit size - * is less than the limit or the total list size is less than 2X the limit. - * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. + * Use the slog as long as the current commit size is less than the + * limit or the total list size is less than 2X the limit. Limit + * checking is disabled by setting zil_slog_limit to UINT64_MAX. */ unsigned long zil_slog_limit = 1024 * 1024; -#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ - (((zilog)->zl_cur_used < zil_slog_limit) || \ - ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) +#define USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \ + ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))) /* * Start a log block write and advance to the next log block. @@ -956,10 +974,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); - /* pass the old blkptr in order to spread log blocks across devs */ use_slog = USE_SLOG(zilog); - error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, - use_slog); + error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); if (use_slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); @@ -978,7 +994,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* * Allocate a new log write buffer (lwb). */ - nlwb = zil_alloc_lwb(zilog, bp, txg); + nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); @@ -1625,6 +1641,9 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; + + ASSERT(lwb->lwb_zio == NULL); + list_remove(&zilog->zl_lwb_list, lwb); zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); @@ -1638,6 +1657,19 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } + + /* + * Remove fastwrite on any blocks that have been pre-allocated for + * the next commit. This prevents fastwrite counter pollution by + * unused, long-lived LWBs. + */ + for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { + if (lwb->lwb_fastwrite && !lwb->lwb_zio) { + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 0; + } + } + mutex_exit(&zilog->zl_lock); } @@ -1817,6 +1849,9 @@ zil_close(zilog_t *zilog) lwb = list_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); + ASSERT(lwb->lwb_zio == NULL); + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); kmem_cache_free(zil_lwb_cache, lwb); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ace72a087..ce76e010c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1861,6 +1861,11 @@ zio_write_gang_block(zio_t *pio) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * We didn't allocate this bp, so make sure it doesn't get unmarked. + */ + pio->io_flags &= ~ZIO_FLAG_FASTWRITE; + zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); @@ -2270,6 +2275,7 @@ zio_dva_allocate(zio_t *zio) flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; + flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); @@ -2333,8 +2339,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t use_slog) +zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, + boolean_t use_slog) { int error = 1; @@ -2347,14 +2353,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); } if (error == 0) { @@ -3066,6 +3072,11 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } + if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && + !BP_IS_HOLE(zio->io_bp)) { + metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); + } + /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as |