diff options
author | Alexander Motin <[email protected]> | 2023-05-25 12:48:43 -0400 |
---|---|---|
committer | GitHub <[email protected]> | 2023-05-25 09:48:43 -0700 |
commit | f63811f07213361d49878648bd597af88d06859c (patch) | |
tree | fc7d387ebf8934ef69b2a99c512bfe416e0bd255 /module/zfs | |
parent | 79b20949b25c8db4d379f6486b0835a6613b480c (diff) |
ZIL: Reduce scope of per-dataset zl_issuer_lock.
Before this change ZIL copied all log data while holding the lock.
It caused huge lock contention on workloads with many big parallel
writes. This change splits the process into two parts: first,
zil_lwb_assign() estimates the log space needed for all transactions,
and zil_lwb_write_close() allocates blocks and zios while holding the
lock, then, after the lock in dropped, zil_lwb_commit() copies the
data, and zil_lwb_write_issue() issues the I/Os.
Also while there slightly reduce scope of zl_lock.
Reviewed-by: Paul Dagnelie <[email protected]>
Reviewed-by: Prakash Surya <[email protected]>
Reviewed-by: Richard Yao <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored by: iXsystems, Inc.
Closes #14841
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/zil.c | 428 |
1 files changed, 280 insertions, 148 deletions
diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d887e4900..f2798270a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,6 +146,9 @@ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; +static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static itx_t *zil_itx_clone(itx_t *oitx); + static int zil_bp_compare(const void *x1, const void *x2) { @@ -747,20 +750,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; + lwb->lwb_indirect = B_FALSE; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = lwb->lwb_nfilled = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } + lwb->lwb_max_txg = txg; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); @@ -1397,6 +1401,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } + mutex_exit(&zilog->zl_lock); + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); @@ -1429,8 +1435,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - mutex_exit(&zilog->zl_lock); - mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); @@ -1666,46 +1670,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + if (lwb->lwb_root_zio != NULL) + return; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, + &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), + zil_lwb_write_done, lwb, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - lwb->lwb_state = LWB_STATE_OPENED; + lwb->lwb_state = LWB_STATE_OPENED; - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; - } + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); - - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); } /* @@ -1736,11 +1735,11 @@ static const struct { static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* - * Start a log block write and advance to the next log block. - * Calls are serialized. + * Close the log block for being issued and allocate the next one. + * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; @@ -1748,7 +1747,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; - uint64_t zil_blksz, wsz; + uint64_t zil_blksz; int i, error; boolean_t slog; @@ -1757,16 +1756,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; + /* + * If this lwb includes indirect writes, we have to commit before + * creating the transaction, otherwise we may end up in dead lock. + */ + if (lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; } - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); - /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. @@ -1816,17 +1816,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; BP_ZERO(bp); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (slog) { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, - lwb->lwb_nused); - } else { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, - lwb->lwb_nused); - } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1838,17 +1834,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } + lwb->lwb_state = LWB_STATE_ISSUED; + + dmu_tx_commit(tx); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +/* + * Finalize previously closed block and issue the write zio. + * Does not require locking. + */ +static void +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + zil_chain_t *zilc; + int wsz; + + /* Actually fill the lwb with the data if not yet. */ + if (!lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; + } + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); + ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; + zilc = (zil_chain_t *)lwb->lwb_buf; } else { wsz = lwb->lwb_sz; + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); } - zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; @@ -1858,22 +1884,20 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + if (lwb->lwb_slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); - - dmu_tx_commit(tx); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); } /* @@ -1909,13 +1933,19 @@ zil_max_copied_data(zilog_t *zilog) sizeof (lr_write_t)); } +/* + * Estimate space needed in the lwb for the itx. Allocate more lwbs or + * split the itx as needed, but don't touch the actual transaction data. + * Has to be called under zl_issuer_lock to call zil_lwb_write_close() + * to chain more lwbs. + */ static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; + itx_t *citx; + lr_t *lr, *clr; + lr_write_t *lrw; + uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1923,8 +1953,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_write_open(zilog, lwb); - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead @@ -1938,24 +1968,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * * For more details, see the comment above zil_commit(). */ - if (lrc->lrc_txtype == TX_COMMIT) { + if (lr->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); + list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - dpad = dlen - lrw->lr_length; } else { - dlen = dpad = 0; + dlen = 0; } - reclen = lrc->lrc_reclen; + reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; cont: /* @@ -1968,7 +1997,8 @@ cont: lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); @@ -1987,19 +2017,99 @@ cont: } dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - memcpy(lr_buf, lrc, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + if (dlen > dnow) { + ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); + ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); + citx = zil_itx_clone(itx); + clr = &citx->itx_lr; + lr_write_t *clrw = (lr_write_t *)clr; + clrw->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + } else { + citx = itx; + clr = lr; + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + clr->lrc_seq = ++zilog->zl_lr_seq; + + lwb->lwb_nused += reclen + dnow; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + zil_lwb_add_txg(lwb, lr->lrc_txg); + list_insert_tail(&lwb->lwb_itxs, citx); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + /* + * We have to really issue all queued LWBs before we may have to + * wait for a txg sync. Otherwise we may end up in a dead lock. + */ + if (lr->lrc_txtype == TX_WRITE) { + boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); + if (frozen || itx->itx_wr_state == WR_INDIRECT) { + lwb_t *tlwb; + while ((tlwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, tlwb); + } + if (itx->itx_wr_state == WR_INDIRECT) + lwb->lwb_indirect = B_TRUE; + if (frozen) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); + } + + return (lwb); +} + +/* + * Fill the actual transaction data into the lwb, following zil_lwb_assign(). + * Does not require locking. + */ +static void +zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) +{ + lr_t *lr, *lrb; + lr_write_t *lrw, *lrwb; + char *lr_buf; + uint64_t dlen, reclen; + + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_COMMIT) + return; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lr->lrc_reclen; + ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); + + lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; + memcpy(lr_buf, lr, reclen); + lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ + lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, @@ -2010,14 +2120,10 @@ cont: if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; + lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, - dnow); + dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; @@ -2044,9 +2150,11 @@ cont: error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_write_zio); - if (dbuf != NULL && error == 0 && dnow == dlen) + if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ - memset((char *)dbuf + lrwb->lr_length, 0, dpad); + memset((char *)dbuf + lrwb->lr_length, 0, + dlen - lrwb->lr_length); + } /* * Typically, the only return values we should see from @@ -2074,39 +2182,26 @@ cont: error); zfs_fallthrough; case EIO: - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lwb->lwb_indirect) { + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); + } else { + lwb->lwb_write_zio->io_error = error; + } zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return (lwb); + return; } } } - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); + lwb->lwb_nfilled += reclen + dlen; + ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); + ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * @@ -2131,6 +2226,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize) return (itx); } +static itx_t * +zil_itx_clone(itx_t *oitx) +{ + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); + memcpy(itx, oitx, oitx->itx_size); + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + return (itx); +} + void zil_itx_destroy(itx_t *itx) { @@ -2162,7 +2267,7 @@ zil_itxg_clean(void *arg) /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having + * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -2561,7 +2666,7 @@ zil_commit_writer_stall(zilog_t *zilog) * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog) +zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; @@ -2663,18 +2768,23 @@ zil_process_commit_list(zilog_t *zilog) */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - - if (lwb == NULL) + lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); + if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); - else - list_insert_tail(&lwb->lwb_itxs, itx); + } else if ((zcw->zcw_lwb != NULL && + zcw->zcw_lwb != lwb) || zcw->zcw_done) { + /* + * Our lwb is done, leave the rest of + * itx list to somebody else who care. + */ + first = B_FALSE; + break; + } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - list_insert_tail(&nolwb_itxs, itx); } } else { @@ -2690,6 +2800,8 @@ zil_process_commit_list(zilog_t *zilog) * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2735,13 +2847,13 @@ zil_process_commit_list(zilog_t *zilog) * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to + * zil_lwb_assign(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the + * zil_lwb_assign(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount @@ -2773,10 +2885,15 @@ zil_process_commit_list(zilog_t *zilog) zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); zilog->zl_cur_used = 0; - if (lwb == NULL) + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) + != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); + } } } } @@ -2799,9 +2916,13 @@ zil_process_commit_list(zilog_t *zilog) static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { + list_t ilwbs; + lwb_t *lwb; + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); + list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -2828,10 +2949,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); + zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); + while ((lwb = list_remove_head(&ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + list_destroy(&ilwbs); } static void @@ -2858,7 +2982,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) return; /* - * In order to call zil_lwb_write_issue() we must hold the + * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -2876,8 +3000,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) + if (zcw->zcw_done) { + lwb = NULL; goto out; + } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -2896,15 +3022,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + lwb->lwb_state == LWB_STATE_FLUSH_DONE) { + lwb = NULL; goto out; + } ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -2914,7 +3042,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); @@ -2934,7 +3062,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) if (nlwb == NULL) { /* - * When zil_lwb_write_issue() returns NULL, this + * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the @@ -2956,12 +3084,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); + zil_lwb_write_issue(zilog, lwb); + lwb = NULL; zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); + if (lwb) + zil_lwb_write_issue(zilog, lwb); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } @@ -2976,7 +3108,7 @@ out: * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -3656,7 +3788,7 @@ zil_close(zilog_t *zilog) /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_issue(). + * zil_lwb_write_close(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); |