diff options
Diffstat (limited to 'module/zfs/zil.c')
-rw-r--r-- | module/zfs/zil.c | 118 |
1 files changed, 64 insertions, 54 deletions
diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 12a034d5b..6a1f190f5 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -102,6 +102,13 @@ int zil_replay_disable = 0; */ int zfs_nocacheflush = 0; +/* + * Limit SLOG write size per commit executed with synchronous priority. + * Any writes above that will be executed with lower (asynchronous) priority + * to limit potential SLOG device abuse by single active ZIL writer. + */ +unsigned long zil_slog_bulk = 768 * 1024; + static kmem_cache_t *zil_lwb_cache; static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); @@ -449,7 +456,8 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) } static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, + boolean_t fastwrite) { lwb_t *lwb; @@ -457,6 +465,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; + lwb->lwb_slog = slog; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; @@ -542,6 +551,7 @@ zil_create(zilog_t *zilog) blkptr_t blk; int error = 0; boolean_t fastwrite = FALSE; + boolean_t slog = FALSE; /* * Wait for any previous destroy to complete. @@ -570,7 +580,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, - ZIL_MIN_BLKSZ, B_TRUE); + ZIL_MIN_BLKSZ, &slog); fastwrite = TRUE; if (error == 0) @@ -581,7 +591,7 @@ zil_create(zilog_t *zilog) * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); /* * If we just allocated the first log block, commit our transaction @@ -915,6 +925,7 @@ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_phys_t zb; + zio_priority_t prio; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -934,9 +945,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, + zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); } @@ -958,15 +973,6 @@ uint64_t zil_block_buckets[] = { }; /* - * Use the slog as long as the current commit size is less than the - * limit or the total list size is less than 2X the limit. Limit - * checking is disabled by setting zil_slog_limit to UINT64_MAX. - */ -unsigned long zil_slog_limit = 1024 * 1024; -#define USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \ - ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))) - -/* * Start a log block write and advance to the next log block. * Calls are serialized. */ @@ -981,7 +987,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) uint64_t txg; uint64_t zil_blksz, wsz; int i, error; - boolean_t use_slog; + boolean_t slog; if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; @@ -1037,10 +1043,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); - use_slog = USE_SLOG(zilog); - error = zio_alloc_zil(spa, txg, bp, zil_blksz, - USE_SLOG(zilog)); - if (use_slog) { + error = zio_alloc_zil(spa, txg, bp, zil_blksz, &slog); + if (slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); } else { @@ -1055,7 +1059,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* * Allocate a new log write buffer (lwb). */ - nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE); + nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); @@ -1092,45 +1096,53 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { - lr_t *lrc = &itx->itx_lr; /* common log record */ - lr_write_t *lrw = (lr_write_t *)lrc; + lr_t *lrcb, *lrc; + lr_write_t *lrwb, *lrw; char *lr_buf; - uint64_t txg = lrc->lrc_txg; - uint64_t reclen = lrc->lrc_reclen; - uint64_t dlen = 0; + uint64_t dlen, dnow, lwb_sp, reclen, txg; if (lwb == NULL) return (NULL); ASSERT(lwb->lwb_buf != NULL); - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) + lrc = &itx->itx_lr; /* Common log record inside itx. */ + lrw = (lr_write_t *)lrc; /* Write log record inside itx. */ + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - + } else { + dlen = 0; + } + reclen = lrc->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); + txg = lrc->lrc_txg; zil_lwb_write_init(zilog, lwb); +cont: /* * If this record won't fit in the current log block, start a new one. + * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + if (reclen > lwb_sp || (reclen + dlen > lwb_sp && + lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || + lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); - } + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + dnow = MIN(dlen, lwb_sp - reclen); lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); - lrc = (lr_t *)lr_buf; - lrw = (lr_write_t *)lrc; + lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ + lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zil_itx_count); @@ -1147,10 +1159,13 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) char *dbuf; int error; - if (dlen) { - ASSERT(itx->itx_wr_state == WR_NEED_COPY); + if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrw->lr_common.lrc_reclen += dlen; + lrcb->lrc_reclen += dnow; + if (lrwb->lr_length > dnow) + lrwb->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; ZIL_STAT_BUMP(zil_itx_needcopy_count); ZIL_STAT_INCR(zil_itx_needcopy_bytes, lrw->lr_length); @@ -1162,7 +1177,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lrw->lr_length); } error = zilog->zl_get_data( - itx->itx_private, lrw, dbuf, lwb->lwb_zio); + itx->itx_private, lrwb, dbuf, lwb->lwb_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); @@ -1181,12 +1196,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ - lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ - lwb->lwb_nused += reclen + dlen; + lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ + lwb->lwb_nused += reclen + dnow; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + return (lwb); } @@ -1200,7 +1221,6 @@ zil_itx_create(uint64_t txtype, size_t lrsize) itx = zio_data_buf_alloc(offsetof(itx_t, itx_lr) + lrsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; - itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; @@ -1351,11 +1371,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", itxg->itxg_txg); - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); - itxg->itxg_sod = 0; clean = itxg->itxg_itxs; } - ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); @@ -1368,8 +1385,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); - atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); - itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = @@ -1419,8 +1434,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT(itxg->itxg_txg != 0); ASSERT(zilog->zl_clean_taskq != NULL); - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); - itxg->itxg_sod = 0; clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; @@ -1444,7 +1457,6 @@ zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; - uint64_t push_sod = 0; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; @@ -1476,12 +1488,9 @@ zil_get_commit_list(zilog_t *zilog) ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); - push_sod += itxg->itxg_sod; - itxg->itxg_sod = 0; mutex_exit(&itxg->itxg_lock); } - atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); } /* @@ -2304,13 +2313,14 @@ EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); +/* BEGIN CSTYLED */ module_param(zil_replay_disable, int, 0644); MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay"); module_param(zfs_nocacheflush, int, 0644); MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); -/* CSTYLED */ -module_param(zil_slog_limit, ulong, 0644); -MODULE_PARM_DESC(zil_slog_limit, "Max commit bytes to separate log device"); +module_param(zil_slog_bulk, ulong, 0644); +MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit"); +/* END CSTYLED */ #endif |