aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorMatthew Ahrens <[email protected]>2019-06-10 11:48:42 -0700
committerBrian Behlendorf <[email protected]>2019-06-10 11:48:42 -0700
commitb8738257c2607c73c731ce8e0fd73282b266d6ef (patch)
treed73e90809b9f413b8894d8ee7fc6ad8a11bff7fe /module
parent5a902f5aaa1fbf6f7e459ec29f6d1d988ec78b0a (diff)
make zil max block size tunable
We've observed that on some highly fragmented pools, most metaslab allocations are small (~2-8KB), but there are some large, 128K allocations. The large allocations are for ZIL blocks. If there is a lot of fragmentation, the large allocations can be hard to satisfy. The most common impact of this is that we need to check (and thus load) lots of metaslabs from the ZIL allocation code path, causing sync writes to wait for metaslabs to load, which can take a second or more. In the worst case, we may not be able to satisfy the allocation, in which case the ZIL will resort to txg_wait_synced() to ensure the change is on disk. To provide a workaround for this, this change adds a tunable that can reduce the size of ZIL blocks. External-issue: DLPX-61719 Reviewed-by: George Wilson <[email protected]> Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Matthew Ahrens <[email protected]> Closes #8865
Diffstat (limited to 'module')
-rw-r--r--module/zfs/zfs_log.c11
-rw-r--r--module/zfs/zil.c67
-rw-r--r--module/zfs/zvol.c2
3 files changed, 71 insertions, 9 deletions
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 15c396ce0..ad5b5cf30 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
*/
@@ -528,7 +528,14 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t wr_state = write_state;
ssize_t len = resid;
- if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+ /*
+ * A WR_COPIED record must fit entirely in one log block.
+ * Large writes can use WR_NEED_COPY, which the ZIL will
+ * split into multiple records across several log blocks
+ * if necessary.
+ */
+ if (wr_state == WR_COPIED &&
+ resid > zil_max_copied_data(zilog))
wr_state = WR_NEED_COPY;
else if (wr_state == WR_INDIRECT)
len = MIN(blocksize - P2PHASE(off, blocksize), resid);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index ff14a98b6..71079c4a3 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1424,6 +1424,13 @@ uint64_t zil_block_buckets[] = {
};
/*
+ * Maximum block size used by the ZIL. This is picked up when the ZIL is
+ * initialized. Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+
+/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
@@ -1499,9 +1506,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
continue;
- zil_blksz = zil_block_buckets[i];
- if (zil_blksz == UINT64_MAX)
- zil_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zil_blksz = MIN(zil_block_buckets[i], zilog->zl_max_block_size);
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
@@ -1562,13 +1567,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
return (nlwb);
}
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
+uint64_t
+zil_max_log_data(zilog_t *zilog)
+{
+ return (zilog->zl_max_block_size -
+ sizeof (zil_chain_t) - sizeof (lr_write_t));
+}
+
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+static inline uint64_t
+zil_max_waste_space(zilog_t *zilog)
+{
+ return (zil_max_log_data(zilog) / 8);
+}
+
+/*
+ * Maximum amount of write data for WR_COPIED. For correctness, consumers
+ * must fall back to WR_NEED_COPY if we can't fit the entire record into one
+ * maximum sized log block, because each WR_COPIED record must fit in a
+ * single log block. For space efficiency, we want to fit two records into a
+ * max-sized log block.
+ */
+uint64_t
+zil_max_copied_data(zilog_t *zilog)
+{
+ return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
+ sizeof (lr_write_t));
+}
+
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrcb, *lrc;
lr_write_t *lrwb, *lrw;
char *lr_buf;
- uint64_t dlen, dnow, lwb_sp, reclen, txg;
+ uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
@@ -1617,15 +1656,27 @@ cont:
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ max_log_data = zil_max_log_data(zilog);
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
- lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
- lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
+ lwb_sp < zil_max_waste_space(zilog) &&
+ (dlen % max_log_data == 0 ||
+ lwb_sp < reclen + dlen % max_log_data))) {
lwb = zil_lwb_write_issue(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
ASSERT(LWB_EMPTY(lwb));
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+
+ /*
+ * There must be enough space in the new, empty log block to
+ * hold reclen. For WR_COPIED, we need to fit the whole
+ * record in one block, and reclen is the header size + the
+ * data size. For WR_NEED_COPY, we can create multiple
+ * records, splitting the data into multiple blocks, so we
+ * only need to fit one word of data per block; in this case
+ * reclen is just the header size (no data).
+ */
ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
}
@@ -3124,6 +3175,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_dirty_max_txg = 0;
zilog->zl_last_lwb_opened = NULL;
zilog->zl_last_lwb_latency = 0;
+ zilog->zl_max_block_size = zil_maxblocksize;
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3637,5 +3689,8 @@ MODULE_PARM_DESC(zil_nocacheflush, "Disable ZIL cache flushes");
module_param(zil_slog_bulk, ulong, 0644);
MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit");
+
+module_param(zil_maxblocksize, int, 0644);
+MODULE_PARM_DESC(zil_maxblocksize, "Limit in bytes of ZIL log block size");
/* END CSTYLED */
#endif
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index c29f65f67..9aeadb22b 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -684,7 +684,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
itx_wr_state_t wr_state = write_state;
ssize_t len = size;
- if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA)
+ if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
wr_state = WR_NEED_COPY;
else if (wr_state == WR_INDIRECT)
len = MIN(blocksize - P2PHASE(offset, blocksize), size);