aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs/zil.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/zil.c')
-rw-r--r--module/zfs/zil.c110
1 files changed, 96 insertions, 14 deletions
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e8adc6d99..d0b1c1d14 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -29,6 +29,7 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -430,6 +431,35 @@ done:
return (error);
}
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /*
+ * As we call this function from the context of a rewind to a
+ * checkpoint, each ZIL block whose txg is later than the txg
+ * that we rewind to is invalid. Thus, we return -1 so
+ * zil_parse() doesn't attempt to read it.
+ */
+ if (bp->blk_birth >= first_txg)
+ return (-1);
+
+ if (zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
+
+ zio_free(zilog->zl_spa, first_txg, bp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ return (0);
+}
+
static int
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
@@ -476,7 +506,7 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
static int
zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
- zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
}
@@ -662,7 +692,7 @@ zil_create(zilog_t *zilog)
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
- zio_free_zil(zilog->zl_spa, txg, &blk);
+ zio_free(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
@@ -767,8 +797,8 @@ int
zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
- uint64_t first_txg = dmu_tx_get_txg(tx);
zilog_t *zilog;
+ uint64_t first_txg;
zil_header_t *zh;
objset_t *os;
int error;
@@ -790,10 +820,43 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
+ ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+ first_txg = spa_min_claim_txg(zilog->zl_spa);
- if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
- if (!BP_IS_HOLE(&zh->zh_log))
- zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+ /*
+ * If the spa_log_state is not set to be cleared, check whether
+ * the current uberblock is a checkpoint one and if the current
+ * header has been claimed before moving on.
+ *
+ * If the current uberblock is a checkpointed uberblock then
+ * one of the following scenarios took place:
+ *
+ * 1] We are currently rewinding to the checkpoint of the pool.
+ * 2] We crashed in the middle of a checkpoint rewind but we
+ * did manage to write the checkpointed uberblock to the
+ * vdev labels, so when we tried to import the pool again
+ * the checkpointed uberblock was selected from the import
+ * procedure.
+ *
+ * In both cases we want to zero out all the ZIL blocks, except
+ * the ones that have been claimed at the time of the checkpoint
+ * (their zh_claim_txg != 0). The reason is that these blocks
+ * may be corrupted since we may have reused their locations on
+ * disk after we took the checkpoint.
+ *
+ * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+ * when we first figure out whether the current uberblock is
+ * checkpointed or not. Unfortunately, that would discard all
+ * the logs, including the ones that are claimed, and we would
+ * leak space.
+ */
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+ (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)) {
+ if (!BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_clear_log_block,
+ zil_noop_log_record, tx, first_txg, B_FALSE);
+ }
BP_ZERO(&zh->zh_log);
if (os->os_encrypted)
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
@@ -803,6 +866,12 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
}
/*
+ * If we are not rewinding and opening the pool normally, then
+ * the min_claim_txg should be equal to the first txg of the pool.
+ */
+ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+ /*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
@@ -855,16 +924,17 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
zilog = dmu_objset_zil(os);
bp = (blkptr_t *)&zilog->zl_header->zh_log;
- /*
- * Check the first block and determine if it's on a log device
- * which may have been removed or faulted prior to loading this
- * pool. If so, there's no point in checking the rest of the log
- * as its content should have already been synced to the pool.
- */
if (!BP_IS_HOLE(bp)) {
vdev_t *vd;
boolean_t valid = B_TRUE;
+ /*
+ * Check the first block and determine if it's on a log device
+ * which may have been removed or faulted prior to loading this
+ * pool. If so, there's no point in checking the rest of the
+ * log as its content should have already been synced to the
+ * pool.
+ */
spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
if (vd->vdev_islog && vdev_is_dead(vd))
@@ -873,6 +943,18 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
if (!valid)
return (0);
+
+ /*
+ * Check whether the current uberblock is checkpointed (e.g.
+ * we are rewinding) and whether the current header has been
+ * claimed or not. If it hasn't then skip verifying it. We
+ * do this because its ZIL blocks may be part of the pool's
+ * state before the rewind, which is no longer valid.
+ */
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return (0);
}
/*
@@ -883,8 +965,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* which will update spa_max_claim_txg. See spa_load() for details.
*/
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
- zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
- B_FALSE);
+ zilog->zl_header->zh_claim_txg ? -1ULL :
+ spa_min_claim_txg(os->os_spa), B_FALSE);
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}