diff options
Diffstat (limited to 'zfs/lib/libzpool/zil.c')
-rw-r--r-- | zfs/lib/libzpool/zil.c | 173 |
1 files changed, 145 insertions, 28 deletions
diff --git a/zfs/lib/libzpool/zil.c b/zfs/lib/libzpool/zil.c index 4f9325dbb..95101882b 100644 --- a/zfs/lib/libzpool/zil.c +++ b/zfs/lib/libzpool/zil.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "@(#)zil.c 1.34 08/02/22 SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/dmu.h> @@ -167,7 +165,11 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) *abufpp = NULL; - error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, + /* + * We shouldn't be doing any scrubbing while we're doing log + * replay, it's OK to not lock. + */ + error = arc_read_nolock(NULL, zilog->zl_spa, &blk, arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); @@ -178,17 +180,20 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) zio_cksum_t cksum = bp->blk_cksum; /* + * Validate the checksummed log block. + * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. + * + * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) - error = ESTALE; - else if (BP_IS_HOLE(&ztp->zit_next_blk)) - error = ENOENT; - else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) - error = EOVERFLOW; + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) || + (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) { + error = ECKSUM; + } if (error) { VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); @@ -283,7 +288,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) */ if (bp->blk_birth >= first_txg && zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { - err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL)); + err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED)); ASSERT(err == 0); } } @@ -499,9 +505,9 @@ zil_claim(char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os); + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); if (error) { - cmn_err(CE_WARN, "can't process intent log for %s", osname); + cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -528,6 +534,83 @@ zil_claim(char *osname, void *txarg) return (0); } +/* + * Check the log by walking the log chain. + * Checksum errors are ok as they indicate the end of the chain. + * Any other error (no device or read failure) returns an error. + */ +/* ARGSUSED */ +int +zil_check_log_chain(char *osname, void *txarg) +{ + zilog_t *zilog; + zil_header_t *zh; + blkptr_t blk; + arc_buf_t *abuf; + objset_t *os; + char *lrbuf; + zil_trailer_t *ztp; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + blk = zh->zh_log; + if (BP_IS_HOLE(&blk)) { + dmu_objset_close(os); + return (0); /* no chain */ + } + + for (;;) { + error = zil_read_log_block(zilog, &blk, &abuf); + if (error) + break; + lrbuf = abuf->b_data; + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + dmu_objset_close(os); + if (error == ECKSUM) + return (0); /* normal end of chain */ + return (error); +} + +/* + * Clear a log chain + */ +/* ARGSUSED */ +int +zil_clear_log_chain(char *osname, void *txarg) +{ + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + dmu_tx_t *tx; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } + + zilog = dmu_objset_zil(os); + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + zh = zil_header_in_syncing_context(zilog); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_tx_commit(tx); + dmu_objset_close(os); + return (0); +} + static int zil_vdev_compare(const void *x1, const void *x2) { @@ -591,10 +674,9 @@ zil_flush_vdevs(zilog_t *zilog) if (avl_numnodes(t) == 0) return; - spa_config_enter(spa, RW_READER, FTAG); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); @@ -609,7 +691,7 @@ zil_flush_vdevs(zilog_t *zilog) */ (void) zio_wait(zio); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_STATE, FTAG); } /* @@ -621,6 +703,15 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG); + ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); + ASSERT(!BP_IS_GANG(zio->io_bp)); + ASSERT(!BP_IS_HOLE(zio->io_bp)); + ASSERT(zio->io_bp->blk_fill == 0); + /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in @@ -638,9 +729,6 @@ zil_lwb_write_done(zio_t *zio) /* * Initialize the io for a log block. - * - * Note, we should not initialize the IO until we are about - * to use it, since zio_rewrite() does a spa_config_enter(). */ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) @@ -658,7 +746,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, + 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); } @@ -951,7 +1039,7 @@ zil_clean(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } -void +static void zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) { uint64_t txg; @@ -961,7 +1049,7 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) spa_t *spa; zilog->zl_writer = B_TRUE; - zilog->zl_root_zio = NULL; + ASSERT(zilog->zl_root_zio == NULL); spa = zilog->zl_spa; if (zilog->zl_suspend) { @@ -1066,6 +1154,7 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) if (zilog->zl_root_zio) { DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); (void) zio_wait(zilog->zl_root_zio); + zilog->zl_root_zio = NULL; DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); zil_flush_vdevs(zilog); } @@ -1251,20 +1340,20 @@ zil_free(zilog_t *zilog) /* * return true if the initial log block is not valid */ -static int +static boolean_t zil_empty(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; arc_buf_t *abuf = NULL; if (BP_IS_HOLE(&zh->zh_log)) - return (1); + return (B_TRUE); if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (1); + return (B_TRUE); VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (0); + return (B_FALSE); } /* @@ -1333,7 +1422,6 @@ zil_suspend(zilog_t *zilog) */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); - ASSERT(BP_IS_HOLE(&zh->zh_log)); mutex_exit(&zilog->zl_lock); return (0); } @@ -1372,6 +1460,7 @@ zil_resume(zilog_t *zilog) typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; + zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; uint64_t *zr_txgp; boolean_t zr_byteswap; @@ -1450,6 +1539,29 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) } /* + * Replay of large truncates can end up needing additional txs + * and a different txg. If they are nested within the replay tx + * as below then a hang is possible. So we do the truncate here + * and redo the truncate later (a no-op) and update the sequence + * number whilst in the replay tx. Fortunately, it's safe to repeat + * a truncate if we crash and the truncate commits. A create over + * an existing file will also come in as a TX_TRUNCATE record. + * + * Note, remove of large files and renames over large files is + * handled by putting the deleted object on a stable list + * and if necessary force deleting the object outside of the replay + * transaction using the zr_replay_cleaner. + */ + if (txtype == TX_TRUNCATE) { + *zr->zr_txgp = TXG_NOWAIT; + error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap); + if (error) + goto bad; + zr->zr_byteswap = 0; /* only byteswap once */ + } + + /* * We must now do two things atomically: replay this log record, * and update the log header to reflect the fact that we did so. * We use the DMU's ability to assign into a specific txg to do this. @@ -1502,6 +1614,8 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * transaction. */ if (error != ERESTART && !sunk) { + if (zr->zr_replay_cleaner) + zr->zr_replay_cleaner(zr->zr_arg); txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); sunk = B_TRUE; continue; /* retry */ @@ -1517,6 +1631,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) dprintf("pass %d, retrying\n", pass); } +bad: ASSERT(error && error != ERESTART); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); @@ -1540,7 +1655,8 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) */ void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]) + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; @@ -1553,6 +1669,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zr.zr_os = os; zr.zr_replay = replay_func; + zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); |