diff options
author | Will Andrews <[email protected]> | 2013-06-11 09:12:34 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2013-11-04 10:55:25 -0800 |
commit | e49f1e20a09181d03382d64afdc4b7a12a5dfdf1 (patch) | |
tree | 301457b83154f25be39a6135f797ca5cb83daf1e /module | |
parent | b1118acbb16ec347f6a3eb091d9b7097d12b8d54 (diff) |
Illumos #3741
3741 zfs needs better comments
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Eric Schrock <[email protected]>
Approved by: Christopher Siden <[email protected]>
References:
https://www.illumos.org/issues/3741
illumos/illumos-gate@3e30c24aeefdee1631958ecf17f18da671781956
Ported-by: Richard Yao <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Issue #1775
Diffstat (limited to 'module')
-rw-r--r-- | module/zfs/arc.c | 19 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 24 | ||||
-rw-r--r-- | module/zfs/dmu.c | 2 | ||||
-rw-r--r-- | module/zfs/dmu_tx.c | 8 | ||||
-rw-r--r-- | module/zfs/dmu_zfetch.c | 12 | ||||
-rw-r--r-- | module/zfs/spa.c | 2 | ||||
-rw-r--r-- | module/zfs/txg.c | 14 | ||||
-rw-r--r-- | module/zfs/vdev_label.c | 1 | ||||
-rw-r--r-- | module/zfs/vdev_raidz.c | 65 | ||||
-rw-r--r-- | module/zfs/zfs_ctldir.c | 5 |
10 files changed, 141 insertions, 11 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a521501bd..2ae4c37a3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -260,7 +260,18 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; kstat_named_t arcstat_recycle_miss; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped because they have I/O in progress, are + * indrect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; @@ -3174,6 +3185,10 @@ top: mutex_exit(hash_lock); + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); @@ -3445,8 +3460,8 @@ arc_buf_evict(arc_buf_t *buf) } /* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9c60ec55a..95c7b3297 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -691,6 +691,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (!havepzio) err = zio_wait(zio); } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, @@ -699,6 +707,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); + /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || @@ -1313,7 +1322,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* - * Return TRUE if this evicted the dbuf. + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -2324,6 +2334,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db_level > 0); DBUF_VERIFY(db); + /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); @@ -2334,10 +2345,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); + /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2728,6 +2741,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); } +/* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { @@ -2762,11 +2776,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { + /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9223b907b..34f3eeef9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1965,7 +1965,7 @@ dmu_init(void) void dmu_fini(void) { - arc_fini(); + arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1cad8d20e..caac60193 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1040,6 +1040,10 @@ dmu_tx_unassign(dmu_tx_t *tx) txg_rele_to_quiesce(&tx->tx_txgh); + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1157,6 +1161,10 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 1763bae51..705478c82 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -48,11 +48,11 @@ unsigned int zfetch_block_cap = 256; unsigned long zfetch_array_rd_sz = 1024 * 1024; /* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); @@ -104,9 +104,9 @@ kstat_t *zfetch_ksp; * last stream, then we are probably in a strided access pattern. So * combine the two sequential streams into a single strided stream. * - * If no co-linear streams are found, return NULL. + * Returns whether co-linear streams were found. */ -static int +static boolean_t dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) { zstream_t *z_walk; @@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ -static int +static boolean_t dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; @@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; - int fetched; + boolean_t fetched; int inserted; unsigned int blkshft; uint64_t blksz; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bc9bf2cc3..c30107771 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -26,6 +26,8 @@ */ /* + * SPA: Storage Pool Allocator + * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c8a29e14f..697aa0905 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -354,6 +354,12 @@ txg_rele_to_sync(txg_handle_t *th) th->th_cpu = NULL; /* defensive */ } +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { @@ -409,6 +415,9 @@ txg_do_callbacks(list_t *cb_list) /* * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) @@ -419,7 +428,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - /* No need to lock tx_cpu_t at this point */ + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ int g = txg & TXG_MASK; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 07e66ebdc..0405608c2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1035,6 +1035,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } +/* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 130ec575e..d2dfd5b43 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -431,23 +431,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_cksum_report }; +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { @@ -1521,6 +1548,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ static int vdev_raidz_io_start(zio_t *zio) { @@ -1864,6 +1908,27 @@ done: return (ret); } +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ static void vdev_raidz_io_done(zio_t *zio) { diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index ce084fff1..c08e9dd9b 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -368,6 +368,11 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) return (0); } +/* + * Gets the full dataset name that corresponds to the given snapshot name + * Example: + * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" + */ static int zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) { |