diff options
Diffstat (limited to 'module/zfs/zfs_vnops.c')
-rw-r--r-- | module/zfs/zfs_vnops.c | 285 |
1 files changed, 93 insertions, 192 deletions
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index f62d3bfa0..e38abb28c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -348,56 +348,28 @@ zfs_unmap_page(page_t *pp, caddr_t addr) * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int64_t start, off; - int len = nbytes; - int error = 0; + int64_t off; - start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t woff = uio->uio_loffset; + uint64_t nbytes = MIN(PAGESIZE - off, len); - /* - * We don't want a new page to "appear" in the middle of - * the file update (because it may not get the write - * update data), so we grab a lock to block - * zfs_getpage(). - */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; - rw_exit(&zp->z_map_lock); va = zfs_map_page(pp, S_WRITE); - error = uiomove(va+off, bytes, UIO_WRITE, uio); - if (error == 0) { - dmu_write(zfsvfs->z_os, zp->z_id, - woff, bytes, va+off, tx); - } + (void) dmu_read(os, oid, start+off, nbytes, va+off); zfs_unmap_page(pp, va); page_unlock(pp); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, bytes, tx); - rw_exit(&zp->z_map_lock); } - len -= bytes; + len -= nbytes; off = 0; - if (error) - break; } - return (error); } /* @@ -733,18 +705,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); - } + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); tx_bytes -= uio->uio_resid; + if (tx_bytes && vn_has_cached_data(vp)) + update_pages(vp, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); /* * If we made no progress, we're done. If we made even @@ -3610,9 +3577,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; - rl_t *rl; u_offset_t off, koff; size_t len, klen; uint64_t filesz; @@ -3627,26 +3592,18 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, * a read-modify-write). */ if (off < filesz && zp->z_blksz > PAGESIZE) { - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - klen = zp->z_blksz; - koff = P2ALIGN(off, (u_offset_t)klen); - } + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= filesz); if (koff + klen > filesz) klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); -top: - rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* * Can't push pages past end-of-file. */ - filesz = zp->z_phys->zp_size; if (off >= filesz) { /* ignore all pages */ err = 0; @@ -3661,17 +3618,15 @@ top: pvn_write_done(trunc, flags); len = filesz - off; } - +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_bonus(tx, zp->z_id); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { - zfs_range_unlock(rl); dmu_tx_wait(tx); dmu_tx_abort(tx); - err = 0; goto top; } dmu_tx_abort(tx); @@ -3689,12 +3644,11 @@ top: if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); dmu_tx_commit(tx); } out: - zfs_range_unlock(rl); pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; @@ -3731,31 +3685,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, page_t *pp; size_t io_len; u_offset_t io_off; - uint64_t filesz; + uint_t blksz; + rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (len == 0) { + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { /* - * Search the entire vp list for pages >= off. + * Search the entire vp list for pages >= io_off. */ - error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, - flags, cr); + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off > filesz) { + if (off > zp->z_phys->zp_size) { /* past end of file */ + zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } - len = MIN(len, filesz - off); + len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); - for (io_off = off; io_off < off + len; io_off += io_len) { + for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); @@ -3778,6 +3751,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: + zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0) zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); ZFS_EXIT(zfsvfs); @@ -3894,7 +3868,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering). + * multiple pages at once (klustering) to fill up the supplied page + * list. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, @@ -3903,57 +3878,27 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; - caddr_t va; u_offset_t io_off, total; - uint64_t oid = zp->z_id; size_t io_len; - uint64_t filesz; int err; - /* - * If we are only asking for a single page don't bother klustering. - */ - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off >= filesz) - return (EFAULT); if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ io_off = off; io_len = PAGESIZE; pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); } else { /* - * Try to fill a kluster of pages (a blocks worth). + * Try to find enough pages to fill the page list */ - size_t klen; - u_offset_t koff; - - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - /* - * It would be ideal to align our offset to the - * blocksize but doing so has resulted in some - * strange application crashes. For now, we - * leave the offset as is and only adjust the - * length if we are off the end of the file. - */ - koff = off; - klen = plsz; - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; - ASSERT3U(off, >=, koff); - ASSERT3U(off, <, koff + klen); pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, koff, klen, 0); + &io_len, off, plsz, 0); } if (pp == NULL) { /* - * Some other thread entered the page before us. - * Return to zfs_getpage to retry the lookup. + * The page already exists, nothing to do here. */ *pl = NULL; return (0); @@ -3964,9 +3909,11 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, oid, io_off, PAGESIZE, va); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -3978,15 +3925,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, } cur_pp = cur_pp->p_next; } -out: + /* - * Fill in the page list array from the kluster. If - * there are too many pages in the kluster, return - * as many pages as possible starting from the desired - * offset `off'. + * Fill in the page list array from the kluster starting + * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } @@ -3994,10 +3940,10 @@ out: /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may - * also return page pointers from before or after the specified - * region (i.e. some region [off', off' + plsz]). These additional - * pages are only returned if they are already in the cache, or were - * created as part of a klustered read. + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. @@ -4026,9 +3972,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp, **pl0 = pl; - int need_unlock = 0, err = 0; - offset_t orig_off; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -4036,104 +3990,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, if (protp) *protp = PROT_ALL; - /* no faultahead (for now) */ - if (pl == NULL) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* can't fault past EOF */ - if (off >= zp->z_phys->zp_size) { - ZFS_EXIT(zfsvfs); - return (EFAULT); - } - orig_off = off; - - /* - * If we already own the lock, then we must be page faulting - * in the middle of a write to this file (i.e., we are writing - * to this file using data from a mapped region of the file). - */ - if (rw_owner(&zp->z_map_lock) != curthread) { - rw_enter(&zp->z_map_lock, RW_WRITER); - need_unlock = TRUE; - } - /* * Loop through the requested range [off, off + len] looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { - if (plsz < PAGESIZE) - break; - if (pp = page_lookup(vp, off, SE_SHARED)) { - *pl++ = pp; + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; - len -= PAGESIZE; - plsz -= PAGESIZE; - } else { - err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); - if (err) - goto out; - /* - * klustering may have changed our region - * to be block aligned. - */ - if (((pp = *pl) != 0) && (off != pp->p_offset)) { - int delta = off - pp->p_offset; - len += delta; - off -= delta; - addr -= delta; - } - while (*pl) { - pl++; - off += PAGESIZE; - addr += PAGESIZE; - plsz -= PAGESIZE; - if (len > PAGESIZE) - len -= PAGESIZE; - else - len = 0; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; } } /* * Fill out the page array with any pages already in the cache. */ - while (plsz > 0) { - pp = page_lookup_nowait(vp, off, SE_SHARED); - if (pp == NULL) - break; - *pl++ = pp; - off += PAGESIZE; - plsz -= PAGESIZE; + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); out: - /* - * We can't grab the range lock for the page as reader which would - * stop truncation as this leads to deadlock. So we need to recheck - * the file size. - */ - if (orig_off >= zp->z_phys->zp_size) - err = EFAULT; if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; - if (need_unlock) - rw_exit(&zp->z_map_lock); - ZFS_EXIT(zfsvfs); return (err); } |