summaryrefslogtreecommitdiffstats
path: root/module/zfs/zfs_vnops.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/zfs_vnops.c')
-rw-r--r--module/zfs/zfs_vnops.c285
1 files changed, 93 insertions, 192 deletions
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index f62d3bfa0..e38abb28c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -348,56 +348,28 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
*
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
*/
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int64_t start, off;
- int len = nbytes;
- int error = 0;
+ int64_t off;
- start = uio->uio_loffset;
off = start & PAGEOFFSET;
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
page_t *pp;
- uint64_t bytes = MIN(PAGESIZE - off, len);
- uint64_t woff = uio->uio_loffset;
+ uint64_t nbytes = MIN(PAGESIZE - off, len);
- /*
- * We don't want a new page to "appear" in the middle of
- * the file update (because it may not get the write
- * update data), so we grab a lock to block
- * zfs_getpage().
- */
- rw_enter(&zp->z_map_lock, RW_WRITER);
if (pp = page_lookup(vp, start, SE_SHARED)) {
caddr_t va;
- rw_exit(&zp->z_map_lock);
va = zfs_map_page(pp, S_WRITE);
- error = uiomove(va+off, bytes, UIO_WRITE, uio);
- if (error == 0) {
- dmu_write(zfsvfs->z_os, zp->z_id,
- woff, bytes, va+off, tx);
- }
+ (void) dmu_read(os, oid, start+off, nbytes, va+off);
zfs_unmap_page(pp, va);
page_unlock(pp);
- } else {
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
- uio, bytes, tx);
- rw_exit(&zp->z_map_lock);
}
- len -= bytes;
+ len -= nbytes;
off = 0;
- if (error)
- break;
}
- return (error);
}
/*
@@ -733,18 +705,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
- rw_enter(&zp->z_map_lock, RW_READER);
tx_bytes = uio->uio_resid;
- if (vn_has_cached_data(vp)) {
- rw_exit(&zp->z_map_lock);
- error = mappedwrite(vp, nbytes, uio, tx);
- } else {
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
- uio, nbytes, tx);
- rw_exit(&zp->z_map_lock);
- }
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx);
tx_bytes -= uio->uio_resid;
+ if (tx_bytes && vn_has_cached_data(vp))
+ update_pages(vp, woff,
+ tx_bytes, zfsvfs->z_os, zp->z_id);
/*
* If we made no progress, we're done. If we made even
@@ -3610,9 +3577,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
dmu_tx_t *tx;
- rl_t *rl;
u_offset_t off, koff;
size_t len, klen;
uint64_t filesz;
@@ -3627,26 +3592,18 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
* a read-modify-write).
*/
if (off < filesz && zp->z_blksz > PAGESIZE) {
- if (!ISP2(zp->z_blksz)) {
- /* Only one block in the file. */
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = 0;
- } else {
- klen = zp->z_blksz;
- koff = P2ALIGN(off, (u_offset_t)klen);
- }
+ klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+ koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
ASSERT(koff <= filesz);
if (koff + klen > filesz)
klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
}
ASSERT3U(btop(len), ==, btopr(len));
-top:
- rl = zfs_range_lock(zp, off, len, RL_WRITER);
+
/*
* Can't push pages past end-of-file.
*/
- filesz = zp->z_phys->zp_size;
if (off >= filesz) {
/* ignore all pages */
err = 0;
@@ -3661,17 +3618,15 @@ top:
pvn_write_done(trunc, flags);
len = filesz - off;
}
-
+top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_bonus(tx, zp->z_id);
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err != 0) {
if (err == ERESTART) {
- zfs_range_unlock(rl);
dmu_tx_wait(tx);
dmu_tx_abort(tx);
- err = 0;
goto top;
}
dmu_tx_abort(tx);
@@ -3689,12 +3644,11 @@ top:
if (err == 0) {
zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
- zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0);
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
dmu_tx_commit(tx);
}
out:
- zfs_range_unlock(rl);
pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
if (offp)
*offp = off;
@@ -3731,31 +3685,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
page_t *pp;
size_t io_len;
u_offset_t io_off;
- uint64_t filesz;
+ uint_t blksz;
+ rl_t *rl;
int error = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- if (len == 0) {
+ /*
+ * Align this request to the file block size in case we kluster.
+ * XXX - this can result in pretty aggresive locking, which can
+ * impact simultanious read/write access. One option might be
+ * to break up long requests (len == 0) into block-by-block
+ * operations to get narrower locking.
+ */
+ blksz = zp->z_blksz;
+ if (ISP2(blksz))
+ io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
+ else
+ io_off = 0;
+ if (len > 0 && ISP2(blksz))
+ io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t);
+ else
+ io_len = 0;
+
+ if (io_len == 0) {
/*
- * Search the entire vp list for pages >= off.
+ * Search the entire vp list for pages >= io_off.
*/
- error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
- flags, cr);
+ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
+ error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
goto out;
}
+ rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
- filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
- if (off > filesz) {
+ if (off > zp->z_phys->zp_size) {
/* past end of file */
+ zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (0);
}
- len = MIN(len, filesz - off);
+ len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
- for (io_off = off; io_off < off + len; io_off += io_len) {
+ for (off = io_off; io_off < off + len; io_off += io_len) {
if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
pp = page_lookup(vp, io_off,
(flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
@@ -3778,6 +3751,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
}
}
out:
+ zfs_range_unlock(rl);
if ((flags & B_ASYNC) == 0)
zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
ZFS_EXIT(zfsvfs);
@@ -3894,7 +3868,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
/*
* If we can't find a page in the cache, we will create a new page
* and fill it with file data. For efficiency, we may try to fill
- * multiple pages at once (klustering).
+ * multiple pages at once (klustering) to fill up the supplied page
+ * list.
*/
static int
zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
@@ -3903,57 +3878,27 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
znode_t *zp = VTOZ(vp);
page_t *pp, *cur_pp;
objset_t *os = zp->z_zfsvfs->z_os;
- caddr_t va;
u_offset_t io_off, total;
- uint64_t oid = zp->z_id;
size_t io_len;
- uint64_t filesz;
int err;
- /*
- * If we are only asking for a single page don't bother klustering.
- */
- filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
- if (off >= filesz)
- return (EFAULT);
if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
+ /*
+ * We only have a single page, don't bother klustering
+ */
io_off = off;
io_len = PAGESIZE;
pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
} else {
/*
- * Try to fill a kluster of pages (a blocks worth).
+ * Try to find enough pages to fill the page list
*/
- size_t klen;
- u_offset_t koff;
-
- if (!ISP2(zp->z_blksz)) {
- /* Only one block in the file. */
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = 0;
- } else {
- /*
- * It would be ideal to align our offset to the
- * blocksize but doing so has resulted in some
- * strange application crashes. For now, we
- * leave the offset as is and only adjust the
- * length if we are off the end of the file.
- */
- koff = off;
- klen = plsz;
- }
- ASSERT(koff <= filesz);
- if (koff + klen > filesz)
- klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff;
- ASSERT3U(off, >=, koff);
- ASSERT3U(off, <, koff + klen);
pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
- &io_len, koff, klen, 0);
+ &io_len, off, plsz, 0);
}
if (pp == NULL) {
/*
- * Some other thread entered the page before us.
- * Return to zfs_getpage to retry the lookup.
+ * The page already exists, nothing to do here.
*/
*pl = NULL;
return (0);
@@ -3964,9 +3909,11 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
*/
cur_pp = pp;
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ caddr_t va;
+
ASSERT3U(io_off, ==, cur_pp->p_offset);
va = zfs_map_page(cur_pp, S_WRITE);
- err = dmu_read(os, oid, io_off, PAGESIZE, va);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va);
zfs_unmap_page(cur_pp, va);
if (err) {
/* On error, toss the entire kluster */
@@ -3978,15 +3925,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
}
cur_pp = cur_pp->p_next;
}
-out:
+
/*
- * Fill in the page list array from the kluster. If
- * there are too many pages in the kluster, return
- * as many pages as possible starting from the desired
- * offset `off'.
+ * Fill in the page list array from the kluster starting
+ * from the desired offset `off'.
* NOTE: the page list will always be null terminated.
*/
pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+ ASSERT(pl == NULL || (*pl)->p_offset == off);
return (0);
}
@@ -3994,10 +3940,10 @@ out:
/*
* Return pointers to the pages for the file region [off, off + len]
* in the pl array. If plsz is greater than len, this function may
- * also return page pointers from before or after the specified
- * region (i.e. some region [off', off' + plsz]). These additional
- * pages are only returned if they are already in the cache, or were
- * created as part of a klustered read.
+ * also return page pointers from after the specified region
+ * (i.e. the region [off, off + plsz]). These additional pages are
+ * only returned if they are already in the cache, or were created as
+ * part of a klustered read.
*
* IN: vp - vnode of file to get data from.
* off - position in file to get data from.
@@ -4026,9 +3972,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp, **pl0 = pl;
- int need_unlock = 0, err = 0;
- offset_t orig_off;
+ page_t **pl0 = pl;
+ int err = 0;
+
+ /* we do our own caching, faultahead is unnecessary */
+ if (pl == NULL)
+ return (0);
+ else if (len > plsz)
+ len = plsz;
+ else
+ len = P2ROUNDUP(len, PAGESIZE);
+ ASSERT(plsz >= len);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
@@ -4036,104 +3990,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
if (protp)
*protp = PROT_ALL;
- /* no faultahead (for now) */
- if (pl == NULL) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /* can't fault past EOF */
- if (off >= zp->z_phys->zp_size) {
- ZFS_EXIT(zfsvfs);
- return (EFAULT);
- }
- orig_off = off;
-
- /*
- * If we already own the lock, then we must be page faulting
- * in the middle of a write to this file (i.e., we are writing
- * to this file using data from a mapped region of the file).
- */
- if (rw_owner(&zp->z_map_lock) != curthread) {
- rw_enter(&zp->z_map_lock, RW_WRITER);
- need_unlock = TRUE;
- }
-
/*
* Loop through the requested range [off, off + len] looking
* for pages. If we don't find a page, we will need to create
* a new page and fill it with data from the file.
*/
while (len > 0) {
- if (plsz < PAGESIZE)
- break;
- if (pp = page_lookup(vp, off, SE_SHARED)) {
- *pl++ = pp;
+ if (*pl = page_lookup(vp, off, SE_SHARED))
+ *(pl+1) = NULL;
+ else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
+ goto out;
+ while (*pl) {
+ ASSERT3U((*pl)->p_offset, ==, off);
off += PAGESIZE;
addr += PAGESIZE;
- len -= PAGESIZE;
- plsz -= PAGESIZE;
- } else {
- err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
- if (err)
- goto out;
- /*
- * klustering may have changed our region
- * to be block aligned.
- */
- if (((pp = *pl) != 0) && (off != pp->p_offset)) {
- int delta = off - pp->p_offset;
- len += delta;
- off -= delta;
- addr -= delta;
- }
- while (*pl) {
- pl++;
- off += PAGESIZE;
- addr += PAGESIZE;
- plsz -= PAGESIZE;
- if (len > PAGESIZE)
- len -= PAGESIZE;
- else
- len = 0;
+ if (len > 0) {
+ ASSERT3U(len, >=, PAGESIZE);
+ len -= PAGESIZE;
}
+ ASSERT3U(plsz, >=, PAGESIZE);
+ plsz -= PAGESIZE;
+ pl++;
}
}
/*
* Fill out the page array with any pages already in the cache.
*/
- while (plsz > 0) {
- pp = page_lookup_nowait(vp, off, SE_SHARED);
- if (pp == NULL)
- break;
- *pl++ = pp;
- off += PAGESIZE;
- plsz -= PAGESIZE;
+ while (plsz > 0 &&
+ (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
+ off += PAGESIZE;
+ plsz -= PAGESIZE;
}
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
out:
- /*
- * We can't grab the range lock for the page as reader which would
- * stop truncation as this leads to deadlock. So we need to recheck
- * the file size.
- */
- if (orig_off >= zp->z_phys->zp_size)
- err = EFAULT;
if (err) {
/*
* Release any pages we have previously locked.
*/
while (pl > pl0)
page_unlock(*--pl);
+ } else {
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
}
*pl = NULL;
- if (need_unlock)
- rw_exit(&zp->z_map_lock);
-
ZFS_EXIT(zfsvfs);
return (err);
}