aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
authorPrasad Joshi <[email protected]>2011-05-28 02:53:07 +0100
committerBrian Behlendorf <[email protected]>2011-07-01 12:22:52 -0700
commitdde471ef5a07bd569deeadd3e9a88655db3e10ab (patch)
treea6fc4f05594420aa7b4a1337c7db1da7bae61113 /module
parent2a005961a48e748632e96272915192dab6ce9401 (diff)
MMAP Optimization
Enable zfs_getpage, zfs_fillpage, zfs_putpage, zfs_putapage functions. The functions have been modified to make them Linux friendly. ZFS uses these functions to read/write the mmapped pages. Using them from readpage/writepage results in clear code. The patch also adds readpages and writepages interface functions to read/write list of pages in one function call. The code change handles the first mmap optimization mentioned on https://github.com/behlendorf/zfs/issues/225 Signed-off-by: Prasad Joshi <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Issue #255
Diffstat (limited to 'module')
-rw-r--r--module/zfs/zfs_vnops.c360
-rw-r--r--module/zfs/zpl_file.c141
2 files changed, 174 insertions, 327 deletions
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 86bef25fb..ff761854c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -3725,53 +3725,31 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
return (0);
}
+#endif /* HAVE_MMAP */
/*
- * Push a page out to disk, klustering if possible.
+ * Push a page out to disk
*
* IN: vp - file to push page to.
* pp - page to push.
- * flags - additional flags.
- * cr - credentials of caller.
+ * off - start of range pushed.
+ * len - len of range pushed.
*
- * OUT: offp - start of range pushed.
- * lenp - len of range pushed.
*
* RETURN: 0 if success
* error code if failure
*
- * NOTE: callers must have locked the page to be pushed. On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
+ * NOTE: callers must have locked the page to be pushed.
*/
/* ARGSUSED */
static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
+zfs_putapage(struct inode *ip, struct page *pp, u_offset_t off, size_t len)
{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- dmu_tx_t *tx;
- u_offset_t off, koff;
- size_t len, klen;
- int err;
-
- off = pp->p_offset;
- len = PAGESIZE;
- /*
- * If our blocksize is bigger than the page size, try to kluster
- * multiple pages so that we write a full block (thus avoiding
- * a read-modify-write).
- */
- if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
- ASSERT(koff <= zp->z_size);
- if (koff + klen > zp->z_size)
- klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
- pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
- }
- ASSERT3U(btop(len), ==, btopr(len));
+ znode_t *zp = ITOZ(ip);
+ zfs_sb_t *zsb = ITOZSB(ip);
+ dmu_tx_t *tx;
+ caddr_t va;
+ int err;
/*
* Can't push pages past end-of-file.
@@ -3780,24 +3758,16 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
/* ignore all pages */
err = 0;
goto out;
- } else if (off + len > zp->z_size) {
- int npages = btopr(zp->z_size - off);
- page_t *trunc;
-
- page_list_break(&pp, &trunc, npages);
- /* ignore pages past end of file */
- if (trunc)
- pvn_write_done(trunc, flags);
+ } else if (off + len > zp->z_size)
len = zp->z_size - off;
- }
- if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
- zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
+ zfs_owner_overquota(zsb, zp, B_TRUE)) {
err = EDQUOT;
goto out;
}
top:
- tx = dmu_tx_create(zfsvfs->z_os);
+ tx = dmu_tx_create(zsb->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -3813,52 +3783,38 @@ top:
goto out;
}
- if (zp->z_blksz <= PAGESIZE) {
- caddr_t va = zfs_map_page(pp, S_READ);
- ASSERT3U(len, <=, PAGESIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
- zfs_unmap_page(pp, va);
- } else {
- err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
- }
+ va = kmap(pp);
+ ASSERT3U(len, <=, PAGESIZE);
+ dmu_write(zsb->z_os, zp->z_id, off, len, va, tx);
+ kunmap(pp);
if (err == 0) {
uint64_t mtime[2], ctime[2];
sa_bulk_attr_t bulk[3];
int count = 0;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL,
&mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
&ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+ zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, off, len, 0);
}
dmu_tx_commit(tx);
out:
- pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
- if (offp)
- *offp = off;
- if (lenp)
- *lenp = len;
-
return (err);
}
/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
+ * Copy the portion of the file indicated from page into the file.
*
- * IN: vp - vnode of file to push page data to.
- * off - position in file to put data.
- * len - amount of data to write.
- * flags - flags to control the operation.
- * cr - credentials of caller.
- * ct - caller context.
+ * IN: ip - inode of file to push page data to.
+ * wbc - Unused parameter
+ * data - pointer to address_space
*
* RETURN: 0 if success
* error code if failure
@@ -3867,87 +3823,45 @@ out:
* vp - ctime|mtime updated
*/
/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
+int
+zfs_putpage(struct page *page, struct writeback_control *wbc, void *data)
{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp;
- size_t io_len;
- u_offset_t io_off;
- uint_t blksz;
- rl_t *rl;
- int error = 0;
+ struct address_space *mapping = data;
+ struct inode *ip = mapping->host;
+ znode_t *zp = ITOZ(ip);
+ zfs_sb_t *zsb = ITOZSB(ip);
+ rl_t *rl;
+ u_offset_t io_off;
+ size_t io_len;
+ size_t len;
+ int error;
+
+ io_off = page_offset(page);
+ io_len = PAGESIZE;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);
- /*
- * Align this request to the file block size in case we kluster.
- * XXX - this can result in pretty aggresive locking, which can
- * impact simultanious read/write access. One option might be
- * to break up long requests (len == 0) into block-by-block
- * operations to get narrower locking.
- */
- blksz = zp->z_blksz;
- if (ISP2(blksz))
- io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
- else
- io_off = 0;
- if (len > 0 && ISP2(blksz))
- io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
- else
- io_len = 0;
-
- if (io_len == 0) {
- /*
- * Search the entire vp list for pages >= io_off.
- */
- rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
- error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
- goto out;
- }
rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
- if (off > zp->z_size) {
+ if (io_off > zp->z_size) {
/* past end of file */
zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
+ ZFS_EXIT(zsb);
return (0);
}
len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
- for (off = io_off; io_off < off + len; io_off += io_len) {
- if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
- pp = page_lookup(vp, io_off,
- (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
- } else {
- pp = page_lookup_nowait(vp, io_off,
- (flags & B_FREE) ? SE_EXCL : SE_SHARED);
- }
-
- if (pp != NULL && pvn_getdirty(pp, flags)) {
- int err;
-
- /*
- * Found a dirty page to push
- */
- err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
- if (err)
- error = err;
- } else {
- io_len = PAGESIZE;
- }
- }
-out:
+ error = zfs_putapage(ip, page, io_off, len);
zfs_range_unlock(rl);
- if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
+
+ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zsb->z_log, zp->z_id);
+ ZFS_EXIT(zsb);
return (error);
}
-#endif /* HAVE_MMAP */
+EXPORT_SYMBOL(zfs_putpage);
/*ARGSUSED*/
void
@@ -4039,102 +3953,62 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
ZFS_EXIT(zfsvfs);
return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
}
+#endif /* HAVE_MMAP */
/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data. For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list. Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
+ * Fill pages with data from the disk.
*/
static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
- caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
{
- znode_t *zp = VTOZ(vp);
- page_t *pp, *cur_pp;
- objset_t *os = zp->z_zfsvfs->z_os;
- u_offset_t io_off, total;
- size_t io_len;
- int err;
-
- if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
- /*
- * We only have a single page, don't bother klustering
- */
- io_off = off;
- io_len = PAGESIZE;
- pp = page_create_va(vp, io_off, io_len,
- PG_EXCL | PG_WAIT, seg, addr);
- } else {
- /*
- * Try to find enough pages to fill the page list
- */
- pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
- &io_len, off, plsz, 0);
- }
- if (pp == NULL) {
- /*
- * The page already exists, nothing to do here.
- */
- *pl = NULL;
- return (0);
- }
+ znode_t *zp = ITOZ(ip);
+ zfs_sb_t *zsb = ITOZSB(ip);
+ objset_t *os;
+ struct page *cur_pp;
+ u_offset_t io_off, total;
+ size_t io_len;
+ loff_t i_size;
+ unsigned page_idx;
+ int err;
+
+ os = zsb->z_os;
+ io_len = nr_pages << PAGE_CACHE_SHIFT;
+ i_size = i_size_read(ip);
+ io_off = page_offset(pl[0]);
+
+ if (io_off + io_len > i_size)
+ io_len = i_size - io_off;
/*
- * Fill the pages in the kluster.
+ * Iterate over list of pages and read each page individually.
*/
- cur_pp = pp;
+ page_idx = 0;
+ cur_pp = pl[0];
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
caddr_t va;
- ASSERT3U(io_off, ==, cur_pp->p_offset);
- va = zfs_map_page(cur_pp, S_WRITE);
+ va = kmap(cur_pp);
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
DMU_READ_PREFETCH);
- zfs_unmap_page(cur_pp, va);
+ kunmap(cur_pp);
if (err) {
- /* On error, toss the entire kluster */
- pvn_read_done(pp, B_ERROR);
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = EIO;
return (err);
}
- cur_pp = cur_pp->p_next;
+ cur_pp = pl[++page_idx];
}
- /*
- * Fill in the page list array from the kluster starting
- * from the desired offset `off'.
- * NOTE: the page list will always be null terminated.
- */
- pvn_plist_init(pp, pl, plsz, off, io_len, rw);
- ASSERT(pl == NULL || (*pl)->p_offset == off);
-
return (0);
}
/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array. If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]). These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- * IN: vp - vnode of file to get data from.
- * off - position in file to get data from.
- * len - amount of data to retrieve.
- * plsz - length of provided page list.
- * seg - segment to obtain pages for.
- * addr - virtual address of fault.
- * rw - mode of created pages.
- * cr - credentials of caller.
- * ct - caller context.
+ * Uses zfs_fillpage to read data from the file and fill the pages.
*
- * OUT: protp - protection mode of created pages.
- * pl - list of pages created.
+ * IN: ip - inode of file to get data from.
+ * pl - list of pages to read
+ * nr_pages - number of pages to read
*
* RETURN: 0 if success
* error code if failure
@@ -4143,80 +4017,30 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
* vp - atime updated
*/
/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
- page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
- enum seg_rw rw, cred_t *cr)
+int
+zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t **pl0 = pl;
- int err = 0;
+ znode_t *zp = ITOZ(ip);
+ zfs_sb_t *zsb = ITOZSB(ip);
+ int err;
- /* we do our own caching, faultahead is unnecessary */
if (pl == NULL)
return (0);
- else if (len > plsz)
- len = plsz;
- else
- len = P2ROUNDUP(len, PAGESIZE);
- ASSERT(plsz >= len);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);
- if (protp)
- *protp = PROT_ALL;
+ err = zfs_fillpage(ip, pl, nr_pages);
- /*
- * Loop through the requested range [off, off + len) looking
- * for pages. If we don't find a page, we will need to create
- * a new page and fill it with data from the file.
- */
- while (len > 0) {
- if (*pl = page_lookup(vp, off, SE_SHARED))
- *(pl+1) = NULL;
- else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
- goto out;
- while (*pl) {
- ASSERT3U((*pl)->p_offset, ==, off);
- off += PAGESIZE;
- addr += PAGESIZE;
- if (len > 0) {
- ASSERT3U(len, >=, PAGESIZE);
- len -= PAGESIZE;
- }
- ASSERT3U(plsz, >=, PAGESIZE);
- plsz -= PAGESIZE;
- pl++;
- }
- }
+ if (!err)
+ ZFS_ACCESSTIME_STAMP(zsb, zp);
- /*
- * Fill out the page array with any pages already in the cache.
- */
- while (plsz > 0 &&
- (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
- off += PAGESIZE;
- plsz -= PAGESIZE;
- }
-out:
- if (err) {
- /*
- * Release any pages we have previously locked.
- */
- while (pl > pl0)
- page_unlock(*--pl);
- } else {
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- }
-
- *pl = NULL;
-
- ZFS_EXIT(zfsvfs);
+ ZFS_EXIT(zsb);
return (err);
}
+EXPORT_SYMBOL(zfs_getpage);
+#ifdef HAVE_MMAP
/*
* Request a memory map for a section of a file. This code interacts
* with common code and the VM system as follows:
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
index de66ff4b4..0e90b7803 100644
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -254,6 +254,60 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
return (error);
}
+static struct page **
+pages_vector_from_list(struct list_head *pages, unsigned nr_pages)
+{
+ struct page **pl;
+ struct page *t;
+ unsigned page_idx;
+
+ pl = kmalloc(sizeof(*pl) * nr_pages, GFP_NOFS);
+ if (!pl)
+ return ERR_PTR(-ENOMEM);
+
+ page_idx = 0;
+ list_for_each_entry_reverse(t, pages, lru) {
+ pl[page_idx] = t;
+ page_idx++;
+ }
+
+ return pl;
+}
+
+static int
+zpl_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct inode *ip;
+ struct page **pl;
+ struct page *p, *n;
+ int error;
+
+ ip = mapping->host;
+
+ pl = pages_vector_from_list(pages, nr_pages);
+ if (IS_ERR(pl))
+ return PTR_ERR(pl);
+
+ error = -zfs_getpage(ip, pl, nr_pages);
+ if (error)
+ goto error;
+
+ list_for_each_entry_safe_reverse(p, n, pages, lru) {
+
+ list_del(&p->lru);
+
+ flush_dcache_page(p);
+ SetPageUptodate(p);
+ unlock_page(p);
+ page_cache_release(p);
+ }
+
+error:
+ kfree(pl);
+ return error;
+}
+
/*
* Populate a page with data for the Linux page cache. This function is
* only used to support mmap(2). There will be an identical copy of the
@@ -267,33 +321,34 @@ static int
zpl_readpage(struct file *filp, struct page *pp)
{
struct inode *ip;
- loff_t off, i_size;
- size_t len, wrote;
- cred_t *cr = CRED();
- void *pb;
+ struct page *pl[1];
int error = 0;
ASSERT(PageLocked(pp));
ip = pp->mapping->host;
- off = page_offset(pp);
- i_size = i_size_read(ip);
- ASSERT3S(off, <, i_size);
+ pl[0] = pp;
- crhold(cr);
- len = MIN(PAGE_CACHE_SIZE, i_size - off);
+ error = -zfs_getpage(ip, pl, 1);
- pb = kmap(pp);
+ if (error) {
+ SetPageError(pp);
+ ClearPageUptodate(pp);
+ } else {
+ ClearPageError(pp);
+ SetPageUptodate(pp);
+ flush_dcache_page(pp);
+ }
- /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
- wrote = zpl_read_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
- if (wrote != len)
- error = -EIO;
+ unlock_page(pp);
+ return error;
+}
- if (!error && (len < PAGE_CACHE_SIZE))
- memset(pb + len, 0, PAGE_CACHE_SIZE - len);
+int
+zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+{
+ int error;
- kunmap(pp);
- crfree(cr);
+ error = -zfs_putpage(pp, wbc, data);
if (error) {
SetPageError(pp);
@@ -305,8 +360,13 @@ zpl_readpage(struct file *filp, struct page *pp)
}
unlock_page(pp);
+ return error;
+}
- return (error);
+static int
+zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ return write_cache_pages(mapping, wbc, zpl_putpage, mapping);
}
/*
@@ -314,55 +374,18 @@ zpl_readpage(struct file *filp, struct page *pp)
* support mmap(2). Mapped pages may be dirtied by memory operations
* which never call .write(). These dirty pages are kept in sync with
* the ARC buffers via this hook.
- *
- * Currently this function relies on zpl_write_common() and the O_DIRECT
- * flag to push out the page. This works but the more correct way is
- * to update zfs_putapage() to be Linux friendly and use that interface.
*/
static int
zpl_writepage(struct page *pp, struct writeback_control *wbc)
{
- struct inode *ip;
- loff_t off, i_size;
- size_t len, read;
- cred_t *cr = CRED();
- void *pb;
- int error = 0;
-
- ASSERT(PageLocked(pp));
- ip = pp->mapping->host;
- off = page_offset(pp);
- i_size = i_size_read(ip);
-
- crhold(cr);
- len = MIN(PAGE_CACHE_SIZE, i_size - off);
-
- pb = kmap(pp);
-
- /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
- read = zpl_write_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
- if (read != len)
- error = -EIO;
-
- kunmap(pp);
- crfree(cr);
-
- if (error) {
- SetPageError(pp);
- ClearPageUptodate(pp);
- } else {
- ClearPageError(pp);
- SetPageUptodate(pp);
- }
-
- unlock_page(pp);
-
- return (error);
+ return zpl_putpage(pp, wbc, pp->mapping);
}
const struct address_space_operations zpl_address_space_operations = {
+ .readpages = zpl_readpages,
.readpage = zpl_readpage,
.writepage = zpl_writepage,
+ .writepages = zpl_writepages,
};
const struct file_operations zpl_file_operations = {