summaryrefslogtreecommitdiffstats
path: root/module/zfs/abd.c
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs/abd.c')
-rw-r--r--module/zfs/abd.c120
1 files changed, 79 insertions, 41 deletions
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 9041bd8b1..9f688d9bc 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -72,17 +72,19 @@
* (2) Fragmentation is less of an issue since when we are at the limit of
* allocatable space, we won't have to search around for a long free
* hole in the VA space for large ARC allocations. Each chunk is mapped in
- * individually, so even if we weren't using segkpm (see next point) we
+ * individually, so even if we are using HIGHMEM (see next point) we
* wouldn't need to worry about finding a contiguous address range.
*
- * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
- * on each ABD access. (If segkpm isn't available then we use all linear
- * ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ * (3) If we are not using HIGHMEM, then all physical memory is always
+ * mapped into the kernel's address space, so we also avoid the map /
+ * unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space. See abd_alloc_pages() for details.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
- * available, which is the case on all 32-bit systems and any 64-bit systems
- * where kpm_enable is turned off.
+ * B_FALSE.
*
* In addition to directly allocating a linear or scattered ABD, it is also
* possible to create an ABD by requesting the "sub-ABD" starting at an offset
@@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
#define __GFP_RECLAIM __GFP_WAIT
#endif
-static unsigned long
-abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
-{
- struct page *page;
-
- page = alloc_pages_node(nid, gfp, order);
- if (!page)
- return (0);
-
- return ((unsigned long) page_address(page));
-}
-
/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
@@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
size_t remaining_size;
int nid = NUMA_NO_NODE;
int alloc_pages = 0;
- int order;
INIT_LIST_HEAD(&pages);
while (alloc_pages < nr_pages) {
- unsigned long paddr;
unsigned chunk_pages;
+ int order;
order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
chunk_pages = (1U << order);
- paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
- if (paddr == 0) {
+ page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+ if (page == NULL) {
if (order == 0) {
ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
schedule_timeout_interruptible(1);
@@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
continue;
}
- page = virt_to_page(paddr);
list_add_tail(&page->lru, &pages);
if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
@@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size)
list_del(&page->lru);
}
- if (chunks > 1) {
+ /*
+ * These conditions ensure that a possible transformation to a linear
+ * ABD would be valid.
+ */
+ ASSERT(!PageHighMem(sg_page(table.sgl)));
+ ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+ if (table.nents == 1) {
+ /*
+ * Since there is only one entry, this ABD can be represented
+ * as a linear buffer. All single-page (4K) ABD's can be
+ * represented this way. Some multi-page ABD's can also be
+ * represented this way, if we were able to allocate a single
+ * "chunk" (higher-order "page" which represents a power-of-2
+ * series of physically-contiguous pages). This is often the
+ * case for 2-page (8K) ABD's.
+ *
+ * Representing a single-entry scatter ABD as a linear ABD
+ * has the performance advantage of avoiding the copy (and
+ * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+ * A performance increase of around 5% has been observed for
+ * ARC-cached reads (of small blocks which can take advantage
+ * of this).
+ *
+ * Note that this optimization is only possible because the
+ * pages are always mapped into the kernel's address space.
+ * This is not the case for highmem pages, so the
+ * optimization can not be made there.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR;
+ abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+ abd->abd_u.abd_linear.abd_sgl = table.sgl;
+ abd->abd_u.abd_linear.abd_buf =
+ page_address(sg_page(table.sgl));
+ } else if (table.nents > 1) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
@@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size)
ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
}
- }
- ABD_SCATTER(abd).abd_sgl = table.sgl;
- ABD_SCATTER(abd).abd_nents = table.nents;
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = table.nents;
+ }
}
#else
/*
@@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)
struct page;
-#define kpm_enable 1
-#define abd_alloc_chunk(o) \
- ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
-#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o))
#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
@@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
abd_for_each_sg(abd, sg, nr_pages, i) {
- struct page *p = abd_alloc_chunk(0);
+ struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
@@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
- int j;
abd_for_each_sg(abd, sg, n, i) {
- for (j = 0; j < sg->length; j += PAGESIZE) {
- struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
- abd_free_chunk(p, 0);
+ for (int j = 0; j < sg->length; j += PAGESIZE) {
+ struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+ umem_free(p, PAGESIZE);
}
}
@@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
- ABD_FLAG_MULTI_CHUNK));
+ ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
@@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd_t *abd = abd_alloc_struct();
abd->abd_flags = ABD_FLAG_OWNER;
+ abd->abd_u.abd_scatter.abd_offset = 0;
abd_alloc_pages(abd, size);
if (is_metadata) {
@@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);
- abd->abd_u.abd_scatter.abd_offset = 0;
-
ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
@@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
static void
abd_free_linear(abd_t *abd)
{
+ if (abd_is_linear_page(abd)) {
+ /* Transform it back into a scatter ABD for freeing */
+ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+ ABD_SCATTER(abd).abd_nents = 1;
+ ABD_SCATTER(abd).abd_offset = 0;
+ ABD_SCATTER(abd).abd_sgl = sg;
+ abd_free_scatter(abd);
+ return;
+ }
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
} else {
@@ -718,7 +745,8 @@ abd_t *
abd_alloc_sametype(abd_t *sabd, size_t size)
{
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
- if (abd_is_linear(sabd)) {
+ if (abd_is_linear(sabd) &&
+ !abd_is_linear_page(sabd)) {
return (abd_alloc_linear(size, is_metadata));
} else {
return (abd_alloc(size, is_metadata));
@@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+ /*
+ * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+ * Since that flag does not survive the
+ * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+ * abd_take_ownership_of_buf() sequence, we don't allow releasing
+ * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+ */
+ ASSERT(!abd_is_linear_page(abd));
+
abd_verify(abd);
abd->abd_flags &= ~ABD_FLAG_OWNER;