From 5662fd57941d020e23160b271dc27f254fb5a3c6 Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Tue, 11 Jun 2019 09:02:31 -0700
Subject: single-chunk scatter ABDs can be treated as linear

Scatter ABD's are allocated from a number of pages.  In contrast to
linear ABD's, these pages are disjoint in the kernel's virtual address
space, so they can't be accessed as a contiguous buffer.  Therefore
routines that need a linear buffer (e.g. abd_borrow_buf() and friends)
must allocate a separate linear buffer (with zio_buf_alloc()), and copy
the contents of the pages to/from the linear buffer.  This can have a
measurable performance overhead on some workloads.

https://github.com/zfsonlinux/zfs/commit/87c25d567fb7969b44c7d8af63990e
("abd_alloc should use scatter for >1K allocations") increased the use
of scatter ABD's, specifically switching 1.5K through 4K (inclusive)
buffers from linear to scatter.  For workloads that access blocks whose
compressed sizes are in this range, that commit introduced an additional
copy into the read code path.  For example, the
sequential_reads_arc_cached tests in the test suite were reduced by
around 5% (this is doing reads of 8K-logical blocks, compressed to 3K,
which are cached in the ARC).

This commit treats single-chunk scattered buffers as linear buffers,
because they are contiguous in the kernel's virtual address space.

All single-page (4K) ABD's can be represented this way.  Some multi-page
ABD's can also be represented this way, if we were able to allocate a
single "chunk" (higher-order "page" which represents a power-of-2 series
of physically-contiguous pages).  This is often the case for 2-page (8K)
ABD's.

Representing a single-entry scatter ABD as a linear ABD has the
performance advantage of avoiding the copy (and allocation) in
abd_borrow_buf_copy / abd_return_buf_copy.  A performance increase of
around 5% has been observed for ARC-cached reads (of small blocks which
can take advantage of this), fixing the regression introduced by
87c25d567.

Note that this optimization is only possible because all physical memory
is always mapped into the kernel's address space.  This is not the case
for HIGHMEM pages, so the optimization can not be made on 32-bit
systems.

Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #8580
---
 module/zfs/abd.c | 120 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 79 insertions(+), 41 deletions(-)

(limited to 'module/zfs/abd.c')

diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 9041bd8b1..9f688d9bc 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -72,17 +72,19 @@
  *  (2) Fragmentation is less of an issue since when we are at the limit of
  *      allocatable space, we won't have to search around for a long free
  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we weren't using segkpm (see next point) we
+ *      individually, so even if we are using HIGHMEM (see next point) we
  *      wouldn't need to worry about finding a contiguous address range.
  *
- *  (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
- *      on each ABD access. (If segkpm isn't available then we use all linear
- *      ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_pages() for details.
  *
  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
- * available, which is the case on all 32-bit systems and any 64-bit systems
- * where kpm_enable is turned off.
+ * B_FALSE.
  *
  * In addition to directly allocating a linear or scattered ABD, it is also
  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
@@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
 #define	__GFP_RECLAIM		__GFP_WAIT
 #endif
 
-static unsigned long
-abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
-{
-	struct page *page;
-
-	page = alloc_pages_node(nid, gfp, order);
-	if (!page)
-		return (0);
-
-	return ((unsigned long) page_address(page));
-}
-
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is
@@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	size_t remaining_size;
 	int nid = NUMA_NO_NODE;
 	int alloc_pages = 0;
-	int order;
 
 	INIT_LIST_HEAD(&pages);
 
 	while (alloc_pages < nr_pages) {
-		unsigned long paddr;
 		unsigned chunk_pages;
+		int order;
 
 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 		chunk_pages = (1U << order);
 
-		paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
-		if (paddr == 0) {
+		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+		if (page == NULL) {
 			if (order == 0) {
 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 				schedule_timeout_interruptible(1);
@@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			continue;
 		}
 
-		page = virt_to_page(paddr);
 		list_add_tail(&page->lru, &pages);
 
 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
@@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size)
 		list_del(&page->lru);
 	}
 
-	if (chunks > 1) {
+	/*
+	 * These conditions ensure that a possible transformation to a linear
+	 * ABD would be valid.
+	 */
+	ASSERT(!PageHighMem(sg_page(table.sgl)));
+	ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+	if (table.nents == 1) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer.  All single-page (4K) ABD's can be
+		 * represented this way.  Some multi-page ABD's can also be
+		 * represented this way, if we were able to allocate a single
+		 * "chunk" (higher-order "page" which represents a power-of-2
+		 * series of physically-contiguous pages).  This is often the
+		 * case for 2-page (8K) ABD's.
+		 *
+		 * Representing a single-entry scatter ABD as a linear ABD
+		 * has the performance advantage of avoiding the copy (and
+		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+		 * A performance increase of around 5% has been observed for
+		 * ARC-cached reads (of small blocks which can take advantage
+		 * of this).
+		 *
+		 * Note that this optimization is only possible because the
+		 * pages are always mapped into the kernel's address space.
+		 * This is not the case for highmem pages, so the
+		 * optimization can not be made there.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		abd->abd_u.abd_linear.abd_buf =
+		    page_address(sg_page(table.sgl));
+	} else if (table.nents > 1) {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
@@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 		}
-	}
 
-	ABD_SCATTER(abd).abd_sgl = table.sgl;
-	ABD_SCATTER(abd).abd_nents = table.nents;
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+	}
 }
 #else
 /*
@@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)
 
 struct page;
 
-#define	kpm_enable			1
-#define	abd_alloc_chunk(o) \
-	((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
-#define	abd_free_chunk(chunk, o)	umem_free(chunk, PAGESIZE << (o))
 #define	zfs_kmap_atomic(chunk, km)	((void *)chunk)
 #define	zfs_kunmap_atomic(addr, km)	do { (void)(addr); } while (0)
 #define	local_irq_save(flags)		do { (void)(flags); } while (0)
@@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
 
 	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = abd_alloc_chunk(0);
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
 		sg_set_page(sg, p, PAGESIZE, 0);
 	}
 	ABD_SCATTER(abd).abd_nents = nr_pages;
@@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
 {
 	int i, n = ABD_SCATTER(abd).abd_nents;
 	struct scatterlist *sg;
-	int j;
 
 	abd_for_each_sg(abd, sg, n, i) {
-		for (j = 0; j < sg->length; j += PAGESIZE) {
-			struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
-			abd_free_chunk(p, 0);
+		for (int j = 0; j < sg->length; j += PAGESIZE) {
+			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+			umem_free(p, PAGESIZE);
 		}
 	}
 
@@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
-	    ABD_FLAG_MULTI_CHUNK));
+	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
 
 	abd_t *abd = abd_alloc_struct();
 	abd->abd_flags = ABD_FLAG_OWNER;
+	abd->abd_u.abd_scatter.abd_offset = 0;
 	abd_alloc_pages(abd, size);
 
 	if (is_metadata) {
@@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
 	abd->abd_parent = NULL;
 	zfs_refcount_create(&abd->abd_children);
 
-	abd->abd_u.abd_scatter.abd_offset = 0;
-
 	ABDSTAT_BUMP(abdstat_scatter_cnt);
 	ABDSTAT_INCR(abdstat_scatter_data_size, size);
 	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
@@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
 static void
 abd_free_linear(abd_t *abd)
 {
+	if (abd_is_linear_page(abd)) {
+		/* Transform it back into a scatter ABD for freeing */
+		struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+		ABD_SCATTER(abd).abd_nents = 1;
+		ABD_SCATTER(abd).abd_offset = 0;
+		ABD_SCATTER(abd).abd_sgl = sg;
+		abd_free_scatter(abd);
+		return;
+	}
 	if (abd->abd_flags & ABD_FLAG_META) {
 		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
 	} else {
@@ -718,7 +745,8 @@ abd_t *
 abd_alloc_sametype(abd_t *sabd, size_t size)
 {
 	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
-	if (abd_is_linear(sabd)) {
+	if (abd_is_linear(sabd) &&
+	    !abd_is_linear_page(sabd)) {
 		return (abd_alloc_linear(size, is_metadata));
 	} else {
 		return (abd_alloc(size, is_metadata));
@@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
 {
 	ASSERT(abd_is_linear(abd));
 	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+	/*
+	 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+	 * Since that flag does not survive the
+	 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+	 * abd_take_ownership_of_buf() sequence, we don't allow releasing
+	 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+	 */
+	ASSERT(!abd_is_linear_page(abd));
+
 	abd_verify(abd);
 
 	abd->abd_flags &= ~ABD_FLAG_OWNER;
-- 
cgit v1.2.3