1 files changed, 79 insertions, 41 deletions
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 9041bd8b1..9f688d9bc 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -72,17 +72,19 @@
  *  (2) Fragmentation is less of an issue since when we are at the limit of
  *      allocatable space, we won't have to search around for a long free
  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we weren't using segkpm (see next point) we
+ *      individually, so even if we are using HIGHMEM (see next point) we
  *      wouldn't need to worry about finding a contiguous address range.
  *
- *  (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
- *      on each ABD access. (If segkpm isn't available then we use all linear
- *      ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_pages() for details.
  *
  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
- * available, which is the case on all 32-bit systems and any 64-bit systems
- * where kpm_enable is turned off.
+ * B_FALSE.
  *
  * In addition to directly allocating a linear or scattered ABD, it is also
  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
@@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
 #define	__GFP_RECLAIM		__GFP_WAIT
 #endif
 
-static unsigned long
-abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
-{
-	struct page *page;
-
-	page = alloc_pages_node(nid, gfp, order);
-	if (!page)
-		return (0);
-
-	return ((unsigned long) page_address(page));
-}
-
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is
@@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	size_t remaining_size;
 	int nid = NUMA_NO_NODE;
 	int alloc_pages = 0;
-	int order;
 
 	INIT_LIST_HEAD(&pages);
 
 	while (alloc_pages < nr_pages) {
-		unsigned long paddr;
 		unsigned chunk_pages;
+		int order;
 
 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 		chunk_pages = (1U << order);
 
-		paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
-		if (paddr == 0) {
+		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+		if (page == NULL) {
 			if (order == 0) {
 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 				schedule_timeout_interruptible(1);
@@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			continue;
 		}
 
-		page = virt_to_page(paddr);
 		list_add_tail(&page->lru, &pages);
 
 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
@@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size)
 		list_del(&page->lru);
 	}
 
-	if (chunks > 1) {
+	/*
+	 * These conditions ensure that a possible transformation to a linear
+	 * ABD would be valid.
+	 */
+	ASSERT(!PageHighMem(sg_page(table.sgl)));
+	ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+	if (table.nents == 1) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer.  All single-page (4K) ABD's can be
+		 * represented this way.  Some multi-page ABD's can also be
+		 * represented this way, if we were able to allocate a single
+		 * "chunk" (higher-order "page" which represents a power-of-2
+		 * series of physically-contiguous pages).  This is often the
+		 * case for 2-page (8K) ABD's.
+		 *
+		 * Representing a single-entry scatter ABD as a linear ABD
+		 * has the performance advantage of avoiding the copy (and
+		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+		 * A performance increase of around 5% has been observed for
+		 * ARC-cached reads (of small blocks which can take advantage
+		 * of this).
+		 *
+		 * Note that this optimization is only possible because the
+		 * pages are always mapped into the kernel's address space.
+		 * This is not the case for highmem pages, so the
+		 * optimization can not be made there.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		abd->abd_u.abd_linear.abd_buf =
+		    page_address(sg_page(table.sgl));
+	} else if (table.nents > 1) {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
@@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 		}
-	}
 
-	ABD_SCATTER(abd).abd_sgl = table.sgl;
-	ABD_SCATTER(abd).abd_nents = table.nents;
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+	}
 }
 #else
 /*
@@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)
 
 struct page;
 
-#define	kpm_enable			1
-#define	abd_alloc_chunk(o) \
-	((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
-#define	abd_free_chunk(chunk, o)	umem_free(chunk, PAGESIZE << (o))
 #define	zfs_kmap_atomic(chunk, km)	((void *)chunk)
 #define	zfs_kunmap_atomic(addr, km)	do { (void)(addr); } while (0)
 #define	local_irq_save(flags)		do { (void)(flags); } while (0)
@@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
 
 	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = abd_alloc_chunk(0);
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
 		sg_set_page(sg, p, PAGESIZE, 0);
 	}
 	ABD_SCATTER(abd).abd_nents = nr_pages;
@@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
 {
 	int i, n = ABD_SCATTER(abd).abd_nents;
 	struct scatterlist *sg;
-	int j;
 
 	abd_for_each_sg(abd, sg, n, i) {
-		for (j = 0; j < sg->length; j += PAGESIZE) {
-			struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
-			abd_free_chunk(p, 0);
+		for (int j = 0; j < sg->length; j += PAGESIZE) {
+			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+			umem_free(p, PAGESIZE);
 		}
 	}
 
@@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
-	    ABD_FLAG_MULTI_CHUNK));
+	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
 
 	abd_t *abd = abd_alloc_struct();
 	abd->abd_flags = ABD_FLAG_OWNER;
+	abd->abd_u.abd_scatter.abd_offset = 0;
 	abd_alloc_pages(abd, size);
 
 	if (is_metadata) {
@@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
 	abd->abd_parent = NULL;
 	zfs_refcount_create(&abd->abd_children);
 
-	abd->abd_u.abd_scatter.abd_offset = 0;
-
 	ABDSTAT_BUMP(abdstat_scatter_cnt);
 	ABDSTAT_INCR(abdstat_scatter_data_size, size);
 	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
@@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
 static void
 abd_free_linear(abd_t *abd)
 {
+	if (abd_is_linear_page(abd)) {
+		/* Transform it back into a scatter ABD for freeing */
+		struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+		ABD_SCATTER(abd).abd_nents = 1;
+		ABD_SCATTER(abd).abd_offset = 0;
+		ABD_SCATTER(abd).abd_sgl = sg;
+		abd_free_scatter(abd);
+		return;
+	}
 	if (abd->abd_flags & ABD_FLAG_META) {
 		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
 	} else {
@@ -718,7 +745,8 @@ abd_t *
 abd_alloc_sametype(abd_t *sabd, size_t size)
 {
 	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
-	if (abd_is_linear(sabd)) {
+	if (abd_is_linear(sabd) &&
+	    !abd_is_linear_page(sabd)) {
 		return (abd_alloc_linear(size, is_metadata));
 	} else {
 		return (abd_alloc(size, is_metadata));
@@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
 {
 	ASSERT(abd_is_linear(abd));
 	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+	/*
+	 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+	 * Since that flag does not survive the
+	 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+	 * abd_take_ownership_of_buf() sequence, we don't allow releasing
+	 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+	 */
+	ASSERT(!abd_is_linear_page(abd));
+
 	abd_verify(abd);
 
 	abd->abd_flags &= ~ABD_FLAG_OWNER;