4 files changed, 103 insertions, 55 deletions
diff --git a/include/sys/abd.h b/include/sys/abd.h
index 3d9fdbf10..b781be4da 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _ABD_H
@@ -44,7 +44,8 @@ typedef enum abd_flags {
 	ABD_FLAG_OWNER	= 1 << 1,	/* does it own its data buffers? */
 	ABD_FLAG_META	= 1 << 2,	/* does this represent FS metadata? */
 	ABD_FLAG_MULTI_ZONE  = 1 << 3,	/* pages split over memory zones */
-	ABD_FLAG_MULTI_CHUNK = 1 << 4	/* pages split over multiple chunks */
+	ABD_FLAG_MULTI_CHUNK = 1 << 4,	/* pages split over multiple chunks */
+	ABD_FLAG_LINEAR_PAGE = 1 << 5,	/* linear but allocd from page */
 } abd_flags_t;
 
 typedef struct abd {
@@ -60,6 +61,7 @@ typedef struct abd {
 		} abd_scatter;
 		struct abd_linear {
 			void		*abd_buf;
+			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
 		} abd_linear;
 	} abd_u;
 } abd_t;
@@ -75,6 +77,13 @@ abd_is_linear(abd_t *abd)
 	return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
 }
 
+static inline boolean_t
+abd_is_linear_page(abd_t *abd)
+{
+	return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
+	    B_TRUE : B_FALSE);
+}
+
 /*
  * Allocations and deallocations
  */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 9041bd8b1..9f688d9bc 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -72,17 +72,19 @@
  *  (2) Fragmentation is less of an issue since when we are at the limit of
  *      allocatable space, we won't have to search around for a long free
  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we weren't using segkpm (see next point) we
+ *      individually, so even if we are using HIGHMEM (see next point) we
  *      wouldn't need to worry about finding a contiguous address range.
  *
- *  (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
- *      on each ABD access. (If segkpm isn't available then we use all linear
- *      ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_pages() for details.
  *
  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
- * available, which is the case on all 32-bit systems and any 64-bit systems
- * where kpm_enable is turned off.
+ * B_FALSE.
  *
  * In addition to directly allocating a linear or scattered ABD, it is also
  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
@@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
 #define	__GFP_RECLAIM		__GFP_WAIT
 #endif
 
-static unsigned long
-abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
-{
-	struct page *page;
-
-	page = alloc_pages_node(nid, gfp, order);
-	if (!page)
-		return (0);
-
-	return ((unsigned long) page_address(page));
-}
-
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is
@@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	size_t remaining_size;
 	int nid = NUMA_NO_NODE;
 	int alloc_pages = 0;
-	int order;
 
 	INIT_LIST_HEAD(&pages);
 
 	while (alloc_pages < nr_pages) {
-		unsigned long paddr;
 		unsigned chunk_pages;
+		int order;
 
 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 		chunk_pages = (1U << order);
 
-		paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
-		if (paddr == 0) {
+		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+		if (page == NULL) {
 			if (order == 0) {
 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 				schedule_timeout_interruptible(1);
@@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			continue;
 		}
 
-		page = virt_to_page(paddr);
 		list_add_tail(&page->lru, &pages);
 
 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
@@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size)
 		list_del(&page->lru);
 	}
 
-	if (chunks > 1) {
+	/*
+	 * These conditions ensure that a possible transformation to a linear
+	 * ABD would be valid.
+	 */
+	ASSERT(!PageHighMem(sg_page(table.sgl)));
+	ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+	if (table.nents == 1) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer.  All single-page (4K) ABD's can be
+		 * represented this way.  Some multi-page ABD's can also be
+		 * represented this way, if we were able to allocate a single
+		 * "chunk" (higher-order "page" which represents a power-of-2
+		 * series of physically-contiguous pages).  This is often the
+		 * case for 2-page (8K) ABD's.
+		 *
+		 * Representing a single-entry scatter ABD as a linear ABD
+		 * has the performance advantage of avoiding the copy (and
+		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+		 * A performance increase of around 5% has been observed for
+		 * ARC-cached reads (of small blocks which can take advantage
+		 * of this).
+		 *
+		 * Note that this optimization is only possible because the
+		 * pages are always mapped into the kernel's address space.
+		 * This is not the case for highmem pages, so the
+		 * optimization can not be made there.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		abd->abd_u.abd_linear.abd_buf =
+		    page_address(sg_page(table.sgl));
+	} else if (table.nents > 1) {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
@@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size)
 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 		}
-	}
 
-	ABD_SCATTER(abd).abd_sgl = table.sgl;
-	ABD_SCATTER(abd).abd_nents = table.nents;
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+	}
 }
 #else
 /*
@@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)
 
 struct page;
 
-#define	kpm_enable			1
-#define	abd_alloc_chunk(o) \
-	((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
-#define	abd_free_chunk(chunk, o)	umem_free(chunk, PAGESIZE << (o))
 #define	zfs_kmap_atomic(chunk, km)	((void *)chunk)
 #define	zfs_kunmap_atomic(addr, km)	do { (void)(addr); } while (0)
 #define	local_irq_save(flags)		do { (void)(flags); } while (0)
@@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
 	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
 
 	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = abd_alloc_chunk(0);
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
 		sg_set_page(sg, p, PAGESIZE, 0);
 	}
 	ABD_SCATTER(abd).abd_nents = nr_pages;
@@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
 {
 	int i, n = ABD_SCATTER(abd).abd_nents;
 	struct scatterlist *sg;
-	int j;
 
 	abd_for_each_sg(abd, sg, n, i) {
-		for (j = 0; j < sg->length; j += PAGESIZE) {
-			struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
-			abd_free_chunk(p, 0);
+		for (int j = 0; j < sg->length; j += PAGESIZE) {
+			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+			umem_free(p, PAGESIZE);
 		}
 	}
 
@@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
-	    ABD_FLAG_MULTI_CHUNK));
+	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {
@@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
 
 	abd_t *abd = abd_alloc_struct();
 	abd->abd_flags = ABD_FLAG_OWNER;
+	abd->abd_u.abd_scatter.abd_offset = 0;
 	abd_alloc_pages(abd, size);
 
 	if (is_metadata) {
@@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
 	abd->abd_parent = NULL;
 	zfs_refcount_create(&abd->abd_children);
 
-	abd->abd_u.abd_scatter.abd_offset = 0;
-
 	ABDSTAT_BUMP(abdstat_scatter_cnt);
 	ABDSTAT_INCR(abdstat_scatter_data_size, size);
 	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
@@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
 static void
 abd_free_linear(abd_t *abd)
 {
+	if (abd_is_linear_page(abd)) {
+		/* Transform it back into a scatter ABD for freeing */
+		struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR;
+		abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+		ABD_SCATTER(abd).abd_nents = 1;
+		ABD_SCATTER(abd).abd_offset = 0;
+		ABD_SCATTER(abd).abd_sgl = sg;
+		abd_free_scatter(abd);
+		return;
+	}
 	if (abd->abd_flags & ABD_FLAG_META) {
 		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
 	} else {
@@ -718,7 +745,8 @@ abd_t *
 abd_alloc_sametype(abd_t *sabd, size_t size)
 {
 	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
-	if (abd_is_linear(sabd)) {
+	if (abd_is_linear(sabd) &&
+	    !abd_is_linear_page(sabd)) {
 		return (abd_alloc_linear(size, is_metadata));
 	} else {
 		return (abd_alloc(size, is_metadata));
@@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
 {
 	ASSERT(abd_is_linear(abd));
 	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+	/*
+	 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+	 * Since that flag does not survive the
+	 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+	 * abd_take_ownership_of_buf() sequence, we don't allow releasing
+	 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+	 */
+	ASSERT(!abd_is_linear_page(abd));
+
 	abd_verify(abd);
 
 	abd->abd_flags &= ~ABD_FLAG_OWNER;
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 946ea3415..c1ffe1f24 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -2917,7 +2917,8 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
-	 * allocate a new buffer to store the buf's data.
+	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
+	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
@@ -2925,10 +2926,17 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
-	 * need to be ABD-aware.
-	 */
-	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
-	    hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
+	 * need to be ABD-aware.  It must be allocated via
+	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
+	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
+	 * page" buffers because the ABD code needs to handle freeing them
+	 * specially.
+	 */
+	boolean_t can_share = arc_can_share(hdr, buf) &&
+	    !HDR_L2_WRITING(hdr) &&
+	    hdr->b_l1hdr.b_pabd != NULL &&
+	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
+	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
@@ -3731,7 +3739,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
 		 * disk, it's easiest if we just set up sharing between the
 		 * buf and the hdr.
 		 */
-		ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
 		arc_hdr_free_abd(hdr, B_FALSE);
 		arc_share_buf(hdr, buf);
 	}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 016ac07ea..80a2dbc82 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -331,12 +331,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
-	/*
-	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
-	 * going to get a nasty surprise when they try to access the data.
-	 */
-	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
-
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;