aboutsummaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/Makefile.in43
-rw-r--r--module/zfs/abd.c1638
-rw-r--r--module/zfs/gzip.c2
-rw-r--r--module/zfs/policy.c355
-rw-r--r--module/zfs/qat.c105
-rw-r--r--module/zfs/qat.h204
-rw-r--r--module/zfs/qat_compress.c574
-rw-r--r--module/zfs/qat_crypt.c631
-rw-r--r--module/zfs/sha256.c2
-rw-r--r--module/zfs/spa_misc.c2
-rw-r--r--module/zfs/spa_stats.c1034
-rw-r--r--module/zfs/vdev_disk.c954
-rw-r--r--module/zfs/vdev_file.c331
-rw-r--r--module/zfs/zfs_acl.c2816
-rw-r--r--module/zfs/zfs_ctldir.c1240
-rw-r--r--module/zfs/zfs_debug.c253
-rw-r--r--module/zfs/zfs_dir.c1205
-rw-r--r--module/zfs/zfs_sysfs.c661
-rw-r--r--module/zfs/zfs_vfsops.c2562
-rw-r--r--module/zfs/zfs_vnops.c5275
-rw-r--r--module/zfs/zfs_znode.c2234
-rw-r--r--module/zfs/zio_crypt.c2036
-rw-r--r--module/zfs/zpl_ctldir.c572
-rw-r--r--module/zfs/zpl_export.c177
-rw-r--r--module/zfs/zpl_file.c1075
-rw-r--r--module/zfs/zpl_inode.c826
-rw-r--r--module/zfs/zpl_super.c426
-rw-r--r--module/zfs/zpl_xattr.c1548
28 files changed, 14 insertions, 28767 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 5adea9fb5..7c560fad7 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -16,18 +16,17 @@ endif
# Suppress unused but set variable warnings often due to ASSERTs
ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE)
-$(MODULE)-objs += abd.o
$(MODULE)-objs += aggsum.o
$(MODULE)-objs += arc.o
$(MODULE)-objs += blkptr.o
$(MODULE)-objs += bplist.o
$(MODULE)-objs += bpobj.o
-$(MODULE)-objs += cityhash.o
-$(MODULE)-objs += dbuf.o
-$(MODULE)-objs += dbuf_stats.o
$(MODULE)-objs += bptree.o
$(MODULE)-objs += bqueue.o
+$(MODULE)-objs += cityhash.o
$(MODULE)-objs += dataset_kstats.o
+$(MODULE)-objs += dbuf.o
+$(MODULE)-objs += dbuf_stats.o
$(MODULE)-objs += ddt.o
$(MODULE)-objs += ddt_zap.o
$(MODULE)-objs += dmu.o
@@ -42,28 +41,29 @@ $(MODULE)-objs += dmu_tx.o
$(MODULE)-objs += dmu_zfetch.o
$(MODULE)-objs += dnode.o
$(MODULE)-objs += dnode_sync.o
+$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_crypt.o
$(MODULE)-objs += dsl_dataset.o
$(MODULE)-objs += dsl_deadlist.o
$(MODULE)-objs += dsl_deleg.o
-$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_destroy.o
$(MODULE)-objs += dsl_dir.o
-$(MODULE)-objs += dsl_crypt.o
$(MODULE)-objs += dsl_pool.o
$(MODULE)-objs += dsl_prop.o
$(MODULE)-objs += dsl_scan.o
$(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += dsl_userhold.o
$(MODULE)-objs += edonr_zfs.o
$(MODULE)-objs += fm.o
$(MODULE)-objs += gzip.o
$(MODULE)-objs += hkdf.o
-$(MODULE)-objs += lzjb.o
$(MODULE)-objs += lz4.o
+$(MODULE)-objs += lzjb.o
$(MODULE)-objs += metaslab.o
$(MODULE)-objs += mmp.o
$(MODULE)-objs += multilist.o
$(MODULE)-objs += objlist.o
$(MODULE)-objs += pathname.o
-$(MODULE)-objs += policy.o
$(MODULE)-objs += range_tree.o
$(MODULE)-objs += refcount.o
$(MODULE)-objs += rrwlock.o
@@ -78,17 +78,14 @@ $(MODULE)-objs += spa_errlog.o
$(MODULE)-objs += spa_history.o
$(MODULE)-objs += spa_log_spacemap.o
$(MODULE)-objs += spa_misc.o
-$(MODULE)-objs += spa_stats.o
$(MODULE)-objs += space_map.o
$(MODULE)-objs += space_reftree.o
-$(MODULE)-objs += txg.o
$(MODULE)-objs += trace.o
+$(MODULE)-objs += txg.o
$(MODULE)-objs += uberblock.o
$(MODULE)-objs += unique.o
$(MODULE)-objs += vdev.o
$(MODULE)-objs += vdev_cache.o
-$(MODULE)-objs += vdev_disk.o
-$(MODULE)-objs += vdev_file.o
$(MODULE)-objs += vdev_indirect.o
$(MODULE)-objs += vdev_indirect_births.o
$(MODULE)-objs += vdev_indirect_mapping.o
@@ -112,11 +109,7 @@ $(MODULE)-objs += zcp_global.o
$(MODULE)-objs += zcp_iter.o
$(MODULE)-objs += zcp_synctask.o
$(MODULE)-objs += zfeature.o
-$(MODULE)-objs += zfs_acl.o
$(MODULE)-objs += zfs_byteswap.o
-$(MODULE)-objs += zfs_ctldir.o
-$(MODULE)-objs += zfs_debug.o
-$(MODULE)-objs += zfs_dir.o
$(MODULE)-objs += zfs_fm.o
$(MODULE)-objs += zfs_fuid.o
$(MODULE)-objs += zfs_ioctl.o
@@ -126,31 +119,15 @@ $(MODULE)-objs += zfs_ratelimit.o
$(MODULE)-objs += zfs_replay.o
$(MODULE)-objs += zfs_rlock.o
$(MODULE)-objs += zfs_sa.o
-$(MODULE)-objs += zfs_sysfs.o
-$(MODULE)-objs += zfs_vfsops.o
-$(MODULE)-objs += zfs_vnops.o
-$(MODULE)-objs += zfs_znode.o
$(MODULE)-objs += zil.o
$(MODULE)-objs += zio.o
$(MODULE)-objs += zio_checksum.o
$(MODULE)-objs += zio_compress.o
-$(MODULE)-objs += zio_crypt.o
$(MODULE)-objs += zio_inject.o
$(MODULE)-objs += zle.o
-$(MODULE)-objs += zpl_ctldir.o
-$(MODULE)-objs += zpl_export.o
-$(MODULE)-objs += zpl_file.o
-$(MODULE)-objs += zpl_inode.o
-$(MODULE)-objs += zpl_super.o
-$(MODULE)-objs += zpl_xattr.o
$(MODULE)-objs += zrlock.o
$(MODULE)-objs += zthr.o
$(MODULE)-objs += zvol.o
-$(MODULE)-objs += dsl_destroy.o
-$(MODULE)-objs += dsl_userhold.o
-$(MODULE)-objs += qat.o
-$(MODULE)-objs += qat_compress.o
-$(MODULE)-objs += qat_crypt.o
# Suppress incorrect warnings from versions of objtool which are not
# aware of x86 EVEX prefix instructions used for AVX512.
@@ -165,3 +142,5 @@ $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
+
+-include @abs_top_builddir@/module/os/linux/zfs/Makefile
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
deleted file mode 100644
index ac6b0b742..000000000
--- a/module/zfs/abd.c
+++ /dev/null
@@ -1,1638 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2019 by Delphix. All rights reserved.
- */
-
-/*
- * ARC buffer data (ABD).
- *
- * ABDs are an abstract data structure for the ARC which can use two
- * different ways of storing the underlying data:
- *
- * (a) Linear buffer. In this case, all the data in the ABD is stored in one
- * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
- *
- * +-------------------+
- * | ABD (linear) |
- * | abd_flags = ... |
- * | abd_size = ... | +--------------------------------+
- * | abd_buf ------------->| raw buffer of size abd_size |
- * +-------------------+ +--------------------------------+
- * no abd_chunks
- *
- * (b) Scattered buffer. In this case, the data in the ABD is split into
- * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
- * to the chunks recorded in an array at the end of the ABD structure.
- *
- * +-------------------+
- * | ABD (scattered) |
- * | abd_flags = ... |
- * | abd_size = ... |
- * | abd_offset = 0 | +-----------+
- * | abd_chunks[0] ----------------------------->| chunk 0 |
- * | abd_chunks[1] ---------------------+ +-----------+
- * | ... | | +-----------+
- * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
- * +-------------------+ | +-----------+
- * | ...
- * | +-----------+
- * +----------------->| chunk N-1 |
- * +-----------+
- *
- * Linear buffers act exactly like normal buffers and are always mapped into the
- * kernel's virtual memory space, while scattered ABD data chunks are allocated
- * as physical pages and then mapped in only while they are actually being
- * accessed through one of the abd_* library functions. Using scattered ABDs
- * provides several benefits:
- *
- * (1) They avoid use of kmem_*, preventing performance problems where running
- * kmem_reap on very large memory systems never finishes and causes
- * constant TLB shootdowns.
- *
- * (2) Fragmentation is less of an issue since when we are at the limit of
- * allocatable space, we won't have to search around for a long free
- * hole in the VA space for large ARC allocations. Each chunk is mapped in
- * individually, so even if we are using HIGHMEM (see next point) we
- * wouldn't need to worry about finding a contiguous address range.
- *
- * (3) If we are not using HIGHMEM, then all physical memory is always
- * mapped into the kernel's address space, so we also avoid the map /
- * unmap costs on each ABD access.
- *
- * If we are not using HIGHMEM, scattered buffers which have only one chunk
- * can be treated as linear buffers, because they are contiguous in the
- * kernel's virtual address space. See abd_alloc_pages() for details.
- *
- * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE.
- *
- * In addition to directly allocating a linear or scattered ABD, it is also
- * possible to create an ABD by requesting the "sub-ABD" starting at an offset
- * within an existing ABD. In linear buffers this is simple (set abd_buf of
- * the new ABD to the starting point within the original raw buffer), but
- * scattered ABDs are a little more complex. The new ABD makes a copy of the
- * relevant abd_chunks pointers (but not the underlying data). However, to
- * provide arbitrary rather than only chunk-aligned starting offsets, it also
- * tracks an abd_offset field which represents the starting point of the data
- * within the first chunk in abd_chunks. For both linear and scattered ABDs,
- * creating an offset ABD marks the original ABD as the offset's parent, and the
- * original ABD's abd_children refcount is incremented. This data allows us to
- * ensure the root ABD isn't deleted before its children.
- *
- * Most consumers should never need to know what type of ABD they're using --
- * the ABD public API ensures that it's possible to transparently switch from
- * using a linear ABD to a scattered one when doing so would be beneficial.
- *
- * If you need to use the data within an ABD directly, if you know it's linear
- * (because you allocated it) you can use abd_to_buf() to access the underlying
- * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
- * which will allocate a raw buffer if necessary. Use the abd_return_buf*
- * functions to return any raw buffers that are no longer necessary when you're
- * done using them.
- *
- * There are a variety of ABD APIs that implement basic buffer operations:
- * compare, copy, read, write, and fill with zeroes. If you need a custom
- * function which progressively accesses the whole ABD, use the abd_iterate_*
- * functions.
- */
-
-#include <sys/abd.h>
-#include <sys/param.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_znode.h>
-#ifdef _KERNEL
-#include <linux/scatterlist.h>
-#include <linux/kmap_compat.h>
-#else
-#define MAX_ORDER 1
-#endif
-
-typedef struct abd_stats {
- kstat_named_t abdstat_struct_size;
- kstat_named_t abdstat_linear_cnt;
- kstat_named_t abdstat_linear_data_size;
- kstat_named_t abdstat_scatter_cnt;
- kstat_named_t abdstat_scatter_data_size;
- kstat_named_t abdstat_scatter_chunk_waste;
- kstat_named_t abdstat_scatter_orders[MAX_ORDER];
- kstat_named_t abdstat_scatter_page_multi_chunk;
- kstat_named_t abdstat_scatter_page_multi_zone;
- kstat_named_t abdstat_scatter_page_alloc_retry;
- kstat_named_t abdstat_scatter_sg_table_retry;
-} abd_stats_t;
-
-static abd_stats_t abd_stats = {
- /* Amount of memory occupied by all of the abd_t struct allocations */
- { "struct_size", KSTAT_DATA_UINT64 },
- /*
- * The number of linear ABDs which are currently allocated, excluding
- * ABDs which don't own their data (for instance the ones which were
- * allocated through abd_get_offset() and abd_get_from_buf()). If an
- * ABD takes ownership of its buf then it will become tracked.
- */
- { "linear_cnt", KSTAT_DATA_UINT64 },
- /* Amount of data stored in all linear ABDs tracked by linear_cnt */
- { "linear_data_size", KSTAT_DATA_UINT64 },
- /*
- * The number of scatter ABDs which are currently allocated, excluding
- * ABDs which don't own their data (for instance the ones which were
- * allocated through abd_get_offset()).
- */
- { "scatter_cnt", KSTAT_DATA_UINT64 },
- /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
- { "scatter_data_size", KSTAT_DATA_UINT64 },
- /*
- * The amount of space wasted at the end of the last chunk across all
- * scatter ABDs tracked by scatter_cnt.
- */
- { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
- /*
- * The number of compound allocations of a given order. These
- * allocations are spread over all currently allocated ABDs, and
- * act as a measure of memory fragmentation.
- */
- { { "scatter_order_N", KSTAT_DATA_UINT64 } },
- /*
- * The number of scatter ABDs which contain multiple chunks.
- * ABDs are preferentially allocated from the minimum number of
- * contiguous multi-page chunks, a single chunk is optimal.
- */
- { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
- /*
- * The number of scatter ABDs which are split across memory zones.
- * ABDs are preferentially allocated using pages from a single zone.
- */
- { "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
- /*
- * The total number of retries encountered when attempting to
- * allocate the pages to populate the scatter ABD.
- */
- { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
- /*
- * The total number of retries encountered when attempting to
- * allocate the sg table for an ABD.
- */
- { "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
-};
-
-#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
-#define ABDSTAT_INCR(stat, val) \
- atomic_add_64(&abd_stats.stat.value.ui64, (val))
-#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
-#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
-
-#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
-#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
-#define abd_for_each_sg(abd, sg, n, i) \
- for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
-
-/* see block comment above for description */
-int zfs_abd_scatter_enabled = B_TRUE;
-unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
-
-/*
- * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
- * ABD's. Smaller allocations will use linear ABD's which uses
- * zio_[data_]buf_alloc().
- *
- * Scatter ABD's use at least one page each, so sub-page allocations waste
- * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
- * half of each page). Using linear ABD's for small allocations means that
- * they will be put on slabs which contain many allocations. This can
- * improve memory efficiency, but it also makes it much harder for ARC
- * evictions to actually free pages, because all the buffers on one slab need
- * to be freed in order for the slab (and underlying pages) to be freed.
- * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
- * possible for them to actually waste more memory than scatter (one page per
- * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
- *
- * Spill blocks are typically 512B and are heavily used on systems running
- * selinux with the default dnode size and the `xattr=sa` property set.
- *
- * By default we use linear allocations for 512B and 1KB, and scatter
- * allocations for larger (1.5KB and up).
- */
-int zfs_abd_scatter_min_size = 512 * 3;
-
-static kmem_cache_t *abd_cache = NULL;
-static kstat_t *abd_ksp;
-
-static inline size_t
-abd_chunkcnt_for_bytes(size_t size)
-{
- return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
-}
-
-#ifdef _KERNEL
-/*
- * Mark zfs data pages so they can be excluded from kernel crash dumps
- */
-#ifdef _LP64
-#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E
-
-static inline void
-abd_mark_zfs_page(struct page *page)
-{
- get_page(page);
- SetPagePrivate(page);
- set_page_private(page, ABD_FILE_CACHE_PAGE);
-}
-
-static inline void
-abd_unmark_zfs_page(struct page *page)
-{
- set_page_private(page, 0UL);
- ClearPagePrivate(page);
- put_page(page);
-}
-#else
-#define abd_mark_zfs_page(page)
-#define abd_unmark_zfs_page(page)
-#endif /* _LP64 */
-
-#ifndef CONFIG_HIGHMEM
-
-#ifndef __GFP_RECLAIM
-#define __GFP_RECLAIM __GFP_WAIT
-#endif
-
-/*
- * The goal is to minimize fragmentation by preferentially populating ABDs
- * with higher order compound pages from a single zone. Allocation size is
- * progressively decreased until it can be satisfied without performing
- * reclaim or compaction. When necessary this function will degenerate to
- * allocating individual pages and allowing reclaim to satisfy allocations.
- */
-static void
-abd_alloc_pages(abd_t *abd, size_t size)
-{
- struct list_head pages;
- struct sg_table table;
- struct scatterlist *sg;
- struct page *page, *tmp_page = NULL;
- gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
- gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
- int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
- int nr_pages = abd_chunkcnt_for_bytes(size);
- int chunks = 0, zones = 0;
- size_t remaining_size;
- int nid = NUMA_NO_NODE;
- int alloc_pages = 0;
-
- INIT_LIST_HEAD(&pages);
-
- while (alloc_pages < nr_pages) {
- unsigned chunk_pages;
- int order;
-
- order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
- chunk_pages = (1U << order);
-
- page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
- if (page == NULL) {
- if (order == 0) {
- ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
- schedule_timeout_interruptible(1);
- } else {
- max_order = MAX(0, order - 1);
- }
- continue;
- }
-
- list_add_tail(&page->lru, &pages);
-
- if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
- zones++;
-
- nid = page_to_nid(page);
- ABDSTAT_BUMP(abdstat_scatter_orders[order]);
- chunks++;
- alloc_pages += chunk_pages;
- }
-
- ASSERT3S(alloc_pages, ==, nr_pages);
-
- while (sg_alloc_table(&table, chunks, gfp)) {
- ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
- schedule_timeout_interruptible(1);
- }
-
- sg = table.sgl;
- remaining_size = size;
- list_for_each_entry_safe(page, tmp_page, &pages, lru) {
- size_t sg_size = MIN(PAGESIZE << compound_order(page),
- remaining_size);
- sg_set_page(sg, page, sg_size, 0);
- abd_mark_zfs_page(page);
- remaining_size -= sg_size;
-
- sg = sg_next(sg);
- list_del(&page->lru);
- }
-
- /*
- * These conditions ensure that a possible transformation to a linear
- * ABD would be valid.
- */
- ASSERT(!PageHighMem(sg_page(table.sgl)));
- ASSERT0(ABD_SCATTER(abd).abd_offset);
-
- if (table.nents == 1) {
- /*
- * Since there is only one entry, this ABD can be represented
- * as a linear buffer. All single-page (4K) ABD's can be
- * represented this way. Some multi-page ABD's can also be
- * represented this way, if we were able to allocate a single
- * "chunk" (higher-order "page" which represents a power-of-2
- * series of physically-contiguous pages). This is often the
- * case for 2-page (8K) ABD's.
- *
- * Representing a single-entry scatter ABD as a linear ABD
- * has the performance advantage of avoiding the copy (and
- * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
- * A performance increase of around 5% has been observed for
- * ARC-cached reads (of small blocks which can take advantage
- * of this).
- *
- * Note that this optimization is only possible because the
- * pages are always mapped into the kernel's address space.
- * This is not the case for highmem pages, so the
- * optimization can not be made there.
- */
- abd->abd_flags |= ABD_FLAG_LINEAR;
- abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
- abd->abd_u.abd_linear.abd_sgl = table.sgl;
- abd->abd_u.abd_linear.abd_buf =
- page_address(sg_page(table.sgl));
- } else if (table.nents > 1) {
- ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
- abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
-
- if (zones) {
- ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
- abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
- }
-
- ABD_SCATTER(abd).abd_sgl = table.sgl;
- ABD_SCATTER(abd).abd_nents = table.nents;
- }
-}
-#else
-/*
- * Allocate N individual pages to construct a scatter ABD. This function
- * makes no attempt to request contiguous pages and requires the minimal
- * number of kernel interfaces. It's designed for maximum compatibility.
- */
-static void
-abd_alloc_pages(abd_t *abd, size_t size)
-{
- struct scatterlist *sg = NULL;
- struct sg_table table;
- struct page *page;
- gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
- int nr_pages = abd_chunkcnt_for_bytes(size);
- int i = 0;
-
- while (sg_alloc_table(&table, nr_pages, gfp)) {
- ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
- schedule_timeout_interruptible(1);
- }
-
- ASSERT3U(table.nents, ==, nr_pages);
- ABD_SCATTER(abd).abd_sgl = table.sgl;
- ABD_SCATTER(abd).abd_nents = nr_pages;
-
- abd_for_each_sg(abd, sg, nr_pages, i) {
- while ((page = __page_cache_alloc(gfp)) == NULL) {
- ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
- schedule_timeout_interruptible(1);
- }
-
- ABDSTAT_BUMP(abdstat_scatter_orders[0]);
- sg_set_page(sg, page, PAGESIZE, 0);
- abd_mark_zfs_page(page);
- }
-
- if (nr_pages > 1) {
- ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
- abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
- }
-}
-#endif /* !CONFIG_HIGHMEM */
-
-static void
-abd_free_pages(abd_t *abd)
-{
- struct scatterlist *sg = NULL;
- struct sg_table table;
- struct page *page;
- int nr_pages = ABD_SCATTER(abd).abd_nents;
- int order, i = 0;
-
- if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
- ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
-
- if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
- ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
-
- abd_for_each_sg(abd, sg, nr_pages, i) {
- page = sg_page(sg);
- abd_unmark_zfs_page(page);
- order = compound_order(page);
- __free_pages(page, order);
- ASSERT3U(sg->length, <=, PAGE_SIZE << order);
- ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
- }
-
- table.sgl = ABD_SCATTER(abd).abd_sgl;
- table.nents = table.orig_nents = nr_pages;
- sg_free_table(&table);
-}
-
-#else /* _KERNEL */
-
-#ifndef PAGE_SHIFT
-#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
-#endif
-
-struct page;
-
-#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
-#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
-#define local_irq_save(flags) do { (void)(flags); } while (0)
-#define local_irq_restore(flags) do { (void)(flags); } while (0)
-#define nth_page(pg, i) \
- ((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
- struct page *page;
- int length;
- int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
- memset(sg, 0, nr * sizeof (struct scatterlist));
- sg[nr - 1].end = 1;
-}
-
-#define for_each_sg(sgl, sg, nr, i) \
- for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
- unsigned int offset)
-{
- /* currently we don't use offset */
- ASSERT(offset == 0);
- sg->page = page;
- sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
- return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
- if (sg->end)
- return (NULL);
-
- return (sg + 1);
-}
-
-static void
-abd_alloc_pages(abd_t *abd, size_t size)
-{
- unsigned nr_pages = abd_chunkcnt_for_bytes(size);
- struct scatterlist *sg;
- int i;
-
- ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
- sizeof (struct scatterlist), KM_SLEEP);
- sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
- abd_for_each_sg(abd, sg, nr_pages, i) {
- struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
- sg_set_page(sg, p, PAGESIZE, 0);
- }
- ABD_SCATTER(abd).abd_nents = nr_pages;
-}
-
-static void
-abd_free_pages(abd_t *abd)
-{
- int i, n = ABD_SCATTER(abd).abd_nents;
- struct scatterlist *sg;
-
- abd_for_each_sg(abd, sg, n, i) {
- for (int j = 0; j < sg->length; j += PAGESIZE) {
- struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
- umem_free(p, PAGESIZE);
- }
- }
-
- vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist));
-}
-
-#endif /* _KERNEL */
-
-void
-abd_init(void)
-{
- int i;
-
- abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
-
- abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
- sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
- if (abd_ksp != NULL) {
- abd_ksp->ks_data = &abd_stats;
- kstat_install(abd_ksp);
-
- for (i = 0; i < MAX_ORDER; i++) {
- snprintf(abd_stats.abdstat_scatter_orders[i].name,
- KSTAT_STRLEN, "scatter_order_%d", i);
- abd_stats.abdstat_scatter_orders[i].data_type =
- KSTAT_DATA_UINT64;
- }
- }
-}
-
-void
-abd_fini(void)
-{
- if (abd_ksp != NULL) {
- kstat_delete(abd_ksp);
- abd_ksp = NULL;
- }
-
- if (abd_cache) {
- kmem_cache_destroy(abd_cache);
- abd_cache = NULL;
- }
-}
-
-static inline void
-abd_verify(abd_t *abd)
-{
- ASSERT3U(abd->abd_size, >, 0);
- ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
- ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
- ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
- ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
- IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
- IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
- if (abd_is_linear(abd)) {
- ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
- } else {
- size_t n;
- int i = 0;
- struct scatterlist *sg = NULL;
-
- ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
- ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
- ABD_SCATTER(abd).abd_sgl->length);
- n = ABD_SCATTER(abd).abd_nents;
- abd_for_each_sg(abd, sg, n, i) {
- ASSERT3P(sg_page(sg), !=, NULL);
- }
- }
-}
-
-static inline abd_t *
-abd_alloc_struct(void)
-{
- abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
-
- ASSERT3P(abd, !=, NULL);
- ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
-
- return (abd);
-}
-
-static inline void
-abd_free_struct(abd_t *abd)
-{
- kmem_cache_free(abd_cache, abd);
- ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
-}
-
-/*
- * Allocate an ABD, along with its own underlying data buffers. Use this if you
- * don't care whether the ABD is linear or not.
- */
-abd_t *
-abd_alloc(size_t size, boolean_t is_metadata)
-{
- /* see the comment above zfs_abd_scatter_min_size */
- if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size)
- return (abd_alloc_linear(size, is_metadata));
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- abd_t *abd = abd_alloc_struct();
- abd->abd_flags = ABD_FLAG_OWNER;
- abd->abd_u.abd_scatter.abd_offset = 0;
- abd_alloc_pages(abd, size);
-
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- ABDSTAT_BUMP(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, size);
- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
- P2ROUNDUP(size, PAGESIZE) - size);
-
- return (abd);
-}
-
-static void
-abd_free_scatter(abd_t *abd)
-{
- abd_free_pages(abd);
-
- zfs_refcount_destroy(&abd->abd_children);
- ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
- ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
- ABDSTAT_INCR(abdstat_scatter_chunk_waste,
- (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE));
-
- abd_free_struct(abd);
-}
-
-/*
- * Allocate an ABD that must be linear, along with its own underlying data
- * buffer. Only use this when it would be very annoying to write your ABD
- * consumer with a scattered ABD.
- */
-abd_t *
-abd_alloc_linear(size_t size, boolean_t is_metadata)
-{
- abd_t *abd = abd_alloc_struct();
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- if (is_metadata) {
- abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
- } else {
- abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
- }
-
- ABDSTAT_BUMP(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, size);
-
- return (abd);
-}
-
-static void
-abd_free_linear(abd_t *abd)
-{
- if (abd_is_linear_page(abd)) {
- /* Transform it back into a scatter ABD for freeing */
- struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
- abd->abd_flags &= ~ABD_FLAG_LINEAR;
- abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
- ABD_SCATTER(abd).abd_nents = 1;
- ABD_SCATTER(abd).abd_offset = 0;
- ABD_SCATTER(abd).abd_sgl = sg;
- abd_free_scatter(abd);
- return;
- }
- if (abd->abd_flags & ABD_FLAG_META) {
- zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
- } else {
- zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
- }
-
- zfs_refcount_destroy(&abd->abd_children);
- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-
- abd_free_struct(abd);
-}
-
-/*
- * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
- * abd_alloc_linear().
- */
-void
-abd_free(abd_t *abd)
-{
- abd_verify(abd);
- ASSERT3P(abd->abd_parent, ==, NULL);
- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
- if (abd_is_linear(abd))
- abd_free_linear(abd);
- else
- abd_free_scatter(abd);
-}
-
-/*
- * Allocate an ABD of the same format (same metadata flag, same scatterize
- * setting) as another ABD.
- */
-abd_t *
-abd_alloc_sametype(abd_t *sabd, size_t size)
-{
- boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
- if (abd_is_linear(sabd) &&
- !abd_is_linear_page(sabd)) {
- return (abd_alloc_linear(size, is_metadata));
- } else {
- return (abd_alloc(size, is_metadata));
- }
-}
-
-/*
- * If we're going to use this ABD for doing I/O using the block layer, the
- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
- * plan to store this ABD in memory for a long period of time, we should
- * allocate the ABD type that requires the least data copying to do the I/O.
- *
- * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
- * using a scatter/gather list we should switch to that and replace this call
- * with vanilla abd_alloc().
- *
- * On Linux the optimal thing to do would be to use abd_get_offset() and
- * construct a new ABD which shares the original pages thereby eliminating
- * the copy. But for the moment a new linear ABD is allocated until this
- * performance optimization can be implemented.
- */
-abd_t *
-abd_alloc_for_io(size_t size, boolean_t is_metadata)
-{
- return (abd_alloc(size, is_metadata));
-}
-
-/*
- * Allocate a new ABD to point to offset off of sabd. It shares the underlying
- * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
- * any derived ABDs exist.
- */
-static inline abd_t *
-abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
-{
- abd_t *abd;
-
- abd_verify(sabd);
- ASSERT3U(off, <=, sabd->abd_size);
-
- if (abd_is_linear(sabd)) {
- abd = abd_alloc_struct();
-
- /*
- * Even if this buf is filesystem metadata, we only track that
- * if we own the underlying data buffer, which is not true in
- * this case. Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = ABD_FLAG_LINEAR;
-
- abd->abd_u.abd_linear.abd_buf =
- (char *)sabd->abd_u.abd_linear.abd_buf + off;
- } else {
- int i = 0;
- struct scatterlist *sg = NULL;
- size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
-
- abd = abd_alloc_struct();
-
- /*
- * Even if this buf is filesystem metadata, we only track that
- * if we own the underlying data buffer, which is not true in
- * this case. Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = 0;
-
- abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
- if (new_offset < sg->length)
- break;
- new_offset -= sg->length;
- }
-
- ABD_SCATTER(abd).abd_sgl = sg;
- ABD_SCATTER(abd).abd_offset = new_offset;
- ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
- }
-
- abd->abd_size = size;
- abd->abd_parent = sabd;
- zfs_refcount_create(&abd->abd_children);
- (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
-
- return (abd);
-}
-
-abd_t *
-abd_get_offset(abd_t *sabd, size_t off)
-{
- size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
-
- VERIFY3U(size, >, 0);
-
- return (abd_get_offset_impl(sabd, off, size));
-}
-
-abd_t *
-abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
-{
- ASSERT3U(off + size, <=, sabd->abd_size);
-
- return (abd_get_offset_impl(sabd, off, size));
-}
-
-/*
- * Allocate a linear ABD structure for buf. You must free this with abd_put()
- * since the resulting ABD doesn't own its own buffer.
- */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
-{
- abd_t *abd = abd_alloc_struct();
-
- VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
- /*
- * Even if this buf is filesystem metadata, we only track that if we
- * own the underlying data buffer, which is not true in this case.
- * Therefore, we don't ever use ABD_FLAG_META here.
- */
- abd->abd_flags = ABD_FLAG_LINEAR;
- abd->abd_size = size;
- abd->abd_parent = NULL;
- zfs_refcount_create(&abd->abd_children);
-
- abd->abd_u.abd_linear.abd_buf = buf;
-
- return (abd);
-}
-
-/*
- * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
- * free the underlying scatterlist or buffer.
- */
-void
-abd_put(abd_t *abd)
-{
- abd_verify(abd);
- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
-
- if (abd->abd_parent != NULL) {
- (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
- abd->abd_size, abd);
- }
-
- zfs_refcount_destroy(&abd->abd_children);
- abd_free_struct(abd);
-}
-
-/*
- * Get the raw buffer associated with a linear ABD.
- */
-void *
-abd_to_buf(abd_t *abd)
-{
- ASSERT(abd_is_linear(abd));
- abd_verify(abd);
- return (abd->abd_u.abd_linear.abd_buf);
-}
-
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
- void *buf;
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- buf = abd_to_buf(abd);
- } else {
- buf = zio_buf_alloc(n);
- }
- (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-
- return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
- void *buf = abd_borrow_buf(abd, n);
- if (!abd_is_linear(abd)) {
- abd_copy_to_buf(buf, abd, n);
- }
- return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
- abd_verify(abd);
- ASSERT3U(abd->abd_size, >=, n);
- if (abd_is_linear(abd)) {
- ASSERT3P(buf, ==, abd_to_buf(abd));
- } else {
- ASSERT0(abd_cmp_buf(abd, buf, n));
- zio_buf_free(buf, n);
- }
- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
- if (!abd_is_linear(abd)) {
- abd_copy_from_buf(abd, buf, n);
- }
- abd_return_buf(abd, buf, n);
-}
-
-/*
- * Give this ABD ownership of the buffer that it's storing. Can only be used on
- * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
- * with abd_alloc_linear() which subsequently released ownership of their buf
- * with abd_release_ownership_of_buf().
- */
-void
-abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
-{
- ASSERT(abd_is_linear(abd));
- ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
- abd_verify(abd);
-
- abd->abd_flags |= ABD_FLAG_OWNER;
- if (is_metadata) {
- abd->abd_flags |= ABD_FLAG_META;
- }
-
- ABDSTAT_BUMP(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
-}
-
-void
-abd_release_ownership_of_buf(abd_t *abd)
-{
- ASSERT(abd_is_linear(abd));
- ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
-
- /*
- * abd_free() needs to handle LINEAR_PAGE ABD's specially.
- * Since that flag does not survive the
- * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
- * abd_take_ownership_of_buf() sequence, we don't allow releasing
- * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
- */
- ASSERT(!abd_is_linear_page(abd));
-
- abd_verify(abd);
-
- abd->abd_flags &= ~ABD_FLAG_OWNER;
- /* Disable this flag since we no longer own the data buffer */
- abd->abd_flags &= ~ABD_FLAG_META;
-
- ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
- ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-}
-
-#ifndef HAVE_1ARG_KMAP_ATOMIC
-#define NR_KM_TYPE (6)
-#ifdef _KERNEL
-int km_table[NR_KM_TYPE] = {
- KM_USER0,
- KM_USER1,
- KM_BIO_SRC_IRQ,
- KM_BIO_DST_IRQ,
- KM_PTE0,
- KM_PTE1,
-};
-#endif
-#endif
-
-struct abd_iter {
- /* public interface */
- void *iter_mapaddr; /* addr corresponding to iter_pos */
- size_t iter_mapsize; /* length of data valid at mapaddr */
-
- /* private */
- abd_t *iter_abd; /* ABD being iterated through */
- size_t iter_pos;
- size_t iter_offset; /* offset in current sg/abd_buf, */
- /* abd_offset included */
- struct scatterlist *iter_sg; /* current sg */
-#ifndef HAVE_1ARG_KMAP_ATOMIC
- int iter_km; /* KM_* for kmap_atomic */
-#endif
-};
-
-/*
- * Initialize the abd_iter.
- */
-static void
-abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type)
-{
- abd_verify(abd);
- aiter->iter_abd = abd;
- aiter->iter_mapaddr = NULL;
- aiter->iter_mapsize = 0;
- aiter->iter_pos = 0;
- if (abd_is_linear(abd)) {
- aiter->iter_offset = 0;
- aiter->iter_sg = NULL;
- } else {
- aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
- aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
- }
-#ifndef HAVE_1ARG_KMAP_ATOMIC
- ASSERT3U(km_type, <, NR_KM_TYPE);
- aiter->iter_km = km_type;
-#endif
-}
-
-/*
- * Advance the iterator by a certain amount. Cannot be called when a chunk is
- * in use. This can be safely called when the aiter has already exhausted, in
- * which case this does nothing.
- */
-static void
-abd_iter_advance(struct abd_iter *aiter, size_t amount)
-{
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
- ASSERT0(aiter->iter_mapsize);
-
- /* There's nothing left to advance to, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- aiter->iter_pos += amount;
- aiter->iter_offset += amount;
- if (!abd_is_linear(aiter->iter_abd)) {
- while (aiter->iter_offset >= aiter->iter_sg->length) {
- aiter->iter_offset -= aiter->iter_sg->length;
- aiter->iter_sg = sg_next(aiter->iter_sg);
- if (aiter->iter_sg == NULL) {
- ASSERT0(aiter->iter_offset);
- break;
- }
- }
- }
-}
-
-/*
- * Map the current chunk into aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_map(struct abd_iter *aiter)
-{
- void *paddr;
- size_t offset = 0;
-
- ASSERT3P(aiter->iter_mapaddr, ==, NULL);
- ASSERT0(aiter->iter_mapsize);
-
- /* There's nothing left to iterate over, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- if (abd_is_linear(aiter->iter_abd)) {
- ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
- offset = aiter->iter_offset;
- aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
- paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
- } else {
- offset = aiter->iter_offset;
- aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
- aiter->iter_abd->abd_size - aiter->iter_pos);
-
- paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
- km_table[aiter->iter_km]);
- }
-
- aiter->iter_mapaddr = (char *)paddr + offset;
-}
-
-/*
- * Unmap the current chunk from aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_unmap(struct abd_iter *aiter)
-{
- /* There's nothing left to unmap, so do nothing */
- if (aiter->iter_pos == aiter->iter_abd->abd_size)
- return;
-
- if (!abd_is_linear(aiter->iter_abd)) {
- /* LINTED E_FUNC_SET_NOT_USED */
- zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
- km_table[aiter->iter_km]);
- }
-
- ASSERT3P(aiter->iter_mapaddr, !=, NULL);
- ASSERT3U(aiter->iter_mapsize, >, 0);
-
- aiter->iter_mapaddr = NULL;
- aiter->iter_mapsize = 0;
-}
-
-int
-abd_iterate_func(abd_t *abd, size_t off, size_t size,
- abd_iter_func_t *func, void *private)
-{
- int ret = 0;
- struct abd_iter aiter;
-
- abd_verify(abd);
- ASSERT3U(off + size, <=, abd->abd_size);
-
- abd_iter_init(&aiter, abd, 0);
- abd_iter_advance(&aiter, off);
-
- while (size > 0) {
- abd_iter_map(&aiter);
-
- size_t len = MIN(aiter.iter_mapsize, size);
- ASSERT3U(len, >, 0);
-
- ret = func(aiter.iter_mapaddr, len, private);
-
- abd_iter_unmap(&aiter);
-
- if (ret != 0)
- break;
-
- size -= len;
- abd_iter_advance(&aiter, len);
- }
-
- return (ret);
-}
-
-struct buf_arg {
- void *arg_buf;
-};
-
-static int
-abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
-{
- struct buf_arg *ba_ptr = private;
-
- (void) memcpy(ba_ptr->arg_buf, buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (0);
-}
-
-/*
- * Copy abd to buf. (off is the offset in abd.)
- */
-void
-abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { buf };
-
- (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
- &ba_ptr);
-}
-
-static int
-abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
-{
- int ret;
- struct buf_arg *ba_ptr = private;
-
- ret = memcmp(buf, ba_ptr->arg_buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (ret);
-}
-
-/*
- * Compare the contents of abd to buf. (off is the offset in abd.)
- */
-int
-abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { (void *) buf };
-
- return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
-}
-
-static int
-abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
-{
- struct buf_arg *ba_ptr = private;
-
- (void) memcpy(buf, ba_ptr->arg_buf, size);
- ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
- return (0);
-}
-
-/*
- * Copy from buf to abd. (off is the offset in abd.)
- */
-void
-abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
- struct buf_arg ba_ptr = { (void *) buf };
-
- (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
- &ba_ptr);
-}
-
-/*ARGSUSED*/
-static int
-abd_zero_off_cb(void *buf, size_t size, void *private)
-{
- (void) memset(buf, 0, size);
- return (0);
-}
-
-/*
- * Zero out the abd from a particular offset to the end.
- */
-void
-abd_zero_off(abd_t *abd, size_t off, size_t size)
-{
- (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
-}
-
-/*
- * Iterate over two ABDs and call func incrementally on the two ABDs' data in
- * equal-sized chunks (passed to func as raw buffers). func could be called many
- * times during this iteration.
- */
-int
-abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
- size_t size, abd_iter_func2_t *func, void *private)
-{
- int ret = 0;
- struct abd_iter daiter, saiter;
-
- abd_verify(dabd);
- abd_verify(sabd);
-
- ASSERT3U(doff + size, <=, dabd->abd_size);
- ASSERT3U(soff + size, <=, sabd->abd_size);
-
- abd_iter_init(&daiter, dabd, 0);
- abd_iter_init(&saiter, sabd, 1);
- abd_iter_advance(&daiter, doff);
- abd_iter_advance(&saiter, soff);
-
- while (size > 0) {
- abd_iter_map(&daiter);
- abd_iter_map(&saiter);
-
- size_t dlen = MIN(daiter.iter_mapsize, size);
- size_t slen = MIN(saiter.iter_mapsize, size);
- size_t len = MIN(dlen, slen);
- ASSERT(dlen > 0 || slen > 0);
-
- ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
- private);
-
- abd_iter_unmap(&saiter);
- abd_iter_unmap(&daiter);
-
- if (ret != 0)
- break;
-
- size -= len;
- abd_iter_advance(&daiter, len);
- abd_iter_advance(&saiter, len);
- }
-
- return (ret);
-}
-
-/*ARGSUSED*/
-static int
-abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
-{
- (void) memcpy(dbuf, sbuf, size);
- return (0);
-}
-
-/*
- * Copy from sabd to dabd starting from soff and doff.
- */
-void
-abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
-{
- (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
- abd_copy_off_cb, NULL);
-}
-
-/*ARGSUSED*/
-static int
-abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
-{
- return (memcmp(bufa, bufb, size));
-}
-
-/*
- * Compares the contents of two ABDs.
- */
-int
-abd_cmp(abd_t *dabd, abd_t *sabd)
-{
- ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
- return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
- abd_cmp_cb, NULL));
-}
-
-/*
- * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
- *
- * @cabds parity ABDs, must have equal size
- * @dabd data ABD. Can be NULL (in this case @dsize = 0)
- * @func_raidz_gen should be implemented so that its behaviour
- * is the same when taking linear and when taking scatter
- */
-void
-abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
- ssize_t csize, ssize_t dsize, const unsigned parity,
- void (*func_raidz_gen)(void **, const void *, size_t, size_t))
-{
- int i;
- ssize_t len, dlen;
- struct abd_iter caiters[3];
- struct abd_iter daiter = {0};
- void *caddrs[3];
- unsigned long flags;
-
- ASSERT3U(parity, <=, 3);
-
- for (i = 0; i < parity; i++)
- abd_iter_init(&caiters[i], cabds[i], i);
-
- if (dabd)
- abd_iter_init(&daiter, dabd, i);
-
- ASSERT3S(dsize, >=, 0);
-
- local_irq_save(flags);
- while (csize > 0) {
- len = csize;
-
- if (dabd && dsize > 0)
- abd_iter_map(&daiter);
-
- for (i = 0; i < parity; i++) {
- abd_iter_map(&caiters[i]);
- caddrs[i] = caiters[i].iter_mapaddr;
- }
-
- switch (parity) {
- case 3:
- len = MIN(caiters[2].iter_mapsize, len);
- /* falls through */
- case 2:
- len = MIN(caiters[1].iter_mapsize, len);
- /* falls through */
- case 1:
- len = MIN(caiters[0].iter_mapsize, len);
- }
-
- /* must be progressive */
- ASSERT3S(len, >, 0);
-
- if (dabd && dsize > 0) {
- /* this needs precise iter.length */
- len = MIN(daiter.iter_mapsize, len);
- dlen = len;
- } else
- dlen = 0;
-
- /* must be progressive */
- ASSERT3S(len, >, 0);
- /*
- * The iterated function likely will not do well if each
- * segment except the last one is not multiple of 512 (raidz).
- */
- ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
-
- func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
-
- for (i = parity-1; i >= 0; i--) {
- abd_iter_unmap(&caiters[i]);
- abd_iter_advance(&caiters[i], len);
- }
-
- if (dabd && dsize > 0) {
- abd_iter_unmap(&daiter);
- abd_iter_advance(&daiter, dlen);
- dsize -= dlen;
- }
-
- csize -= len;
-
- ASSERT3S(dsize, >=, 0);
- ASSERT3S(csize, >=, 0);
- }
- local_irq_restore(flags);
-}
-
-/*
- * Iterate over code ABDs and data reconstruction target ABDs and call
- * @func_raidz_rec. Function maps at most 6 pages atomically.
- *
- * @cabds parity ABDs, must have equal size
- * @tabds rec target ABDs, at most 3
- * @tsize size of data target columns
- * @func_raidz_rec expects syndrome data in target columns. Function
- * reconstructs data and overwrites target columns.
- */
-void
-abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
- ssize_t tsize, const unsigned parity,
- void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
- const unsigned *mul),
- const unsigned *mul)
-{
- int i;
- ssize_t len;
- struct abd_iter citers[3];
- struct abd_iter xiters[3];
- void *caddrs[3], *xaddrs[3];
- unsigned long flags;
-
- ASSERT3U(parity, <=, 3);
-
- for (i = 0; i < parity; i++) {
- abd_iter_init(&citers[i], cabds[i], 2*i);
- abd_iter_init(&xiters[i], tabds[i], 2*i+1);
- }
-
- local_irq_save(flags);
- while (tsize > 0) {
-
- for (i = 0; i < parity; i++) {
- abd_iter_map(&citers[i]);
- abd_iter_map(&xiters[i]);
- caddrs[i] = citers[i].iter_mapaddr;
- xaddrs[i] = xiters[i].iter_mapaddr;
- }
-
- len = tsize;
- switch (parity) {
- case 3:
- len = MIN(xiters[2].iter_mapsize, len);
- len = MIN(citers[2].iter_mapsize, len);
- /* falls through */
- case 2:
- len = MIN(xiters[1].iter_mapsize, len);
- len = MIN(citers[1].iter_mapsize, len);
- /* falls through */
- case 1:
- len = MIN(xiters[0].iter_mapsize, len);
- len = MIN(citers[0].iter_mapsize, len);
- }
- /* must be progressive */
- ASSERT3S(len, >, 0);
- /*
- * The iterated function likely will not do well if each
- * segment except the last one is not multiple of 512 (raidz).
- */
- ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
-
- func_raidz_rec(xaddrs, len, caddrs, mul);
-
- for (i = parity-1; i >= 0; i--) {
- abd_iter_unmap(&xiters[i]);
- abd_iter_unmap(&citers[i]);
- abd_iter_advance(&xiters[i], len);
- abd_iter_advance(&citers[i], len);
- }
-
- tsize -= len;
- ASSERT3S(tsize, >=, 0);
- }
- local_irq_restore(flags);
-}
-
-#if defined(_KERNEL)
-/*
- * bio_nr_pages for ABD.
- * @off is the offset in @abd
- */
-unsigned long
-abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
-{
- unsigned long pos;
-
- if (abd_is_linear(abd))
- pos = (unsigned long)abd_to_buf(abd) + off;
- else
- pos = abd->abd_u.abd_scatter.abd_offset + off;
-
- return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
- (pos >> PAGE_SHIFT);
-}
-
-/*
- * bio_map for scatter ABD.
- * @off is the offset in @abd
- * Remaining IO size is returned
- */
-unsigned int
-abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
- unsigned int io_size, size_t off)
-{
- int i;
- struct abd_iter aiter;
-
- ASSERT(!abd_is_linear(abd));
- ASSERT3U(io_size, <=, abd->abd_size - off);
-
- abd_iter_init(&aiter, abd, 0);
- abd_iter_advance(&aiter, off);
-
- for (i = 0; i < bio->bi_max_vecs; i++) {
- struct page *pg;
- size_t len, sgoff, pgoff;
- struct scatterlist *sg;
-
- if (io_size <= 0)
- break;
-
- sg = aiter.iter_sg;
- sgoff = aiter.iter_offset;
- pgoff = sgoff & (PAGESIZE - 1);
- len = MIN(io_size, PAGESIZE - pgoff);
- ASSERT(len > 0);
-
- pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
- if (bio_add_page(bio, pg, len, pgoff) != len)
- break;
-
- io_size -= len;
- abd_iter_advance(&aiter, len);
- }
-
- return (io_size);
-}
-
-/* Tunable Parameters */
-module_param(zfs_abd_scatter_enabled, int, 0644);
-MODULE_PARM_DESC(zfs_abd_scatter_enabled,
- "Toggle whether ABD allocations must be linear.");
-module_param(zfs_abd_scatter_min_size, int, 0644);
-MODULE_PARM_DESC(zfs_abd_scatter_min_size,
- "Minimum size of scatter allocations.");
-/* CSTYLED */
-module_param(zfs_abd_scatter_max_order, uint, 0644);
-MODULE_PARM_DESC(zfs_abd_scatter_max_order,
- "Maximum order allocation used for a scatter ABD.");
-#endif
diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index 5cac2a7de..9d8af3228 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -29,7 +29,7 @@
#include <sys/debug.h>
#include <sys/types.h>
#include <sys/strings.h>
-#include "qat.h"
+#include <sys/qat.h>
#ifdef _KERNEL
diff --git a/module/zfs/policy.c b/module/zfs/policy.c
deleted file mode 100644
index 7f9456a67..000000000
--- a/module/zfs/policy.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
- * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
- *
- * For Linux the vast majority of this enforcement is already handled via
- * the standard Linux VFS permission checks. However certain administrative
- * commands which bypass the standard mechanisms may need to make use of
- * this functionality.
- */
-
-#include <sys/policy.h>
-#include <linux/security.h>
-#include <linux/vfs_compat.h>
-
-/*
- * The passed credentials cannot be directly verified because Linux only
- * provides and interface to check the *current* process credentials. In
- * order to handle this the capable() test is only run when the passed
- * credentials match the current process credentials or the kcred. In
- * all other cases this function must fail and return the passed err.
- */
-static int
-priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err,
- struct user_namespace *ns)
-{
- ASSERT3S(all, ==, B_FALSE);
-
- if (cr != CRED() && (cr != kcred))
- return (err);
-
-#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE)
- if (!(ns ? ns_capable(ns, capability) : capable(capability)))
-#else
- if (!capable(capability))
-#endif
- return (err);
-
- return (0);
-}
-
-static int
-priv_policy(const cred_t *cr, int capability, boolean_t all, int err)
-{
- return (priv_policy_ns(cr, capability, all, err, NULL));
-}
-
-static int
-priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err)
-{
- /*
- * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
- * checks. If we cannot do them, we shouldn't be using ns_capable()
- * since we don't know whether the affected files are valid in our
- * namespace. Note that kuid_has_mapping() came after cred->user_ns, so
- * we shouldn't need to re-check for HAVE_CRED_USER_NS
- */
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- return (priv_policy_ns(cr, capability, all, err, cr->user_ns));
-#else
- return (priv_policy_ns(cr, capability, all, err, NULL));
-#endif
-}
-
-/*
- * Checks for operations that are either client-only or are used by
- * both clients and servers.
- */
-int
-secpolicy_nfs(const cred_t *cr)
-{
- return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
-}
-
-/*
- * Catch all system configuration.
- */
-int
-secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
-{
- return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
-}
-
-/*
- * Like secpolicy_vnode_access() but we get the actual wanted mode and the
- * current mode of the file, not the missing bits.
- *
- * Enforced in the Linux VFS.
- */
-int
-secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
- mode_t curmode, mode_t wantmode)
-{
- return (0);
-}
-
-/*
- * This is a special routine for ZFS; it is used to determine whether
- * any of the privileges in effect allow any form of access to the
- * file. There's no reason to audit this or any reason to record
- * this. More work is needed to do the "KPLD" stuff.
- */
-int
-secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
-{
- if (crgetfsuid(cr) == owner)
- return (0);
-
- if (zpl_inode_owner_or_capable(ip))
- return (0);
-
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
- return (EPERM);
-#endif
-
- if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0)
- return (0);
-
- if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0)
- return (0);
-
- return (EPERM);
-}
-
-/*
- * Determine if subject can chown owner of a file.
- */
-int
-secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
-{
- if (crgetfsuid(cr) == owner)
- return (0);
-
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
- return (EPERM);
-#endif
-
- return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
-}
-
-/*
- * Determine if subject can change group ownership of a file.
- */
-int
-secpolicy_vnode_create_gid(const cred_t *cr)
-{
- return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM));
-}
-
-/*
- * Policy determines whether we can remove an entry from a directory,
- * regardless of permission bits.
- */
-int
-secpolicy_vnode_remove(const cred_t *cr)
-{
- return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM));
-}
-
-/*
- * Determine that subject can modify the mode of a file. allzone privilege
- * needed when modifying root owned object.
- */
-int
-secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
-{
- if (crgetfsuid(cr) == owner)
- return (0);
-
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
- return (EPERM);
-#endif
-
- return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
-}
-
-/*
- * Are we allowed to retain the set-uid/set-gid bits when
- * changing ownership or when writing to a file?
- * "issuid" should be true when set-uid; only in that case
- * root ownership is checked (setgid is assumed).
- *
- * Enforced in the Linux VFS.
- */
-int
-secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
-{
- return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
-}
-
-/*
- * Determine that subject can set the file setgid flag.
- */
-int
-secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
-{
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
- return (EPERM);
-#endif
- if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
- return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
-
- return (0);
-}
-
-/*
- * Determine if the subject can inject faults in the ZFS fault injection
- * framework. Requires all privileges.
- */
-int
-secpolicy_zinject(const cred_t *cr)
-{
- return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
-}
-
-/*
- * Determine if the subject has permission to manipulate ZFS datasets
- * (not pools). Equivalent to the SYS_MOUNT privilege.
- */
-int
-secpolicy_zfs(const cred_t *cr)
-{
- return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
-}
-
-void
-secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
-{
- if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- (vap->va_mode & S_ISUID) != 0 &&
- (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
- vap->va_mask |= AT_MODE;
- vap->va_mode &= ~(S_ISUID|S_ISGID);
- }
-}
-
-/*
- * Determine that subject can set the file setid flags.
- */
-static int
-secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
-{
- if (crgetfsuid(cr) == owner)
- return (0);
-
-#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
- if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
- return (EPERM);
-#endif
-
- return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
-}
-
-/*
- * Determine that subject can make a file a "sticky".
- *
- * Enforced in the Linux VFS.
- */
-static int
-secpolicy_vnode_stky_modify(const cred_t *cr)
-{
- return (0);
-}
-
-int
-secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
- const vattr_t *ovap, cred_t *cr)
-{
- int error;
-
- if ((vap->va_mode & S_ISUID) != 0 &&
- (error = secpolicy_vnode_setid_modify(cr,
- ovap->va_uid)) != 0) {
- return (error);
- }
-
- /*
- * Check privilege if attempting to set the
- * sticky bit on a non-directory.
- */
- if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
- secpolicy_vnode_stky_modify(cr) != 0) {
- vap->va_mode &= ~S_ISVTX;
- }
-
- /*
- * Check for privilege if attempting to set the
- * group-id bit.
- */
- if ((vap->va_mode & S_ISGID) != 0 &&
- secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
- vap->va_mode &= ~S_ISGID;
- }
-
- return (0);
-}
-
-/*
- * Check privileges for setting xvattr attributes
- */
-int
-secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
-{
- return (secpolicy_vnode_chown(cr, owner));
-}
-
-/*
- * Check privileges for setattr attributes.
- *
- * Enforced in the Linux VFS.
- */
-int
-secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
- const struct vattr *ovap, int flags,
- int unlocked_access(void *, int, cred_t *), void *node)
-{
- return (0);
-}
-
-/*
- * Check privileges for links.
- *
- * Enforced in the Linux VFS.
- */
-int
-secpolicy_basic_link(const cred_t *cr)
-{
- return (0);
-}
diff --git a/module/zfs/qat.c b/module/zfs/qat.c
deleted file mode 100644
index a6f024cb4..000000000
--- a/module/zfs/qat.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#if defined(_KERNEL) && defined(HAVE_QAT)
-#include <sys/zfs_context.h>
-#include "qat.h"
-
-qat_stats_t qat_stats = {
- { "comp_requests", KSTAT_DATA_UINT64 },
- { "comp_total_in_bytes", KSTAT_DATA_UINT64 },
- { "comp_total_out_bytes", KSTAT_DATA_UINT64 },
- { "decomp_requests", KSTAT_DATA_UINT64 },
- { "decomp_total_in_bytes", KSTAT_DATA_UINT64 },
- { "decomp_total_out_bytes", KSTAT_DATA_UINT64 },
- { "dc_fails", KSTAT_DATA_UINT64 },
- { "encrypt_requests", KSTAT_DATA_UINT64 },
- { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 },
- { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 },
- { "decrypt_requests", KSTAT_DATA_UINT64 },
- { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 },
- { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 },
- { "crypt_fails", KSTAT_DATA_UINT64 },
- { "cksum_requests", KSTAT_DATA_UINT64 },
- { "cksum_total_in_bytes", KSTAT_DATA_UINT64 },
- { "cksum_fails", KSTAT_DATA_UINT64 },
-};
-
-static kstat_t *qat_ksp = NULL;
-
-CpaStatus
-qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
-{
- *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
- if (*pp_mem_addr == NULL)
- return (CPA_STATUS_RESOURCE);
- return (CPA_STATUS_SUCCESS);
-}
-
-void
-qat_mem_free_contig(void **pp_mem_addr)
-{
- if (*pp_mem_addr != NULL) {
- kfree(*pp_mem_addr);
- *pp_mem_addr = NULL;
- }
-}
-
-int
-qat_init(void)
-{
- qat_ksp = kstat_create("zfs", 0, "qat", "misc",
- KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
- if (qat_ksp != NULL) {
- qat_ksp->ks_data = &qat_stats;
- kstat_install(qat_ksp);
- }
-
- /*
- * Just set the disable flag when qat init failed, qat can be
- * turned on again in post-process after zfs module is loaded, e.g.:
- * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
- */
- if (qat_dc_init() != 0)
- zfs_qat_compress_disable = 1;
-
- if (qat_cy_init() != 0) {
- zfs_qat_checksum_disable = 1;
- zfs_qat_encrypt_disable = 1;
- }
-
- return (0);
-}
-
-void
-qat_fini(void)
-{
- if (qat_ksp != NULL) {
- kstat_delete(qat_ksp);
- qat_ksp = NULL;
- }
-
- qat_cy_fini();
- qat_dc_fini();
-}
-
-#endif
diff --git a/module/zfs/qat.h b/module/zfs/qat.h
deleted file mode 100644
index fdd608139..000000000
--- a/module/zfs/qat.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#ifndef _SYS_QAT_H
-#define _SYS_QAT_H
-
-typedef enum qat_compress_dir {
- QAT_DECOMPRESS = 0,
- QAT_COMPRESS = 1,
-} qat_compress_dir_t;
-
-typedef enum qat_encrypt_dir {
- QAT_DECRYPT = 0,
- QAT_ENCRYPT = 1,
-} qat_encrypt_dir_t;
-
-
-#if defined(_KERNEL) && defined(HAVE_QAT)
-#include <sys/zio.h>
-#include <sys/crypto/api.h>
-#include "cpa.h"
-#include "dc/cpa_dc.h"
-#include "lac/cpa_cy_sym.h"
-
-/*
- * Timeout - no response from hardware after 0.5 seconds
- */
-#define QAT_TIMEOUT_MS 500
-
-/*
- * The minimal and maximal buffer size which are not restricted
- * in the QAT hardware, but with the input buffer size between 4KB
- * and 128KB the hardware can provide the optimal performance.
- */
-#define QAT_MIN_BUF_SIZE (4*1024)
-#define QAT_MAX_BUF_SIZE (128*1024)
-
-/*
- * Used for QAT kstat.
- */
-typedef struct qat_stats {
- /*
- * Number of jobs submitted to QAT compression engine.
- */
- kstat_named_t comp_requests;
- /*
- * Total bytes sent to QAT compression engine.
- */
- kstat_named_t comp_total_in_bytes;
- /*
- * Total bytes output from QAT compression engine.
- */
- kstat_named_t comp_total_out_bytes;
- /*
- * Number of jobs submitted to QAT de-compression engine.
- */
- kstat_named_t decomp_requests;
- /*
- * Total bytes sent to QAT de-compression engine.
- */
- kstat_named_t decomp_total_in_bytes;
- /*
- * Total bytes output from QAT de-compression engine.
- */
- kstat_named_t decomp_total_out_bytes;
- /*
- * Number of fails in the QAT compression / decompression engine.
- * Note: when a QAT error happens, it doesn't necessarily indicate a
- * critical hardware issue. Sometimes it is because the output buffer
- * is not big enough. The compression job will be transferred to the
- * gzip software implementation so the functionality of ZFS is not
- * impacted.
- */
- kstat_named_t dc_fails;
-
- /*
- * Number of jobs submitted to QAT encryption engine.
- */
- kstat_named_t encrypt_requests;
- /*
- * Total bytes sent to QAT encryption engine.
- */
- kstat_named_t encrypt_total_in_bytes;
- /*
- * Total bytes output from QAT encryption engine.
- */
- kstat_named_t encrypt_total_out_bytes;
- /*
- * Number of jobs submitted to QAT decryption engine.
- */
- kstat_named_t decrypt_requests;
- /*
- * Total bytes sent to QAT decryption engine.
- */
- kstat_named_t decrypt_total_in_bytes;
- /*
- * Total bytes output from QAT decryption engine.
- */
- kstat_named_t decrypt_total_out_bytes;
- /*
- * Number of fails in the QAT encryption / decryption engine.
- * Note: when a QAT error happens, it doesn't necessarily indicate a
- * critical hardware issue. The encryption job will be transferred
- * to the software implementation so the functionality of ZFS is
- * not impacted.
- */
- kstat_named_t crypt_fails;
-
- /*
- * Number of jobs submitted to QAT checksum engine.
- */
- kstat_named_t cksum_requests;
- /*
- * Total bytes sent to QAT checksum engine.
- */
- kstat_named_t cksum_total_in_bytes;
- /*
- * Number of fails in the QAT checksum engine.
- * Note: when a QAT error happens, it doesn't necessarily indicate a
- * critical hardware issue. The checksum job will be transferred to the
- * software implementation so the functionality of ZFS is not impacted.
- */
- kstat_named_t cksum_fails;
-} qat_stats_t;
-
-#define QAT_STAT_INCR(stat, val) \
- atomic_add_64(&qat_stats.stat.value.ui64, (val))
-#define QAT_STAT_BUMP(stat) \
- QAT_STAT_INCR(stat, 1)
-
-extern qat_stats_t qat_stats;
-extern int zfs_qat_compress_disable;
-extern int zfs_qat_checksum_disable;
-extern int zfs_qat_encrypt_disable;
-
-/* inlined for performance */
-static inline struct page *
-qat_mem_to_page(void *addr)
-{
- if (!is_vmalloc_addr(addr))
- return (virt_to_page(addr));
-
- return (vmalloc_to_page(addr));
-}
-
-CpaStatus qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes);
-void qat_mem_free_contig(void **pp_mem_addr);
-#define QAT_PHYS_CONTIG_ALLOC(pp_mem_addr, size_bytes) \
- qat_mem_alloc_contig((void *)(pp_mem_addr), (size_bytes))
-#define QAT_PHYS_CONTIG_FREE(p_mem_addr) \
- qat_mem_free_contig((void *)&(p_mem_addr))
-
-extern int qat_dc_init(void);
-extern void qat_dc_fini(void);
-extern int qat_cy_init(void);
-extern void qat_cy_fini(void);
-extern int qat_init(void);
-extern void qat_fini(void);
-
-/* fake CpaStatus used to indicate data was not compressible */
-#define CPA_STATUS_INCOMPRESSIBLE (-127)
-
-extern boolean_t qat_dc_use_accel(size_t s_len);
-extern boolean_t qat_crypt_use_accel(size_t s_len);
-extern boolean_t qat_checksum_use_accel(size_t s_len);
-extern int qat_compress(qat_compress_dir_t dir, char *src, int src_len,
- char *dst, int dst_len, size_t *c_len);
-extern int qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
- uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
- crypto_key_t *key, uint64_t crypt, uint32_t enc_len);
-extern int qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size,
- zio_cksum_t *zcp);
-#else
-#define CPA_STATUS_SUCCESS 0
-#define CPA_STATUS_INCOMPRESSIBLE (-127)
-#define qat_init()
-#define qat_fini()
-#define qat_dc_use_accel(s_len) 0
-#define qat_crypt_use_accel(s_len) 0
-#define qat_checksum_use_accel(s_len) 0
-#define qat_compress(dir, s, sl, d, dl, cl) 0
-#define qat_crypt(dir, s, d, a, al, i, db, k, c, el) 0
-#define qat_checksum(c, buf, s, z) 0
-#endif
-
-#endif /* _SYS_QAT_H */
diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c
deleted file mode 100644
index 4136b6555..000000000
--- a/module/zfs/qat_compress.c
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#if defined(_KERNEL) && defined(HAVE_QAT)
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/completion.h>
-#include <sys/zfs_context.h>
-#include <sys/byteorder.h>
-#include <sys/zio.h>
-#include "qat.h"
-
-/*
- * Max instances in a QAT device, each instance is a channel to submit
- * jobs to QAT hardware, this is only for pre-allocating instance and
- * session arrays; the actual number of instances are defined in the
- * QAT driver's configuration file.
- */
-#define QAT_DC_MAX_INSTANCES 48
-
-/*
- * ZLIB head and foot size
- */
-#define ZLIB_HEAD_SZ 2
-#define ZLIB_FOOT_SZ 4
-
-static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
-static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
-static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
-static Cpa16U num_inst = 0;
-static Cpa32U inst_num = 0;
-static boolean_t qat_dc_init_done = B_FALSE;
-int zfs_qat_compress_disable = 0;
-
-boolean_t
-qat_dc_use_accel(size_t s_len)
-{
- return (!zfs_qat_compress_disable &&
- qat_dc_init_done &&
- s_len >= QAT_MIN_BUF_SIZE &&
- s_len <= QAT_MAX_BUF_SIZE);
-}
-
-static void
-qat_dc_callback(void *p_callback, CpaStatus status)
-{
- if (p_callback != NULL)
- complete((struct completion *)p_callback);
-}
-
-static void
-qat_dc_clean(void)
-{
- Cpa16U buff_num = 0;
- Cpa16U num_inter_buff_lists = 0;
-
- for (Cpa16U i = 0; i < num_inst; i++) {
- cpaDcStopInstance(dc_inst_handles[i]);
- QAT_PHYS_CONTIG_FREE(session_handles[i]);
- /* free intermediate buffers */
- if (buffer_array[i] != NULL) {
- cpaDcGetNumIntermediateBuffers(
- dc_inst_handles[i], &num_inter_buff_lists);
- for (buff_num = 0; buff_num < num_inter_buff_lists;
- buff_num++) {
- CpaBufferList *buffer_inter =
- buffer_array[i][buff_num];
- if (buffer_inter->pBuffers) {
- QAT_PHYS_CONTIG_FREE(
- buffer_inter->pBuffers->pData);
- QAT_PHYS_CONTIG_FREE(
- buffer_inter->pBuffers);
- }
- QAT_PHYS_CONTIG_FREE(
- buffer_inter->pPrivateMetaData);
- QAT_PHYS_CONTIG_FREE(buffer_inter);
- }
- }
- }
-
- num_inst = 0;
- qat_dc_init_done = B_FALSE;
-}
-
-int
-qat_dc_init(void)
-{
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa32U sess_size = 0;
- Cpa32U ctx_size = 0;
- Cpa16U num_inter_buff_lists = 0;
- Cpa16U buff_num = 0;
- Cpa32U buff_meta_size = 0;
- CpaDcSessionSetupData sd = {0};
-
- if (qat_dc_init_done)
- return (0);
-
- status = cpaDcGetNumInstances(&num_inst);
- if (status != CPA_STATUS_SUCCESS)
- return (-1);
-
- /* if the user has configured no QAT compression units just return */
- if (num_inst == 0)
- return (0);
-
- if (num_inst > QAT_DC_MAX_INSTANCES)
- num_inst = QAT_DC_MAX_INSTANCES;
-
- status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
- if (status != CPA_STATUS_SUCCESS)
- return (-1);
-
- for (Cpa16U i = 0; i < num_inst; i++) {
- cpaDcSetAddressTranslation(dc_inst_handles[i],
- (void*)virt_to_phys);
-
- status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
- 1, &buff_meta_size);
-
- if (status == CPA_STATUS_SUCCESS)
- status = cpaDcGetNumIntermediateBuffers(
- dc_inst_handles[i], &num_inter_buff_lists);
-
- if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
- status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
- num_inter_buff_lists *
- sizeof (CpaBufferList *));
-
- for (buff_num = 0; buff_num < num_inter_buff_lists;
- buff_num++) {
- if (status == CPA_STATUS_SUCCESS)
- status = QAT_PHYS_CONTIG_ALLOC(
- &buffer_array[i][buff_num],
- sizeof (CpaBufferList));
-
- if (status == CPA_STATUS_SUCCESS)
- status = QAT_PHYS_CONTIG_ALLOC(
- &buffer_array[i][buff_num]->
- pPrivateMetaData,
- buff_meta_size);
-
- if (status == CPA_STATUS_SUCCESS)
- status = QAT_PHYS_CONTIG_ALLOC(
- &buffer_array[i][buff_num]->pBuffers,
- sizeof (CpaFlatBuffer));
-
- if (status == CPA_STATUS_SUCCESS) {
- /*
- * implementation requires an intermediate
- * buffer approximately twice the size of
- * output buffer, which is 2x max buffer
- * size here.
- */
- status = QAT_PHYS_CONTIG_ALLOC(
- &buffer_array[i][buff_num]->pBuffers->
- pData, 2 * QAT_MAX_BUF_SIZE);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- buffer_array[i][buff_num]->numBuffers = 1;
- buffer_array[i][buff_num]->pBuffers->
- dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
- }
- }
-
- status = cpaDcStartInstance(dc_inst_handles[i],
- num_inter_buff_lists, buffer_array[i]);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- sd.compLevel = CPA_DC_L1;
- sd.compType = CPA_DC_DEFLATE;
- sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
- sd.sessDirection = CPA_DC_DIR_COMBINED;
- sd.sessState = CPA_DC_STATELESS;
- sd.deflateWindowSize = 7;
- sd.checksum = CPA_DC_ADLER32;
- status = cpaDcGetSessionSize(dc_inst_handles[i],
- &sd, &sess_size, &ctx_size);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
- if (session_handles[i] == NULL)
- goto fail;
-
- status = cpaDcInitSession(dc_inst_handles[i],
- session_handles[i],
- &sd, NULL, qat_dc_callback);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- }
-
- qat_dc_init_done = B_TRUE;
- return (0);
-fail:
- qat_dc_clean();
- return (-1);
-}
-
-void
-qat_dc_fini(void)
-{
- if (!qat_dc_init_done)
- return;
-
- qat_dc_clean();
-}
-
-/*
- * The "add" parameter is an additional buffer which is passed
- * to QAT as a scratch buffer alongside the destination buffer
- * in case the "compressed" data ends up being larger than the
- * original source data. This is necessary to prevent QAT from
- * generating buffer overflow warnings for incompressible data.
- */
-static int
-qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
- char *dst, int dst_len, char *add, int add_len, size_t *c_len)
-{
- CpaInstanceHandle dc_inst_handle;
- CpaDcSessionHandle session_handle;
- CpaBufferList *buf_list_src = NULL;
- CpaBufferList *buf_list_dst = NULL;
- CpaFlatBuffer *flat_buf_src = NULL;
- CpaFlatBuffer *flat_buf_dst = NULL;
- Cpa8U *buffer_meta_src = NULL;
- Cpa8U *buffer_meta_dst = NULL;
- Cpa32U buffer_meta_size = 0;
- CpaDcRqResults dc_results;
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa32U hdr_sz = 0;
- Cpa32U compressed_sz;
- Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
- Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
- Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
- Cpa32U bytes_left;
- Cpa32U dst_pages = 0;
- Cpa32U adler32 = 0;
- char *data;
- struct page *page;
- struct page **in_pages = NULL;
- struct page **out_pages = NULL;
- struct page **add_pages = NULL;
- Cpa32U page_off = 0;
- struct completion complete;
- Cpa32U page_num = 0;
- Cpa16U i;
-
- /*
- * We increment num_src_buf and num_dst_buf by 2 to allow
- * us to handle non page-aligned buffer addresses and buffers
- * whose sizes are not divisible by PAGE_SIZE.
- */
- Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
- (num_src_buf * sizeof (CpaFlatBuffer));
- Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
- ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
-
- if (QAT_PHYS_CONTIG_ALLOC(&in_pages,
- num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
- goto fail;
-
- if (QAT_PHYS_CONTIG_ALLOC(&out_pages,
- num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
- goto fail;
-
- if (QAT_PHYS_CONTIG_ALLOC(&add_pages,
- num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
- goto fail;
-
- i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
- dc_inst_handle = dc_inst_handles[i];
- session_handle = session_handles[i];
-
- cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
- &buffer_meta_size);
- if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) !=
- CPA_STATUS_SUCCESS)
- goto fail;
-
- cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
- &buffer_meta_size);
- if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) !=
- CPA_STATUS_SUCCESS)
- goto fail;
-
- /* build source buffer list */
- if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) !=
- CPA_STATUS_SUCCESS)
- goto fail;
-
- flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
-
- buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
-
- /* build destination buffer list */
- if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) !=
- CPA_STATUS_SUCCESS)
- goto fail;
-
- flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
-
- buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
-
- buf_list_src->numBuffers = 0;
- buf_list_src->pPrivateMetaData = buffer_meta_src;
- bytes_left = src_len;
- data = src;
- page_num = 0;
- while (bytes_left > 0) {
- page_off = ((long)data & ~PAGE_MASK);
- page = qat_mem_to_page(data);
- in_pages[page_num] = page;
- flat_buf_src->pData = kmap(page) + page_off;
- flat_buf_src->dataLenInBytes =
- min((long)PAGE_SIZE - page_off, (long)bytes_left);
-
- bytes_left -= flat_buf_src->dataLenInBytes;
- data += flat_buf_src->dataLenInBytes;
- flat_buf_src++;
- buf_list_src->numBuffers++;
- page_num++;
- }
-
- buf_list_dst->numBuffers = 0;
- buf_list_dst->pPrivateMetaData = buffer_meta_dst;
- bytes_left = dst_len;
- data = dst;
- page_num = 0;
- while (bytes_left > 0) {
- page_off = ((long)data & ~PAGE_MASK);
- page = qat_mem_to_page(data);
- flat_buf_dst->pData = kmap(page) + page_off;
- out_pages[page_num] = page;
- flat_buf_dst->dataLenInBytes =
- min((long)PAGE_SIZE - page_off, (long)bytes_left);
-
- bytes_left -= flat_buf_dst->dataLenInBytes;
- data += flat_buf_dst->dataLenInBytes;
- flat_buf_dst++;
- buf_list_dst->numBuffers++;
- page_num++;
- dst_pages++;
- }
-
- /* map additional scratch pages into the destination buffer list */
- bytes_left = add_len;
- data = add;
- page_num = 0;
- while (bytes_left > 0) {
- page_off = ((long)data & ~PAGE_MASK);
- page = qat_mem_to_page(data);
- flat_buf_dst->pData = kmap(page) + page_off;
- add_pages[page_num] = page;
- flat_buf_dst->dataLenInBytes =
- min((long)PAGE_SIZE - page_off, (long)bytes_left);
-
- bytes_left -= flat_buf_dst->dataLenInBytes;
- data += flat_buf_dst->dataLenInBytes;
- flat_buf_dst++;
- buf_list_dst->numBuffers++;
- page_num++;
- }
-
- init_completion(&complete);
-
- if (dir == QAT_COMPRESS) {
- QAT_STAT_BUMP(comp_requests);
- QAT_STAT_INCR(comp_total_in_bytes, src_len);
-
- cpaDcGenerateHeader(session_handle,
- buf_list_dst->pBuffers, &hdr_sz);
- buf_list_dst->pBuffers->pData += hdr_sz;
- buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
- status = cpaDcCompressData(
- dc_inst_handle, session_handle,
- buf_list_src, buf_list_dst,
- &dc_results, CPA_DC_FLUSH_FINAL,
- &complete);
- if (status != CPA_STATUS_SUCCESS) {
- goto fail;
- }
-
- /* we now wait until the completion of the operation. */
- if (!wait_for_completion_interruptible_timeout(&complete,
- QAT_TIMEOUT_MS)) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- if (dc_results.status != CPA_STATUS_SUCCESS) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- compressed_sz = dc_results.produced;
- if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
- status = CPA_STATUS_INCOMPRESSIBLE;
- goto fail;
- }
-
- flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
- /* move to the last page */
- flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
-
- /* no space for gzip footer in the last page */
- if (((compressed_sz + hdr_sz) % PAGE_SIZE)
- + ZLIB_FOOT_SZ > PAGE_SIZE) {
- status = CPA_STATUS_INCOMPRESSIBLE;
- goto fail;
- }
-
- /* jump to the end of the buffer and append footer */
- flat_buf_dst->pData =
- (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
- + ((compressed_sz + hdr_sz) % PAGE_SIZE);
- flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
-
- dc_results.produced = 0;
- status = cpaDcGenerateFooter(session_handle,
- flat_buf_dst, &dc_results);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- *c_len = compressed_sz + dc_results.produced + hdr_sz;
- QAT_STAT_INCR(comp_total_out_bytes, *c_len);
- } else {
- ASSERT3U(dir, ==, QAT_DECOMPRESS);
- QAT_STAT_BUMP(decomp_requests);
- QAT_STAT_INCR(decomp_total_in_bytes, src_len);
-
- buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
- buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
- status = cpaDcDecompressData(dc_inst_handle, session_handle,
- buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
- &complete);
-
- if (CPA_STATUS_SUCCESS != status) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- /* we now wait until the completion of the operation. */
- if (!wait_for_completion_interruptible_timeout(&complete,
- QAT_TIMEOUT_MS)) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- if (dc_results.status != CPA_STATUS_SUCCESS) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- /* verify adler checksum */
- adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
- if (adler32 != BSWAP_32(dc_results.checksum)) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
- *c_len = dc_results.produced;
- QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
- }
-
-fail:
- if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
- QAT_STAT_BUMP(dc_fails);
-
- if (in_pages) {
- for (page_num = 0;
- page_num < buf_list_src->numBuffers;
- page_num++) {
- kunmap(in_pages[page_num]);
- }
- QAT_PHYS_CONTIG_FREE(in_pages);
- }
-
- if (out_pages) {
- for (page_num = 0; page_num < dst_pages; page_num++) {
- kunmap(out_pages[page_num]);
- }
- QAT_PHYS_CONTIG_FREE(out_pages);
- }
-
- if (add_pages) {
- for (page_num = 0;
- page_num < buf_list_dst->numBuffers - dst_pages;
- page_num++) {
- kunmap(add_pages[page_num]);
- }
- QAT_PHYS_CONTIG_FREE(add_pages);
- }
-
- QAT_PHYS_CONTIG_FREE(buffer_meta_src);
- QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
- QAT_PHYS_CONTIG_FREE(buf_list_src);
- QAT_PHYS_CONTIG_FREE(buf_list_dst);
-
- return (status);
-}
-
-/*
- * Entry point for QAT accelerated compression / decompression.
- */
-int
-qat_compress(qat_compress_dir_t dir, char *src, int src_len,
- char *dst, int dst_len, size_t *c_len)
-{
- int ret;
- size_t add_len = 0;
- void *add = NULL;
-
- if (dir == QAT_COMPRESS) {
- add_len = dst_len;
- add = zio_data_buf_alloc(add_len);
- }
-
- ret = qat_compress_impl(dir, src, src_len, dst,
- dst_len, add, add_len, c_len);
-
- if (dir == QAT_COMPRESS)
- zio_data_buf_free(add, add_len);
-
- return (ret);
-}
-
-static int
-param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
-{
- int ret;
- int *pvalue = kp->arg;
- ret = param_set_int(val, kp);
- if (ret)
- return (ret);
- /*
- * zfs_qat_compress_disable = 0: enable qat compress
- * try to initialize qat instance if it has not been done
- */
- if (*pvalue == 0 && !qat_dc_init_done) {
- ret = qat_dc_init();
- if (ret != 0) {
- zfs_qat_compress_disable = 1;
- return (ret);
- }
- }
- return (ret);
-}
-
-module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
- param_get_int, &zfs_qat_compress_disable, 0644);
-MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
-
-#endif
diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c
deleted file mode 100644
index 02e19d21d..000000000
--- a/module/zfs/qat_crypt.c
+++ /dev/null
@@ -1,631 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * This file represents the QAT implementation of checksums and encryption.
- * Internally, QAT shares the same cryptographic instances for both of these
- * operations, so the code has been combined here. QAT data compression uses
- * compression instances, so that code is separated into qat_compress.c
- */
-
-#if defined(_KERNEL) && defined(HAVE_QAT)
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/completion.h>
-#include <sys/zfs_context.h>
-#include <sys/zio_crypt.h>
-#include "lac/cpa_cy_im.h"
-#include "lac/cpa_cy_common.h"
-#include "qat.h"
-
-/*
- * Max instances in a QAT device, each instance is a channel to submit
- * jobs to QAT hardware, this is only for pre-allocating instances
- * and session arrays; the actual number of instances are defined in
- * the QAT driver's configure file.
- */
-#define QAT_CRYPT_MAX_INSTANCES 48
-
-#define MAX_PAGE_NUM 1024
-
-static Cpa32U inst_num = 0;
-static Cpa16U num_inst = 0;
-static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
-static boolean_t qat_cy_init_done = B_FALSE;
-int zfs_qat_encrypt_disable = 0;
-int zfs_qat_checksum_disable = 0;
-
-typedef struct cy_callback {
- CpaBoolean verify_result;
- struct completion complete;
-} cy_callback_t;
-
-static void
-symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
- void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
-{
- cy_callback_t *cb = p_callback;
-
- if (cb != NULL) {
- /* indicate that the function has been called */
- cb->verify_result = verify;
- complete(&cb->complete);
- }
-}
-
-boolean_t
-qat_crypt_use_accel(size_t s_len)
-{
- return (!zfs_qat_encrypt_disable &&
- qat_cy_init_done &&
- s_len >= QAT_MIN_BUF_SIZE &&
- s_len <= QAT_MAX_BUF_SIZE);
-}
-
-boolean_t
-qat_checksum_use_accel(size_t s_len)
-{
- return (!zfs_qat_checksum_disable &&
- qat_cy_init_done &&
- s_len >= QAT_MIN_BUF_SIZE &&
- s_len <= QAT_MAX_BUF_SIZE);
-}
-
-void
-qat_cy_clean(void)
-{
- for (Cpa16U i = 0; i < num_inst; i++)
- cpaCyStopInstance(cy_inst_handles[i]);
-
- num_inst = 0;
- qat_cy_init_done = B_FALSE;
-}
-
-int
-qat_cy_init(void)
-{
- CpaStatus status = CPA_STATUS_FAIL;
-
- if (qat_cy_init_done)
- return (0);
-
- status = cpaCyGetNumInstances(&num_inst);
- if (status != CPA_STATUS_SUCCESS)
- return (-1);
-
- /* if the user has configured no QAT encryption units just return */
- if (num_inst == 0)
- return (0);
-
- if (num_inst > QAT_CRYPT_MAX_INSTANCES)
- num_inst = QAT_CRYPT_MAX_INSTANCES;
-
- status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
- if (status != CPA_STATUS_SUCCESS)
- return (-1);
-
- for (Cpa16U i = 0; i < num_inst; i++) {
- status = cpaCySetAddressTranslation(cy_inst_handles[i],
- (void *)virt_to_phys);
- if (status != CPA_STATUS_SUCCESS)
- goto error;
-
- status = cpaCyStartInstance(cy_inst_handles[i]);
- if (status != CPA_STATUS_SUCCESS)
- goto error;
- }
-
- qat_cy_init_done = B_TRUE;
- return (0);
-
-error:
- qat_cy_clean();
- return (-1);
-}
-
-void
-qat_cy_fini(void)
-{
- if (!qat_cy_init_done)
- return;
-
- qat_cy_clean();
-}
-
-static CpaStatus
-qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
- CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
- Cpa64U crypt, Cpa32U aad_len)
-{
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa32U ctx_size;
- Cpa32U ciper_algorithm;
- Cpa32U hash_algorithm;
- CpaCySymSessionSetupData sd = { 0 };
-
- if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
- return (CPA_STATUS_FAIL);
- } else {
- ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
- hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
- }
-
- sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
- sd.cipherSetupData.pCipherKey = key->ck_data;
- sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
- sd.hashSetupData.hashAlgorithm = hash_algorithm;
- sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
- sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
- sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
- sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
- sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
- sd.digestIsAppended = CPA_FALSE;
- sd.verifyDigest = CPA_FALSE;
-
- if (dir == QAT_ENCRYPT) {
- sd.cipherSetupData.cipherDirection =
- CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
- sd.algChainOrder =
- CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
- } else {
- ASSERT3U(dir, ==, QAT_DECRYPT);
- sd.cipherSetupData.cipherDirection =
- CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
- sd.algChainOrder =
- CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
- }
-
- status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
- if (status != CPA_STATUS_SUCCESS)
- return (status);
-
- status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
- if (status != CPA_STATUS_SUCCESS)
- return (status);
-
- status = cpaCySymInitSession(inst_handle, symcallback, &sd,
- *cy_session_ctx);
- if (status != CPA_STATUS_SUCCESS) {
- QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
- return (status);
- }
-
- return (CPA_STATUS_SUCCESS);
-}
-
-static CpaStatus
-qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
- CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
-{
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa32U ctx_size;
- Cpa32U hash_algorithm;
- CpaCySymSessionSetupData sd = { 0 };
-
- /*
- * ZFS's SHA512 checksum is actually SHA512/256, which uses
- * a different IV from standard SHA512. QAT does not support
- * SHA512/256, so we can only support SHA256.
- */
- if (cksum == ZIO_CHECKSUM_SHA256)
- hash_algorithm = CPA_CY_SYM_HASH_SHA256;
- else
- return (CPA_STATUS_FAIL);
-
- sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
- sd.symOperation = CPA_CY_SYM_OP_HASH;
- sd.hashSetupData.hashAlgorithm = hash_algorithm;
- sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
- sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
- sd.digestIsAppended = CPA_FALSE;
- sd.verifyDigest = CPA_FALSE;
-
- status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
- if (status != CPA_STATUS_SUCCESS)
- return (status);
-
- status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
- if (status != CPA_STATUS_SUCCESS)
- return (status);
-
- status = cpaCySymInitSession(inst_handle, symcallback, &sd,
- *cy_session_ctx);
- if (status != CPA_STATUS_SUCCESS) {
- QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
- return (status);
- }
-
- return (CPA_STATUS_SUCCESS);
-}
-
-static CpaStatus
-qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
- CpaBufferList *src, CpaBufferList *dst)
-{
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa32U meta_size = 0;
-
- status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
- if (status != CPA_STATUS_SUCCESS)
- return (status);
-
- status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
- if (status != CPA_STATUS_SUCCESS)
- goto error;
-
- if (src != dst) {
- status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
- meta_size);
- if (status != CPA_STATUS_SUCCESS)
- goto error;
- }
-
- return (CPA_STATUS_SUCCESS);
-
-error:
- QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
- if (src != dst)
- QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
-
- return (status);
-}
-
-int
-qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
- uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
- crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
-{
- CpaStatus status = CPA_STATUS_SUCCESS;
- Cpa16U i;
- CpaInstanceHandle cy_inst_handle;
- Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
- Cpa32U bytes_left = 0;
- Cpa8S *data = NULL;
- CpaCySymSessionCtx *cy_session_ctx = NULL;
- cy_callback_t cb;
- CpaCySymOpData op_data = { 0 };
- CpaBufferList src_buffer_list = { 0 };
- CpaBufferList dst_buffer_list = { 0 };
- CpaFlatBuffer *flat_src_buf_array = NULL;
- CpaFlatBuffer *flat_src_buf = NULL;
- CpaFlatBuffer *flat_dst_buf_array = NULL;
- CpaFlatBuffer *flat_dst_buf = NULL;
- struct page *in_pages[MAX_PAGE_NUM];
- struct page *out_pages[MAX_PAGE_NUM];
- Cpa32U in_page_num = 0;
- Cpa32U out_page_num = 0;
- Cpa32U in_page_off = 0;
- Cpa32U out_page_off = 0;
-
- if (dir == QAT_ENCRYPT) {
- QAT_STAT_BUMP(encrypt_requests);
- QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
- } else {
- QAT_STAT_BUMP(decrypt_requests);
- QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
- }
-
- i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
- cy_inst_handle = cy_inst_handles[i];
-
- status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
- &cy_session_ctx, key, crypt, aad_len);
- if (status != CPA_STATUS_SUCCESS) {
- /* don't count CCM as a failure since it's not supported */
- if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
- QAT_STAT_BUMP(crypt_fails);
- return (status);
- }
-
- /*
- * We increment nr_bufs by 2 to allow us to handle non
- * page-aligned buffer addresses and buffers whose sizes
- * are not divisible by PAGE_SIZE.
- */
- status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
- &src_buffer_list, &dst_buffer_list);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
- nr_bufs * sizeof (CpaFlatBuffer));
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
- nr_bufs * sizeof (CpaFlatBuffer));
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
- ZIO_DATA_MAC_LEN);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
- ZIO_DATA_IV_LEN);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- if (aad_len > 0) {
- status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
- aad_len);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
- }
-
- bytes_left = enc_len;
- data = src_buf;
- flat_src_buf = flat_src_buf_array;
- while (bytes_left > 0) {
- in_page_off = ((long)data & ~PAGE_MASK);
- in_pages[in_page_num] = qat_mem_to_page(data);
- flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
- flat_src_buf->dataLenInBytes =
- min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
- data += flat_src_buf->dataLenInBytes;
- bytes_left -= flat_src_buf->dataLenInBytes;
- flat_src_buf++;
- in_page_num++;
- }
- src_buffer_list.pBuffers = flat_src_buf_array;
- src_buffer_list.numBuffers = in_page_num;
-
- bytes_left = enc_len;
- data = dst_buf;
- flat_dst_buf = flat_dst_buf_array;
- while (bytes_left > 0) {
- out_page_off = ((long)data & ~PAGE_MASK);
- out_pages[out_page_num] = qat_mem_to_page(data);
- flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
- out_page_off;
- flat_dst_buf->dataLenInBytes =
- min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
- data += flat_dst_buf->dataLenInBytes;
- bytes_left -= flat_dst_buf->dataLenInBytes;
- flat_dst_buf++;
- out_page_num++;
- }
- dst_buffer_list.pBuffers = flat_dst_buf_array;
- dst_buffer_list.numBuffers = out_page_num;
-
- op_data.sessionCtx = cy_session_ctx;
- op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
- op_data.cryptoStartSrcOffsetInBytes = 0;
- op_data.messageLenToCipherInBytes = 0;
- op_data.hashStartSrcOffsetInBytes = 0;
- op_data.messageLenToHashInBytes = 0;
- op_data.messageLenToCipherInBytes = enc_len;
- op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
- bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
-
- cb.verify_result = CPA_FALSE;
- init_completion(&cb.complete);
- status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
- &src_buffer_list, &dst_buffer_list, NULL);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- if (!wait_for_completion_interruptible_timeout(&cb.complete,
- QAT_TIMEOUT_MS)) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- if (cb.verify_result == CPA_FALSE) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- /* save digest result to digest_buf */
- bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
- if (dir == QAT_ENCRYPT)
- QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
- else
- QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
-
-fail:
- if (status != CPA_STATUS_SUCCESS)
- QAT_STAT_BUMP(crypt_fails);
-
- for (i = 0; i < in_page_num; i++)
- kunmap(in_pages[i]);
- for (i = 0; i < out_page_num; i++)
- kunmap(out_pages[i]);
-
- cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
- if (aad_len > 0)
- QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
- QAT_PHYS_CONTIG_FREE(op_data.pIv);
- QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
- QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
- QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
- QAT_PHYS_CONTIG_FREE(cy_session_ctx);
- QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
- QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
-
- return (status);
-}
-
-int
-qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
-{
- CpaStatus status;
- Cpa16U i;
- CpaInstanceHandle cy_inst_handle;
- Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
- Cpa32U bytes_left = 0;
- Cpa8S *data = NULL;
- CpaCySymSessionCtx *cy_session_ctx = NULL;
- cy_callback_t cb;
- Cpa8U *digest_buffer = NULL;
- CpaCySymOpData op_data = { 0 };
- CpaBufferList src_buffer_list = { 0 };
- CpaFlatBuffer *flat_src_buf_array = NULL;
- CpaFlatBuffer *flat_src_buf = NULL;
- struct page *in_pages[MAX_PAGE_NUM];
- Cpa32U page_num = 0;
- Cpa32U page_off = 0;
-
- QAT_STAT_BUMP(cksum_requests);
- QAT_STAT_INCR(cksum_total_in_bytes, size);
-
- i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
- cy_inst_handle = cy_inst_handles[i];
-
- status = qat_init_checksum_session_ctx(cy_inst_handle,
- &cy_session_ctx, cksum);
- if (status != CPA_STATUS_SUCCESS) {
- /* don't count unsupported checksums as a failure */
- if (cksum == ZIO_CHECKSUM_SHA256 ||
- cksum == ZIO_CHECKSUM_SHA512)
- QAT_STAT_BUMP(cksum_fails);
- return (status);
- }
-
- /*
- * We increment nr_bufs by 2 to allow us to handle non
- * page-aligned buffer addresses and buffers whose sizes
- * are not divisible by PAGE_SIZE.
- */
- status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
- &src_buffer_list, &src_buffer_list);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
- nr_bufs * sizeof (CpaFlatBuffer));
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
- status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
- sizeof (zio_cksum_t));
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- bytes_left = size;
- data = buf;
- flat_src_buf = flat_src_buf_array;
- while (bytes_left > 0) {
- page_off = ((long)data & ~PAGE_MASK);
- in_pages[page_num] = qat_mem_to_page(data);
- flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
- flat_src_buf->dataLenInBytes =
- min((long)PAGE_SIZE - page_off, (long)bytes_left);
- data += flat_src_buf->dataLenInBytes;
- bytes_left -= flat_src_buf->dataLenInBytes;
- flat_src_buf++;
- page_num++;
- }
- src_buffer_list.pBuffers = flat_src_buf_array;
- src_buffer_list.numBuffers = page_num;
-
- op_data.sessionCtx = cy_session_ctx;
- op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
- op_data.hashStartSrcOffsetInBytes = 0;
- op_data.messageLenToHashInBytes = size;
- op_data.pDigestResult = digest_buffer;
-
- cb.verify_result = CPA_FALSE;
- init_completion(&cb.complete);
- status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
- &src_buffer_list, &src_buffer_list, NULL);
- if (status != CPA_STATUS_SUCCESS)
- goto fail;
-
- if (!wait_for_completion_interruptible_timeout(&cb.complete,
- QAT_TIMEOUT_MS)) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
- if (cb.verify_result == CPA_FALSE) {
- status = CPA_STATUS_FAIL;
- goto fail;
- }
-
- bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
-
-fail:
- if (status != CPA_STATUS_SUCCESS)
- QAT_STAT_BUMP(cksum_fails);
-
- for (i = 0; i < page_num; i++)
- kunmap(in_pages[i]);
-
- cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
- QAT_PHYS_CONTIG_FREE(digest_buffer);
- QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
- QAT_PHYS_CONTIG_FREE(cy_session_ctx);
- QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
-
- return (status);
-}
-
-static int
-param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
-{
- int ret;
- int *pvalue = kp->arg;
- ret = param_set_int(val, kp);
- if (ret)
- return (ret);
- /*
- * zfs_qat_encrypt_disable = 0: enable qat encrypt
- * try to initialize qat instance if it has not been done
- */
- if (*pvalue == 0 && !qat_cy_init_done) {
- ret = qat_cy_init();
- if (ret != 0) {
- zfs_qat_encrypt_disable = 1;
- return (ret);
- }
- }
- return (ret);
-}
-
-static int
-param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
-{
- int ret;
- int *pvalue = kp->arg;
- ret = param_set_int(val, kp);
- if (ret)
- return (ret);
- /*
- * set_checksum_param_ops = 0: enable qat checksum
- * try to initialize qat instance if it has not been done
- */
- if (*pvalue == 0 && !qat_cy_init_done) {
- ret = qat_cy_init();
- if (ret != 0) {
- zfs_qat_checksum_disable = 1;
- return (ret);
- }
- }
- return (ret);
-}
-
-module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
- param_get_int, &zfs_qat_encrypt_disable, 0644);
-MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
-
-module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
- param_get_int, &zfs_qat_checksum_disable, 0644);
-MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
-
-#endif
diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c
index 2adadf56f..406c926a0 100644
--- a/module/zfs/sha256.c
+++ b/module/zfs/sha256.c
@@ -30,7 +30,7 @@
#include <sys/zio.h>
#include <sys/sha2.h>
#include <sys/abd.h>
-#include "qat.h"
+#include <sys/qat.h>
static int
sha_incremental(void *buf, size_t size, void *arg)
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index b0c1ae1e6..a18f9604a 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -59,7 +59,7 @@
#include <sys/kstat.h>
#include "zfs_prop.h"
#include <sys/zfeature.h>
-#include "qat.h"
+#include <sys/qat.h>
/*
* SPA locking
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
deleted file mode 100644
index 6895428f4..000000000
--- a/module/zfs/spa_stats.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/spa.h>
-#include <zfs_comutil.h>
-
-/*
- * Keeps stats on last N reads per spa_t, disabled by default.
- */
-int zfs_read_history = 0;
-
-/*
- * Include cache hits in history, disabled by default.
- */
-int zfs_read_history_hits = 0;
-
-/*
- * Keeps stats on the last 100 txgs by default.
- */
-int zfs_txg_history = 100;
-
-/*
- * Keeps stats on the last N MMP updates, disabled by default.
- */
-int zfs_multihost_history = 0;
-
-/*
- * ==========================================================================
- * SPA Read History Routines
- * ==========================================================================
- */
-
-/*
- * Read statistics - Information exported regarding each arc_read call
- */
-typedef struct spa_read_history {
- hrtime_t start; /* time read completed */
- uint64_t objset; /* read from this objset */
- uint64_t object; /* read of this object number */
- uint64_t level; /* block's indirection level */
- uint64_t blkid; /* read of this block id */
- char origin[24]; /* read originated from here */
- uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
- pid_t pid; /* PID of task doing read */
- char comm[16]; /* process name of task doing read */
- procfs_list_node_t srh_node;
-} spa_read_history_t;
-
-static int
-spa_read_history_show_header(struct seq_file *f)
-{
- seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
- "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
- "level", "blkid", "aflags", "origin", "pid", "process");
-
- return (0);
-}
-
-static int
-spa_read_history_show(struct seq_file *f, void *data)
-{
- spa_read_history_t *srh = (spa_read_history_t *)data;
-
- seq_printf(f, "%-8llu %-16llu 0x%-6llx "
- "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
- (u_longlong_t)srh->srh_node.pln_id, srh->start,
- (longlong_t)srh->objset, (longlong_t)srh->object,
- (longlong_t)srh->level, (longlong_t)srh->blkid,
- srh->aflags, srh->origin, srh->pid, srh->comm);
-
- return (0);
-}
-
-/* Remove oldest elements from list until there are no more than 'size' left */
-static void
-spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
-{
- spa_read_history_t *srh;
- while (shl->size > size) {
- srh = list_remove_head(&shl->procfs_list.pl_list);
- ASSERT3P(srh, !=, NULL);
- kmem_free(srh, sizeof (spa_read_history_t));
- shl->size--;
- }
-
- if (size == 0)
- ASSERT(list_is_empty(&shl->procfs_list.pl_list));
-}
-
-static int
-spa_read_history_clear(procfs_list_t *procfs_list)
-{
- spa_history_list_t *shl = procfs_list->pl_private;
- mutex_enter(&procfs_list->pl_lock);
- spa_read_history_truncate(shl, 0);
- mutex_exit(&procfs_list->pl_lock);
- return (0);
-}
-
-static void
-spa_read_history_init(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.read_history;
- char *module;
-
- shl->size = 0;
-
- module = kmem_asprintf("zfs/%s", spa_name(spa));
-
- shl->procfs_list.pl_private = shl;
- procfs_list_install(module,
- "reads",
- 0600,
- &shl->procfs_list,
- spa_read_history_show,
- spa_read_history_show_header,
- spa_read_history_clear,
- offsetof(spa_read_history_t, srh_node));
-
- strfree(module);
-}
-
-static void
-spa_read_history_destroy(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.read_history;
- procfs_list_uninstall(&shl->procfs_list);
- spa_read_history_truncate(shl, 0);
- procfs_list_destroy(&shl->procfs_list);
-}
-
-void
-spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
-{
- spa_history_list_t *shl = &spa->spa_stats.read_history;
- spa_read_history_t *srh;
-
- ASSERT3P(spa, !=, NULL);
- ASSERT3P(zb, !=, NULL);
-
- if (zfs_read_history == 0 && shl->size == 0)
- return;
-
- if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
- return;
-
- srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
- strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
- srh->start = gethrtime();
- srh->objset = zb->zb_objset;
- srh->object = zb->zb_object;
- srh->level = zb->zb_level;
- srh->blkid = zb->zb_blkid;
- srh->aflags = aflags;
- srh->pid = getpid();
-
- mutex_enter(&shl->procfs_list.pl_lock);
-
- procfs_list_add(&shl->procfs_list, srh);
- shl->size++;
-
- spa_read_history_truncate(shl, zfs_read_history);
-
- mutex_exit(&shl->procfs_list.pl_lock);
-}
-
-/*
- * ==========================================================================
- * SPA TXG History Routines
- * ==========================================================================
- */
-
-/*
- * Txg statistics - Information exported regarding each txg sync
- */
-
-typedef struct spa_txg_history {
- uint64_t txg; /* txg id */
- txg_state_t state; /* active txg state */
- uint64_t nread; /* number of bytes read */
- uint64_t nwritten; /* number of bytes written */
- uint64_t reads; /* number of read operations */
- uint64_t writes; /* number of write operations */
- uint64_t ndirty; /* number of dirty bytes */
- hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
- procfs_list_node_t sth_node;
-} spa_txg_history_t;
-
-static int
-spa_txg_history_show_header(struct seq_file *f)
-{
- seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
- "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
- "ndirty", "nread", "nwritten", "reads", "writes",
- "otime", "qtime", "wtime", "stime");
- return (0);
-}
-
-static int
-spa_txg_history_show(struct seq_file *f, void *data)
-{
- spa_txg_history_t *sth = (spa_txg_history_t *)data;
- uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
- char state;
-
- switch (sth->state) {
- case TXG_STATE_BIRTH: state = 'B'; break;
- case TXG_STATE_OPEN: state = 'O'; break;
- case TXG_STATE_QUIESCED: state = 'Q'; break;
- case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break;
- case TXG_STATE_SYNCED: state = 'S'; break;
- case TXG_STATE_COMMITTED: state = 'C'; break;
- default: state = '?'; break;
- }
-
- if (sth->times[TXG_STATE_OPEN])
- open = sth->times[TXG_STATE_OPEN] -
- sth->times[TXG_STATE_BIRTH];
-
- if (sth->times[TXG_STATE_QUIESCED])
- quiesce = sth->times[TXG_STATE_QUIESCED] -
- sth->times[TXG_STATE_OPEN];
-
- if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
- wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
- sth->times[TXG_STATE_QUIESCED];
-
- if (sth->times[TXG_STATE_SYNCED])
- sync = sth->times[TXG_STATE_SYNCED] -
- sth->times[TXG_STATE_WAIT_FOR_SYNC];
-
- seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
- "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
- (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
- (u_longlong_t)sth->ndirty,
- (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
- (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
- (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
- (u_longlong_t)sync);
-
- return (0);
-}
-
-/* Remove oldest elements from list until there are no more than 'size' left */
-static void
-spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
-{
- spa_txg_history_t *sth;
- while (shl->size > size) {
- sth = list_remove_head(&shl->procfs_list.pl_list);
- ASSERT3P(sth, !=, NULL);
- kmem_free(sth, sizeof (spa_txg_history_t));
- shl->size--;
- }
-
- if (size == 0)
- ASSERT(list_is_empty(&shl->procfs_list.pl_list));
-
-}
-
-static int
-spa_txg_history_clear(procfs_list_t *procfs_list)
-{
- spa_history_list_t *shl = procfs_list->pl_private;
- mutex_enter(&procfs_list->pl_lock);
- spa_txg_history_truncate(shl, 0);
- mutex_exit(&procfs_list->pl_lock);
- return (0);
-}
-
-static void
-spa_txg_history_init(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.txg_history;
- char *module;
-
- shl->size = 0;
-
- module = kmem_asprintf("zfs/%s", spa_name(spa));
-
- shl->procfs_list.pl_private = shl;
- procfs_list_install(module,
- "txgs",
- 0644,
- &shl->procfs_list,
- spa_txg_history_show,
- spa_txg_history_show_header,
- spa_txg_history_clear,
- offsetof(spa_txg_history_t, sth_node));
-
- strfree(module);
-}
-
-static void
-spa_txg_history_destroy(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.txg_history;
- procfs_list_uninstall(&shl->procfs_list);
- spa_txg_history_truncate(shl, 0);
- procfs_list_destroy(&shl->procfs_list);
-}
-
-/*
- * Add a new txg to historical record.
- */
-void
-spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
-{
- spa_history_list_t *shl = &spa->spa_stats.txg_history;
- spa_txg_history_t *sth;
-
- if (zfs_txg_history == 0 && shl->size == 0)
- return;
-
- sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
- sth->txg = txg;
- sth->state = TXG_STATE_OPEN;
- sth->times[TXG_STATE_BIRTH] = birth_time;
-
- mutex_enter(&shl->procfs_list.pl_lock);
- procfs_list_add(&shl->procfs_list, sth);
- shl->size++;
- spa_txg_history_truncate(shl, zfs_txg_history);
- mutex_exit(&shl->procfs_list.pl_lock);
-}
-
-/*
- * Set txg state completion time and increment current state.
- */
-int
-spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
- hrtime_t completed_time)
-{
- spa_history_list_t *shl = &spa->spa_stats.txg_history;
- spa_txg_history_t *sth;
- int error = ENOENT;
-
- if (zfs_txg_history == 0)
- return (0);
-
- mutex_enter(&shl->procfs_list.pl_lock);
- for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
- sth = list_prev(&shl->procfs_list.pl_list, sth)) {
- if (sth->txg == txg) {
- sth->times[completed_state] = completed_time;
- sth->state++;
- error = 0;
- break;
- }
- }
- mutex_exit(&shl->procfs_list.pl_lock);
-
- return (error);
-}
-
-/*
- * Set txg IO stats.
- */
-static int
-spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
- uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
-{
- spa_history_list_t *shl = &spa->spa_stats.txg_history;
- spa_txg_history_t *sth;
- int error = ENOENT;
-
- if (zfs_txg_history == 0)
- return (0);
-
- mutex_enter(&shl->procfs_list.pl_lock);
- for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
- sth = list_prev(&shl->procfs_list.pl_list, sth)) {
- if (sth->txg == txg) {
- sth->nread = nread;
- sth->nwritten = nwritten;
- sth->reads = reads;
- sth->writes = writes;
- sth->ndirty = ndirty;
- error = 0;
- break;
- }
- }
- mutex_exit(&shl->procfs_list.pl_lock);
-
- return (error);
-}
-
-txg_stat_t *
-spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
-{
- txg_stat_t *ts;
-
- if (zfs_txg_history == 0)
- return (NULL);
-
- ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
-
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
- spa_config_exit(spa, SCL_CONFIG, FTAG);
-
- ts->txg = txg;
- ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
-
- spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
-
- return (ts);
-}
-
-void
-spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
-{
- if (ts == NULL)
- return;
-
- if (zfs_txg_history == 0) {
- kmem_free(ts, sizeof (txg_stat_t));
- return;
- }
-
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
- spa_config_exit(spa, SCL_CONFIG, FTAG);
-
- spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
- spa_txg_history_set_io(spa, ts->txg,
- ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
- ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
- ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
- ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
- ts->ndirty);
-
- kmem_free(ts, sizeof (txg_stat_t));
-}
-
-/*
- * ==========================================================================
- * SPA TX Assign Histogram Routines
- * ==========================================================================
- */
-
-/*
- * Tx statistics - Information exported regarding dmu_tx_assign time.
- */
-
-/*
- * When the kstat is written zero all buckets. When the kstat is read
- * count the number of trailing buckets set to zero and update ks_ndata
- * such that they are not output.
- */
-static int
-spa_tx_assign_update(kstat_t *ksp, int rw)
-{
- spa_t *spa = ksp->ks_private;
- spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
- int i;
-
- if (rw == KSTAT_WRITE) {
- for (i = 0; i < shk->count; i++)
- ((kstat_named_t *)shk->private)[i].value.ui64 = 0;
- }
-
- for (i = shk->count; i > 0; i--)
- if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0)
- break;
-
- ksp->ks_ndata = i;
- ksp->ks_data_size = i * sizeof (kstat_named_t);
-
- return (0);
-}
-
-static void
-spa_tx_assign_init(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
- char *name;
- kstat_named_t *ks;
- kstat_t *ksp;
- int i;
-
- mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
-
- shk->count = 42; /* power of two buckets for 1ns to 2,199s */
- shk->size = shk->count * sizeof (kstat_named_t);
- shk->private = kmem_alloc(shk->size, KM_SLEEP);
-
- name = kmem_asprintf("zfs/%s", spa_name(spa));
-
- for (i = 0; i < shk->count; i++) {
- ks = &((kstat_named_t *)shk->private)[i];
- ks->data_type = KSTAT_DATA_UINT64;
- ks->value.ui64 = 0;
- (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
- (u_longlong_t)1 << i);
- }
-
- ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
- KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
- shk->kstat = ksp;
-
- if (ksp) {
- ksp->ks_lock = &shk->lock;
- ksp->ks_data = shk->private;
- ksp->ks_ndata = shk->count;
- ksp->ks_data_size = shk->size;
- ksp->ks_private = spa;
- ksp->ks_update = spa_tx_assign_update;
- kstat_install(ksp);
- }
- strfree(name);
-}
-
-static void
-spa_tx_assign_destroy(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
- kstat_t *ksp;
-
- ksp = shk->kstat;
- if (ksp)
- kstat_delete(ksp);
-
- kmem_free(shk->private, shk->size);
- mutex_destroy(&shk->lock);
-}
-
-void
-spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
- uint64_t idx = 0;
-
- while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
- idx++;
-
- atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64);
-}
-
-/*
- * ==========================================================================
- * SPA IO History Routines
- * ==========================================================================
- */
-static int
-spa_io_history_update(kstat_t *ksp, int rw)
-{
- if (rw == KSTAT_WRITE)
- memset(ksp->ks_data, 0, ksp->ks_data_size);
-
- return (0);
-}
-
-static void
-spa_io_history_init(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.io_history;
- char *name;
- kstat_t *ksp;
-
- mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
-
- name = kmem_asprintf("zfs/%s", spa_name(spa));
-
- ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
- shk->kstat = ksp;
-
- if (ksp) {
- ksp->ks_lock = &shk->lock;
- ksp->ks_private = spa;
- ksp->ks_update = spa_io_history_update;
- kstat_install(ksp);
- }
- strfree(name);
-}
-
-static void
-spa_io_history_destroy(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
- if (shk->kstat)
- kstat_delete(shk->kstat);
-
- mutex_destroy(&shk->lock);
-}
-
-/*
- * ==========================================================================
- * SPA MMP History Routines
- * ==========================================================================
- */
-
-/*
- * MMP statistics - Information exported regarding attempted MMP writes
- * For MMP writes issued, fields used as per comments below.
- * For MMP writes skipped, an entry represents a span of time when
- * writes were skipped for same reason (error from mmp_random_leaf).
- * Differences are:
- * timestamp time first write skipped, if >1 skipped in a row
- * mmp_delay delay value at timestamp
- * vdev_guid number of writes skipped
- * io_error one of enum mmp_error
- * duration time span (ns) of skipped writes
- */
-
-typedef struct spa_mmp_history {
- uint64_t mmp_node_id; /* unique # for updates */
- uint64_t txg; /* txg of last sync */
- uint64_t timestamp; /* UTC time MMP write issued */
- uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
- uint64_t vdev_guid; /* unique ID of leaf vdev */
- char *vdev_path;
- int vdev_label; /* vdev label */
- int io_error; /* error status of MMP write */
- hrtime_t error_start; /* hrtime of start of error period */
- hrtime_t duration; /* time from submission to completion */
- procfs_list_node_t smh_node;
-} spa_mmp_history_t;
-
-static int
-spa_mmp_history_show_header(struct seq_file *f)
-{
- seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
- "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
- "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
- return (0);
-}
-
-static int
-spa_mmp_history_show(struct seq_file *f, void *data)
-{
- spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
- char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
- "%-10lld %s\n";
- char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
- "%-10lld %s\n";
-
- seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
- (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
- (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
- (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
- (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
- (smh->vdev_path ? smh->vdev_path : "-"));
-
- return (0);
-}
-
-/* Remove oldest elements from list until there are no more than 'size' left */
-static void
-spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
-{
- spa_mmp_history_t *smh;
- while (shl->size > size) {
- smh = list_remove_head(&shl->procfs_list.pl_list);
- if (smh->vdev_path)
- strfree(smh->vdev_path);
- kmem_free(smh, sizeof (spa_mmp_history_t));
- shl->size--;
- }
-
- if (size == 0)
- ASSERT(list_is_empty(&shl->procfs_list.pl_list));
-
-}
-
-static int
-spa_mmp_history_clear(procfs_list_t *procfs_list)
-{
- spa_history_list_t *shl = procfs_list->pl_private;
- mutex_enter(&procfs_list->pl_lock);
- spa_mmp_history_truncate(shl, 0);
- mutex_exit(&procfs_list->pl_lock);
- return (0);
-}
-
-static void
-spa_mmp_history_init(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.mmp_history;
- char *module;
-
- shl->size = 0;
-
- module = kmem_asprintf("zfs/%s", spa_name(spa));
-
- shl->procfs_list.pl_private = shl;
- procfs_list_install(module,
- "multihost",
- 0644,
- &shl->procfs_list,
- spa_mmp_history_show,
- spa_mmp_history_show_header,
- spa_mmp_history_clear,
- offsetof(spa_mmp_history_t, smh_node));
-
- strfree(module);
-}
-
-static void
-spa_mmp_history_destroy(spa_t *spa)
-{
- spa_history_list_t *shl = &spa->spa_stats.mmp_history;
- procfs_list_uninstall(&shl->procfs_list);
- spa_mmp_history_truncate(shl, 0);
- procfs_list_destroy(&shl->procfs_list);
-}
-
-/*
- * Set duration in existing "skip" record to how long we have waited for a leaf
- * vdev to become available.
- *
- * Important that we start search at the tail of the list where new
- * records are inserted, so this is normally an O(1) operation.
- */
-int
-spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
-{
- spa_history_list_t *shl = &spa->spa_stats.mmp_history;
- spa_mmp_history_t *smh;
- int error = ENOENT;
-
- if (zfs_multihost_history == 0 && shl->size == 0)
- return (0);
-
- mutex_enter(&shl->procfs_list.pl_lock);
- for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
- smh = list_prev(&shl->procfs_list.pl_list, smh)) {
- if (smh->mmp_node_id == mmp_node_id) {
- ASSERT3U(smh->io_error, !=, 0);
- smh->duration = gethrtime() - smh->error_start;
- smh->vdev_guid++;
- error = 0;
- break;
- }
- }
- mutex_exit(&shl->procfs_list.pl_lock);
-
- return (error);
-}
-
-/*
- * Set MMP write duration and error status in existing record.
- * See comment re: search order above spa_mmp_history_set_skip().
- */
-int
-spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
- hrtime_t duration)
-{
- spa_history_list_t *shl = &spa->spa_stats.mmp_history;
- spa_mmp_history_t *smh;
- int error = ENOENT;
-
- if (zfs_multihost_history == 0 && shl->size == 0)
- return (0);
-
- mutex_enter(&shl->procfs_list.pl_lock);
- for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
- smh = list_prev(&shl->procfs_list.pl_list, smh)) {
- if (smh->mmp_node_id == mmp_node_id) {
- ASSERT(smh->io_error == 0);
- smh->io_error = io_error;
- smh->duration = duration;
- error = 0;
- break;
- }
- }
- mutex_exit(&shl->procfs_list.pl_lock);
-
- return (error);
-}
-
-/*
- * Add a new MMP historical record.
- * error == 0 : a write was issued.
- * error != 0 : a write was not issued because no leaves were found.
- */
-void
-spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
- uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
- int error)
-{
- spa_history_list_t *shl = &spa->spa_stats.mmp_history;
- spa_mmp_history_t *smh;
-
- if (zfs_multihost_history == 0 && shl->size == 0)
- return;
-
- smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
- smh->txg = txg;
- smh->timestamp = timestamp;
- smh->mmp_delay = mmp_delay;
- if (vd) {
- smh->vdev_guid = vd->vdev_guid;
- if (vd->vdev_path)
- smh->vdev_path = strdup(vd->vdev_path);
- }
- smh->vdev_label = label;
- smh->mmp_node_id = mmp_node_id;
-
- if (error) {
- smh->io_error = error;
- smh->error_start = gethrtime();
- smh->vdev_guid = 1;
- }
-
- mutex_enter(&shl->procfs_list.pl_lock);
- procfs_list_add(&shl->procfs_list, smh);
- shl->size++;
- spa_mmp_history_truncate(shl, zfs_multihost_history);
- mutex_exit(&shl->procfs_list.pl_lock);
-}
-
-static void *
-spa_state_addr(kstat_t *ksp, loff_t n)
-{
- return (ksp->ks_private); /* return the spa_t */
-}
-
-static int
-spa_state_data(char *buf, size_t size, void *data)
-{
- spa_t *spa = (spa_t *)data;
- (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
- return (0);
-}
-
-/*
- * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
- *
- * This is a lock-less read of the pool's state (unlike using 'zpool', which
- * can potentially block for seconds). Because it doesn't block, it can useful
- * as a pool heartbeat value.
- */
-static void
-spa_state_init(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.state;
- char *name;
- kstat_t *ksp;
-
- mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
-
- name = kmem_asprintf("zfs/%s", spa_name(spa));
- ksp = kstat_create(name, 0, "state", "misc",
- KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-
- shk->kstat = ksp;
- if (ksp) {
- ksp->ks_lock = &shk->lock;
- ksp->ks_data = NULL;
- ksp->ks_private = spa;
- ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
- kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
- kstat_install(ksp);
- }
-
- strfree(name);
-}
-
-static void
-spa_health_destroy(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.state;
- kstat_t *ksp = shk->kstat;
- if (ksp)
- kstat_delete(ksp);
-
- mutex_destroy(&shk->lock);
-}
-
-static spa_iostats_t spa_iostats_template = {
- { "trim_extents_written", KSTAT_DATA_UINT64 },
- { "trim_bytes_written", KSTAT_DATA_UINT64 },
- { "trim_extents_skipped", KSTAT_DATA_UINT64 },
- { "trim_bytes_skipped", KSTAT_DATA_UINT64 },
- { "trim_extents_failed", KSTAT_DATA_UINT64 },
- { "trim_bytes_failed", KSTAT_DATA_UINT64 },
- { "autotrim_extents_written", KSTAT_DATA_UINT64 },
- { "autotrim_bytes_written", KSTAT_DATA_UINT64 },
- { "autotrim_extents_skipped", KSTAT_DATA_UINT64 },
- { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
- { "autotrim_extents_failed", KSTAT_DATA_UINT64 },
- { "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
-};
-
-#define SPA_IOSTATS_ADD(stat, val) \
- atomic_add_64(&iostats->stat.value.ui64, (val));
-
-void
-spa_iostats_trim_add(spa_t *spa, trim_type_t type,
- uint64_t extents_written, uint64_t bytes_written,
- uint64_t extents_skipped, uint64_t bytes_skipped,
- uint64_t extents_failed, uint64_t bytes_failed)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.iostats;
- kstat_t *ksp = shk->kstat;
- spa_iostats_t *iostats;
-
- if (ksp == NULL)
- return;
-
- iostats = ksp->ks_data;
- if (type == TRIM_TYPE_MANUAL) {
- SPA_IOSTATS_ADD(trim_extents_written, extents_written);
- SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
- SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
- SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
- SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
- SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
- } else {
- SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
- SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
- SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
- SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
- SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
- SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
- }
-}
-
-int
-spa_iostats_update(kstat_t *ksp, int rw)
-{
- if (rw == KSTAT_WRITE) {
- memcpy(ksp->ks_data, &spa_iostats_template,
- sizeof (spa_iostats_t));
- }
-
- return (0);
-}
-
-static void
-spa_iostats_init(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.iostats;
-
- mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
-
- char *name = kmem_asprintf("zfs/%s", spa_name(spa));
- kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
- KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
-
- shk->kstat = ksp;
- if (ksp) {
- int size = sizeof (spa_iostats_t);
- ksp->ks_lock = &shk->lock;
- ksp->ks_private = spa;
- ksp->ks_update = spa_iostats_update;
- ksp->ks_data = kmem_alloc(size, KM_SLEEP);
- memcpy(ksp->ks_data, &spa_iostats_template, size);
- kstat_install(ksp);
- }
-
- strfree(name);
-}
-
-static void
-spa_iostats_destroy(spa_t *spa)
-{
- spa_history_kstat_t *shk = &spa->spa_stats.iostats;
- kstat_t *ksp = shk->kstat;
- if (ksp) {
- kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
- kstat_delete(ksp);
- }
-
- mutex_destroy(&shk->lock);
-}
-
-void
-spa_stats_init(spa_t *spa)
-{
- spa_read_history_init(spa);
- spa_txg_history_init(spa);
- spa_tx_assign_init(spa);
- spa_io_history_init(spa);
- spa_mmp_history_init(spa);
- spa_state_init(spa);
- spa_iostats_init(spa);
-}
-
-void
-spa_stats_destroy(spa_t *spa)
-{
- spa_iostats_destroy(spa);
- spa_health_destroy(spa);
- spa_tx_assign_destroy(spa);
- spa_txg_history_destroy(spa);
- spa_read_history_destroy(spa);
- spa_io_history_destroy(spa);
- spa_mmp_history_destroy(spa);
-}
-
-#if defined(_KERNEL)
-/* CSTYLED */
-module_param(zfs_read_history, int, 0644);
-MODULE_PARM_DESC(zfs_read_history,
- "Historical statistics for the last N reads");
-
-module_param(zfs_read_history_hits, int, 0644);
-MODULE_PARM_DESC(zfs_read_history_hits,
- "Include cache hits in read history");
-
-module_param(zfs_txg_history, int, 0644);
-MODULE_PARM_DESC(zfs_txg_history,
- "Historical statistics for the last N txgs");
-
-module_param(zfs_multihost_history, int, 0644);
-MODULE_PARM_DESC(zfs_multihost_history,
- "Historical statistics for last N multihost writes");
-/* END CSTYLED */
-#endif
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
deleted file mode 100644
index 21f9ae454..000000000
--- a/module/zfs/vdev_disk.c
+++ /dev/null
@@ -1,954 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Rewritten for Linux by Brian Behlendorf <[email protected]>.
- * LLNL-CODE-403049.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_disk.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_trim.h>
-#include <sys/abd.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <linux/msdos_fs.h>
-#include <linux/vfs_compat.h>
-
-char *zfs_vdev_scheduler = VDEV_SCHEDULER;
-static void *zfs_vdev_holder = VDEV_HOLDER;
-
-/* size of the "reserved" partition, in blocks */
-#define EFI_MIN_RESV_SIZE (16 * 1024)
-
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
- zio_t *dr_zio; /* Parent ZIO */
- atomic_t dr_ref; /* References */
- int dr_error; /* Bio error */
- int dr_bio_count; /* Count of bio's */
- struct bio *dr_bio[0]; /* Attached bio's */
-} dio_request_t;
-
-
-#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
-static fmode_t
-vdev_bdev_mode(int smode)
-{
- fmode_t mode = 0;
-
- ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
-
- if (smode & FREAD)
- mode |= FMODE_READ;
-
- if (smode & FWRITE)
- mode |= FMODE_WRITE;
-
- return (mode);
-}
-#else
-static int
-vdev_bdev_mode(int smode)
-{
- int mode = 0;
-
- ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
-
- if ((smode & FREAD) && !(smode & FWRITE))
- mode = SB_RDONLY;
-
- return (mode);
-}
-#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
-
-/*
- * Returns the usable capacity (in bytes) for the partition or disk.
- */
-static uint64_t
-bdev_capacity(struct block_device *bdev)
-{
- return (i_size_read(bdev->bd_inode));
-}
-
-/*
- * Returns the maximum expansion capacity of the block device (in bytes).
- *
- * It is possible to expand a vdev when it has been created as a wholedisk
- * and the containing block device has increased in capacity. Or when the
- * partition containing the pool has been manually increased in size.
- *
- * This function is only responsible for calculating the potential expansion
- * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
- * responsible for verifying the expected partition layout in the wholedisk
- * case, and updating the partition table if appropriate. Once the partition
- * size has been increased the additional capacity will be visible using
- * bdev_capacity().
- *
- * The returned maximum expansion capacity is always expected to be larger, or
- * at the very least equal, to its usable capacity to prevent overestimating
- * the pool expandsize.
- */
-static uint64_t
-bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
-{
- uint64_t psize;
- int64_t available;
-
- if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
- /*
- * When reporting maximum expansion capacity for a wholedisk
- * deduct any capacity which is expected to be lost due to
- * alignment restrictions. Over reporting this value isn't
- * harmful and would only result in slightly less capacity
- * than expected post expansion.
- * The estimated available space may be slightly smaller than
- * bdev_capacity() for devices where the number of sectors is
- * not a multiple of the alignment size and the partition layout
- * is keeping less than PARTITION_END_ALIGNMENT bytes after the
- * "reserved" EFI partition: in such cases return the device
- * usable capacity.
- */
- available = i_size_read(bdev->bd_contains->bd_inode) -
- ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
- PARTITION_END_ALIGNMENT) << SECTOR_BITS);
- psize = MAX(available, bdev_capacity(bdev));
- } else {
- psize = bdev_capacity(bdev);
- }
-
- return (psize);
-}
-
-static void
-vdev_disk_error(zio_t *zio)
-{
- /*
- * This function can be called in interrupt context, for instance while
- * handling IRQs coming from a misbehaving disk device; use printk()
- * which is safe from any context.
- */
- printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
- "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
- zio->io_vd->vdev_path, zio->io_error, zio->io_type,
- (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
- zio->io_flags);
-}
-
-/*
- * Use the Linux 'noop' elevator for zfs managed block devices. This
- * strikes the ideal balance by allowing the zfs elevator to do all
- * request ordering and prioritization. While allowing the Linux
- * elevator to do the maximum front/back merging allowed by the
- * physical device. This yields the largest possible requests for
- * the device with the lowest total overhead.
- */
-static void
-vdev_elevator_switch(vdev_t *v, char *elevator)
-{
- vdev_disk_t *vd = v->vdev_tsd;
- struct request_queue *q;
- char *device;
- int error;
-
- for (int c = 0; c < v->vdev_children; c++)
- vdev_elevator_switch(v->vdev_child[c], elevator);
-
- if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
- return;
-
- q = bdev_get_queue(vd->vd_bdev);
- device = vd->vd_bdev->bd_disk->disk_name;
-
- /*
- * Skip devices which are not whole disks (partitions).
- * Device-mapper devices are excepted since they may be whole
- * disks despite the vdev_wholedisk flag, in which case we can
- * and should switch the elevator. If the device-mapper device
- * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
- * "Skip devices without schedulers" check below will fail.
- */
- if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
- return;
-
- /* Leave existing scheduler when set to "none" */
- if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
- return;
-
- /*
- * The elevator_change() function was available in kernels from
- * 2.6.36 to 4.11. When not available fall back to using the user
- * mode helper functionality to set the elevator via sysfs. This
- * requires /bin/echo and sysfs to be mounted which may not be true
- * early in the boot process.
- */
-#ifdef HAVE_ELEVATOR_CHANGE
- error = elevator_change(q, elevator);
-#else
-#define SET_SCHEDULER_CMD \
- "exec 0</dev/null " \
- " 1>/sys/block/%s/queue/scheduler " \
- " 2>/dev/null; " \
- "echo %s"
-
- char *argv[] = { "/bin/sh", "-c", NULL, NULL };
- char *envp[] = { NULL };
-
- argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
- error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- strfree(argv[2]);
-#endif /* HAVE_ELEVATOR_CHANGE */
- if (error) {
- zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
- elevator, v->vdev_path, device, error);
- }
-}
-
-static int
-vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
-{
- struct block_device *bdev;
- fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
- int count = 0, block_size;
- int bdev_retry_count = 50;
- vdev_disk_t *vd;
-
- /* Must have a pathname and it must be absolute. */
- if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
- v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- vdev_dbgmsg(v, "invalid vdev_path");
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Reopen the device if it is currently open. When expanding a
- * partition force re-scanning the partition table while closed
- * in order to get an accurate updated block device size. Then
- * since udev may need to recreate the device links increase the
- * open retry count before reporting the device as unavailable.
- */
- vd = v->vdev_tsd;
- if (vd) {
- char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
- boolean_t reread_part = B_FALSE;
-
- rw_enter(&vd->vd_lock, RW_WRITER);
- bdev = vd->vd_bdev;
- vd->vd_bdev = NULL;
-
- if (bdev) {
- if (v->vdev_expanding && bdev != bdev->bd_contains) {
- bdevname(bdev->bd_contains, disk_name + 5);
- reread_part = B_TRUE;
- }
-
- vdev_bdev_close(bdev, mode);
- }
-
- if (reread_part) {
- bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
- if (!IS_ERR(bdev)) {
- int error = vdev_bdev_reread_part(bdev);
- vdev_bdev_close(bdev, mode);
- if (error == 0)
- bdev_retry_count = 100;
- }
- }
- } else {
- vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-
- rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
- rw_enter(&vd->vd_lock, RW_WRITER);
- }
-
- /*
- * Devices are always opened by the path provided at configuration
- * time. This means that if the provided path is a udev by-id path
- * then drives may be re-cabled without an issue. If the provided
- * path is a udev by-path path, then the physical location information
- * will be preserved. This can be critical for more complicated
- * configurations where drives are located in specific physical
- * locations to maximize the systems tolerance to component failure.
- *
- * Alternatively, you can provide your own udev rule to flexibly map
- * the drives as you see fit. It is not advised that you use the
- * /dev/[hd]d devices which may be reordered due to probing order.
- * Devices in the wrong locations will be detected by the higher
- * level vdev validation.
- *
- * The specified paths may be briefly removed and recreated in
- * response to udev events. This should be exceptionally unlikely
- * because the zpool command makes every effort to verify these paths
- * have already settled prior to reaching this point. Therefore,
- * a ENOENT failure at this point is highly likely to be transient
- * and it is reasonable to sleep and retry before giving up. In
- * practice delays have been observed to be on the order of 100ms.
- */
- bdev = ERR_PTR(-ENXIO);
- while (IS_ERR(bdev) && count < bdev_retry_count) {
- bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
- if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
- schedule_timeout(MSEC_TO_TICK(10));
- count++;
- } else if (IS_ERR(bdev)) {
- break;
- }
- }
-
- if (IS_ERR(bdev)) {
- int error = -PTR_ERR(bdev);
- vdev_dbgmsg(v, "open error=%d count=%d", error, count);
- vd->vd_bdev = NULL;
- v->vdev_tsd = vd;
- rw_exit(&vd->vd_lock);
- return (SET_ERROR(error));
- } else {
- vd->vd_bdev = bdev;
- v->vdev_tsd = vd;
- rw_exit(&vd->vd_lock);
- }
-
- struct request_queue *q = bdev_get_queue(vd->vd_bdev);
-
- /* Determine the physical block size */
- block_size = vdev_bdev_block_size(vd->vd_bdev);
-
- /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
- v->vdev_nowritecache = B_FALSE;
-
- /* Set when device reports it supports TRIM. */
- v->vdev_has_trim = !!blk_queue_discard(q);
-
- /* Set when device reports it supports secure TRIM. */
- v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
-
- /* Inform the ZIO pipeline that we are non-rotational */
- v->vdev_nonrot = blk_queue_nonrot(q);
-
- /* Physical volume size in bytes for the partition */
- *psize = bdev_capacity(vd->vd_bdev);
-
- /* Physical volume size in bytes including possible expansion space */
- *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
-
- /* Based on the minimum sector size set the block size */
- *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
-
- /* Try to set the io scheduler elevator algorithm */
- (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
-
- return (0);
-}
-
-static void
-vdev_disk_close(vdev_t *v)
-{
- vdev_disk_t *vd = v->vdev_tsd;
-
- if (v->vdev_reopening || vd == NULL)
- return;
-
- if (vd->vd_bdev != NULL) {
- vdev_bdev_close(vd->vd_bdev,
- vdev_bdev_mode(spa_mode(v->vdev_spa)));
- }
-
- rw_destroy(&vd->vd_lock);
- kmem_free(vd, sizeof (vdev_disk_t));
- v->vdev_tsd = NULL;
-}
-
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
- dio_request_t *dr;
- int i;
-
- dr = kmem_zalloc(sizeof (dio_request_t) +
- sizeof (struct bio *) * bio_count, KM_SLEEP);
- if (dr) {
- atomic_set(&dr->dr_ref, 0);
- dr->dr_bio_count = bio_count;
- dr->dr_error = 0;
-
- for (i = 0; i < dr->dr_bio_count; i++)
- dr->dr_bio[i] = NULL;
- }
-
- return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
- int i;
-
- for (i = 0; i < dr->dr_bio_count; i++)
- if (dr->dr_bio[i])
- bio_put(dr->dr_bio[i]);
-
- kmem_free(dr, sizeof (dio_request_t) +
- sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
- atomic_inc(&dr->dr_ref);
-}
-
-static int
-vdev_disk_dio_put(dio_request_t *dr)
-{
- int rc = atomic_dec_return(&dr->dr_ref);
-
- /*
- * Free the dio_request when the last reference is dropped and
- * ensure zio_interpret is called only once with the correct zio
- */
- if (rc == 0) {
- zio_t *zio = dr->dr_zio;
- int error = dr->dr_error;
-
- vdev_disk_dio_free(dr);
-
- if (zio) {
- zio->io_error = error;
- ASSERT3S(zio->io_error, >=, 0);
- if (zio->io_error)
- vdev_disk_error(zio);
-
- zio_delay_interrupt(zio);
- }
- }
-
- return (rc);
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
- dio_request_t *dr = bio->bi_private;
- int rc;
-
- if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
- dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
- if (error)
- dr->dr_error = -(error);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- dr->dr_error = EIO;
-#endif
- }
-
- /* Drop reference acquired by __vdev_disk_physio */
- rc = vdev_disk_dio_put(dr);
-}
-
-static unsigned int
-bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
-{
- unsigned int offset, size, i;
- struct page *page;
-
- offset = offset_in_page(bio_ptr);
- for (i = 0; i < bio->bi_max_vecs; i++) {
- size = PAGE_SIZE - offset;
-
- if (bio_size <= 0)
- break;
-
- if (size > bio_size)
- size = bio_size;
-
- if (is_vmalloc_addr(bio_ptr))
- page = vmalloc_to_page(bio_ptr);
- else
- page = virt_to_page(bio_ptr);
-
- /*
- * Some network related block device uses tcp_sendpage, which
- * doesn't behave well when using 0-count page, this is a
- * safety net to catch them.
- */
- ASSERT3S(page_count(page), >, 0);
-
- if (bio_add_page(bio, page, size, offset) != size)
- break;
-
- bio_ptr += size;
- bio_size -= size;
- offset = 0;
- }
-
- return (bio_size);
-}
-
-static unsigned int
-bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
-{
- if (abd_is_linear(abd))
- return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
-
- return (abd_scatter_bio_map_off(bio, abd, size, off));
-}
-
-static inline void
-vdev_submit_bio_impl(struct bio *bio)
-{
-#ifdef HAVE_1ARG_SUBMIT_BIO
- submit_bio(bio);
-#else
- submit_bio(0, bio);
-#endif
-}
-
-#ifdef HAVE_BIO_SET_DEV
-#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
-/*
- * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
- * GPL-only bio_associate_blkg() symbol thus inadvertently converting
- * the entire macro. Provide a minimal version which always assigns the
- * request queue's root_blkg to the bio.
- */
-static inline void
-vdev_bio_associate_blkg(struct bio *bio)
-{
- struct request_queue *q = bio->bi_disk->queue;
-
- ASSERT3P(q, !=, NULL);
- ASSERT3P(bio->bi_blkg, ==, NULL);
-
- if (blkg_tryget(q->root_blkg))
- bio->bi_blkg = q->root_blkg;
-}
-#define bio_associate_blkg vdev_bio_associate_blkg
-#endif
-#else
-/*
- * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
- */
-static inline void
-bio_set_dev(struct bio *bio, struct block_device *bdev)
-{
- bio->bi_bdev = bdev;
-}
-#endif /* HAVE_BIO_SET_DEV */
-
-static inline void
-vdev_submit_bio(struct bio *bio)
-{
-#ifdef HAVE_CURRENT_BIO_TAIL
- struct bio **bio_tail = current->bio_tail;
- current->bio_tail = NULL;
- vdev_submit_bio_impl(bio);
- current->bio_tail = bio_tail;
-#else
- struct bio_list *bio_list = current->bio_list;
- current->bio_list = NULL;
- vdev_submit_bio_impl(bio);
- current->bio_list = bio_list;
-#endif
-}
-
-static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
- size_t io_size, uint64_t io_offset, int rw, int flags)
-{
- dio_request_t *dr;
- uint64_t abd_offset;
- uint64_t bio_offset;
- int bio_size, bio_count = 16;
- int i = 0, error = 0;
-#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
- struct blk_plug plug;
-#endif
- /*
- * Accessing outside the block device is never allowed.
- */
- if (io_offset + io_size > bdev->bd_inode->i_size) {
- vdev_dbgmsg(zio->io_vd,
- "Illegal access %llu size %llu, device size %llu",
- io_offset, io_size, i_size_read(bdev->bd_inode));
- return (SET_ERROR(EIO));
- }
-
-retry:
- dr = vdev_disk_dio_alloc(bio_count);
- if (dr == NULL)
- return (SET_ERROR(ENOMEM));
-
- if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
- bio_set_flags_failfast(bdev, &flags);
-
- dr->dr_zio = zio;
-
- /*
- * When the IO size exceeds the maximum bio size for the request
- * queue we are forced to break the IO in multiple bio's and wait
- * for them all to complete. Ideally, all pool users will set
- * their volume block size to match the maximum request size and
- * the common case will be one bio per vdev IO request.
- */
-
- abd_offset = 0;
- bio_offset = io_offset;
- bio_size = io_size;
- for (i = 0; i <= dr->dr_bio_count; i++) {
-
- /* Finished constructing bio's for given buffer */
- if (bio_size <= 0)
- break;
-
- /*
- * By default only 'bio_count' bio's per dio are allowed.
- * However, if we find ourselves in a situation where more
- * are needed we allocate a larger dio and warn the user.
- */
- if (dr->dr_bio_count == i) {
- vdev_disk_dio_free(dr);
- bio_count *= 2;
- goto retry;
- }
-
- /* bio_alloc() with __GFP_WAIT never returns NULL */
- dr->dr_bio[i] = bio_alloc(GFP_NOIO,
- MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
- BIO_MAX_PAGES));
- if (unlikely(dr->dr_bio[i] == NULL)) {
- vdev_disk_dio_free(dr);
- return (SET_ERROR(ENOMEM));
- }
-
- /* Matching put called by vdev_disk_physio_completion */
- vdev_disk_dio_get(dr);
-
- bio_set_dev(dr->dr_bio[i], bdev);
- BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
- dr->dr_bio[i]->bi_private = dr;
- bio_set_op_attrs(dr->dr_bio[i], rw, flags);
-
- /* Remaining size is returned to become the new size */
- bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
- bio_size, abd_offset);
-
- /* Advance in buffer and construct another bio if needed */
- abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
- bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
- }
-
- /* Extra reference to protect dio_request during vdev_submit_bio */
- vdev_disk_dio_get(dr);
-
-#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
- if (dr->dr_bio_count > 1)
- blk_start_plug(&plug);
-#endif
-
- /* Submit all bio's associated with this dio */
- for (i = 0; i < dr->dr_bio_count; i++)
- if (dr->dr_bio[i])
- vdev_submit_bio(dr->dr_bio[i]);
-
-#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
- if (dr->dr_bio_count > 1)
- blk_finish_plug(&plug);
-#endif
-
- (void) vdev_disk_dio_put(dr);
-
- return (error);
-}
-
-BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
-{
- zio_t *zio = bio->bi_private;
-#ifdef HAVE_1ARG_BIO_END_IO_T
- zio->io_error = BIO_END_IO_ERROR(bio);
-#else
- zio->io_error = -error;
-#endif
-
- if (zio->io_error && (zio->io_error == EOPNOTSUPP))
- zio->io_vd->vdev_nowritecache = B_TRUE;
-
- bio_put(bio);
- ASSERT3S(zio->io_error, >=, 0);
- if (zio->io_error)
- vdev_disk_error(zio);
- zio_interrupt(zio);
-}
-
-static int
-vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
-{
- struct request_queue *q;
- struct bio *bio;
-
- q = bdev_get_queue(bdev);
- if (!q)
- return (SET_ERROR(ENXIO));
-
- bio = bio_alloc(GFP_NOIO, 0);
- /* bio_alloc() with __GFP_WAIT never returns NULL */
- if (unlikely(bio == NULL))
- return (SET_ERROR(ENOMEM));
-
- bio->bi_end_io = vdev_disk_io_flush_completion;
- bio->bi_private = zio;
- bio_set_dev(bio, bdev);
- bio_set_flush(bio);
- vdev_submit_bio(bio);
- invalidate_bdev(bdev);
-
- return (0);
-}
-
-static void
-vdev_disk_io_start(zio_t *zio)
-{
- vdev_t *v = zio->io_vd;
- vdev_disk_t *vd = v->vdev_tsd;
- unsigned long trim_flags = 0;
- int rw, flags, error;
-
- /*
- * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
- * Nothing to be done here but return failure.
- */
- if (vd == NULL) {
- zio->io_error = ENXIO;
- zio_interrupt(zio);
- return;
- }
-
- rw_enter(&vd->vd_lock, RW_READER);
-
- /*
- * If the vdev is closed, it's likely due to a failed reopen and is
- * in the UNAVAIL state. Nothing to be done here but return failure.
- */
- if (vd->vd_bdev == NULL) {
- rw_exit(&vd->vd_lock);
- zio->io_error = ENXIO;
- zio_interrupt(zio);
- return;
- }
-
- switch (zio->io_type) {
- case ZIO_TYPE_IOCTL:
-
- if (!vdev_readable(v)) {
- rw_exit(&vd->vd_lock);
- zio->io_error = SET_ERROR(ENXIO);
- zio_interrupt(zio);
- return;
- }
-
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
-
- if (zfs_nocacheflush)
- break;
-
- if (v->vdev_nowritecache) {
- zio->io_error = SET_ERROR(ENOTSUP);
- break;
- }
-
- error = vdev_disk_io_flush(vd->vd_bdev, zio);
- if (error == 0) {
- rw_exit(&vd->vd_lock);
- return;
- }
-
- zio->io_error = error;
-
- break;
-
- default:
- zio->io_error = SET_ERROR(ENOTSUP);
- }
-
- rw_exit(&vd->vd_lock);
- zio_execute(zio);
- return;
- case ZIO_TYPE_WRITE:
- rw = WRITE;
-#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
- flags = (1 << BIO_RW_UNPLUG);
-#elif defined(REQ_UNPLUG)
- flags = REQ_UNPLUG;
-#else
- flags = 0;
-#endif
- break;
-
- case ZIO_TYPE_READ:
- rw = READ;
-#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
- flags = (1 << BIO_RW_UNPLUG);
-#elif defined(REQ_UNPLUG)
- flags = REQ_UNPLUG;
-#else
- flags = 0;
-#endif
- break;
-
- case ZIO_TYPE_TRIM:
-#if defined(BLKDEV_DISCARD_SECURE)
- if (zio->io_trim_flags & ZIO_TRIM_SECURE)
- trim_flags |= BLKDEV_DISCARD_SECURE;
-#endif
- zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
- zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
- trim_flags);
-
- rw_exit(&vd->vd_lock);
- zio_interrupt(zio);
- return;
-
- default:
- rw_exit(&vd->vd_lock);
- zio->io_error = SET_ERROR(ENOTSUP);
- zio_interrupt(zio);
- return;
- }
-
- zio->io_target_timestamp = zio_handle_io_delay(zio);
- error = __vdev_disk_physio(vd->vd_bdev, zio,
- zio->io_size, zio->io_offset, rw, flags);
- rw_exit(&vd->vd_lock);
-
- if (error) {
- zio->io_error = error;
- zio_interrupt(zio);
- return;
- }
-}
-
-static void
-vdev_disk_io_done(zio_t *zio)
-{
- /*
- * If the device returned EIO, we revalidate the media. If it is
- * determined the media has changed this triggers the asynchronous
- * removal of the device from the configuration.
- */
- if (zio->io_error == EIO) {
- vdev_t *v = zio->io_vd;
- vdev_disk_t *vd = v->vdev_tsd;
-
- if (check_disk_change(vd->vd_bdev)) {
- vdev_bdev_invalidate(vd->vd_bdev);
- v->vdev_remove_wanted = B_TRUE;
- spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
- }
- }
-}
-
-static void
-vdev_disk_hold(vdev_t *vd)
-{
- ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
-
- /* We must have a pathname, and it must be absolute. */
- if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
- return;
-
- /*
- * Only prefetch path and devid info if the device has
- * never been opened.
- */
- if (vd->vdev_tsd != NULL)
- return;
-
- /* XXX: Implement me as a vnode lookup for the device */
- vd->vdev_name_vp = NULL;
- vd->vdev_devid_vp = NULL;
-}
-
-static void
-vdev_disk_rele(vdev_t *vd)
-{
- ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
-
- /* XXX: Implement me as a vnode rele for the device */
-}
-
-static int
-param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
-{
- spa_t *spa = NULL;
- char *p;
-
- if (val == NULL)
- return (SET_ERROR(-EINVAL));
-
- if ((p = strchr(val, '\n')) != NULL)
- *p = '\0';
-
- if (spa_mode_global != 0) {
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE ||
- !spa_writeable(spa) || spa_suspended(spa))
- continue;
-
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
- mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
- }
- mutex_exit(&spa_namespace_lock);
- }
-
- return (param_set_charp(val, kp));
-}
-
-vdev_ops_t vdev_disk_ops = {
- .vdev_op_open = vdev_disk_open,
- .vdev_op_close = vdev_disk_close,
- .vdev_op_asize = vdev_default_asize,
- .vdev_op_io_start = vdev_disk_io_start,
- .vdev_op_io_done = vdev_disk_io_done,
- .vdev_op_state_change = NULL,
- .vdev_op_need_resilver = NULL,
- .vdev_op_hold = vdev_disk_hold,
- .vdev_op_rele = vdev_disk_rele,
- .vdev_op_remap = NULL,
- .vdev_op_xlate = vdev_default_xlate,
- .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
- .vdev_op_leaf = B_TRUE /* leaf vdev */
-};
-
-module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
- param_get_charp, &zfs_vdev_scheduler, 0644);
-MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
deleted file mode 100644
index b79017f3a..000000000
--- a/module/zfs/vdev_file.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_file.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_trim.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/abd.h>
-#include <sys/fcntl.h>
-#include <sys/vnode.h>
-
-/*
- * Virtual device vector for files.
- */
-
-static taskq_t *vdev_file_taskq;
-
-static void
-vdev_file_hold(vdev_t *vd)
-{
- ASSERT(vd->vdev_path != NULL);
-}
-
-static void
-vdev_file_rele(vdev_t *vd)
-{
- ASSERT(vd->vdev_path != NULL);
-}
-
-static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
-{
- vdev_file_t *vf;
- vnode_t *vp;
- vattr_t vattr;
- int error;
-
- /*
- * Rotational optimizations only make sense on block devices.
- */
- vd->vdev_nonrot = B_TRUE;
-
- /*
- * Allow TRIM on file based vdevs. This may not always be supported,
- * since it depends on your kernel version and underlying filesystem
- * type but it is always safe to attempt.
- */
- vd->vdev_has_trim = B_TRUE;
-
- /*
- * Disable secure TRIM on file based vdevs. There is no way to
- * request this behavior from the underlying filesystem.
- */
- vd->vdev_has_securetrim = B_FALSE;
-
- /*
- * We must have a pathname, and it must be absolute.
- */
- if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Reopen the device if it's not currently open. Otherwise,
- * just update the physical size of the device.
- */
- if (vd->vdev_tsd != NULL) {
- ASSERT(vd->vdev_reopening);
- vf = vd->vdev_tsd;
- goto skip_open;
- }
-
- vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
-
- /*
- * We always open the files from the root of the global zone, even if
- * we're in a local zone. If the user has gotten to this point, the
- * administrator has already decided that the pool should be available
- * to local zone users, so the underlying devices should be as well.
- */
- ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
- error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
- spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
-
- if (error) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (error);
- }
-
- vf->vf_vnode = vp;
-
-#ifdef _KERNEL
- /*
- * Make sure it's a regular file.
- */
- if (vp->v_type != VREG) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (SET_ERROR(ENODEV));
- }
-#endif
-
-skip_open:
- /*
- * Determine the physical size of the file.
- */
- vattr.va_mask = AT_SIZE;
- error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
- if (error) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (error);
- }
-
- *max_psize = *psize = vattr.va_size;
- *ashift = SPA_MINBLOCKSHIFT;
-
- return (0);
-}
-
-static void
-vdev_file_close(vdev_t *vd)
-{
- vdev_file_t *vf = vd->vdev_tsd;
-
- if (vd->vdev_reopening || vf == NULL)
- return;
-
- if (vf->vf_vnode != NULL) {
- (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
- (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
- kcred, NULL);
- }
-
- vd->vdev_delayed_close = B_FALSE;
- kmem_free(vf, sizeof (vdev_file_t));
- vd->vdev_tsd = NULL;
-}
-
-static void
-vdev_file_io_strategy(void *arg)
-{
- zio_t *zio = (zio_t *)arg;
- vdev_t *vd = zio->io_vd;
- vdev_file_t *vf = vd->vdev_tsd;
- ssize_t resid;
- void *buf;
-
- if (zio->io_type == ZIO_TYPE_READ)
- buf = abd_borrow_buf(zio->io_abd, zio->io_size);
- else
- buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-
- zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
- UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size,
- zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
- if (zio->io_type == ZIO_TYPE_READ)
- abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
- else
- abd_return_buf(zio->io_abd, buf, zio->io_size);
-
- if (resid != 0 && zio->io_error == 0)
- zio->io_error = SET_ERROR(ENOSPC);
-
- zio_delay_interrupt(zio);
-}
-
-static void
-vdev_file_io_fsync(void *arg)
-{
- zio_t *zio = (zio_t *)arg;
- vdev_file_t *vf = zio->io_vd->vdev_tsd;
-
- zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL);
-
- zio_interrupt(zio);
-}
-
-static void
-vdev_file_io_start(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_file_t *vf = vd->vdev_tsd;
-
- if (zio->io_type == ZIO_TYPE_IOCTL) {
- /* XXPOLICY */
- if (!vdev_readable(vd)) {
- zio->io_error = SET_ERROR(ENXIO);
- zio_interrupt(zio);
- return;
- }
-
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
-
- if (zfs_nocacheflush)
- break;
-
- /*
- * We cannot safely call vfs_fsync() when PF_FSTRANS
- * is set in the current context. Filesystems like
- * XFS include sanity checks to verify it is not
- * already set, see xfs_vm_writepage(). Therefore
- * the sync must be dispatched to a different context.
- */
- if (__spl_pf_fstrans_check()) {
- VERIFY3U(taskq_dispatch(vdev_file_taskq,
- vdev_file_io_fsync, zio, TQ_SLEEP), !=,
- TASKQID_INVALID);
- return;
- }
-
- zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
- kcred, NULL);
- break;
- default:
- zio->io_error = SET_ERROR(ENOTSUP);
- }
-
- zio_execute(zio);
- return;
- } else if (zio->io_type == ZIO_TYPE_TRIM) {
- struct flock flck;
-
- ASSERT3U(zio->io_size, !=, 0);
- bzero(&flck, sizeof (flck));
- flck.l_type = F_FREESP;
- flck.l_start = zio->io_offset;
- flck.l_len = zio->io_size;
- flck.l_whence = SEEK_SET;
-
- zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
- 0, 0, kcred, NULL);
-
- zio_execute(zio);
- return;
- }
-
- zio->io_target_timestamp = zio_handle_io_delay(zio);
-
- VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
- TQ_SLEEP), !=, TASKQID_INVALID);
-}
-
-/* ARGSUSED */
-static void
-vdev_file_io_done(zio_t *zio)
-{
-}
-
-vdev_ops_t vdev_file_ops = {
- .vdev_op_open = vdev_file_open,
- .vdev_op_close = vdev_file_close,
- .vdev_op_asize = vdev_default_asize,
- .vdev_op_io_start = vdev_file_io_start,
- .vdev_op_io_done = vdev_file_io_done,
- .vdev_op_state_change = NULL,
- .vdev_op_need_resilver = NULL,
- .vdev_op_hold = vdev_file_hold,
- .vdev_op_rele = vdev_file_rele,
- .vdev_op_remap = NULL,
- .vdev_op_xlate = vdev_default_xlate,
- .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
- .vdev_op_leaf = B_TRUE /* leaf vdev */
-};
-
-void
-vdev_file_init(void)
-{
- vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
- minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
-
- VERIFY(vdev_file_taskq);
-}
-
-void
-vdev_file_fini(void)
-{
- taskq_destroy(vdev_file_taskq);
-}
-
-/*
- * From userland we access disks just like files.
- */
-#ifndef _KERNEL
-
-vdev_ops_t vdev_disk_ops = {
- .vdev_op_open = vdev_file_open,
- .vdev_op_close = vdev_file_close,
- .vdev_op_asize = vdev_default_asize,
- .vdev_op_io_start = vdev_file_io_start,
- .vdev_op_io_done = vdev_file_io_done,
- .vdev_op_state_change = NULL,
- .vdev_op_need_resilver = NULL,
- .vdev_op_hold = vdev_file_hold,
- .vdev_op_rele = vdev_file_rele,
- .vdev_op_remap = NULL,
- .vdev_op_xlate = vdev_default_xlate,
- .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
- .vdev_op_leaf = B_TRUE /* leaf vdev */
-};
-
-#endif
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
deleted file mode 100644
index 26af91e27..000000000
--- a/module/zfs/zfs_acl.c
+++ /dev/null
@@ -1,2816 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/sid.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/mode.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/trace_acl.h>
-#include <sys/zpl.h>
-
-#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
-#define DENY ACE_ACCESS_DENIED_ACE_TYPE
-#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
-#define MIN_ACE_TYPE ALLOW
-
-#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
-#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
- ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
-#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-
-#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
- ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
- ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
- ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
-
-#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
-#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
- ACE_DELETE|ACE_DELETE_CHILD)
-#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
-
-#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
- ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
-
-#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
- ZFS_ACL_PROTECTED)
-
-#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
- ZFS_ACL_OBJ_ACE)
-
-#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
-
-#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U
-
-static uint16_t
-zfs_ace_v0_get_type(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_v0_get_flags(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_v0_get_mask(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_v0_get_who(void *acep)
-{
- return (((zfs_oldace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_v0_set_type(void *acep, uint16_t type)
-{
- ((zfs_oldace_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_v0_set_flags(void *acep, uint16_t flags)
-{
- ((zfs_oldace_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_v0_set_mask(void *acep, uint32_t mask)
-{
- ((zfs_oldace_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_v0_set_who(void *acep, uint64_t who)
-{
- ((zfs_oldace_t *)acep)->z_fuid = who;
-}
-
-/*ARGSUSED*/
-static size_t
-zfs_ace_v0_size(void *acep)
-{
- return (sizeof (zfs_oldace_t));
-}
-
-static size_t
-zfs_ace_v0_abstract_size(void)
-{
- return (sizeof (zfs_oldace_t));
-}
-
-static int
-zfs_ace_v0_mask_off(void)
-{
- return (offsetof(zfs_oldace_t, z_access_mask));
-}
-
-/*ARGSUSED*/
-static int
-zfs_ace_v0_data(void *acep, void **datap)
-{
- *datap = NULL;
- return (0);
-}
-
-static acl_ops_t zfs_acl_v0_ops = {
- .ace_mask_get = zfs_ace_v0_get_mask,
- .ace_mask_set = zfs_ace_v0_set_mask,
- .ace_flags_get = zfs_ace_v0_get_flags,
- .ace_flags_set = zfs_ace_v0_set_flags,
- .ace_type_get = zfs_ace_v0_get_type,
- .ace_type_set = zfs_ace_v0_set_type,
- .ace_who_get = zfs_ace_v0_get_who,
- .ace_who_set = zfs_ace_v0_set_who,
- .ace_size = zfs_ace_v0_size,
- .ace_abstract_size = zfs_ace_v0_abstract_size,
- .ace_mask_off = zfs_ace_v0_mask_off,
- .ace_data = zfs_ace_v0_data
-};
-
-static uint16_t
-zfs_ace_fuid_get_type(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_fuid_get_flags(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_fuid_get_mask(void *acep)
-{
- return (((zfs_ace_hdr_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_fuid_get_who(void *args)
-{
- uint16_t entry_type;
- zfs_ace_t *acep = args;
-
- entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
- if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE)
- return (-1);
- return (((zfs_ace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_fuid_set_type(void *acep, uint16_t type)
-{
- ((zfs_ace_hdr_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
-{
- ((zfs_ace_hdr_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
-{
- ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_fuid_set_who(void *arg, uint64_t who)
-{
- zfs_ace_t *acep = arg;
-
- uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
- if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE)
- return;
- acep->z_fuid = who;
-}
-
-static size_t
-zfs_ace_fuid_size(void *acep)
-{
- zfs_ace_hdr_t *zacep = acep;
- uint16_t entry_type;
-
- switch (zacep->z_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- return (sizeof (zfs_object_ace_t));
- case ALLOW:
- case DENY:
- entry_type =
- (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
- if (entry_type == ACE_OWNER ||
- entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE)
- return (sizeof (zfs_ace_hdr_t));
- /*FALLTHROUGH*/
- default:
- return (sizeof (zfs_ace_t));
- }
-}
-
-static size_t
-zfs_ace_fuid_abstract_size(void)
-{
- return (sizeof (zfs_ace_hdr_t));
-}
-
-static int
-zfs_ace_fuid_mask_off(void)
-{
- return (offsetof(zfs_ace_hdr_t, z_access_mask));
-}
-
-static int
-zfs_ace_fuid_data(void *acep, void **datap)
-{
- zfs_ace_t *zacep = acep;
- zfs_object_ace_t *zobjp;
-
- switch (zacep->z_hdr.z_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- zobjp = acep;
- *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
- return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
- default:
- *datap = NULL;
- return (0);
- }
-}
-
-static acl_ops_t zfs_acl_fuid_ops = {
- .ace_mask_get = zfs_ace_fuid_get_mask,
- .ace_mask_set = zfs_ace_fuid_set_mask,
- .ace_flags_get = zfs_ace_fuid_get_flags,
- .ace_flags_set = zfs_ace_fuid_set_flags,
- .ace_type_get = zfs_ace_fuid_get_type,
- .ace_type_set = zfs_ace_fuid_set_type,
- .ace_who_get = zfs_ace_fuid_get_who,
- .ace_who_set = zfs_ace_fuid_set_who,
- .ace_size = zfs_ace_fuid_size,
- .ace_abstract_size = zfs_ace_fuid_abstract_size,
- .ace_mask_off = zfs_ace_fuid_mask_off,
- .ace_data = zfs_ace_fuid_data
-};
-
-/*
- * The following three functions are provided for compatibility with
- * older ZPL version in order to determine if the file use to have
- * an external ACL and what version of ACL previously existed on the
- * file. Would really be nice to not need this, sigh.
- */
-uint64_t
-zfs_external_acl(znode_t *zp)
-{
- zfs_acl_phys_t acl_phys;
- int error;
-
- if (zp->z_is_sa)
- return (0);
-
- /*
- * Need to deal with a potential
- * race where zfs_sa_upgrade could cause
- * z_isa_sa to change.
- *
- * If the lookup fails then the state of z_is_sa should have
- * changed.
- */
-
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
- &acl_phys, sizeof (acl_phys))) == 0)
- return (acl_phys.z_acl_extern_obj);
- else {
- /*
- * after upgrade the SA_ZPL_ZNODE_ACL should have been
- * removed
- */
- VERIFY(zp->z_is_sa && error == ENOENT);
- return (0);
- }
-}
-
-/*
- * Determine size of ACL in bytes
- *
- * This is more complicated than it should be since we have to deal
- * with old external ACLs.
- */
-static int
-zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
- zfs_acl_phys_t *aclphys)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- uint64_t acl_count;
- int size;
- int error;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- if (zp->z_is_sa) {
- if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
- &size)) != 0)
- return (error);
- *aclsize = size;
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
- &acl_count, sizeof (acl_count))) != 0)
- return (error);
- *aclcount = acl_count;
- } else {
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
- aclphys, sizeof (*aclphys))) != 0)
- return (error);
-
- if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
- *aclcount = aclphys->z_acl_size;
- } else {
- *aclsize = aclphys->z_acl_size;
- *aclcount = aclphys->z_acl_count;
- }
- }
- return (0);
-}
-
-int
-zfs_znode_acl_version(znode_t *zp)
-{
- zfs_acl_phys_t acl_phys;
-
- if (zp->z_is_sa)
- return (ZFS_ACL_VERSION_FUID);
- else {
- int error;
-
- /*
- * Need to deal with a potential
- * race where zfs_sa_upgrade could cause
- * z_isa_sa to change.
- *
- * If the lookup fails then the state of z_is_sa should have
- * changed.
- */
- if ((error = sa_lookup(zp->z_sa_hdl,
- SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
- &acl_phys, sizeof (acl_phys))) == 0)
- return (acl_phys.z_acl_version);
- else {
- /*
- * After upgrade SA_ZPL_ZNODE_ACL should have
- * been removed.
- */
- VERIFY(zp->z_is_sa && error == ENOENT);
- return (ZFS_ACL_VERSION_FUID);
- }
- }
-}
-
-static int
-zfs_acl_version(int version)
-{
- if (version < ZPL_VERSION_FUID)
- return (ZFS_ACL_VERSION_INITIAL);
- else
- return (ZFS_ACL_VERSION_FUID);
-}
-
-static int
-zfs_acl_version_zp(znode_t *zp)
-{
- return (zfs_acl_version(ZTOZSB(zp)->z_version));
-}
-
-zfs_acl_t *
-zfs_acl_alloc(int vers)
-{
- zfs_acl_t *aclp;
-
- aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
- list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
- offsetof(zfs_acl_node_t, z_next));
- aclp->z_version = vers;
- if (vers == ZFS_ACL_VERSION_FUID)
- aclp->z_ops = &zfs_acl_fuid_ops;
- else
- aclp->z_ops = &zfs_acl_v0_ops;
- return (aclp);
-}
-
-zfs_acl_node_t *
-zfs_acl_node_alloc(size_t bytes)
-{
- zfs_acl_node_t *aclnode;
-
- aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
- if (bytes) {
- aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
- aclnode->z_allocdata = aclnode->z_acldata;
- aclnode->z_allocsize = bytes;
- aclnode->z_size = bytes;
- }
-
- return (aclnode);
-}
-
-static void
-zfs_acl_node_free(zfs_acl_node_t *aclnode)
-{
- if (aclnode->z_allocsize)
- kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
- kmem_free(aclnode, sizeof (zfs_acl_node_t));
-}
-
-static void
-zfs_acl_release_nodes(zfs_acl_t *aclp)
-{
- zfs_acl_node_t *aclnode;
-
- while ((aclnode = list_head(&aclp->z_acl))) {
- list_remove(&aclp->z_acl, aclnode);
- zfs_acl_node_free(aclnode);
- }
- aclp->z_acl_count = 0;
- aclp->z_acl_bytes = 0;
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
- zfs_acl_release_nodes(aclp);
- list_destroy(&aclp->z_acl);
- kmem_free(aclp, sizeof (zfs_acl_t));
-}
-
-static boolean_t
-zfs_acl_valid_ace_type(uint_t type, uint_t flags)
-{
- uint16_t entry_type;
-
- switch (type) {
- case ALLOW:
- case DENY:
- case ACE_SYSTEM_AUDIT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_ACE_TYPE:
- entry_type = flags & ACE_TYPE_FLAGS;
- return (entry_type == ACE_OWNER ||
- entry_type == OWNING_GROUP ||
- entry_type == ACE_EVERYONE || entry_type == 0 ||
- entry_type == ACE_IDENTIFIER_GROUP);
- default:
- if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-static boolean_t
-zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
-{
- /*
- * first check type of entry
- */
-
- if (!zfs_acl_valid_ace_type(type, iflags))
- return (B_FALSE);
-
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- if (aclp->z_version < ZFS_ACL_VERSION_FUID)
- return (B_FALSE);
- aclp->z_hints |= ZFS_ACL_OBJ_ACE;
- }
-
- /*
- * next check inheritance level flags
- */
-
- if (S_ISDIR(obj_mode) &&
- (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
- aclp->z_hints |= ZFS_INHERIT_ACE;
-
- if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) == 0) {
- return (B_FALSE);
- }
- }
-
- return (B_TRUE);
-}
-
-static void *
-zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
- uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
-{
- zfs_acl_node_t *aclnode;
-
- ASSERT(aclp);
-
- if (start == NULL) {
- aclnode = list_head(&aclp->z_acl);
- if (aclnode == NULL)
- return (NULL);
-
- aclp->z_next_ace = aclnode->z_acldata;
- aclp->z_curr_node = aclnode;
- aclnode->z_ace_idx = 0;
- }
-
- aclnode = aclp->z_curr_node;
-
- if (aclnode == NULL)
- return (NULL);
-
- if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
- aclnode = list_next(&aclp->z_acl, aclnode);
- if (aclnode == NULL)
- return (NULL);
- else {
- aclp->z_curr_node = aclnode;
- aclnode->z_ace_idx = 0;
- aclp->z_next_ace = aclnode->z_acldata;
- }
- }
-
- if (aclnode->z_ace_idx < aclnode->z_ace_count) {
- void *acep = aclp->z_next_ace;
- size_t ace_size;
-
- /*
- * Make sure we don't overstep our bounds
- */
- ace_size = aclp->z_ops->ace_size(acep);
-
- if (((caddr_t)acep + ace_size) >
- ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
- return (NULL);
- }
-
- *iflags = aclp->z_ops->ace_flags_get(acep);
- *type = aclp->z_ops->ace_type_get(acep);
- *access_mask = aclp->z_ops->ace_mask_get(acep);
- *who = aclp->z_ops->ace_who_get(acep);
- aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
- aclnode->z_ace_idx++;
-
- return ((void *)acep);
- }
- return (NULL);
-}
-
-/*ARGSUSED*/
-static uint64_t
-zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
- uint16_t *flags, uint16_t *type, uint32_t *mask)
-{
- zfs_acl_t *aclp = datap;
- zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
- uint64_t who;
-
- acep = zfs_acl_next_ace(aclp, acep, &who, mask,
- flags, type);
- return ((uint64_t)(uintptr_t)acep);
-}
-
-/*
- * Copy ACE to internal ZFS format.
- * While processing the ACL each ACE will be validated for correctness.
- * ACE FUIDs will be created later.
- */
-int
-zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp,
- void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
- zfs_fuid_info_t **fuidp, cred_t *cr)
-{
- int i;
- uint16_t entry_type;
- zfs_ace_t *aceptr = z_acl;
- ace_t *acep = datap;
- zfs_object_ace_t *zobjacep;
- ace_object_t *aceobjp;
-
- for (i = 0; i != aclcnt; i++) {
- aceptr->z_hdr.z_access_mask = acep->a_access_mask;
- aceptr->z_hdr.z_flags = acep->a_flags;
- aceptr->z_hdr.z_type = acep->a_type;
- entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
- if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
- entry_type != ACE_EVERYONE) {
- aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
- cr, (entry_type == 0) ?
- ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
- }
-
- /*
- * Make sure ACE is valid
- */
- if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type,
- aceptr->z_hdr.z_flags) != B_TRUE)
- return (SET_ERROR(EINVAL));
-
- switch (acep->a_type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- zobjacep = (zfs_object_ace_t *)aceptr;
- aceobjp = (ace_object_t *)acep;
-
- bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
- sizeof (aceobjp->a_obj_type));
- bcopy(aceobjp->a_inherit_obj_type,
- zobjacep->z_inherit_type,
- sizeof (aceobjp->a_inherit_obj_type));
- acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
- break;
- default:
- acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
- }
-
- aceptr = (zfs_ace_t *)((caddr_t)aceptr +
- aclp->z_ops->ace_size(aceptr));
- }
-
- *size = (caddr_t)aceptr - (caddr_t)z_acl;
-
- return (0);
-}
-
-/*
- * Copy ZFS ACEs to fixed size ace_t layout
- */
-static void
-zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
- void *datap, int filter)
-{
- uint64_t who;
- uint32_t access_mask;
- uint16_t iflags, type;
- zfs_ace_hdr_t *zacep = NULL;
- ace_t *acep = datap;
- ace_object_t *objacep;
- zfs_object_ace_t *zobjacep;
- size_t ace_size;
- uint16_t entry_type;
-
- while ((zacep = zfs_acl_next_ace(aclp, zacep,
- &who, &access_mask, &iflags, &type))) {
-
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- if (filter) {
- continue;
- }
- zobjacep = (zfs_object_ace_t *)zacep;
- objacep = (ace_object_t *)acep;
- bcopy(zobjacep->z_object_type,
- objacep->a_obj_type,
- sizeof (zobjacep->z_object_type));
- bcopy(zobjacep->z_inherit_type,
- objacep->a_inherit_obj_type,
- sizeof (zobjacep->z_inherit_type));
- ace_size = sizeof (ace_object_t);
- break;
- default:
- ace_size = sizeof (ace_t);
- break;
- }
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
- if ((entry_type != ACE_OWNER &&
- entry_type != OWNING_GROUP &&
- entry_type != ACE_EVERYONE)) {
- acep->a_who = zfs_fuid_map_id(zfsvfs, who,
- cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
- ZFS_ACE_GROUP : ZFS_ACE_USER);
- } else {
- acep->a_who = (uid_t)(int64_t)who;
- }
- acep->a_access_mask = access_mask;
- acep->a_flags = iflags;
- acep->a_type = type;
- acep = (ace_t *)((caddr_t)acep + ace_size);
- }
-}
-
-static int
-zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep,
- zfs_oldace_t *z_acl, int aclcnt, size_t *size)
-{
- int i;
- zfs_oldace_t *aceptr = z_acl;
-
- for (i = 0; i != aclcnt; i++, aceptr++) {
- aceptr->z_access_mask = acep[i].a_access_mask;
- aceptr->z_type = acep[i].a_type;
- aceptr->z_flags = acep[i].a_flags;
- aceptr->z_fuid = acep[i].a_who;
- /*
- * Make sure ACE is valid
- */
- if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type,
- aceptr->z_flags) != B_TRUE)
- return (SET_ERROR(EINVAL));
- }
- *size = (caddr_t)aceptr - (caddr_t)z_acl;
- return (0);
-}
-
-/*
- * convert old ACL format to new
- */
-void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
-{
- zfs_oldace_t *oldaclp;
- int i;
- uint16_t type, iflags;
- uint32_t access_mask;
- uint64_t who;
- void *cookie = NULL;
- zfs_acl_node_t *newaclnode;
-
- ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
- /*
- * First create the ACE in a contiguous piece of memory
- * for zfs_copy_ace_2_fuid().
- *
- * We only convert an ACL once, so this won't happen
- * every time.
- */
- oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
- KM_SLEEP);
- i = 0;
- while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
- &access_mask, &iflags, &type))) {
- oldaclp[i].z_flags = iflags;
- oldaclp[i].z_type = type;
- oldaclp[i].z_fuid = who;
- oldaclp[i++].z_access_mask = access_mask;
- }
-
- newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
- sizeof (zfs_object_ace_t));
- aclp->z_ops = &zfs_acl_fuid_ops;
- VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode,
- aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
- &newaclnode->z_size, NULL, cr) == 0);
- newaclnode->z_ace_count = aclp->z_acl_count;
- aclp->z_version = ZFS_ACL_VERSION;
- kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
-
- /*
- * Release all previous ACL nodes
- */
-
- zfs_acl_release_nodes(aclp);
-
- list_insert_head(&aclp->z_acl, newaclnode);
-
- aclp->z_acl_bytes = newaclnode->z_size;
- aclp->z_acl_count = newaclnode->z_ace_count;
-
-}
-
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
- uint32_t new_mask = 0;
-
- if (access_mask & S_IXOTH)
- new_mask |= ACE_EXECUTE;
- if (access_mask & S_IWOTH)
- new_mask |= ACE_WRITE_DATA;
- if (access_mask & S_IROTH)
- new_mask |= ACE_READ_DATA;
- return (new_mask);
-}
-
-static void
-zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
- uint16_t access_type, uint64_t fuid, uint16_t entry_type)
-{
- uint16_t type = entry_type & ACE_TYPE_FLAGS;
-
- aclp->z_ops->ace_mask_set(acep, access_mask);
- aclp->z_ops->ace_type_set(acep, access_type);
- aclp->z_ops->ace_flags_set(acep, entry_type);
- if ((type != ACE_OWNER && type != OWNING_GROUP &&
- type != ACE_EVERYONE))
- aclp->z_ops->ace_who_set(acep, fuid);
-}
-
-/*
- * Determine mode of file based on ACL.
- * Also, create FUIDs for any User/Group ACEs
- */
-uint64_t
-zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
- uint64_t *pflags, uint64_t fuid, uint64_t fgid)
-{
- int entry_type;
- mode_t mode;
- mode_t seen = 0;
- zfs_ace_hdr_t *acep = NULL;
- uint64_t who;
- uint16_t iflags, type;
- uint32_t access_mask;
- boolean_t an_exec_denied = B_FALSE;
-
- mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
-
- while ((acep = zfs_acl_next_ace(aclp, acep, &who,
- &access_mask, &iflags, &type))) {
-
- if (!zfs_acl_valid_ace_type(type, iflags))
- continue;
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
-
- /*
- * Skip over owner@, group@ or everyone@ inherit only ACEs
- */
- if ((iflags & ACE_INHERIT_ONLY_ACE) &&
- (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- entry_type == OWNING_GROUP))
- continue;
-
- if (entry_type == ACE_OWNER || (entry_type == 0 &&
- who == fuid)) {
- if ((access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRUSR))) {
- seen |= S_IRUSR;
- if (type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if ((access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWUSR))) {
- seen |= S_IWUSR;
- if (type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if ((access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXUSR))) {
- seen |= S_IXUSR;
- if (type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- } else if (entry_type == OWNING_GROUP ||
- (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
- if ((access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRGRP))) {
- seen |= S_IRGRP;
- if (type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if ((access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWGRP))) {
- seen |= S_IWGRP;
- if (type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if ((access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXGRP))) {
- seen |= S_IXGRP;
- if (type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- } else if (entry_type == ACE_EVERYONE) {
- if ((access_mask & ACE_READ_DATA)) {
- if (!(seen & S_IRUSR)) {
- seen |= S_IRUSR;
- if (type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if (!(seen & S_IRGRP)) {
- seen |= S_IRGRP;
- if (type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if (!(seen & S_IROTH)) {
- seen |= S_IROTH;
- if (type == ALLOW) {
- mode |= S_IROTH;
- }
- }
- }
- if ((access_mask & ACE_WRITE_DATA)) {
- if (!(seen & S_IWUSR)) {
- seen |= S_IWUSR;
- if (type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if (!(seen & S_IWGRP)) {
- seen |= S_IWGRP;
- if (type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if (!(seen & S_IWOTH)) {
- seen |= S_IWOTH;
- if (type == ALLOW) {
- mode |= S_IWOTH;
- }
- }
- }
- if ((access_mask & ACE_EXECUTE)) {
- if (!(seen & S_IXUSR)) {
- seen |= S_IXUSR;
- if (type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- if (!(seen & S_IXGRP)) {
- seen |= S_IXGRP;
- if (type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- if (!(seen & S_IXOTH)) {
- seen |= S_IXOTH;
- if (type == ALLOW) {
- mode |= S_IXOTH;
- }
- }
- }
- } else {
- /*
- * Only care if this IDENTIFIER_GROUP or
- * USER ACE denies execute access to someone,
- * mode is not affected
- */
- if ((access_mask & ACE_EXECUTE) && type == DENY)
- an_exec_denied = B_TRUE;
- }
- }
-
- /*
- * Failure to allow is effectively a deny, so execute permission
- * is denied if it was never mentioned or if we explicitly
- * weren't allowed it.
- */
- if (!an_exec_denied &&
- ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
- (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
- an_exec_denied = B_TRUE;
-
- if (an_exec_denied)
- *pflags &= ~ZFS_NO_EXECS_DENIED;
- else
- *pflags |= ZFS_NO_EXECS_DENIED;
-
- return (mode);
-}
-
-/*
- * Read an external acl object. If the intent is to modify, always
- * create a new acl and leave any cached acl in place.
- */
-int
-zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
- boolean_t will_modify)
-{
- zfs_acl_t *aclp;
- int aclsize = 0;
- int acl_count = 0;
- zfs_acl_node_t *aclnode;
- zfs_acl_phys_t znode_acl;
- int version;
- int error;
- boolean_t drop_lock = B_FALSE;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- if (zp->z_acl_cached && !will_modify) {
- *aclpp = zp->z_acl_cached;
- return (0);
- }
-
- /*
- * close race where znode could be upgrade while trying to
- * read the znode attributes.
- *
- * But this could only happen if the file isn't already an SA
- * znode
- */
- if (!zp->z_is_sa && !have_lock) {
- mutex_enter(&zp->z_lock);
- drop_lock = B_TRUE;
- }
- version = zfs_znode_acl_version(zp);
-
- if ((error = zfs_acl_znode_info(zp, &aclsize,
- &acl_count, &znode_acl)) != 0) {
- goto done;
- }
-
- aclp = zfs_acl_alloc(version);
-
- aclp->z_acl_count = acl_count;
- aclp->z_acl_bytes = aclsize;
-
- aclnode = zfs_acl_node_alloc(aclsize);
- aclnode->z_ace_count = aclp->z_acl_count;
- aclnode->z_size = aclsize;
-
- if (!zp->z_is_sa) {
- if (znode_acl.z_acl_extern_obj) {
- error = dmu_read(ZTOZSB(zp)->z_os,
- znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
- aclnode->z_acldata, DMU_READ_PREFETCH);
- } else {
- bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
- aclnode->z_size);
- }
- } else {
- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)),
- aclnode->z_acldata, aclnode->z_size);
- }
-
- if (error != 0) {
- zfs_acl_free(aclp);
- zfs_acl_node_free(aclnode);
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = SET_ERROR(EIO);
- goto done;
- }
-
- list_insert_head(&aclp->z_acl, aclnode);
-
- *aclpp = aclp;
- if (!will_modify)
- zp->z_acl_cached = aclp;
-done:
- if (drop_lock)
- mutex_exit(&zp->z_lock);
- return (error);
-}
-
-/*ARGSUSED*/
-void
-zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
- boolean_t start, void *userdata)
-{
- zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
-
- if (start) {
- cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
- } else {
- cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
- cb->cb_acl_node);
- }
- *dataptr = cb->cb_acl_node->z_acldata;
- *length = cb->cb_acl_node->z_size;
-}
-
-int
-zfs_acl_chown_setattr(znode_t *zp)
-{
- int error;
- zfs_acl_t *aclp;
-
- if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL)
- return (0);
-
- ASSERT(MUTEX_HELD(&zp->z_lock));
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
- if (error == 0 && aclp->z_acl_count > 0)
- zp->z_mode = ZTOI(zp)->i_mode =
- zfs_mode_compute(zp->z_mode, aclp,
- &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid),
- KGID_TO_SGID(ZTOI(zp)->i_gid));
-
- /*
- * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL
- * nor a DACL_ACES SA in which case ENOENT is returned from
- * zfs_acl_node_read() when the SA can't be located.
- * Allow chown/chgrp to succeed in these cases rather than
- * returning an error that makes no sense in the context of
- * the caller.
- */
- if (error == ENOENT)
- return (0);
-
- return (error);
-}
-
-static void
-acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1,
- uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone)
-{
- *deny1 = *deny2 = *allow0 = *group = 0;
-
- if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
- *deny1 |= ACE_READ_DATA;
- if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
- *deny1 |= ACE_WRITE_DATA;
- if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
- *deny1 |= ACE_EXECUTE;
-
- if (!(mode & S_IRGRP) && (mode & S_IROTH))
- *deny2 = ACE_READ_DATA;
- if (!(mode & S_IWGRP) && (mode & S_IWOTH))
- *deny2 |= ACE_WRITE_DATA;
- if (!(mode & S_IXGRP) && (mode & S_IXOTH))
- *deny2 |= ACE_EXECUTE;
-
- if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
- *allow0 |= ACE_READ_DATA;
- if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
- *allow0 |= ACE_WRITE_DATA;
- if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
- *allow0 |= ACE_EXECUTE;
-
- *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
- ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
- ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
- if (mode & S_IRUSR)
- *owner |= ACE_READ_DATA;
- if (mode & S_IWUSR)
- *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA;
- if (mode & S_IXUSR)
- *owner |= ACE_EXECUTE;
-
- *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
- ACE_SYNCHRONIZE;
- if (mode & S_IRGRP)
- *group |= ACE_READ_DATA;
- if (mode & S_IWGRP)
- *group |= ACE_WRITE_DATA|ACE_APPEND_DATA;
- if (mode & S_IXGRP)
- *group |= ACE_EXECUTE;
-
- *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
- ACE_SYNCHRONIZE;
- if (mode & S_IROTH)
- *everyone |= ACE_READ_DATA;
- if (mode & S_IWOTH)
- *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA;
- if (mode & S_IXOTH)
- *everyone |= ACE_EXECUTE;
-}
-
-/*
- * ace_trivial:
- * determine whether an ace_t acl is trivial
- *
- * Trivialness implies that the acl is composed of only
- * owner, group, everyone entries. ACL can't
- * have read_acl denied, and write_owner/write_acl/write_attributes
- * can only be owner@ entry.
- */
-static int
-ace_trivial_common(void *acep, int aclcnt,
- uint64_t (*walk)(void *, uint64_t, int aclcnt,
- uint16_t *, uint16_t *, uint32_t *))
-{
- uint16_t flags;
- uint32_t mask;
- uint16_t type;
- uint64_t cookie = 0;
-
- while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
- switch (flags & ACE_TYPE_FLAGS) {
- case ACE_OWNER:
- case ACE_GROUP|ACE_IDENTIFIER_GROUP:
- case ACE_EVERYONE:
- break;
- default:
- return (1);
- }
-
- if (flags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
- ACE_INHERIT_ONLY_ACE))
- return (1);
-
- /*
- * Special check for some special bits
- *
- * Don't allow anybody to deny reading basic
- * attributes or a files ACL.
- */
- if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- (type == ACE_ACCESS_DENIED_ACE_TYPE))
- return (1);
-
- /*
- * Delete permissions are never set by default
- */
- if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
- return (1);
- /*
- * only allow owner@ to have
- * write_acl/write_owner/write_attributes/write_xattr/
- */
- if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
- (!(flags & ACE_OWNER) && (mask &
- (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
- ACE_WRITE_NAMED_ATTRS))))
- return (1);
-
- }
-
- return (0);
-}
-
-/*
- * common code for setting ACLs.
- *
- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
- * already checked the acl and knows whether to inherit.
- */
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
-{
- int error;
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- dmu_object_type_t otype;
- zfs_acl_locator_cb_t locate = { 0 };
- uint64_t mode;
- sa_bulk_attr_t bulk[5];
- uint64_t ctime[2];
- int count = 0;
- zfs_acl_phys_t acl_phys;
-
- mode = zp->z_mode;
-
- mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
- KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid));
-
- zp->z_mode = ZTOI(zp)->i_mode = mode;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
- &mode, sizeof (mode));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, sizeof (zp->z_pflags));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- &ctime, sizeof (ctime));
-
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
- }
-
- /*
- * Upgrade needed?
- */
- if (!zfsvfs->z_use_fuids) {
- otype = DMU_OT_OLDACL;
- } else {
- if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
- (zfsvfs->z_version >= ZPL_VERSION_FUID))
- zfs_acl_xform(zp, aclp, cr);
- ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
- otype = DMU_OT_ACL;
- }
-
- /*
- * Arrgh, we have to handle old on disk format
- * as well as newer (preferred) SA format.
- */
-
- if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
- locate.cb_aclp = aclp;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
- zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
- NULL, &aclp->z_acl_count, sizeof (uint64_t));
- } else { /* Painful legacy way */
- zfs_acl_node_t *aclnode;
- uint64_t off = 0;
- uint64_t aoid;
-
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
- &acl_phys, sizeof (acl_phys))) != 0)
- return (error);
-
- aoid = acl_phys.z_acl_extern_obj;
-
- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- /*
- * If ACL was previously external and we are now
- * converting to new ACL format then release old
- * ACL object and create a new one.
- */
- if (aoid &&
- aclp->z_version != acl_phys.z_acl_version) {
- error = dmu_object_free(zfsvfs->z_os, aoid, tx);
- if (error)
- return (error);
- aoid = 0;
- }
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- otype, aclp->z_acl_bytes,
- otype == DMU_OT_ACL ?
- DMU_OT_SYSACL : DMU_OT_NONE,
- otype == DMU_OT_ACL ?
- DN_OLD_MAX_BONUSLEN : 0, tx);
- } else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os,
- aoid, aclp->z_acl_bytes, 0, tx);
- }
- acl_phys.z_acl_extern_obj = aoid;
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
- off += aclnode->z_size;
- }
- } else {
- void *start = acl_phys.z_ace_data;
- /*
- * Migrating back embedded?
- */
- if (acl_phys.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- acl_phys.z_acl_extern_obj, tx);
- if (error)
- return (error);
- acl_phys.z_acl_extern_obj = 0;
- }
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- bcopy(aclnode->z_acldata, start,
- aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
- }
- }
- /*
- * If Old version then swap count/bytes to match old
- * layout of znode_acl_phys_t.
- */
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- acl_phys.z_acl_size = aclp->z_acl_count;
- acl_phys.z_acl_count = aclp->z_acl_bytes;
- } else {
- acl_phys.z_acl_size = aclp->z_acl_bytes;
- acl_phys.z_acl_count = aclp->z_acl_count;
- }
- acl_phys.z_acl_version = aclp->z_version;
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
- &acl_phys, sizeof (acl_phys));
- }
-
- /*
- * Replace ACL wide bits, but first clear them.
- */
- zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
-
- zp->z_pflags |= aclp->z_hints;
-
- if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
- zp->z_pflags |= ZFS_ACL_TRIVIAL;
-
- zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
- return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
-}
-
-static void
-zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
-{
- void *acep = NULL;
- uint64_t who;
- int new_count, new_bytes;
- int ace_size;
- int entry_type;
- uint16_t iflags, type;
- uint32_t access_mask;
- zfs_acl_node_t *newnode;
- size_t abstract_size = aclp->z_ops->ace_abstract_size();
- void *zacep;
- uint32_t owner, group, everyone;
- uint32_t deny1, deny2, allow0;
-
- new_count = new_bytes = 0;
-
- acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2,
- &owner, &group, &everyone);
-
- newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
-
- zacep = newnode->z_acldata;
- if (allow0) {
- zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER);
- zacep = (void *)((uintptr_t)zacep + abstract_size);
- new_count++;
- new_bytes += abstract_size;
- }
- if (deny1) {
- zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER);
- zacep = (void *)((uintptr_t)zacep + abstract_size);
- new_count++;
- new_bytes += abstract_size;
- }
- if (deny2) {
- zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP);
- zacep = (void *)((uintptr_t)zacep + abstract_size);
- new_count++;
- new_bytes += abstract_size;
- }
-
- while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
- &iflags, &type))) {
- uint16_t inherit_flags;
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
- inherit_flags = (iflags & ALL_INHERIT);
-
- if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- (entry_type == OWNING_GROUP)) &&
- ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) {
- continue;
- }
-
- if ((type != ALLOW && type != DENY) ||
- (inherit_flags & ACE_INHERIT_ONLY_ACE)) {
- if (inherit_flags)
- aclp->z_hints |= ZFS_INHERIT_ACE;
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- aclp->z_hints |= ZFS_ACL_OBJ_ACE;
- break;
- }
- } else {
-
- /*
- * Limit permissions to be no greater than
- * group permissions
- */
- if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) {
- if (!(mode & S_IRGRP))
- access_mask &= ~ACE_READ_DATA;
- if (!(mode & S_IWGRP))
- access_mask &=
- ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- if (!(mode & S_IXGRP))
- access_mask &= ~ACE_EXECUTE;
- access_mask &=
- ~(ACE_WRITE_OWNER|ACE_WRITE_ACL|
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS);
- }
- }
- zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
- ace_size = aclp->z_ops->ace_size(acep);
- zacep = (void *)((uintptr_t)zacep + ace_size);
- new_count++;
- new_bytes += ace_size;
- }
- zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER);
- zacep = (void *)((uintptr_t)zacep + abstract_size);
- zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP);
- zacep = (void *)((uintptr_t)zacep + abstract_size);
- zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE);
-
- new_count += 3;
- new_bytes += abstract_size * 3;
- zfs_acl_release_nodes(aclp);
- aclp->z_acl_count = new_count;
- aclp->z_acl_bytes = new_bytes;
- newnode->z_ace_count = new_count;
- newnode->z_size = new_bytes;
- list_insert_tail(&aclp->z_acl, newnode);
-}
-
-void
-zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
-{
- mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
- *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
- (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
- zfs_acl_chmod(ZTOZSB(zp), mode, *aclp);
- mutex_exit(&zp->z_lock);
- mutex_exit(&zp->z_acl_lock);
- ASSERT(*aclp);
-}
-
-/*
- * strip off write_owner and write_acl
- */
-static void
-zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
-{
- uint32_t mask = aclp->z_ops->ace_mask_get(acep);
-
- if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) &&
- (aclp->z_ops->ace_type_get(acep) == ALLOW)) {
- mask &= ~RESTRICTED_CLEAR;
- aclp->z_ops->ace_mask_set(acep, mask);
- }
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags)
-{
- int iflags = (acep_flags & 0xf);
-
- if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
- return (1);
- else if (iflags & ACE_FILE_INHERIT_ACE)
- return (!(S_ISDIR(obj_mode) &&
- (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
- return (0);
-}
-
-/*
- * inherit inheritable ACEs from parent
- */
-static zfs_acl_t *
-zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp,
- uint64_t mode, boolean_t *need_chmod)
-{
- void *pacep;
- void *acep;
- zfs_acl_node_t *aclnode;
- zfs_acl_t *aclp = NULL;
- uint64_t who;
- uint32_t access_mask;
- uint16_t iflags, newflags, type;
- size_t ace_size;
- void *data1, *data2;
- size_t data1sz, data2sz;
- boolean_t vdir = S_ISDIR(obj_mode);
- boolean_t vreg = S_ISREG(obj_mode);
- boolean_t passthrough, passthrough_x, noallow;
-
- passthrough_x =
- zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X;
- passthrough = passthrough_x ||
- zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH;
- noallow =
- zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW;
-
- *need_chmod = B_TRUE;
- pacep = NULL;
- aclp = zfs_acl_alloc(paclp->z_version);
- if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode))
- return (aclp);
- while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
- &access_mask, &iflags, &type))) {
-
- /*
- * don't inherit bogus ACEs
- */
- if (!zfs_acl_valid_ace_type(type, iflags))
- continue;
-
- if (noallow && type == ALLOW)
- continue;
-
- ace_size = aclp->z_ops->ace_size(pacep);
-
- if (!zfs_ace_can_use(obj_mode, iflags))
- continue;
-
- /*
- * If owner@, group@, or everyone@ inheritable
- * then zfs_acl_chmod() isn't needed.
- */
- if (passthrough &&
- ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
- ((iflags & OWNING_GROUP) ==
- OWNING_GROUP)) && (vreg || (vdir && (iflags &
- ACE_DIRECTORY_INHERIT_ACE)))) {
- *need_chmod = B_FALSE;
- }
-
- if (!vdir && passthrough_x &&
- ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
- access_mask &= ~ACE_EXECUTE;
- }
-
- aclnode = zfs_acl_node_alloc(ace_size);
- list_insert_tail(&aclp->z_acl, aclnode);
- acep = aclnode->z_acldata;
-
- zfs_set_ace(aclp, acep, access_mask, type,
- who, iflags|ACE_INHERITED_ACE);
-
- /*
- * Copy special opaque data if any
- */
- if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
- VERIFY((data2sz = aclp->z_ops->ace_data(acep,
- &data2)) == data1sz);
- bcopy(data1, data2, data2sz);
- }
-
- aclp->z_acl_count++;
- aclnode->z_ace_count++;
- aclp->z_acl_bytes += aclnode->z_size;
- newflags = aclp->z_ops->ace_flags_get(acep);
-
- if (vdir)
- aclp->z_hints |= ZFS_INHERIT_ACE;
-
- if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) {
- newflags &= ~ALL_INHERIT;
- aclp->z_ops->ace_flags_set(acep,
- newflags|ACE_INHERITED_ACE);
- zfs_restricted_update(zfsvfs, aclp, acep);
- continue;
- }
-
- ASSERT(vdir);
-
- /*
- * If only FILE_INHERIT is set then turn on
- * inherit_only
- */
- if ((iflags & (ACE_FILE_INHERIT_ACE |
- ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
- newflags |= ACE_INHERIT_ONLY_ACE;
- aclp->z_ops->ace_flags_set(acep,
- newflags|ACE_INHERITED_ACE);
- } else {
- newflags &= ~ACE_INHERIT_ONLY_ACE;
- aclp->z_ops->ace_flags_set(acep,
- newflags|ACE_INHERITED_ACE);
- }
- }
- return (aclp);
-}
-
-/*
- * Create file system object initial permissions
- * including inheritable ACEs.
- */
-int
-zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
- vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
-{
- int error;
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- zfs_acl_t *paclp;
- gid_t gid = vap->va_gid;
- boolean_t need_chmod = B_TRUE;
- boolean_t inherited = B_FALSE;
-
- bzero(acl_ids, sizeof (zfs_acl_ids_t));
- acl_ids->z_mode = vap->va_mode;
-
- if (vsecp)
- if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp,
- cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
- return (error);
-
- acl_ids->z_fuid = vap->va_uid;
- acl_ids->z_fgid = vap->va_gid;
-#ifdef HAVE_KSID
- /*
- * Determine uid and gid.
- */
- if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
- ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) {
- acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid,
- cr, ZFS_OWNER, &acl_ids->z_fuidp);
- acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
- cr, ZFS_GROUP, &acl_ids->z_fuidp);
- gid = vap->va_gid;
- } else {
- acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
- cr, &acl_ids->z_fuidp);
- acl_ids->z_fgid = 0;
- if (vap->va_mask & AT_GID) {
- acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
- (uint64_t)vap->va_gid,
- cr, ZFS_GROUP, &acl_ids->z_fuidp);
- gid = vap->va_gid;
- if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) &&
- !groupmember(vap->va_gid, cr) &&
- secpolicy_vnode_create_gid(cr) != 0)
- acl_ids->z_fgid = 0;
- }
- if (acl_ids->z_fgid == 0) {
- if (dzp->z_mode & S_ISGID) {
- char *domain;
- uint32_t rid;
-
- acl_ids->z_fgid = KGID_TO_SGID(
- ZTOI(dzp)->i_gid);
- gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
- cr, ZFS_GROUP);
-
- if (zfsvfs->z_use_fuids &&
- IS_EPHEMERAL(acl_ids->z_fgid)) {
- domain = zfs_fuid_idx_domain(
- &zfsvfs->z_fuid_idx,
- FUID_INDEX(acl_ids->z_fgid));
- rid = FUID_RID(acl_ids->z_fgid);
- zfs_fuid_node_add(&acl_ids->z_fuidp,
- domain, rid,
- FUID_INDEX(acl_ids->z_fgid),
- acl_ids->z_fgid, ZFS_GROUP);
- }
- } else {
- acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
- ZFS_GROUP, cr, &acl_ids->z_fuidp);
- gid = crgetgid(cr);
- }
- }
- }
-#endif /* HAVE_KSID */
-
- /*
- * If we're creating a directory, and the parent directory has the
- * set-GID bit set, set in on the new directory.
- * Otherwise, if the user is neither privileged nor a member of the
- * file's new group, clear the file's set-GID bit.
- */
-
- if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
- (S_ISDIR(vap->va_mode))) {
- acl_ids->z_mode |= S_ISGID;
- } else {
- if ((acl_ids->z_mode & S_ISGID) &&
- secpolicy_vnode_setids_setgids(cr, gid) != 0)
- acl_ids->z_mode &= ~S_ISGID;
- }
-
- if (acl_ids->z_aclp == NULL) {
- mutex_enter(&dzp->z_acl_lock);
- mutex_enter(&dzp->z_lock);
- if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) &&
- (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
- !(dzp->z_pflags & ZFS_XATTR)) {
- VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
- &paclp, B_FALSE));
- acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
- vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
- inherited = B_TRUE;
- } else {
- acl_ids->z_aclp =
- zfs_acl_alloc(zfs_acl_version_zp(dzp));
- acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
- }
- mutex_exit(&dzp->z_lock);
- mutex_exit(&dzp->z_acl_lock);
- if (need_chmod) {
- acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ?
- ZFS_ACL_AUTO_INHERIT : 0;
- zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp);
- }
- }
-
- if (inherited || vsecp) {
- acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
- acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
- acl_ids->z_fuid, acl_ids->z_fgid);
- if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
- acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
- }
-
- return (0);
-}
-
-/*
- * Free ACL and fuid_infop, but not the acl_ids structure
- */
-void
-zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
-{
- if (acl_ids->z_aclp)
- zfs_acl_free(acl_ids->z_aclp);
- if (acl_ids->z_fuidp)
- zfs_fuid_info_free(acl_ids->z_fuidp);
- acl_ids->z_aclp = NULL;
- acl_ids->z_fuidp = NULL;
-}
-
-boolean_t
-zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
-{
- return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
- zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
- (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
- zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
-}
-
-/*
- * Retrieve a file's ACL
- */
-int
-zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
- zfs_acl_t *aclp;
- ulong_t mask;
- int error;
- int count = 0;
- int largeace = 0;
-
- mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
- VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
-
- if (mask == 0)
- return (SET_ERROR(ENOSYS));
-
- if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
- return (error);
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
- /*
- * Scan ACL to determine number of ACEs
- */
- if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
- void *zacep = NULL;
- uint64_t who;
- uint32_t access_mask;
- uint16_t type, iflags;
-
- while ((zacep = zfs_acl_next_ace(aclp, zacep,
- &who, &access_mask, &iflags, &type))) {
- switch (type) {
- case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
- case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
- case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
- largeace++;
- continue;
- default:
- count++;
- }
- }
- vsecp->vsa_aclcnt = count;
- } else
- count = (int)aclp->z_acl_count;
-
- if (mask & VSA_ACECNT) {
- vsecp->vsa_aclcnt = count;
- }
-
- if (mask & VSA_ACE) {
- size_t aclsz;
-
- aclsz = count * sizeof (ace_t) +
- sizeof (ace_object_t) * largeace;
-
- vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
- vsecp->vsa_aclentsz = aclsz;
-
- if (aclp->z_version == ZFS_ACL_VERSION_FUID)
- zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr,
- vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
- else {
- zfs_acl_node_t *aclnode;
- void *start = vsecp->vsa_aclentp;
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- bcopy(aclnode->z_acldata, start,
- aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
- }
- ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
- aclp->z_acl_bytes);
- }
- }
- if (mask & VSA_ACE_ACLFLAGS) {
- vsecp->vsa_aclflags = 0;
- if (zp->z_pflags & ZFS_ACL_DEFAULTED)
- vsecp->vsa_aclflags |= ACL_DEFAULTED;
- if (zp->z_pflags & ZFS_ACL_PROTECTED)
- vsecp->vsa_aclflags |= ACL_PROTECTED;
- if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
- vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
- }
-
- mutex_exit(&zp->z_acl_lock);
-
- return (0);
-}
-
-int
-zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode,
- vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
- int aclcnt = vsecp->vsa_aclcnt;
- int error;
-
- if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
- return (SET_ERROR(EINVAL));
-
- aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
-
- aclp->z_hints = 0;
- aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp,
- (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
- aclcnt, &aclnode->z_size)) != 0) {
- zfs_acl_free(aclp);
- zfs_acl_node_free(aclnode);
- return (error);
- }
- } else {
- if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp,
- vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
- &aclnode->z_size, fuidp, cr)) != 0) {
- zfs_acl_free(aclp);
- zfs_acl_node_free(aclnode);
- return (error);
- }
- }
- aclp->z_acl_bytes = aclnode->z_size;
- aclnode->z_ace_count = aclcnt;
- aclp->z_acl_count = aclcnt;
- list_insert_head(&aclp->z_acl, aclnode);
-
- /*
- * If flags are being set then add them to z_hints
- */
- if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
- if (vsecp->vsa_aclflags & ACL_PROTECTED)
- aclp->z_hints |= ZFS_ACL_PROTECTED;
- if (vsecp->vsa_aclflags & ACL_DEFAULTED)
- aclp->z_hints |= ZFS_ACL_DEFAULTED;
- if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
- aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
- }
-
- *zaclp = aclp;
-
- return (0);
-}
-
-/*
- * Set a file's ACL
- */
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- zilog_t *zilog = zfsvfs->z_log;
- ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
- dmu_tx_t *tx;
- int error;
- zfs_acl_t *aclp;
- zfs_fuid_info_t *fuidp = NULL;
- boolean_t fuid_dirtied;
- uint64_t acl_obj;
-
- if (mask == 0)
- return (SET_ERROR(ENOSYS));
-
- if (zp->z_pflags & ZFS_IMMUTABLE)
- return (SET_ERROR(EPERM));
-
- if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
- return (error);
-
- error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
- &aclp);
- if (error)
- return (error);
-
- /*
- * If ACL wide flags aren't being set then preserve any
- * existing flags.
- */
- if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
- aclp->z_hints |=
- (zp->z_pflags & V4_ACL_WIDE_FLAGS);
- }
-top:
- mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
-
- tx = dmu_tx_create(zfsvfs->z_os);
-
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
-
- /*
- * If old version and ACL won't fit in bonus and we aren't
- * upgrading then take out necessary DMU holds
- */
-
- if ((acl_obj = zfs_external_acl(zp)) != 0) {
- if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
- zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx, acl_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
- }
- } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
- }
-
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
- if (error) {
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
-
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- zfs_acl_free(aclp);
- return (error);
- }
-
- error = zfs_aclset_common(zp, aclp, cr, tx);
- ASSERT(error == 0);
- ASSERT(zp->z_acl_cached == NULL);
- zp->z_acl_cached = aclp;
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
-
- if (fuidp)
- zfs_fuid_info_free(fuidp);
- dmu_tx_commit(tx);
-
- mutex_exit(&zp->z_lock);
- mutex_exit(&zp->z_acl_lock);
-
- return (error);
-}
-
-/*
- * Check accesses of interest (AoI) against attributes of the dataset
- * such as read-only. Returns zero if no AoI conflict with dataset
- * attributes, otherwise an appropriate errno is returned.
- */
-static int
-zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
-{
- if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
- (!S_ISDEV(ZTOI(zp)->i_mode) ||
- (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
- return (SET_ERROR(EROFS));
- }
-
- /*
- * Only check for READONLY on non-directories.
- */
- if ((v4_mode & WRITE_MASK_DATA) &&
- ((!S_ISDIR(ZTOI(zp)->i_mode) &&
- (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
- (S_ISDIR(ZTOI(zp)->i_mode) &&
- (zp->z_pflags & ZFS_IMMUTABLE)))) {
- return (SET_ERROR(EPERM));
- }
-
- if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
- (zp->z_pflags & ZFS_NOUNLINK)) {
- return (SET_ERROR(EPERM));
- }
-
- if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
- (zp->z_pflags & ZFS_AV_QUARANTINED))) {
- return (SET_ERROR(EACCES));
- }
-
- return (0);
-}
-
-/*
- * The primary usage of this function is to loop through all of the
- * ACEs in the znode, determining what accesses of interest (AoI) to
- * the caller are allowed or denied. The AoI are expressed as bits in
- * the working_mode parameter. As each ACE is processed, bits covered
- * by that ACE are removed from the working_mode. This removal
- * facilitates two things. The first is that when the working mode is
- * empty (= 0), we know we've looked at all the AoI. The second is
- * that the ACE interpretation rules don't allow a later ACE to undo
- * something granted or denied by an earlier ACE. Removing the
- * discovered access or denial enforces this rule. At the end of
- * processing the ACEs, all AoI that were found to be denied are
- * placed into the working_mode, giving the caller a mask of denied
- * accesses. Returns:
- * 0 if all AoI granted
- * EACCES if the denied mask is non-zero
- * other error if abnormal failure (e.g., IO error)
- *
- * A secondary usage of the function is to determine if any of the
- * AoI are granted. If an ACE grants any access in
- * the working_mode, we immediately short circuit out of the function.
- * This mode is chosen by setting anyaccess to B_TRUE. The
- * working_mode is not a denied access mask upon exit if the function
- * is used in this manner.
- */
-static int
-zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
- boolean_t anyaccess, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- zfs_acl_t *aclp;
- int error;
- uid_t uid = crgetuid(cr);
- uint64_t who;
- uint16_t type, iflags;
- uint16_t entry_type;
- uint32_t access_mask;
- uint32_t deny_mask = 0;
- zfs_ace_hdr_t *acep = NULL;
- boolean_t checkit;
- uid_t gowner;
- uid_t fowner;
-
- zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
- ASSERT(zp->z_acl_cached);
-
- while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
- &iflags, &type))) {
- uint32_t mask_matched;
-
- if (!zfs_acl_valid_ace_type(type, iflags))
- continue;
-
- if (S_ISDIR(ZTOI(zp)->i_mode) &&
- (iflags & ACE_INHERIT_ONLY_ACE))
- continue;
-
- /* Skip ACE if it does not affect any AoI */
- mask_matched = (access_mask & *working_mode);
- if (!mask_matched)
- continue;
-
- entry_type = (iflags & ACE_TYPE_FLAGS);
-
- checkit = B_FALSE;
-
- switch (entry_type) {
- case ACE_OWNER:
- if (uid == fowner)
- checkit = B_TRUE;
- break;
- case OWNING_GROUP:
- who = gowner;
- /*FALLTHROUGH*/
- case ACE_IDENTIFIER_GROUP:
- checkit = zfs_groupmember(zfsvfs, who, cr);
- break;
- case ACE_EVERYONE:
- checkit = B_TRUE;
- break;
-
- /* USER Entry */
- default:
- if (entry_type == 0) {
- uid_t newid;
-
- newid = zfs_fuid_map_id(zfsvfs, who, cr,
- ZFS_ACE_USER);
- if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
- uid == newid)
- checkit = B_TRUE;
- break;
- } else {
- mutex_exit(&zp->z_acl_lock);
- return (SET_ERROR(EIO));
- }
- }
-
- if (checkit) {
- if (type == DENY) {
- DTRACE_PROBE3(zfs__ace__denies,
- znode_t *, zp,
- zfs_ace_hdr_t *, acep,
- uint32_t, mask_matched);
- deny_mask |= mask_matched;
- } else {
- DTRACE_PROBE3(zfs__ace__allows,
- znode_t *, zp,
- zfs_ace_hdr_t *, acep,
- uint32_t, mask_matched);
- if (anyaccess) {
- mutex_exit(&zp->z_acl_lock);
- return (0);
- }
- }
- *working_mode &= ~mask_matched;
- }
-
- /* Are we done? */
- if (*working_mode == 0)
- break;
- }
-
- mutex_exit(&zp->z_acl_lock);
-
- /* Put the found 'denies' back on the working mode */
- if (deny_mask) {
- *working_mode |= deny_mask;
- return (SET_ERROR(EACCES));
- } else if (*working_mode) {
- return (-1);
- }
-
- return (0);
-}
-
-/*
- * Return true if any access whatsoever granted, we don't actually
- * care what access is granted.
- */
-boolean_t
-zfs_has_access(znode_t *zp, cred_t *cr)
-{
- uint32_t have = ACE_ALL_PERMS;
-
- if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
- uid_t owner;
-
- owner = zfs_fuid_map_id(ZTOZSB(zp),
- KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER);
- return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0);
- }
- return (B_TRUE);
-}
-
-static int
-zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
- boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- int err;
-
- *working_mode = v4_mode;
- *check_privs = B_TRUE;
-
- /*
- * Short circuit empty requests
- */
- if (v4_mode == 0 || zfsvfs->z_replay) {
- *working_mode = 0;
- return (0);
- }
-
- if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
- *check_privs = B_FALSE;
- return (err);
- }
-
- /*
- * The caller requested that the ACL check be skipped. This
- * would only happen if the caller checked VOP_ACCESS() with a
- * 32 bit ACE mask and already had the appropriate permissions.
- */
- if (skipaclchk) {
- *working_mode = 0;
- return (0);
- }
-
- return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
-}
-
-static int
-zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
- cred_t *cr)
-{
- if (*working_mode != ACE_WRITE_DATA)
- return (SET_ERROR(EACCES));
-
- return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
- check_privs, B_FALSE, cr));
-}
-
-int
-zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
-{
- boolean_t owner = B_FALSE;
- boolean_t groupmbr = B_FALSE;
- boolean_t is_attr;
- uid_t uid = crgetuid(cr);
- int error;
-
- if (zdp->z_pflags & ZFS_AV_QUARANTINED)
- return (SET_ERROR(EACCES));
-
- is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
- (S_ISDIR(ZTOI(zdp)->i_mode)));
- if (is_attr)
- goto slow;
-
-
- mutex_enter(&zdp->z_acl_lock);
-
- if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
- mutex_exit(&zdp->z_acl_lock);
- return (0);
- }
-
- if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 ||
- KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) {
- mutex_exit(&zdp->z_acl_lock);
- goto slow;
- }
-
- if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
- owner = B_TRUE;
- if (zdp->z_mode & S_IXUSR) {
- mutex_exit(&zdp->z_acl_lock);
- return (0);
- } else {
- mutex_exit(&zdp->z_acl_lock);
- goto slow;
- }
- }
- if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
- groupmbr = B_TRUE;
- if (zdp->z_mode & S_IXGRP) {
- mutex_exit(&zdp->z_acl_lock);
- return (0);
- } else {
- mutex_exit(&zdp->z_acl_lock);
- goto slow;
- }
- }
- if (!owner && !groupmbr) {
- if (zdp->z_mode & S_IXOTH) {
- mutex_exit(&zdp->z_acl_lock);
- return (0);
- }
- }
-
- mutex_exit(&zdp->z_acl_lock);
-
-slow:
- DTRACE_PROBE(zfs__fastpath__execute__access__miss);
- ZFS_ENTER(ZTOZSB(zdp));
- error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
- ZFS_EXIT(ZTOZSB(zdp));
- return (error);
-}
-
-/*
- * Determine whether Access should be granted/denied.
- *
- * The least priv subsystem is always consulted as a basic privilege
- * can define any form of access.
- */
-int
-zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
-{
- uint32_t working_mode;
- int error;
- int is_attr;
- boolean_t check_privs;
- znode_t *xzp;
- znode_t *check_zp = zp;
- mode_t needed_bits;
- uid_t owner;
-
- is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode));
-
- /*
- * If attribute then validate against base file
- */
- if (is_attr) {
- if ((error = zfs_zget(ZTOZSB(zp),
- zp->z_xattr_parent, &xzp)) != 0) {
- return (error);
- }
-
- check_zp = xzp;
-
- /*
- * fixup mode to map to xattr perms
- */
-
- if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
- mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mode |= ACE_WRITE_NAMED_ATTRS;
- }
-
- if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
- mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
- mode |= ACE_READ_NAMED_ATTRS;
- }
- }
-
- owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
- cr, ZFS_OWNER);
- /*
- * Map the bits required to the standard inode flags
- * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits
- * mapped by working_mode (currently missing) in missing_bits.
- * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
- * needed_bits.
- */
- needed_bits = 0;
-
- working_mode = mode;
- if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
- owner == crgetuid(cr))
- working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
-
- if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
- ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
- needed_bits |= S_IRUSR;
- if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
- ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
- needed_bits |= S_IWUSR;
- if (working_mode & ACE_EXECUTE)
- needed_bits |= S_IXUSR;
-
- if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
- &check_privs, skipaclchk, cr)) == 0) {
- if (is_attr)
- iput(ZTOI(xzp));
- return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
- needed_bits, needed_bits));
- }
-
- if (error && !check_privs) {
- if (is_attr)
- iput(ZTOI(xzp));
- return (error);
- }
-
- if (error && (flags & V_APPEND)) {
- error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
- }
-
- if (error && check_privs) {
- mode_t checkmode = 0;
-
- /*
- * First check for implicit owner permission on
- * read_acl/read_attributes
- */
-
- error = 0;
- ASSERT(working_mode != 0);
-
- if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
- owner == crgetuid(cr)))
- working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
-
- if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
- ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
- checkmode |= S_IRUSR;
- if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
- ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
- checkmode |= S_IWUSR;
- if (working_mode & ACE_EXECUTE)
- checkmode |= S_IXUSR;
-
- error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner,
- needed_bits & ~checkmode, needed_bits);
-
- if (error == 0 && (working_mode & ACE_WRITE_OWNER))
- error = secpolicy_vnode_chown(cr, owner);
- if (error == 0 && (working_mode & ACE_WRITE_ACL))
- error = secpolicy_vnode_setdac(cr, owner);
-
- if (error == 0 && (working_mode &
- (ACE_DELETE|ACE_DELETE_CHILD)))
- error = secpolicy_vnode_remove(cr);
-
- if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
- error = secpolicy_vnode_chown(cr, owner);
- }
- if (error == 0) {
- /*
- * See if any bits other than those already checked
- * for are still present. If so then return EACCES
- */
- if (working_mode & ~(ZFS_CHECKED_MASKS)) {
- error = SET_ERROR(EACCES);
- }
- }
- } else if (error == 0) {
- error = secpolicy_vnode_access2(cr, ZTOI(zp), owner,
- needed_bits, needed_bits);
- }
-
- if (is_attr)
- iput(ZTOI(xzp));
-
- return (error);
-}
-
-/*
- * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into
- * native ACL format and call zfs_zaccess()
- */
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
-{
- return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
-}
-
-/*
- * Access function for secpolicy_vnode_setattr
- */
-int
-zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
-{
- int v4_mode = zfs_unix_to_v4(mode >> 6);
-
- return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
-}
-
-static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp,
- mode_t available_perms, cred_t *cr)
-{
- int error;
- uid_t downer;
-
- downer = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid),
- cr, ZFS_OWNER);
-
- error = secpolicy_vnode_access2(cr, ZTOI(dzp),
- downer, available_perms, S_IWUSR|S_IXUSR);
-
- if (error == 0)
- error = zfs_sticky_remove_access(dzp, zp, cr);
-
- return (error);
-}
-
-/*
- * Determine whether Access should be granted/deny, without
- * consulting least priv subsystem.
- *
- * The following chart is the recommended NFSv4 enforcement for
- * ability to delete an object.
- *
- * -------------------------------------------------------
- * | Parent Dir | Target Object Permissions |
- * | permissions | |
- * -------------------------------------------------------
- * | | ACL Allows | ACL Denies| Delete |
- * | | Delete | Delete | unspecified|
- * -------------------------------------------------------
- * | ACL Allows | Permit | Permit | Permit |
- * | DELETE_CHILD | |
- * -------------------------------------------------------
- * | ACL Denies | Permit | Deny | Deny |
- * | DELETE_CHILD | | | |
- * -------------------------------------------------------
- * | ACL specifies | | | |
- * | only allow | Permit | Permit | Permit |
- * | write and | | | |
- * | execute | | | |
- * -------------------------------------------------------
- * | ACL denies | | | |
- * | write and | Permit | Deny | Deny |
- * | execute | | | |
- * -------------------------------------------------------
- * ^
- * |
- * No search privilege, can't even look up file?
- *
- */
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
- uint32_t dzp_working_mode = 0;
- uint32_t zp_working_mode = 0;
- int dzp_error, zp_error;
- mode_t available_perms;
- boolean_t dzpcheck_privs = B_TRUE;
- boolean_t zpcheck_privs = B_TRUE;
-
- /*
- * We want specific DELETE permissions to
- * take precedence over WRITE/EXECUTE. We don't
- * want an ACL such as this to mess us up.
- * user:joe:write_data:deny,user:joe:delete:allow
- *
- * However, deny permissions may ultimately be overridden
- * by secpolicy_vnode_access().
- *
- * We will ask for all of the necessary permissions and then
- * look at the working modes from the directory and target object
- * to determine what was found.
- */
-
- if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
- return (SET_ERROR(EPERM));
-
- /*
- * First row
- * If the directory permissions allow the delete, we are done.
- */
- if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
- return (0);
-
- /*
- * If target object has delete permission then we are done
- */
- if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
- &zpcheck_privs, B_FALSE, cr)) == 0)
- return (0);
-
- ASSERT(dzp_error && zp_error);
-
- if (!dzpcheck_privs)
- return (dzp_error);
- if (!zpcheck_privs)
- return (zp_error);
-
- /*
- * Second row
- *
- * If directory returns EACCES then delete_child was denied
- * due to deny delete_child. In this case send the request through
- * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
- * since that *could* allow the delete based on write/execute permission
- * and we want delete permissions to override write/execute.
- */
-
- if (dzp_error == EACCES)
- return (secpolicy_vnode_remove(cr));
-
- /*
- * Third Row
- * only need to see if we have write/execute on directory.
- */
-
- dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
-
- if (dzp_error != 0 && !dzpcheck_privs)
- return (dzp_error);
-
- /*
- * Fourth row
- */
-
- available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR;
- available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR;
-
- return (zfs_delete_final_check(zp, dzp, available_perms, cr));
-
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
- znode_t *tzp, cred_t *cr)
-{
- int add_perm;
- int error;
-
- if (szp->z_pflags & ZFS_AV_QUARANTINED)
- return (SET_ERROR(EACCES));
-
- add_perm = S_ISDIR(ZTOI(szp)->i_mode) ?
- ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
-
- /*
- * Rename permissions are combination of delete permission +
- * add file/subdir permission.
- */
-
- /*
- * first make sure we do the delete portion.
- *
- * If that succeeds then check for add_file/add_subdir permissions
- */
-
- if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
- return (error);
-
- /*
- * If we have a tzp, see if we can delete it?
- */
- if (tzp) {
- if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
- return (error);
- }
-
- /*
- * Now check for add permissions
- */
- error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
-
- return (error);
-}
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
deleted file mode 100644
index 1e61ef06d..000000000
--- a/module/zfs/zfs_ctldir.c
+++ /dev/null
@@ -1,1240 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- *
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * LLNL-CODE-403049.
- * Rewritten for Linux by:
- * Rohan Puri <[email protected]>
- * Brian Behlendorf <[email protected]>
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2018 George Melikov. All Rights Reserved.
- * Copyright (c) 2019 Datto, Inc. All rights reserved.
- */
-
-/*
- * ZFS control directory (a.k.a. ".zfs")
- *
- * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' and 'shares' directory, but this may
- * expand in the future. The elements are built dynamically, as the hierarchy
- * does not actually exist on disk.
- *
- * For 'snapshot', we don't want to have all snapshots always mounted, because
- * this would take up a huge amount of space in /etc/mnttab. We have three
- * types of objects:
- *
- * ctldir ------> snapshotdir -------> snapshot
- * |
- * |
- * V
- * mounted fs
- *
- * The 'snapshot' node contains just enough information to lookup '..' and act
- * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
- * perform an automount of the underlying filesystem and return the
- * corresponding inode.
- *
- * All mounts are handled automatically by an user mode helper which invokes
- * the mount procedure. Unmounts are handled by allowing the mount
- * point to expire so the kernel may automatically unmount it.
- *
- * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
- * zfsvfs_t as the head filesystem (what '.zfs' lives under).
- *
- * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
- * (ie: snapshots) are complete ZFS filesystems and have their own unique
- * zfsvfs_t. However, the fsid reported by these mounts will be the same
- * as that used by the parent zfsvfs_t to make NFS happy.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/pathname.h>
-#include <sys/vfs.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/stat.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_deleg.h>
-#include <sys/zpl.h>
-#include <sys/mntent.h>
-#include "zfs_namecheck.h"
-
-/*
- * Two AVL trees are maintained which contain all currently automounted
- * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t
- * entry which MUST:
- *
- * - be attached to both trees, and
- * - be unique, no duplicate entries are allowed.
- *
- * The zfs_snapshots_by_name tree is indexed by the full dataset name
- * while the zfs_snapshots_by_objsetid tree is indexed by the unique
- * objsetid. This allows for fast lookups either by name or objsetid.
- */
-static avl_tree_t zfs_snapshots_by_name;
-static avl_tree_t zfs_snapshots_by_objsetid;
-static krwlock_t zfs_snapshot_lock;
-
-/*
- * Control Directory Tunables (.zfs)
- */
-int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
-int zfs_admin_snapshot = 0;
-
-typedef struct {
- char *se_name; /* full snapshot name */
- char *se_path; /* full mount path */
- spa_t *se_spa; /* pool spa */
- uint64_t se_objsetid; /* snapshot objset id */
- struct dentry *se_root_dentry; /* snapshot root dentry */
- taskqid_t se_taskqid; /* scheduled unmount taskqid */
- avl_node_t se_node_name; /* zfs_snapshots_by_name link */
- avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
- zfs_refcount_t se_refcount; /* reference count */
-} zfs_snapentry_t;
-
-static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
-
-/*
- * Allocate a new zfs_snapentry_t being careful to make a copy of the
- * the snapshot name and provided mount point. No reference is taken.
- */
-static zfs_snapentry_t *
-zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa,
- uint64_t objsetid, struct dentry *root_dentry)
-{
- zfs_snapentry_t *se;
-
- se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-
- se->se_name = strdup(full_name);
- se->se_path = strdup(full_path);
- se->se_spa = spa;
- se->se_objsetid = objsetid;
- se->se_root_dentry = root_dentry;
- se->se_taskqid = TASKQID_INVALID;
-
- zfs_refcount_create(&se->se_refcount);
-
- return (se);
-}
-
-/*
- * Free a zfs_snapentry_t the caller must ensure there are no active
- * references.
- */
-static void
-zfsctl_snapshot_free(zfs_snapentry_t *se)
-{
- zfs_refcount_destroy(&se->se_refcount);
- strfree(se->se_name);
- strfree(se->se_path);
-
- kmem_free(se, sizeof (zfs_snapentry_t));
-}
-
-/*
- * Hold a reference on the zfs_snapentry_t.
- */
-static void
-zfsctl_snapshot_hold(zfs_snapentry_t *se)
-{
- zfs_refcount_add(&se->se_refcount, NULL);
-}
-
-/*
- * Release a reference on the zfs_snapentry_t. When the number of
- * references drops to zero the structure will be freed.
- */
-static void
-zfsctl_snapshot_rele(zfs_snapentry_t *se)
-{
- if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
- zfsctl_snapshot_free(se);
-}
-
-/*
- * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
- * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
- * of the trees a reference is held.
- */
-static void
-zfsctl_snapshot_add(zfs_snapentry_t *se)
-{
- ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
- zfsctl_snapshot_hold(se);
- avl_add(&zfs_snapshots_by_name, se);
- avl_add(&zfs_snapshots_by_objsetid, se);
-}
-
-/*
- * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
- * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
- * this can result in the structure being freed if that was the last
- * remaining reference.
- */
-static void
-zfsctl_snapshot_remove(zfs_snapentry_t *se)
-{
- ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
- avl_remove(&zfs_snapshots_by_name, se);
- avl_remove(&zfs_snapshots_by_objsetid, se);
- zfsctl_snapshot_rele(se);
-}
-
-/*
- * Snapshot name comparison function for the zfs_snapshots_by_name.
- */
-static int
-snapentry_compare_by_name(const void *a, const void *b)
-{
- const zfs_snapentry_t *se_a = a;
- const zfs_snapentry_t *se_b = b;
- int ret;
-
- ret = strcmp(se_a->se_name, se_b->se_name);
-
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
-}
-
-/*
- * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
- */
-static int
-snapentry_compare_by_objsetid(const void *a, const void *b)
-{
- const zfs_snapentry_t *se_a = a;
- const zfs_snapentry_t *se_b = b;
-
- if (se_a->se_spa != se_b->se_spa)
- return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
-
- if (se_a->se_objsetid < se_b->se_objsetid)
- return (-1);
- else if (se_a->se_objsetid > se_b->se_objsetid)
- return (1);
- else
- return (0);
-}
-
-/*
- * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname
- * is found a pointer to the zfs_snapentry_t is returned and a reference
- * taken on the structure. The caller is responsible for dropping the
- * reference with zfsctl_snapshot_rele(). If the snapname is not found
- * NULL will be returned.
- */
-static zfs_snapentry_t *
-zfsctl_snapshot_find_by_name(char *snapname)
-{
- zfs_snapentry_t *se, search;
-
- ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
-
- search.se_name = snapname;
- se = avl_find(&zfs_snapshots_by_name, &search, NULL);
- if (se)
- zfsctl_snapshot_hold(se);
-
- return (se);
-}
-
-/*
- * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
- * rather than the snapname. In all other respects it behaves the same
- * as zfsctl_snapshot_find_by_name().
- */
-static zfs_snapentry_t *
-zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
-{
- zfs_snapentry_t *se, search;
-
- ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
-
- search.se_spa = spa;
- search.se_objsetid = objsetid;
- se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
- if (se)
- zfsctl_snapshot_hold(se);
-
- return (se);
-}
-
-/*
- * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is
- * removed, renamed, and added back to the new correct location in the tree.
- */
-static int
-zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
-{
- zfs_snapentry_t *se;
-
- ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
-
- se = zfsctl_snapshot_find_by_name(old_snapname);
- if (se == NULL)
- return (SET_ERROR(ENOENT));
-
- zfsctl_snapshot_remove(se);
- strfree(se->se_name);
- se->se_name = strdup(new_snapname);
- zfsctl_snapshot_add(se);
- zfsctl_snapshot_rele(se);
-
- return (0);
-}
-
-/*
- * Delayed task responsible for unmounting an expired automounted snapshot.
- */
-static void
-snapentry_expire(void *data)
-{
- zfs_snapentry_t *se = (zfs_snapentry_t *)data;
- spa_t *spa = se->se_spa;
- uint64_t objsetid = se->se_objsetid;
-
- if (zfs_expire_snapshot <= 0) {
- zfsctl_snapshot_rele(se);
- return;
- }
-
- se->se_taskqid = TASKQID_INVALID;
- (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
- zfsctl_snapshot_rele(se);
-
- /*
- * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
- * This can occur when the snapshot is busy.
- */
- rw_enter(&zfs_snapshot_lock, RW_READER);
- if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
- zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
- zfsctl_snapshot_rele(se);
- }
- rw_exit(&zfs_snapshot_lock);
-}
-
-/*
- * Cancel an automatic unmount of a snapname. This callback is responsible
- * for dropping the reference on the zfs_snapentry_t which was taken when
- * during dispatch.
- */
-static void
-zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
-{
- if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
- se->se_taskqid = TASKQID_INVALID;
- zfsctl_snapshot_rele(se);
- }
-}
-
-/*
- * Dispatch the unmount task for delayed handling with a hold protecting it.
- */
-static void
-zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
-{
- ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
-
- if (delay <= 0)
- return;
-
- zfsctl_snapshot_hold(se);
- se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
- snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
-}
-
-/*
- * Schedule an automatic unmount of objset id to occur in delay seconds from
- * now. Any previous delayed unmount will be cancelled in favor of the
- * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()
- * and held until the outstanding task is handled or cancelled.
- */
-int
-zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
-{
- zfs_snapentry_t *se;
- int error = ENOENT;
-
- rw_enter(&zfs_snapshot_lock, RW_READER);
- if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
- zfsctl_snapshot_unmount_cancel(se);
- zfsctl_snapshot_unmount_delay_impl(se, delay);
- zfsctl_snapshot_rele(se);
- error = 0;
- }
- rw_exit(&zfs_snapshot_lock);
-
- return (error);
-}
-
-/*
- * Check if snapname is currently mounted. Returned non-zero when mounted
- * and zero when unmounted.
- */
-static boolean_t
-zfsctl_snapshot_ismounted(char *snapname)
-{
- zfs_snapentry_t *se;
- boolean_t ismounted = B_FALSE;
-
- rw_enter(&zfs_snapshot_lock, RW_READER);
- if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
- zfsctl_snapshot_rele(se);
- ismounted = B_TRUE;
- }
- rw_exit(&zfs_snapshot_lock);
-
- return (ismounted);
-}
-
-/*
- * Check if the given inode is a part of the virtual .zfs directory.
- */
-boolean_t
-zfsctl_is_node(struct inode *ip)
-{
- return (ITOZ(ip)->z_is_ctldir);
-}
-
-/*
- * Check if the given inode is a .zfs/snapshots/snapname directory.
- */
-boolean_t
-zfsctl_is_snapdir(struct inode *ip)
-{
- return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
-}
-
-/*
- * Allocate a new inode with the passed id and ops.
- */
-static struct inode *
-zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
- const struct file_operations *fops, const struct inode_operations *ops)
-{
- inode_timespec_t now;
- struct inode *ip;
- znode_t *zp;
-
- ip = new_inode(zfsvfs->z_sb);
- if (ip == NULL)
- return (NULL);
-
- now = current_time(ip);
- zp = ITOZ(ip);
- ASSERT3P(zp->z_dirlocks, ==, NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
- zp->z_id = id;
- zp->z_unlinked = B_FALSE;
- zp->z_atime_dirty = B_FALSE;
- zp->z_zn_prefetch = B_FALSE;
- zp->z_moved = B_FALSE;
- zp->z_is_sa = B_FALSE;
- zp->z_is_mapped = B_FALSE;
- zp->z_is_ctldir = B_TRUE;
- zp->z_is_stale = B_FALSE;
- zp->z_sa_hdl = NULL;
- zp->z_blksz = 0;
- zp->z_seq = 0;
- zp->z_mapcnt = 0;
- zp->z_size = 0;
- zp->z_pflags = 0;
- zp->z_mode = 0;
- zp->z_sync_cnt = 0;
- ip->i_generation = 0;
- ip->i_ino = id;
- ip->i_mode = (S_IFDIR | S_IRWXUGO);
- ip->i_uid = SUID_TO_KUID(0);
- ip->i_gid = SGID_TO_KGID(0);
- ip->i_blkbits = SPA_MINBLOCKSHIFT;
- ip->i_atime = now;
- ip->i_mtime = now;
- ip->i_ctime = now;
- ip->i_fop = fops;
- ip->i_op = ops;
-#if defined(IOP_XATTR)
- ip->i_opflags &= ~IOP_XATTR;
-#endif
-
- if (insert_inode_locked(ip)) {
- unlock_new_inode(ip);
- iput(ip);
- return (NULL);
- }
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- list_insert_tail(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes++;
- membar_producer();
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- unlock_new_inode(ip);
-
- return (ip);
-}
-
-/*
- * Lookup the inode with given id, it will be allocated if needed.
- */
-static struct inode *
-zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
- const struct file_operations *fops, const struct inode_operations *ops)
-{
- struct inode *ip = NULL;
-
- while (ip == NULL) {
- ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
- if (ip)
- break;
-
- /* May fail due to concurrent zfsctl_inode_alloc() */
- ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
- }
-
- return (ip);
-}
-
-/*
- * Create the '.zfs' directory. This directory is cached as part of the VFS
- * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1. This reference
- * is removed when the ctldir is destroyed in the unmount. All other entities
- * under the '.zfs' directory are created dynamically as needed.
- *
- * Because the dynamically created '.zfs' directory entries assume the use
- * of 64-bit inode numbers this support must be disabled on 32-bit systems.
- */
-int
-zfsctl_create(zfsvfs_t *zfsvfs)
-{
- ASSERT(zfsvfs->z_ctldir == NULL);
-
- zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
- &zpl_fops_root, &zpl_ops_root);
- if (zfsvfs->z_ctldir == NULL)
- return (SET_ERROR(ENOENT));
-
- return (0);
-}
-
-/*
- * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
- * Only called when the filesystem is unmounted.
- */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
-{
- if (zfsvfs->z_issnap) {
- zfs_snapentry_t *se;
- spa_t *spa = zfsvfs->z_os->os_spa;
- uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
-
- rw_enter(&zfs_snapshot_lock, RW_WRITER);
- se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
- if (se != NULL)
- zfsctl_snapshot_remove(se);
- rw_exit(&zfs_snapshot_lock);
- if (se != NULL) {
- zfsctl_snapshot_unmount_cancel(se);
- zfsctl_snapshot_rele(se);
- }
- } else if (zfsvfs->z_ctldir) {
- iput(zfsvfs->z_ctldir);
- zfsvfs->z_ctldir = NULL;
- }
-}
-
-/*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
- */
-struct inode *
-zfsctl_root(znode_t *zp)
-{
- ASSERT(zfs_has_ctldir(zp));
- igrab(ZTOZSB(zp)->z_ctldir);
- return (ZTOZSB(zp)->z_ctldir);
-}
-
-/*
- * Generate a long fid to indicate a snapdir. We encode whether snapdir is
- * already mounted in gen field. We do this because nfsd lookup will not
- * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
- * this and do automount and return ESTALE to force nfsd revalidate and follow
- * mount.
- */
-static int
-zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
-{
- zfid_short_t *zfid = (zfid_short_t *)fidp;
- zfid_long_t *zlfid = (zfid_long_t *)fidp;
- uint32_t gen = 0;
- uint64_t object;
- uint64_t objsetid;
- int i;
- struct dentry *dentry;
-
- if (fidp->fid_len < LONG_FID_LEN) {
- fidp->fid_len = LONG_FID_LEN;
- return (SET_ERROR(ENOSPC));
- }
-
- object = ip->i_ino;
- objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
- zfid->zf_len = LONG_FID_LEN;
-
- dentry = d_obtain_alias(igrab(ip));
- if (!IS_ERR(dentry)) {
- gen = !!d_mountpoint(dentry);
- dput(dentry);
- }
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- zlfid->zf_setgen[i] = 0;
-
- return (0);
-}
-
-/*
- * Generate an appropriate fid for an entry in the .zfs directory.
- */
-int
-zfsctl_fid(struct inode *ip, fid_t *fidp)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- uint64_t object = zp->z_id;
- zfid_short_t *zfid;
- int i;
-
- ZFS_ENTER(zfsvfs);
-
- if (zfsctl_is_snapdir(ip)) {
- ZFS_EXIT(zfsvfs);
- return (zfsctl_snapdir_fid(ip, fidp));
- }
-
- if (fidp->fid_len < SHORT_FID_LEN) {
- fidp->fid_len = SHORT_FID_LEN;
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOSPC));
- }
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = SHORT_FID_LEN;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* .zfs znodes always have a generation number of 0 */
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = 0;
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Construct a full dataset name in full_name: "pool/dataset@snap_name"
- */
-static int
-zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
- char *full_name)
-{
- objset_t *os = zfsvfs->z_os;
-
- if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
- return (SET_ERROR(EILSEQ));
-
- dmu_objset_name(os, full_name);
- if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
- return (SET_ERROR(ENAMETOOLONG));
-
- (void) strcat(full_name, "@");
- (void) strcat(full_name, snap_name);
-
- return (0);
-}
-
-/*
- * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
- */
-static int
-zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
- int path_len, char *full_path)
-{
- objset_t *os = zfsvfs->z_os;
- fstrans_cookie_t cookie;
- char *snapname;
- boolean_t case_conflict;
- uint64_t id, pos = 0;
- int error = 0;
-
- if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
- return (SET_ERROR(ENOENT));
-
- cookie = spl_fstrans_mark();
- snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-
- while (error == 0) {
- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
- error = dmu_snapshot_list_next(zfsvfs->z_os,
- ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
- &case_conflict);
- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
- if (error)
- goto out;
-
- if (id == objsetid)
- break;
- }
-
- snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
- zfsvfs->z_vfs->vfs_mntpoint, snapname);
-out:
- kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
- spl_fstrans_unmark(cookie);
-
- return (error);
-}
-
-/*
- * Special case the handling of "..".
- */
-int
-zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
- int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(name, "..") == 0) {
- *ipp = dip->i_sb->s_root->d_inode;
- } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
- *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
- &zpl_fops_snapdir, &zpl_ops_snapdir);
- } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
- *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
- &zpl_fops_shares, &zpl_ops_shares);
- } else {
- *ipp = NULL;
- }
-
- if (*ipp == NULL)
- error = SET_ERROR(ENOENT);
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Lookup entry point for the 'snapshot' directory. Try to open the
- * snapshot if it exist, creating the pseudo filesystem inode as necessary.
- */
-int
-zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
- int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- uint64_t id;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
- &simple_dir_operations, &simple_dir_inode_operations);
- if (*ipp == NULL)
- error = SET_ERROR(ENOENT);
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Renaming a directory under '.zfs/snapshot' will automatically trigger
- * a rename of the snapshot to the new given name. The rename is confined
- * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
- */
-int
-zfsctl_snapdir_rename(struct inode *sdip, char *snm,
- struct inode *tdip, char *tnm, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = ITOZSB(sdip);
- char *to, *from, *real, *fsname;
- int error;
-
- if (!zfs_admin_snapshot)
- return (SET_ERROR(EACCES));
-
- ZFS_ENTER(zfsvfs);
-
- to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
- from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
- real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
- fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
- ZFS_MAX_DATASET_NAME_LEN, NULL);
- if (error == 0) {
- snm = real;
- } else if (error != ENOTSUP) {
- goto out;
- }
- }
-
- dmu_objset_name(zfsvfs->z_os, fsname);
-
- error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
- ZFS_MAX_DATASET_NAME_LEN, from);
- if (error == 0)
- error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
- ZFS_MAX_DATASET_NAME_LEN, to);
- if (error == 0)
- error = zfs_secpolicy_rename_perms(from, to, cr);
- if (error != 0)
- goto out;
-
- /*
- * Cannot move snapshots out of the snapdir.
- */
- if (sdip != tdip) {
- error = SET_ERROR(EINVAL);
- goto out;
- }
-
- /*
- * No-op when names are identical.
- */
- if (strcmp(snm, tnm) == 0) {
- error = 0;
- goto out;
- }
-
- rw_enter(&zfs_snapshot_lock, RW_WRITER);
-
- error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
- if (error == 0)
- (void) zfsctl_snapshot_rename(snm, tnm);
-
- rw_exit(&zfs_snapshot_lock);
-out:
- kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
- kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
- kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
- kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Removing a directory under '.zfs/snapshot' will automatically trigger
- * the removal of the snapshot with the given name.
- */
-int
-zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- char *snapname, *real;
- int error;
-
- if (!zfs_admin_snapshot)
- return (SET_ERROR(EACCES));
-
- ZFS_ENTER(zfsvfs);
-
- snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
- real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
- ZFS_MAX_DATASET_NAME_LEN, NULL);
- if (error == 0) {
- name = real;
- } else if (error != ENOTSUP) {
- goto out;
- }
- }
-
- error = zfsctl_snapshot_name(ITOZSB(dip), name,
- ZFS_MAX_DATASET_NAME_LEN, snapname);
- if (error == 0)
- error = zfs_secpolicy_destroy_perms(snapname, cr);
- if (error != 0)
- goto out;
-
- error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
- if ((error == 0) || (error == ENOENT))
- error = dsl_destroy_snapshot(snapname, B_FALSE);
-out:
- kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
- kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Creating a directory under '.zfs/snapshot' will automatically trigger
- * the creation of a new snapshot with the given name.
- */
-int
-zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
- struct inode **ipp, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- char *dsname;
- int error;
-
- if (!zfs_admin_snapshot)
- return (SET_ERROR(EACCES));
-
- dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-
- if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
- error = SET_ERROR(EILSEQ);
- goto out;
- }
-
- dmu_objset_name(zfsvfs->z_os, dsname);
-
- error = zfs_secpolicy_snapshot_perms(dsname, cr);
- if (error != 0)
- goto out;
-
- if (error == 0) {
- error = dmu_objset_snapshot_one(dsname, dirname);
- if (error != 0)
- goto out;
-
- error = zfsctl_snapdir_lookup(dip, dirname, ipp,
- 0, cr, NULL, NULL);
- }
-out:
- kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
-
- return (error);
-}
-
-/*
- * Attempt to unmount a snapshot by making a call to user space.
- * There is no assurance that this can or will succeed, is just a
- * best effort. In the case where it does fail, perhaps because
- * it's in use, the unmount will fail harmlessly.
- */
-int
-zfsctl_snapshot_unmount(char *snapname, int flags)
-{
- char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
- NULL };
- char *envp[] = { NULL };
- zfs_snapentry_t *se;
- int error;
-
- rw_enter(&zfs_snapshot_lock, RW_READER);
- if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
- rw_exit(&zfs_snapshot_lock);
- return (SET_ERROR(ENOENT));
- }
- rw_exit(&zfs_snapshot_lock);
-
- if (flags & MNT_FORCE)
- argv[4] = "-fn";
- argv[5] = se->se_path;
- dprintf("unmount; path=%s\n", se->se_path);
- error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- zfsctl_snapshot_rele(se);
-
-
- /*
- * The umount system utility will return 256 on error. We must
- * assume this error is because the file system is busy so it is
- * converted to the more sensible EBUSY.
- */
- if (error)
- error = SET_ERROR(EBUSY);
-
- return (error);
-}
-
-int
-zfsctl_snapshot_mount(struct path *path, int flags)
-{
- struct dentry *dentry = path->dentry;
- struct inode *ip = dentry->d_inode;
- zfsvfs_t *zfsvfs;
- zfsvfs_t *snap_zfsvfs;
- zfs_snapentry_t *se;
- char *full_name, *full_path;
- char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
- NULL };
- char *envp[] = { NULL };
- int error;
- struct path spath;
-
- if (ip == NULL)
- return (SET_ERROR(EISDIR));
-
- zfsvfs = ITOZSB(ip);
- ZFS_ENTER(zfsvfs);
-
- full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
- full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-
- error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
- ZFS_MAX_DATASET_NAME_LEN, full_name);
- if (error)
- goto error;
-
- /*
- * Construct a mount point path from sb of the ctldir inode and dirent
- * name, instead of from d_path(), so that chroot'd process doesn't fail
- * on mount.zfs(8).
- */
- snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
- zfsvfs->z_vfs->vfs_mntpoint, dname(dentry));
-
- /*
- * Multiple concurrent automounts of a snapshot are never allowed.
- * The snapshot may be manually mounted as many times as desired.
- */
- if (zfsctl_snapshot_ismounted(full_name)) {
- error = 0;
- goto error;
- }
-
- /*
- * Attempt to mount the snapshot from user space. Normally this
- * would be done using the vfs_kern_mount() function, however that
- * function is marked GPL-only and cannot be used. On error we
- * careful to log the real error to the console and return EISDIR
- * to safely abort the automount. This should be very rare.
- *
- * If the user mode helper happens to return EBUSY, a concurrent
- * mount is already in progress in which case the error is ignored.
- * Take note that if the program was executed successfully the return
- * value from call_usermodehelper() will be (exitcode << 8 + signal).
- */
- dprintf("mount; name=%s path=%s\n", full_name, full_path);
- argv[5] = full_name;
- argv[6] = full_path;
- error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- if (error) {
- if (!(error & MOUNT_BUSY << 8)) {
- zfs_dbgmsg("Unable to automount %s error=%d",
- full_path, error);
- error = SET_ERROR(EISDIR);
- } else {
- /*
- * EBUSY, this could mean a concurrent mount, or the
- * snapshot has already been mounted at completely
- * different place. We return 0 so VFS will retry. For
- * the latter case the VFS will retry several times
- * and return ELOOP, which is probably not a very good
- * behavior.
- */
- error = 0;
- }
- goto error;
- }
-
- /*
- * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
- * to identify this as an automounted filesystem.
- */
- spath = *path;
- path_get(&spath);
- if (zpl_follow_down_one(&spath)) {
- snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
- snap_zfsvfs->z_parent = zfsvfs;
- dentry = spath.dentry;
- spath.mnt->mnt_flags |= MNT_SHRINKABLE;
-
- rw_enter(&zfs_snapshot_lock, RW_WRITER);
- se = zfsctl_snapshot_alloc(full_name, full_path,
- snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
- dentry);
- zfsctl_snapshot_add(se);
- zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
- rw_exit(&zfs_snapshot_lock);
- }
- path_put(&spath);
-error:
- kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
- kmem_free(full_path, MAXPATHLEN);
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Get the snapdir inode from fid
- */
-int
-zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
- struct inode **ipp)
-{
- int error;
- struct path path;
- char *mnt;
- struct dentry *dentry;
-
- mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
- error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
- MAXPATHLEN, mnt);
- if (error)
- goto out;
-
- /* Trigger automount */
- error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
- if (error)
- goto out;
-
- path_put(&path);
- /*
- * Get the snapdir inode. Note, we don't want to use the above
- * path because it contains the root of the snapshot rather
- * than the snapdir.
- */
- *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
- if (*ipp == NULL) {
- error = SET_ERROR(ENOENT);
- goto out;
- }
-
- /* check gen, see zfsctl_snapdir_fid */
- dentry = d_obtain_alias(igrab(*ipp));
- if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
- iput(*ipp);
- *ipp = NULL;
- error = SET_ERROR(ENOENT);
- }
- if (!IS_ERR(dentry))
- dput(dentry);
-out:
- kmem_free(mnt, MAXPATHLEN);
- return (error);
-}
-
-int
-zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
- int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- struct inode *ip;
- znode_t *dzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
- }
-
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
- error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
- iput(ZTOI(dzp));
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Initialize the various pieces we'll need to create and manipulate .zfs
- * directories. Currently this is unused but available.
- */
-void
-zfsctl_init(void)
-{
- avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
- sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
- se_node_name));
- avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
- sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
- se_node_objsetid));
- rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
-}
-
-/*
- * Cleanup the various pieces we needed for .zfs directories. In particular
- * ensure the expiry timer is canceled safely.
- */
-void
-zfsctl_fini(void)
-{
- avl_destroy(&zfs_snapshots_by_name);
- avl_destroy(&zfs_snapshots_by_objsetid);
- rw_destroy(&zfs_snapshot_lock);
-}
-
-module_param(zfs_admin_snapshot, int, 0644);
-MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
-
-module_param(zfs_expire_snapshot, int, 0644);
-MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c
deleted file mode 100644
index 538533d27..000000000
--- a/module/zfs/zfs_debug.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-
-typedef struct zfs_dbgmsg {
- procfs_list_node_t zdm_node;
- time_t zdm_timestamp;
- int zdm_size;
- char zdm_msg[1]; /* variable length allocation */
-} zfs_dbgmsg_t;
-
-procfs_list_t zfs_dbgmsgs;
-int zfs_dbgmsg_size = 0;
-int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
-
-/*
- * Internal ZFS debug messages are enabled by default.
- *
- * # Print debug messages
- * cat /proc/spl/kstat/zfs/dbgmsg
- *
- * # Disable the kernel debug message log.
- * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
- *
- * # Clear the kernel debug message log.
- * echo 0 >/proc/spl/kstat/zfs/dbgmsg
- */
-int zfs_dbgmsg_enable = 1;
-
-static int
-zfs_dbgmsg_show_header(struct seq_file *f)
-{
- seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
- return (0);
-}
-
-static int
-zfs_dbgmsg_show(struct seq_file *f, void *p)
-{
- zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
- seq_printf(f, "%-12llu %-s\n",
- (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
- return (0);
-}
-
-static void
-zfs_dbgmsg_purge(int max_size)
-{
- while (zfs_dbgmsg_size > max_size) {
- zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
- if (zdm == NULL)
- return;
-
- int size = zdm->zdm_size;
- kmem_free(zdm, size);
- zfs_dbgmsg_size -= size;
- }
-}
-
-static int
-zfs_dbgmsg_clear(procfs_list_t *procfs_list)
-{
- mutex_enter(&zfs_dbgmsgs.pl_lock);
- zfs_dbgmsg_purge(0);
- mutex_exit(&zfs_dbgmsgs.pl_lock);
- return (0);
-}
-
-void
-zfs_dbgmsg_init(void)
-{
- procfs_list_install("zfs",
- "dbgmsg",
- 0600,
- &zfs_dbgmsgs,
- zfs_dbgmsg_show,
- zfs_dbgmsg_show_header,
- zfs_dbgmsg_clear,
- offsetof(zfs_dbgmsg_t, zdm_node));
-}
-
-void
-zfs_dbgmsg_fini(void)
-{
- procfs_list_uninstall(&zfs_dbgmsgs);
- zfs_dbgmsg_purge(0);
-
- /*
- * TODO - decide how to make this permanent
- */
-#ifdef _KERNEL
- procfs_list_destroy(&zfs_dbgmsgs);
-#endif
-}
-
-void
-__set_error(const char *file, const char *func, int line, int err)
-{
- /*
- * To enable this:
- *
- * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
- */
- if (zfs_flags & ZFS_DEBUG_SET_ERROR)
- __dprintf(B_FALSE, file, func, line, "error %lu", err);
-}
-
-void
-__zfs_dbgmsg(char *buf)
-{
- int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
- zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
- zdm->zdm_size = size;
- zdm->zdm_timestamp = gethrestime_sec();
- strcpy(zdm->zdm_msg, buf);
-
- mutex_enter(&zfs_dbgmsgs.pl_lock);
- procfs_list_add(&zfs_dbgmsgs, zdm);
- zfs_dbgmsg_size += size;
- zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
- mutex_exit(&zfs_dbgmsgs.pl_lock);
-}
-
-#ifdef _KERNEL
-
-void
-__dprintf(boolean_t dprint, const char *file, const char *func,
- int line, const char *fmt, ...)
-{
- const char *newfile;
- va_list adx;
- size_t size;
- char *buf;
- char *nl;
- int i;
- char *prefix = (dprint) ? "dprintf: " : "";
-
- size = 1024;
- buf = kmem_alloc(size, KM_SLEEP);
-
- /*
- * Get rid of annoying prefix to filename.
- */
- newfile = strrchr(file, '/');
- if (newfile != NULL) {
- newfile = newfile + 1; /* Get rid of leading / */
- } else {
- newfile = file;
- }
-
- i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
-
- if (i < size) {
- va_start(adx, fmt);
- (void) vsnprintf(buf + i, size - i, fmt, adx);
- va_end(adx);
- }
-
- /*
- * Get rid of trailing newline for dprintf logs.
- */
- if (dprint && buf[0] != '\0') {
- nl = &buf[strlen(buf) - 1];
- if (*nl == '\n')
- *nl = '\0';
- }
-
- /*
- * To get this data enable the zfs__dprintf trace point as shown:
- *
- * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
- * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
- * $ echo 0 > /sys/kernel/debug/tracing/trace
- *
- * # Dump the ring buffer.
- * $ cat /sys/kernel/debug/tracing/trace
- */
- DTRACE_PROBE1(zfs__dprintf, char *, buf);
-
- /*
- * To get this data:
- *
- * $ cat /proc/spl/kstat/zfs/dbgmsg
- *
- * To clear the buffer:
- * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
- */
- __zfs_dbgmsg(buf);
-
- kmem_free(buf, size);
-}
-
-#else
-
-void
-zfs_dbgmsg_print(const char *tag)
-{
- ssize_t ret __attribute__((unused));
-
- /*
- * We use write() in this function instead of printf()
- * so it is safe to call from a signal handler.
- */
- ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
- ret = write(STDOUT_FILENO, tag, strlen(tag));
- ret = write(STDOUT_FILENO, ") START:\n", 9);
-
- mutex_enter(&zfs_dbgmsgs.pl_lock);
- for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
- zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
- ret = write(STDOUT_FILENO, zdm->zdm_msg,
- strlen(zdm->zdm_msg));
- ret = write(STDOUT_FILENO, "\n", 1);
- }
-
- ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
- ret = write(STDOUT_FILENO, tag, strlen(tag));
- ret = write(STDOUT_FILENO, ") END\n", 6);
-
- mutex_exit(&zfs_dbgmsgs.pl_lock);
-}
-#endif /* _KERNEL */
-
-#ifdef _KERNEL
-module_param(zfs_dbgmsg_enable, int, 0644);
-MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
-
-module_param(zfs_dbgmsg_maxsize, int, 0644);
-MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
-#endif
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
deleted file mode 100644
index 6bdad737c..000000000
--- a/module/zfs/zfs_dir.c
+++ /dev/null
@@ -1,1205 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/mode.h>
-#include <sys/kmem.h>
-#include <sys/uio.h>
-#include <sys/pathname.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/sunddi.h>
-#include <sys/random.h>
-#include <sys/policy.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_vnops.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/dmu.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/sa.h>
-#include <sys/zfs_sa.h>
-
-/*
- * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
- * of names after deciding which is the appropriate lookup interface.
- */
-static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt,
- boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
-{
- boolean_t conflict = B_FALSE;
- int error;
-
- if (zfsvfs->z_norm) {
- size_t bufsz = 0;
- char *buf = NULL;
-
- if (rpnp) {
- buf = rpnp->pn_buf;
- bufsz = rpnp->pn_bufsize;
- }
-
- /*
- * In the non-mixed case we only expect there would ever
- * be one match, but we need to use the normalizing lookup.
- */
- error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
- zoid, mt, buf, bufsz, &conflict);
- } else {
- error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
- }
-
- /*
- * Allow multiple entries provided the first entry is
- * the object id. Non-zpl consumers may safely make
- * use of the additional space.
- *
- * XXX: This should be a feature flag for compatibility
- */
- if (error == EOVERFLOW)
- error = 0;
-
- if (zfsvfs->z_norm && !error && deflags)
- *deflags = conflict ? ED_CASE_CONFLICT : 0;
-
- *zoid = ZFS_DIRENT_OBJ(*zoid);
-
- return (error);
-}
-
-/*
- * Lock a directory entry. A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object. As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
- *
- * Input arguments:
- * dzp - znode for directory
- * name - name of entry to lock
- * flag - ZNEW: if the entry already exists, fail with EEXIST.
- * ZEXISTS: if the entry does not exist, fail with ENOENT.
- * ZSHARED: allow concurrent access with other ZSHARED callers.
- * ZXATTR: we want dzp's xattr directory
- * ZCILOOK: On a mixed sensitivity file system,
- * this lookup should be case-insensitive.
- * ZCIEXACT: On a purely case-insensitive file system,
- * this lookup should be case-sensitive.
- * ZRENAMING: we are locking for renaming, force narrow locks
- * ZHAVELOCK: Don't grab the z_name_lock for this call. The
- * current thread already holds it.
- *
- * Output arguments:
- * zpp - pointer to the znode for the entry (NULL if there isn't one)
- * dlpp - pointer to the dirlock for this entry (NULL on error)
- * direntflags - (case-insensitive lookup only)
- * flags if multiple case-sensitive matches exist in directory
- * realpnp - (case-insensitive lookup only)
- * actual name matched within the directory
- *
- * Return value: 0 on success or errno on failure.
- *
- * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- * but return znode pointers to a single match.
- */
-int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
- int flag, int *direntflags, pathname_t *realpnp)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- zfs_dirlock_t *dl;
- boolean_t update;
- matchtype_t mt = 0;
- uint64_t zoid;
- int error = 0;
- int cmpflags;
-
- *zpp = NULL;
- *dlpp = NULL;
-
- /*
- * Verify that we are not trying to lock '.', '..', or '.zfs'
- */
- if ((name[0] == '.' &&
- (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
- (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
- return (SET_ERROR(EEXIST));
-
- /*
- * Case sensitivity and normalization preferences are set when
- * the file system is created. These are stored in the
- * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
- * affect what vnodes can be cached in the DNLC, how we
- * perform zap lookups, and the "width" of our dirlocks.
- *
- * A normal dirlock locks a single name. Note that with
- * normalization a name can be composed multiple ways, but
- * when normalized, these names all compare equal. A wide
- * dirlock locks multiple names. We need these when the file
- * system is supporting mixed-mode access. It is sometimes
- * necessary to lock all case permutations of file name at
- * once so that simultaneous case-insensitive/case-sensitive
- * behaves as rationally as possible.
- */
-
- /*
- * When matching we may need to normalize & change case according to
- * FS settings.
- *
- * Note that a normalized match is necessary for a case insensitive
- * filesystem when the lookup request is not exact because normalization
- * can fold case independent of normalizing code point sequences.
- *
- * See the table above zfs_dropname().
- */
- if (zfsvfs->z_norm != 0) {
- mt = MT_NORMALIZE;
-
- /*
- * Determine if the match needs to honor the case specified in
- * lookup, and if so keep track of that so that during
- * normalization we don't fold case.
- */
- if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
- (flag & ZCIEXACT)) ||
- (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
- mt |= MT_MATCH_CASE;
- }
- }
-
- /*
- * Only look in or update the DNLC if we are looking for the
- * name on a file system that does not require normalization
- * or case folding. We can also look there if we happen to be
- * on a non-normalizing, mixed sensitivity file system IF we
- * are looking for the exact name.
- *
- * Maybe can add TO-UPPERed version of name to dnlc in ci-only
- * case for performance improvement?
- */
- update = !zfsvfs->z_norm ||
- (zfsvfs->z_case == ZFS_CASE_MIXED &&
- !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
-
- /*
- * ZRENAMING indicates we are in a situation where we should
- * take narrow locks regardless of the file system's
- * preferences for normalizing and case folding. This will
- * prevent us deadlocking trying to grab the same wide lock
- * twice if the two names happen to be case-insensitive
- * matches.
- */
- if (flag & ZRENAMING)
- cmpflags = 0;
- else
- cmpflags = zfsvfs->z_norm;
-
- /*
- * Wait until there are no locks on this name.
- *
- * Don't grab the lock if it is already held. However, cannot
- * have both ZSHARED and ZHAVELOCK together.
- */
- ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
- if (!(flag & ZHAVELOCK))
- rw_enter(&dzp->z_name_lock, RW_READER);
-
- mutex_enter(&dzp->z_lock);
- for (;;) {
- if (dzp->z_unlinked && !(flag & ZXATTR)) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
- if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
- U8_UNICODE_LATEST, &error) == 0) || error != 0)
- break;
- }
- if (error != 0) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- if (dl == NULL) {
- /*
- * Allocate a new dirlock and add it to the list.
- */
- dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
- cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
- dl->dl_name = name;
- dl->dl_sharecnt = 0;
- dl->dl_namelock = 0;
- dl->dl_namesize = 0;
- dl->dl_dzp = dzp;
- dl->dl_next = dzp->z_dirlocks;
- dzp->z_dirlocks = dl;
- break;
- }
- if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
- break;
- cv_wait(&dl->dl_cv, &dzp->z_lock);
- }
-
- /*
- * If the z_name_lock was NOT held for this dirlock record it.
- */
- if (flag & ZHAVELOCK)
- dl->dl_namelock = 1;
-
- if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
- /*
- * We're the second shared reference to dl. Make a copy of
- * dl_name in case the first thread goes away before we do.
- * Note that we initialize the new name before storing its
- * pointer into dl_name, because the first thread may load
- * dl->dl_name at any time. It'll either see the old value,
- * which belongs to it, or the new shared copy; either is OK.
- */
- dl->dl_namesize = strlen(dl->dl_name) + 1;
- name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
- bcopy(dl->dl_name, name, dl->dl_namesize);
- dl->dl_name = name;
- }
-
- mutex_exit(&dzp->z_lock);
-
- /*
- * We have a dirlock on the name. (Note that it is the dirlock,
- * not the dzp's z_lock, that protects the name in the zap object.)
- * See if there's an object by this name; if so, put a hold on it.
- */
- if (flag & ZXATTR) {
- error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
- sizeof (zoid));
- if (error == 0)
- error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
- } else {
- error = zfs_match_find(zfsvfs, dzp, name, mt,
- update, direntflags, realpnp, &zoid);
- }
- if (error) {
- if (error != ENOENT || (flag & ZEXISTS)) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- } else {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- return (SET_ERROR(EEXIST));
- }
- error = zfs_zget(zfsvfs, zoid, zpp);
- if (error) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- }
-
- *dlpp = dl;
-
- return (0);
-}
-
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
-{
- znode_t *dzp = dl->dl_dzp;
- zfs_dirlock_t **prev_dl, *cur_dl;
-
- mutex_enter(&dzp->z_lock);
-
- if (!dl->dl_namelock)
- rw_exit(&dzp->z_name_lock);
-
- if (dl->dl_sharecnt > 1) {
- dl->dl_sharecnt--;
- mutex_exit(&dzp->z_lock);
- return;
- }
- prev_dl = &dzp->z_dirlocks;
- while ((cur_dl = *prev_dl) != dl)
- prev_dl = &cur_dl->dl_next;
- *prev_dl = dl->dl_next;
- cv_broadcast(&dl->dl_cv);
- mutex_exit(&dzp->z_lock);
-
- if (dl->dl_namesize != 0)
- kmem_free(dl->dl_name, dl->dl_namesize);
- cv_destroy(&dl->dl_cv);
- kmem_free(dl, sizeof (*dl));
-}
-
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- * no directory entries are actually stored for them. If this is
- * the root of a filesystem, then '.zfs' is also treated as a
- * special pseudo-directory.
- */
-int
-zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags,
- int *deflg, pathname_t *rpnp)
-{
- zfs_dirlock_t *dl;
- znode_t *zp;
- int error = 0;
- uint64_t parent;
-
- if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
- *ipp = ZTOI(dzp);
- igrab(*ipp);
- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
-
- /*
- * If we are a snapshot mounted under .zfs, return
- * the inode pointer for the snapshot directory.
- */
- if ((error = sa_lookup(dzp->z_sa_hdl,
- SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
- return (error);
-
- if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
- error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
- "snapshot", ipp, 0, kcred, NULL, NULL);
- return (error);
- }
- rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, parent, &zp);
- if (error == 0)
- *ipp = ZTOI(zp);
- rw_exit(&dzp->z_parent_lock);
- } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
- *ipp = zfsctl_root(dzp);
- } else {
- int zf;
-
- zf = ZEXISTS | ZSHARED;
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
- if (error == 0) {
- *ipp = ZTOI(zp);
- zfs_dirent_unlock(dl);
- dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
- }
- rpnp = NULL;
- }
-
- if ((flags & FIGNORECASE) && rpnp && !error)
- (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
- return (error);
-}
-
-/*
- * unlinked Set (formerly known as the "delete queue") Error Handling
- *
- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
- * don't specify the name of the entry that we will be manipulating. We
- * also fib and say that we won't be adding any new entries to the
- * unlinked set, even though we might (this is to lower the minimum file
- * size that can be deleted in a full filesystem). So on the small
- * chance that the nlink list is using a fat zap (ie. has more than
- * 2000 entries), we *may* not pre-read a block that's needed.
- * Therefore it is remotely possible for some of the assertions
- * regarding the unlinked set below to fail due to i/o error. On a
- * nondebug system, this will result in the space being leaked.
- */
-void
-zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
-
- ASSERT(zp->z_unlinked);
- ASSERT(ZTOI(zp)->i_nlink == 0);
-
- VERIFY3U(0, ==,
- zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
-
- dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
-}
-
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-static void
-zfs_unlinked_drain_task(void *arg)
-{
- zfsvfs_t *zfsvfs = arg;
- zap_cursor_t zc;
- zap_attribute_t zap;
- dmu_object_info_t doi;
- znode_t *zp;
- int error;
-
- ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
-
- /*
- * Iterate over the contents of the unlinked set.
- */
- for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
- zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
- zap_cursor_advance(&zc)) {
-
- /*
- * See what kind of object we have in list
- */
-
- error = dmu_object_info(zfsvfs->z_os,
- zap.za_first_integer, &doi);
- if (error != 0)
- continue;
-
- ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
- (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
- /*
- * We need to re-mark these list entries for deletion,
- * so we pull them back into core and set zp->z_unlinked.
- */
- error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
-
- /*
- * We may pick up znodes that are already marked for deletion.
- * This could happen during the purge of an extended attribute
- * directory. All we need to do is skip over them, since they
- * are already in the system marked z_unlinked.
- */
- if (error != 0)
- continue;
-
- zp->z_unlinked = B_TRUE;
-
- /*
- * iput() is Linux's equivalent to illumos' VN_RELE(). It will
- * decrement the inode's ref count and may cause the inode to be
- * synchronously freed. We interrupt freeing of this inode, by
- * checking the return value of dmu_objset_zfs_unmounting() in
- * dmu_free_long_range(), when an unmount is requested.
- */
- iput(ZTOI(zp));
- ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
- }
- zap_cursor_fini(&zc);
-
- zfsvfs->z_draining = B_FALSE;
- zfsvfs->z_drain_task = TASKQID_INVALID;
-}
-
-/*
- * Sets z_draining then tries to dispatch async unlinked drain.
- * If that fails executes synchronous unlinked drain.
- */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
-{
- ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
- ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
-
- zfsvfs->z_draining = B_TRUE;
- zfsvfs->z_drain_cancel = B_FALSE;
-
- zfsvfs->z_drain_task = taskq_dispatch(
- dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
- zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
- if (zfsvfs->z_drain_task == TASKQID_INVALID) {
- zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
- zfs_unlinked_drain_task(zfsvfs);
- }
-}
-
-/*
- * Wait for the unlinked drain taskq task to stop. This will interrupt the
- * unlinked set processing if it is in progress.
- */
-void
-zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
-{
- ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
-
- if (zfsvfs->z_draining) {
- zfsvfs->z_drain_cancel = B_TRUE;
- taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
- dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
- zfsvfs->z_drain_task = TASKQID_INVALID;
- zfsvfs->z_draining = B_FALSE;
- }
-}
-
-/*
- * Delete the entire contents of a directory. Return a count
- * of the number of entries that could not be deleted. If we encounter
- * an error, return a count of at least one so that the directory stays
- * in the unlinked set.
- *
- * NOTE: this function assumes that the directory is inactive,
- * so there is no need to lock its entries before deletion.
- * Also, it assumes the directory contents is *only* regular
- * files.
- */
-static int
-zfs_purgedir(znode_t *dzp)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- znode_t *xzp;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- zfs_dirlock_t dl;
- int skipped = 0;
- int error;
-
- for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
- (error = zap_cursor_retrieve(&zc, &zap)) == 0;
- zap_cursor_advance(&zc)) {
- error = zfs_zget(zfsvfs,
- ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
- if (error) {
- skipped += 1;
- continue;
- }
-
- ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
- S_ISLNK(ZTOI(xzp)->i_mode));
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- /* Is this really needed ? */
- zfs_sa_upgrade_txholds(tx, xzp);
- dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- zfs_iput_async(ZTOI(xzp));
- skipped += 1;
- continue;
- }
- bzero(&dl, sizeof (dl));
- dl.dl_dzp = dzp;
- dl.dl_name = zap.za_name;
-
- error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
- if (error)
- skipped += 1;
- dmu_tx_commit(tx);
-
- zfs_iput_async(ZTOI(xzp));
- }
- zap_cursor_fini(&zc);
- if (error != ENOENT)
- skipped += 1;
- return (skipped);
-}
-
-void
-zfs_rmnode(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- objset_t *os = zfsvfs->z_os;
- znode_t *xzp = NULL;
- dmu_tx_t *tx;
- uint64_t acl_obj;
- uint64_t xattr_obj;
- uint64_t links;
- int error;
-
- ASSERT(ZTOI(zp)->i_nlink == 0);
- ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
-
- /*
- * If this is an attribute directory, purge its contents.
- */
- if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
- if (zfs_purgedir(zp) != 0) {
- /*
- * Not enough space to delete some xattrs.
- * Leave it in the unlinked set.
- */
- zfs_znode_dmu_fini(zp);
-
- return;
- }
- }
-
- /*
- * Free up all the data in the file. We don't do this for directories
- * because we need truncate and remove to be in the same tx, like in
- * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
- * an inconsistent truncated zap object in the delete queue. Note a
- * truncated file is harmless since it only contains user data.
- */
- if (S_ISREG(ZTOI(zp)->i_mode)) {
- error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
- if (error) {
- /*
- * Not enough space or we were interrupted by unmount.
- * Leave the file in the unlinked set.
- */
- zfs_znode_dmu_fini(zp);
- return;
- }
- }
-
- /*
- * If the file has extended attributes, we're going to unlink
- * the xattr dir.
- */
- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj, sizeof (xattr_obj));
- if (error == 0 && xattr_obj) {
- error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT(error == 0);
- }
-
- acl_obj = zfs_external_acl(zp);
-
- /*
- * Set up the final transaction.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- if (xzp) {
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
- dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
- }
- if (acl_obj)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- /*
- * Not enough space to delete the file. Leave it in the
- * unlinked set, leaking it until the fs is remounted (at
- * which point we'll call zfs_unlinked_drain() to process it).
- */
- dmu_tx_abort(tx);
- zfs_znode_dmu_fini(zp);
- goto out;
- }
-
- if (xzp) {
- ASSERT(error == 0);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- clear_nlink(ZTOI(xzp)); /* no more links to it */
- links = 0;
- VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
- &links, sizeof (links), tx));
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
- }
-
- /* Remove this znode from the unlinked set */
- VERIFY3U(0, ==,
- zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
-
- dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
-
- zfs_znode_delete(zp, tx);
-
- dmu_tx_commit(tx);
-out:
- if (xzp)
- zfs_iput_async(ZTOI(xzp));
-}
-
-static uint64_t
-zfs_dirent(znode_t *zp, uint64_t mode)
-{
- uint64_t de = zp->z_id;
-
- if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
- de |= IFTODT(mode) << 60;
- return (de);
-}
-
-/*
- * Link zp into dl. Can fail in the following cases :
- * - if zp has been unlinked.
- * - if the number of entries with the same hash (aka. colliding entries)
- * exceed the capacity of a leaf-block of fatzap and splitting of the
- * leaf-block does not help.
- */
-int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
-{
- znode_t *dzp = dl->dl_dzp;
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- uint64_t value;
- int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
- sa_bulk_attr_t bulk[5];
- uint64_t mtime[2], ctime[2];
- uint64_t links;
- int count = 0;
- int error;
-
- mutex_enter(&zp->z_lock);
-
- if (!(flag & ZRENAMING)) {
- if (zp->z_unlinked) { /* no new links to unlinked zp */
- ASSERT(!(flag & (ZNEW | ZEXISTS)));
- mutex_exit(&zp->z_lock);
- return (SET_ERROR(ENOENT));
- }
- if (!(flag & ZNEW)) {
- /*
- * ZNEW nodes come from zfs_mknode() where the link
- * count has already been initialised
- */
- inc_nlink(ZTOI(zp));
- links = ZTOI(zp)->i_nlink;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
- NULL, &links, sizeof (links));
- }
- }
-
- value = zfs_dirent(zp, zp->z_mode);
- error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
- &value, tx);
-
- /*
- * zap_add could fail to add the entry if it exceeds the capacity of the
- * leaf-block and zap_leaf_split() failed to help.
- * The caller of this routine is responsible for failing the transaction
- * which will rollback the SA updates done above.
- */
- if (error != 0) {
- if (!(flag & ZRENAMING) && !(flag & ZNEW))
- drop_nlink(ZTOI(zp));
- mutex_exit(&zp->z_lock);
- return (error);
- }
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
- &dzp->z_id, sizeof (dzp->z_id));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, sizeof (zp->z_pflags));
-
- if (!(flag & ZNEW)) {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- ctime, sizeof (ctime));
- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
- ctime);
- }
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
-
- mutex_exit(&zp->z_lock);
-
- mutex_enter(&dzp->z_lock);
- dzp->z_size++;
- if (zp_is_dir)
- inc_nlink(ZTOI(dzp));
- links = ZTOI(dzp)->i_nlink;
- count = 0;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &dzp->z_size, sizeof (dzp->z_size));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
- &links, sizeof (links));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
- mtime, sizeof (mtime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- ctime, sizeof (ctime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &dzp->z_pflags, sizeof (dzp->z_pflags));
- zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
- error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
-
- return (0);
-}
-
-/*
- * The match type in the code for this function should conform to:
- *
- * ------------------------------------------------------------------------
- * fs type | z_norm | lookup type | match type
- * ---------|-------------|-------------|----------------------------------
- * CS !norm | 0 | 0 | 0 (exact)
- * CS norm | formX | 0 | MT_NORMALIZE
- * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
- * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
- * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
- * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
- * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
- * CM !norm | upper | ZCILOOK | MT_NORMALIZE
- * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
- * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
- *
- * Abbreviations:
- * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
- * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
- * formX = unicode normalization form set on fs creation
- */
-static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
- int flag)
-{
- int error;
-
- if (ZTOZSB(zp)->z_norm) {
- matchtype_t mt = MT_NORMALIZE;
-
- if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
- (flag & ZCIEXACT)) ||
- (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
- !(flag & ZCILOOK))) {
- mt |= MT_MATCH_CASE;
- }
-
- error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
- dl->dl_name, mt, tx);
- } else {
- error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
- tx);
- }
-
- return (error);
-}
-
-/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
- * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
- * and it's the caller's job to do it.
- */
-int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
-{
- znode_t *dzp = dl->dl_dzp;
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
- boolean_t unlinked = B_FALSE;
- sa_bulk_attr_t bulk[5];
- uint64_t mtime[2], ctime[2];
- uint64_t links;
- int count = 0;
- int error;
-
- if (!(flag & ZRENAMING)) {
- mutex_enter(&zp->z_lock);
-
- if (zp_is_dir && !zfs_dirempty(zp)) {
- mutex_exit(&zp->z_lock);
- return (SET_ERROR(ENOTEMPTY));
- }
-
- /*
- * If we get here, we are going to try to remove the object.
- * First try removing the name from the directory; if that
- * fails, return the error.
- */
- error = zfs_dropname(dl, zp, dzp, tx, flag);
- if (error != 0) {
- mutex_exit(&zp->z_lock);
- return (error);
- }
-
- if (ZTOI(zp)->i_nlink <= zp_is_dir) {
- zfs_panic_recover("zfs: link count on %lu is %u, "
- "should be at least %u", zp->z_id,
- (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
- set_nlink(ZTOI(zp), zp_is_dir + 1);
- }
- drop_nlink(ZTOI(zp));
- if (ZTOI(zp)->i_nlink == zp_is_dir) {
- zp->z_unlinked = B_TRUE;
- clear_nlink(ZTOI(zp));
- unlinked = B_TRUE;
- } else {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
- NULL, &ctime, sizeof (ctime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
- NULL, &zp->z_pflags, sizeof (zp->z_pflags));
- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
- ctime);
- }
- links = ZTOI(zp)->i_nlink;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
- NULL, &links, sizeof (links));
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- count = 0;
- ASSERT(error == 0);
- mutex_exit(&zp->z_lock);
- } else {
- error = zfs_dropname(dl, zp, dzp, tx, flag);
- if (error != 0)
- return (error);
- }
-
- mutex_enter(&dzp->z_lock);
- dzp->z_size--; /* one dirent removed */
- if (zp_is_dir)
- drop_nlink(ZTOI(dzp)); /* ".." link from zp */
- links = ZTOI(dzp)->i_nlink;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
- NULL, &links, sizeof (links));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
- NULL, &dzp->z_size, sizeof (dzp->z_size));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
- NULL, ctime, sizeof (ctime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
- NULL, mtime, sizeof (mtime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
- NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
- zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
- error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
-
- if (unlinkedp != NULL)
- *unlinkedp = unlinked;
- else if (unlinked)
- zfs_unlinked_add(zp, tx);
-
- return (0);
-}
-
-/*
- * Indicate whether the directory is empty. Works with or without z_lock
- * held, but can only be consider a hint in the latter case. Returns true
- * if only "." and ".." remain and there's no work in progress.
- *
- * The internal ZAP size, rather than zp->z_size, needs to be checked since
- * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
- */
-boolean_t
-zfs_dirempty(znode_t *dzp)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- uint64_t count;
- int error;
-
- if (dzp->z_dirlocks != NULL)
- return (B_FALSE);
-
- error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
- if (error != 0 || count != 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-int
-zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- znode_t *xzp;
- dmu_tx_t *tx;
- int error;
- zfs_acl_ids_t acl_ids;
- boolean_t fuid_dirtied;
-#ifdef DEBUG
- uint64_t parent;
-#endif
-
- *xipp = NULL;
-
- if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
- return (error);
-
- if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
- &acl_ids)) != 0)
- return (error);
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
- zfs_acl_ids_free(&acl_ids);
- return (SET_ERROR(EDQUOT));
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- return (error);
- }
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
-#ifdef DEBUG
- error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
- &parent, sizeof (parent));
- ASSERT(error == 0 && parent == zp->z_id);
-#endif
-
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
- sizeof (xzp->z_id), tx));
-
- if (!zp->z_unlinked)
- (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
- xzp, "", NULL, acl_ids.z_fuidp, vap);
-
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
-
- *xipp = ZTOI(xzp);
-
- return (0);
-}
-
-/*
- * Return a znode for the extended attribute directory for zp.
- * ** If the directory does not already exist, it is created **
- *
- * IN: zp - znode to obtain attribute directory from
- * cr - credentials of caller
- * flags - flags from the VOP_LOOKUP call
- *
- * OUT: xipp - pointer to extended attribute znode
- *
- * RETURN: 0 on success
- * error number on failure
- */
-int
-zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- znode_t *xzp;
- zfs_dirlock_t *dl;
- vattr_t va;
- int error;
-top:
- error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
- if (error)
- return (error);
-
- if (xzp != NULL) {
- *xipp = ZTOI(xzp);
- zfs_dirent_unlock(dl);
- return (0);
- }
-
- if (!(flags & CREATE_XATTR_DIR)) {
- zfs_dirent_unlock(dl);
- return (SET_ERROR(ENOENT));
- }
-
- if (zfs_is_readonly(zfsvfs)) {
- zfs_dirent_unlock(dl);
- return (SET_ERROR(EROFS));
- }
-
- /*
- * The ability to 'create' files in an attribute
- * directory comes from the write_xattr permission on the base file.
- *
- * The ability to 'search' an attribute directory requires
- * read_xattr permission on the base file.
- *
- * Once in a directory the ability to read/write attributes
- * is controlled by the permissions on the attribute file.
- */
- va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
- va.va_mode = S_IFDIR | S_ISVTX | 0777;
- zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
-
- va.va_dentry = NULL;
- error = zfs_make_xattrdir(zp, &va, xipp, cr);
- zfs_dirent_unlock(dl);
-
- if (error == ERESTART) {
- /* NB: we already did dmu_tx_wait() if necessary */
- goto top;
- }
-
- return (error);
-}
-
-/*
- * Decide whether it is okay to remove within a sticky directory.
- *
- * In sticky directories, write access is not sufficient;
- * you can remove entries from a directory only if:
- *
- * you own the directory,
- * you own the entry,
- * you have write access to the entry,
- * or you are privileged (checked in secpolicy...).
- *
- * The function returns 0 if remove access is granted.
- */
-int
-zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
-{
- uid_t uid;
- uid_t downer;
- uid_t fowner;
- zfsvfs_t *zfsvfs = ZTOZSB(zdp);
-
- if (zfsvfs->z_replay)
- return (0);
-
- if ((zdp->z_mode & S_ISVTX) == 0)
- return (0);
-
- downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
- cr, ZFS_OWNER);
- fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
- cr, ZFS_OWNER);
-
- if ((uid = crgetuid(cr)) == downer || uid == fowner ||
- zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
- return (0);
- else
- return (secpolicy_vnode_remove(cr));
-}
diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c
deleted file mode 100644
index bb7f3b69a..000000000
--- a/module/zfs/zfs_sysfs.c
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/zfeature.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_sysfs.h>
-#include <sys/kmem.h>
-#include <sys/fs/zfs.h>
-#include <linux/kobject.h>
-
-#include "zfs_prop.h"
-
-#if !defined(_KERNEL)
-#error kernel builds only
-#endif
-
-/*
- * ZFS Module sysfs support
- *
- * This extends our sysfs '/sys/module/zfs' entry to include feature
- * and property attributes. The primary consumer of this information
- * is user processes, like the zfs CLI, that need to know what the
- * current loaded ZFS module supports. The libzfs binary will consult
- * this information when instantiating the zfs|zpool property tables
- * and the pool features table.
- *
- * The added top-level directories are:
- * /sys/module/zfs
- * ├── features.kernel
- * ├── features.pool
- * ├── properties.dataset
- * └── properties.pool
- *
- * The local interface for the zfs kobjects includes:
- * zfs_kobj_init()
- * zfs_kobj_add()
- * zfs_kobj_release()
- * zfs_kobj_add_attr()
- * zfs_kobj_fini()
- */
-
-/*
- * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
- */
-struct zfs_mod_kobj;
-typedef struct zfs_mod_kobj zfs_mod_kobj_t;
-
-struct zfs_mod_kobj {
- struct kobject zko_kobj;
- struct kobj_type zko_kobj_type;
- struct sysfs_ops zko_sysfs_ops;
- size_t zko_attr_count;
- struct attribute *zko_attr_list; /* allocated */
- struct attribute **zko_default_attrs; /* allocated */
- size_t zko_child_count;
- zfs_mod_kobj_t *zko_children; /* allocated */
-};
-
-#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt))
-/* Note +1 for NULL terminator slot */
-#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1))
-#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt))
-
-/*
- * These are the top-level kobjects under '/sys/module/zfs/'
- */
-static zfs_mod_kobj_t kernel_features_kobj;
-static zfs_mod_kobj_t pool_features_kobj;
-static zfs_mod_kobj_t dataset_props_kobj;
-static zfs_mod_kobj_t pool_props_kobj;
-
-/*
- * The show function is used to provide the content
- * of an attribute into a PAGE_SIZE buffer.
- */
-typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *,
- char *);
-
-static void
-zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
-{
- /* finalize any child kobjects */
- if (zkobj->zko_child_count != 0) {
- ASSERT(zkobj->zko_children);
- for (int i = 0; i < zkobj->zko_child_count; i++)
- zfs_kobj_fini(&zkobj->zko_children[i]);
- }
-
- /* kobject_put() will call zfs_kobj_release() to release memory */
- kobject_del(&zkobj->zko_kobj);
- kobject_put(&zkobj->zko_kobj);
-}
-
-static void
-zfs_kobj_release(struct kobject *kobj)
-{
- zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
-
- if (zkobj->zko_attr_list != NULL) {
- ASSERT3S(zkobj->zko_attr_count, !=, 0);
- kmem_free(zkobj->zko_attr_list,
- ATTR_TABLE_SIZE(zkobj->zko_attr_count));
- zkobj->zko_attr_list = NULL;
- }
-
- if (zkobj->zko_default_attrs != NULL) {
- kmem_free(zkobj->zko_default_attrs,
- DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
- zkobj->zko_default_attrs = NULL;
- }
-
- if (zkobj->zko_child_count != 0) {
- ASSERT(zkobj->zko_children);
-
- kmem_free(zkobj->zko_children,
- CHILD_TABLE_SIZE(zkobj->zko_child_count));
- zkobj->zko_child_count = 0;
- zkobj->zko_children = NULL;
- }
-
- zkobj->zko_attr_count = 0;
-}
-
-#ifndef sysfs_attr_init
-#define sysfs_attr_init(attr) do {} while (0)
-#endif
-
-static void
-zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
-{
- VERIFY3U(attr_num, <, zkobj->zko_attr_count);
- ASSERT(zkobj->zko_attr_list);
- ASSERT(zkobj->zko_default_attrs);
-
- zkobj->zko_attr_list[attr_num].name = attr_name;
- zkobj->zko_attr_list[attr_num].mode = 0444;
- zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
- sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
-}
-
-static int
-zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
- sysfs_show_func show_func)
-{
- /*
- * Initialize object's attributes. Count can be zero.
- */
- if (attr_cnt > 0) {
- zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
- KM_SLEEP);
- if (zkobj->zko_attr_list == NULL)
- return (ENOMEM);
- }
- /* this will always have at least one slot for NULL termination */
- zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
- KM_SLEEP);
- if (zkobj->zko_default_attrs == NULL) {
- if (zkobj->zko_attr_list != NULL) {
- kmem_free(zkobj->zko_attr_list,
- ATTR_TABLE_SIZE(attr_cnt));
- }
- return (ENOMEM);
- }
- zkobj->zko_attr_count = attr_cnt;
- zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
-
- if (child_cnt > 0) {
- zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
- KM_SLEEP);
- if (zkobj->zko_children == NULL) {
- if (zkobj->zko_default_attrs != NULL) {
- kmem_free(zkobj->zko_default_attrs,
- DEFAULT_ATTR_SIZE(attr_cnt));
- }
- if (zkobj->zko_attr_list != NULL) {
- kmem_free(zkobj->zko_attr_list,
- ATTR_TABLE_SIZE(attr_cnt));
- }
- return (ENOMEM);
- }
- zkobj->zko_child_count = child_cnt;
- }
-
- zkobj->zko_sysfs_ops.show = show_func;
- zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
- zkobj->zko_kobj_type.release = zfs_kobj_release;
-
- return (0);
-}
-
-static int
-zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
-{
- /* zko_default_attrs must be NULL terminated */
- ASSERT(zkobj->zko_default_attrs != NULL);
- ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
-
- kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
- return (kobject_add(&zkobj->zko_kobj, parent, name));
-}
-
-/*
- * Each zfs property has these common attributes
- */
-static const char *zprop_attrs[] = {
- "type",
- "readonly",
- "setonce",
- "visible",
- "values",
- "default",
- "datasets" /* zfs properties only */
-};
-
-#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs)
-#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1)
-
-static const char *zprop_types[] = {
- "number",
- "string",
- "index",
-};
-
-typedef struct zfs_type_map {
- zfs_type_t ztm_type;
- const char *ztm_name;
-} zfs_type_map_t;
-
-static zfs_type_map_t type_map[] = {
- {ZFS_TYPE_FILESYSTEM, "filesystem"},
- {ZFS_TYPE_SNAPSHOT, "snapshot"},
- {ZFS_TYPE_VOLUME, "volume"},
- {ZFS_TYPE_BOOKMARK, "bookmark"}
-};
-
-/*
- * Show the content for a zfs property attribute
- */
-static ssize_t
-zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
- char *buf, size_t buflen)
-{
- const char *show_str;
- char number[32];
-
- /* For dataset properties list the dataset types that apply */
- if (strcmp(attr_name, "datasets") == 0 &&
- property->pd_types != ZFS_TYPE_POOL) {
- int len = 0;
-
- for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
- if (type_map[i].ztm_type & property->pd_types) {
- len += snprintf(buf + len, buflen - len, "%s ",
- type_map[i].ztm_name);
- }
- }
- len += snprintf(buf + len, buflen - len, "\n");
- return (len);
- }
-
- if (strcmp(attr_name, "type") == 0) {
- show_str = zprop_types[property->pd_proptype];
- } else if (strcmp(attr_name, "readonly") == 0) {
- show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
- } else if (strcmp(attr_name, "setonce") == 0) {
- show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
- } else if (strcmp(attr_name, "visible") == 0) {
- show_str = property->pd_visible ? "1" : "0";
- } else if (strcmp(attr_name, "values") == 0) {
- show_str = property->pd_values ? property->pd_values : "";
- } else if (strcmp(attr_name, "default") == 0) {
- switch (property->pd_proptype) {
- case PROP_TYPE_NUMBER:
- (void) snprintf(number, sizeof (number), "%llu",
- (u_longlong_t)property->pd_numdefault);
- show_str = number;
- break;
- case PROP_TYPE_STRING:
- show_str = property->pd_strdefault ?
- property->pd_strdefault : "";
- break;
- case PROP_TYPE_INDEX:
- if (zprop_index_to_string(property->pd_propnum,
- property->pd_numdefault, &show_str,
- property->pd_types) != 0) {
- show_str = "";
- }
- break;
- default:
- return (0);
- }
- } else {
- return (0);
- }
-
- return (snprintf(buf, buflen, "%s\n", show_str));
-}
-
-static ssize_t
-dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
- zprop_desc_t *prop_tbl = zfs_prop_get_table();
- ssize_t len;
-
- ASSERT3U(prop, <, ZFS_NUM_PROPS);
-
- len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
-
- return (len);
-}
-
-static ssize_t
-pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
- zprop_desc_t *prop_tbl = zpool_prop_get_table();
- ssize_t len;
-
- ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
-
- len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
-
- return (len);
-}
-
-/*
- * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
- *
- * This list is intended for kernel features that don't have a pool feature
- * association or that extend existing user kernel interfaces.
- *
- * A user processes can easily check if the running zfs kernel module
- * supports the new feature.
- */
-static const char *zfs_kernel_features[] = {
- /* --> Add new kernel features here */
- "com.delphix:vdev_initialize",
- "org.zfsonlinux:vdev_trim",
-};
-
-#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features)
-
-static ssize_t
-kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- if (strcmp(attr->name, "supported") == 0)
- return (snprintf(buf, PAGE_SIZE, "yes\n"));
- return (0);
-}
-
-static void
-kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
-{
- zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
-
- ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
- ASSERT(name);
-
- int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
- if (err)
- return;
-
- zfs_kobj_add_attr(zfs_kobj, 0, "supported");
-
- err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
- if (err)
- zfs_kobj_release(&zfs_kobj->zko_kobj);
-}
-
-static int
-zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
-{
- /*
- * Create a parent kobject to host kernel features.
- *
- * '/sys/module/zfs/features.kernel'
- */
- int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
- kernel_feature_show);
- if (err)
- return (err);
- err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
- if (err) {
- zfs_kobj_release(&zfs_kobj->zko_kobj);
- return (err);
- }
-
- /*
- * Now create a kobject for each feature.
- *
- * '/sys/module/zfs/features.kernel/<feature>'
- */
- for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
- kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
-
- return (0);
-}
-
-/*
- * Each pool feature has these common attributes
- */
-static const char *pool_feature_attrs[] = {
- "description",
- "guid",
- "uname",
- "readonly_compatible",
- "required_for_mos",
- "activate_on_enable",
- "per_dataset"
-};
-
-#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs)
-
-/*
- * Show the content for the given zfs pool feature attribute
- */
-static ssize_t
-pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- spa_feature_t fid;
-
- if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
- return (0);
-
- ASSERT3U(fid, <, SPA_FEATURES);
-
- zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
- const char *show_str = NULL;
-
- if (strcmp(attr->name, "description") == 0) {
- show_str = spa_feature_table[fid].fi_desc;
- } else if (strcmp(attr->name, "guid") == 0) {
- show_str = spa_feature_table[fid].fi_guid;
- } else if (strcmp(attr->name, "uname") == 0) {
- show_str = spa_feature_table[fid].fi_uname;
- } else if (strcmp(attr->name, "readonly_compatible") == 0) {
- show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
- } else if (strcmp(attr->name, "required_for_mos") == 0) {
- show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
- } else if (strcmp(attr->name, "activate_on_enable") == 0) {
- show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
- } else if (strcmp(attr->name, "per_dataset") == 0) {
- show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
- }
- if (show_str == NULL)
- return (0);
-
- return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
-}
-
-static void
-pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
- const char *name)
-{
- zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
-
- ASSERT3U(fid, <, SPA_FEATURES);
- ASSERT(name);
-
- int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
- pool_feature_show);
- if (err)
- return;
-
- for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
- zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
-
- err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
- if (err)
- zfs_kobj_release(&zfs_kobj->zko_kobj);
-}
-
-static int
-zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
-{
- /*
- * Create a parent kobject to host pool features.
- *
- * '/sys/module/zfs/features.pool'
- */
- int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
- if (err)
- return (err);
- err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
- if (err) {
- zfs_kobj_release(&zfs_kobj->zko_kobj);
- return (err);
- }
-
- /*
- * Now create a kobject for each feature.
- *
- * '/sys/module/zfs/features.pool/<feature>'
- */
- for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
- pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
-
- return (0);
-}
-
-typedef struct prop_to_kobj_arg {
- zprop_desc_t *p2k_table;
- zfs_mod_kobj_t *p2k_parent;
- sysfs_show_func p2k_show_func;
- int p2k_attr_count;
-} prop_to_kobj_arg_t;
-
-static int
-zprop_to_kobj(int prop, void *args)
-{
- prop_to_kobj_arg_t *data = args;
- zfs_mod_kobj_t *parent = data->p2k_parent;
- zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
- const char *name = data->p2k_table[prop].pd_name;
- int err;
-
- ASSERT(name);
-
- err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
- data->p2k_show_func);
- if (err)
- return (ZPROP_CONT);
-
- for (int i = 0; i < data->p2k_attr_count; i++)
- zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
-
- err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
- if (err)
- zfs_kobj_release(&zfs_kobj->zko_kobj);
-
- return (ZPROP_CONT);
-}
-
-static int
-zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
- zfs_type_t type)
-{
- prop_to_kobj_arg_t context;
- const char *name;
- int err;
-
- /*
- * Create a parent kobject to host properties.
- *
- * '/sys/module/zfs/properties.<type>'
- */
- if (type == ZFS_TYPE_POOL) {
- name = ZFS_SYSFS_POOL_PROPERTIES;
- context.p2k_table = zpool_prop_get_table();
- context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
- context.p2k_parent = zfs_kobj;
- context.p2k_show_func = pool_property_show;
- err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
- pool_property_show);
- } else {
- name = ZFS_SYSFS_DATASET_PROPERTIES;
- context.p2k_table = zfs_prop_get_table();
- context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
- context.p2k_parent = zfs_kobj;
- context.p2k_show_func = dataset_property_show;
- err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
- dataset_property_show);
- }
-
- if (err)
- return (err);
-
- err = zfs_kobj_add(zfs_kobj, parent, name);
- if (err) {
- zfs_kobj_release(&zfs_kobj->zko_kobj);
- return (err);
- }
-
- /*
- * Create a kobject for each property.
- *
- * '/sys/module/zfs/properties.<type>/<property>'
- */
- (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
- B_FALSE, type);
-
- return (err);
-}
-
-void
-zfs_sysfs_init(void)
-{
- struct kobject *parent;
-#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
- parent = kobject_create_and_add("zfs", fs_kobj);
-#else
- parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
-#endif
- int err;
-
- if (parent == NULL)
- return;
-
- err = zfs_kernel_features_init(&kernel_features_kobj, parent);
- if (err)
- return;
-
- err = zfs_pool_features_init(&pool_features_kobj, parent);
- if (err) {
- zfs_kobj_fini(&kernel_features_kobj);
- return;
- }
-
- err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
- ZFS_TYPE_POOL);
- if (err) {
- zfs_kobj_fini(&kernel_features_kobj);
- zfs_kobj_fini(&pool_features_kobj);
- return;
- }
-
- err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
- ZFS_TYPE_FILESYSTEM);
- if (err) {
- zfs_kobj_fini(&kernel_features_kobj);
- zfs_kobj_fini(&pool_features_kobj);
- zfs_kobj_fini(&pool_props_kobj);
- return;
- }
-}
-
-void
-zfs_sysfs_fini(void)
-{
- /*
- * Remove top-level kobjects; each will remove any children kobjects
- */
- zfs_kobj_fini(&kernel_features_kobj);
- zfs_kobj_fini(&pool_features_kobj);
- zfs_kobj_fini(&dataset_props_kobj);
- zfs_kobj_fini(&pool_props_kobj);
-}
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
deleted file mode 100644
index 0914e4b7d..000000000
--- a/module/zfs/zfs_vfsops.c
+++ /dev/null
@@ -1,2562 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/sysmacros.h>
-#include <sys/kmem.h>
-#include <sys/pathname.h>
-#include <sys/vnode.h>
-#include <sys/vfs.h>
-#include <sys/mntent.h>
-#include <sys/cmn_err.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_deleg.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/policy.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/sunddi.h>
-#include <sys/dmu_objset.h>
-#include <sys/spa_boot.h>
-#include <sys/objlist.h>
-#include <sys/zpl.h>
-#include <linux/vfs_compat.h>
-#include "zfs_comutil.h"
-
-enum {
- TOKEN_RO,
- TOKEN_RW,
- TOKEN_SETUID,
- TOKEN_NOSETUID,
- TOKEN_EXEC,
- TOKEN_NOEXEC,
- TOKEN_DEVICES,
- TOKEN_NODEVICES,
- TOKEN_DIRXATTR,
- TOKEN_SAXATTR,
- TOKEN_XATTR,
- TOKEN_NOXATTR,
- TOKEN_ATIME,
- TOKEN_NOATIME,
- TOKEN_RELATIME,
- TOKEN_NORELATIME,
- TOKEN_NBMAND,
- TOKEN_NONBMAND,
- TOKEN_MNTPOINT,
- TOKEN_LAST,
-};
-
-static const match_table_t zpl_tokens = {
- { TOKEN_RO, MNTOPT_RO },
- { TOKEN_RW, MNTOPT_RW },
- { TOKEN_SETUID, MNTOPT_SETUID },
- { TOKEN_NOSETUID, MNTOPT_NOSETUID },
- { TOKEN_EXEC, MNTOPT_EXEC },
- { TOKEN_NOEXEC, MNTOPT_NOEXEC },
- { TOKEN_DEVICES, MNTOPT_DEVICES },
- { TOKEN_NODEVICES, MNTOPT_NODEVICES },
- { TOKEN_DIRXATTR, MNTOPT_DIRXATTR },
- { TOKEN_SAXATTR, MNTOPT_SAXATTR },
- { TOKEN_XATTR, MNTOPT_XATTR },
- { TOKEN_NOXATTR, MNTOPT_NOXATTR },
- { TOKEN_ATIME, MNTOPT_ATIME },
- { TOKEN_NOATIME, MNTOPT_NOATIME },
- { TOKEN_RELATIME, MNTOPT_RELATIME },
- { TOKEN_NORELATIME, MNTOPT_NORELATIME },
- { TOKEN_NBMAND, MNTOPT_NBMAND },
- { TOKEN_NONBMAND, MNTOPT_NONBMAND },
- { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" },
- { TOKEN_LAST, NULL },
-};
-
-static void
-zfsvfs_vfs_free(vfs_t *vfsp)
-{
- if (vfsp != NULL) {
- if (vfsp->vfs_mntpoint != NULL)
- strfree(vfsp->vfs_mntpoint);
-
- kmem_free(vfsp, sizeof (vfs_t));
- }
-}
-
-static int
-zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
-{
- switch (token) {
- case TOKEN_RO:
- vfsp->vfs_readonly = B_TRUE;
- vfsp->vfs_do_readonly = B_TRUE;
- break;
- case TOKEN_RW:
- vfsp->vfs_readonly = B_FALSE;
- vfsp->vfs_do_readonly = B_TRUE;
- break;
- case TOKEN_SETUID:
- vfsp->vfs_setuid = B_TRUE;
- vfsp->vfs_do_setuid = B_TRUE;
- break;
- case TOKEN_NOSETUID:
- vfsp->vfs_setuid = B_FALSE;
- vfsp->vfs_do_setuid = B_TRUE;
- break;
- case TOKEN_EXEC:
- vfsp->vfs_exec = B_TRUE;
- vfsp->vfs_do_exec = B_TRUE;
- break;
- case TOKEN_NOEXEC:
- vfsp->vfs_exec = B_FALSE;
- vfsp->vfs_do_exec = B_TRUE;
- break;
- case TOKEN_DEVICES:
- vfsp->vfs_devices = B_TRUE;
- vfsp->vfs_do_devices = B_TRUE;
- break;
- case TOKEN_NODEVICES:
- vfsp->vfs_devices = B_FALSE;
- vfsp->vfs_do_devices = B_TRUE;
- break;
- case TOKEN_DIRXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_DIR;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_SAXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_SA;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_XATTR:
- vfsp->vfs_xattr = ZFS_XATTR_DIR;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_NOXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_OFF;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_ATIME:
- vfsp->vfs_atime = B_TRUE;
- vfsp->vfs_do_atime = B_TRUE;
- break;
- case TOKEN_NOATIME:
- vfsp->vfs_atime = B_FALSE;
- vfsp->vfs_do_atime = B_TRUE;
- break;
- case TOKEN_RELATIME:
- vfsp->vfs_relatime = B_TRUE;
- vfsp->vfs_do_relatime = B_TRUE;
- break;
- case TOKEN_NORELATIME:
- vfsp->vfs_relatime = B_FALSE;
- vfsp->vfs_do_relatime = B_TRUE;
- break;
- case TOKEN_NBMAND:
- vfsp->vfs_nbmand = B_TRUE;
- vfsp->vfs_do_nbmand = B_TRUE;
- break;
- case TOKEN_NONBMAND:
- vfsp->vfs_nbmand = B_FALSE;
- vfsp->vfs_do_nbmand = B_TRUE;
- break;
- case TOKEN_MNTPOINT:
- vfsp->vfs_mntpoint = match_strdup(&args[0]);
- if (vfsp->vfs_mntpoint == NULL)
- return (SET_ERROR(ENOMEM));
-
- break;
- default:
- break;
- }
-
- return (0);
-}
-
-/*
- * Parse the raw mntopts and return a vfs_t describing the options.
- */
-static int
-zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
-{
- vfs_t *tmp_vfsp;
- int error;
-
- tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
-
- if (mntopts != NULL) {
- substring_t args[MAX_OPT_ARGS];
- char *tmp_mntopts, *p, *t;
- int token;
-
- tmp_mntopts = t = strdup(mntopts);
- if (tmp_mntopts == NULL)
- return (SET_ERROR(ENOMEM));
-
- while ((p = strsep(&t, ",")) != NULL) {
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, zpl_tokens, args);
- error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
- if (error) {
- strfree(tmp_mntopts);
- zfsvfs_vfs_free(tmp_vfsp);
- return (error);
- }
- }
-
- strfree(tmp_mntopts);
- }
-
- *vfsp = tmp_vfsp;
-
- return (0);
-}
-
-boolean_t
-zfs_is_readonly(zfsvfs_t *zfsvfs)
-{
- return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY));
-}
-
-/*ARGSUSED*/
-int
-zfs_sync(struct super_block *sb, int wait, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
-
- /*
- * Semantically, the only requirement is that the sync be initiated.
- * The DMU syncs out txgs frequently, so there's nothing to do.
- */
- if (!wait)
- return (0);
-
- if (zfsvfs != NULL) {
- /*
- * Sync a specific filesystem.
- */
- dsl_pool_t *dp;
-
- ZFS_ENTER(zfsvfs);
- dp = dmu_objset_pool(zfsvfs->z_os);
-
- /*
- * If the system is shutting down, then skip any
- * filesystems which may exist on a suspended pool.
- */
- if (spa_suspended(dp->dp_spa)) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, 0);
-
- ZFS_EXIT(zfsvfs);
- } else {
- /*
- * Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
- * request by waiting for all pools to commit all dirty data.
- */
- spa_sync_allpools();
- }
-
- return (0);
-}
-
-static void
-atime_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
- struct super_block *sb = zfsvfs->z_sb;
-
- if (sb == NULL)
- return;
- /*
- * Update SB_NOATIME bit in VFS super block. Since atime update is
- * determined by atime_needs_update(), atime_needs_update() needs to
- * return false if atime is turned off, and not unconditionally return
- * false if atime is turned on.
- */
- if (newval)
- sb->s_flags &= ~SB_NOATIME;
- else
- sb->s_flags |= SB_NOATIME;
-}
-
-static void
-relatime_changed_cb(void *arg, uint64_t newval)
-{
- ((zfsvfs_t *)arg)->z_relatime = newval;
-}
-
-static void
-xattr_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == ZFS_XATTR_OFF) {
- zfsvfs->z_flags &= ~ZSB_XATTR;
- } else {
- zfsvfs->z_flags |= ZSB_XATTR;
-
- if (newval == ZFS_XATTR_SA)
- zfsvfs->z_xattr_sa = B_TRUE;
- else
- zfsvfs->z_xattr_sa = B_FALSE;
- }
-}
-
-static void
-acltype_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- switch (newval) {
- case ZFS_ACLTYPE_OFF:
- zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
- zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
- break;
- case ZFS_ACLTYPE_POSIXACL:
-#ifdef CONFIG_FS_POSIX_ACL
- zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL;
- zfsvfs->z_sb->s_flags |= SB_POSIXACL;
-#else
- zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
- zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
-#endif /* CONFIG_FS_POSIX_ACL */
- break;
- default:
- break;
- }
-}
-
-static void
-blksz_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
- ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
- ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
- ASSERT(ISP2(newval));
-
- zfsvfs->z_max_blksz = newval;
-}
-
-static void
-readonly_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
- struct super_block *sb = zfsvfs->z_sb;
-
- if (sb == NULL)
- return;
-
- if (newval)
- sb->s_flags |= SB_RDONLY;
- else
- sb->s_flags &= ~SB_RDONLY;
-}
-
-static void
-devices_changed_cb(void *arg, uint64_t newval)
-{
-}
-
-static void
-setuid_changed_cb(void *arg, uint64_t newval)
-{
-}
-
-static void
-exec_changed_cb(void *arg, uint64_t newval)
-{
-}
-
-static void
-nbmand_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
- struct super_block *sb = zfsvfs->z_sb;
-
- if (sb == NULL)
- return;
-
- if (newval == TRUE)
- sb->s_flags |= SB_MANDLOCK;
- else
- sb->s_flags &= ~SB_MANDLOCK;
-}
-
-static void
-snapdir_changed_cb(void *arg, uint64_t newval)
-{
- ((zfsvfs_t *)arg)->z_show_ctldir = newval;
-}
-
-static void
-vscan_changed_cb(void *arg, uint64_t newval)
-{
- ((zfsvfs_t *)arg)->z_vscan = newval;
-}
-
-static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
-{
- ((zfsvfs_t *)arg)->z_acl_inherit = newval;
-}
-
-static int
-zfs_register_callbacks(vfs_t *vfsp)
-{
- struct dsl_dataset *ds = NULL;
- objset_t *os = NULL;
- zfsvfs_t *zfsvfs = NULL;
- int error = 0;
-
- ASSERT(vfsp);
- zfsvfs = vfsp->vfs_data;
- ASSERT(zfsvfs);
- os = zfsvfs->z_os;
-
- /*
- * The act of registering our callbacks will destroy any mount
- * options we may have. In order to enable temporary overrides
- * of mount options, we stash away the current values and
- * restore them after we register the callbacks.
- */
- if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) {
- vfsp->vfs_do_readonly = B_TRUE;
- vfsp->vfs_readonly = B_TRUE;
- }
-
- /*
- * Register property callbacks.
- *
- * It would probably be fine to just check for i/o error from
- * the first prop_register(), but I guess I like to go
- * overboard...
- */
- ds = dmu_objset_ds(os);
- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
- error = dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
- zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs);
- dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
- if (error)
- goto unregister;
-
- /*
- * Invoke our callbacks to restore temporary mount options.
- */
- if (vfsp->vfs_do_readonly)
- readonly_changed_cb(zfsvfs, vfsp->vfs_readonly);
- if (vfsp->vfs_do_setuid)
- setuid_changed_cb(zfsvfs, vfsp->vfs_setuid);
- if (vfsp->vfs_do_exec)
- exec_changed_cb(zfsvfs, vfsp->vfs_exec);
- if (vfsp->vfs_do_devices)
- devices_changed_cb(zfsvfs, vfsp->vfs_devices);
- if (vfsp->vfs_do_xattr)
- xattr_changed_cb(zfsvfs, vfsp->vfs_xattr);
- if (vfsp->vfs_do_atime)
- atime_changed_cb(zfsvfs, vfsp->vfs_atime);
- if (vfsp->vfs_do_relatime)
- relatime_changed_cb(zfsvfs, vfsp->vfs_relatime);
- if (vfsp->vfs_do_nbmand)
- nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand);
-
- return (0);
-
-unregister:
- dsl_prop_unregister_all(ds, zfsvfs);
- return (error);
-}
-
-static int
-zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
- uint64_t *userp, uint64_t *groupp, uint64_t *projectp)
-{
- sa_hdr_phys_t sa;
- sa_hdr_phys_t *sap = data;
- uint64_t flags;
- int hdrsize;
- boolean_t swap = B_FALSE;
-
- /*
- * Is it a valid type of object to track?
- */
- if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
- return (SET_ERROR(ENOENT));
-
- /*
- * If we have a NULL data pointer
- * then assume the id's aren't changing and
- * return EEXIST to the dmu to let it know to
- * use the same ids
- */
- if (data == NULL)
- return (SET_ERROR(EEXIST));
-
- if (bonustype == DMU_OT_ZNODE) {
- znode_phys_t *znp = data;
- *userp = znp->zp_uid;
- *groupp = znp->zp_gid;
- *projectp = ZFS_DEFAULT_PROJID;
- return (0);
- }
-
- if (sap->sa_magic == 0) {
- /*
- * This should only happen for newly created files
- * that haven't had the znode data filled in yet.
- */
- *userp = 0;
- *groupp = 0;
- *projectp = ZFS_DEFAULT_PROJID;
- return (0);
- }
-
- sa = *sap;
- if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
- sa.sa_magic = SA_MAGIC;
- sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
- swap = B_TRUE;
- } else {
- VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
- }
-
- hdrsize = sa_hdrsize(&sa);
- VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
-
- *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET));
- *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET));
- flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET));
- if (swap)
- flags = BSWAP_64(flags);
-
- if (flags & ZFS_PROJID)
- *projectp = *((uint64_t *)((uintptr_t)data + hdrsize +
- SA_PROJID_OFFSET));
- else
- *projectp = ZFS_DEFAULT_PROJID;
-
- if (swap) {
- *userp = BSWAP_64(*userp);
- *groupp = BSWAP_64(*groupp);
- *projectp = BSWAP_64(*projectp);
- }
- return (0);
-}
-
-static void
-fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
- char *domainbuf, int buflen, uid_t *ridp)
-{
- uint64_t fuid;
- const char *domain;
-
- fuid = zfs_strtonum(fuidstr, NULL);
-
- domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
- if (domain)
- (void) strlcpy(domainbuf, domain, buflen);
- else
- domainbuf[0] = '\0';
- *ridp = FUID_RID(fuid);
-}
-
-static uint64_t
-zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
-{
- switch (type) {
- case ZFS_PROP_USERUSED:
- case ZFS_PROP_USEROBJUSED:
- return (DMU_USERUSED_OBJECT);
- case ZFS_PROP_GROUPUSED:
- case ZFS_PROP_GROUPOBJUSED:
- return (DMU_GROUPUSED_OBJECT);
- case ZFS_PROP_PROJECTUSED:
- case ZFS_PROP_PROJECTOBJUSED:
- return (DMU_PROJECTUSED_OBJECT);
- case ZFS_PROP_USERQUOTA:
- return (zfsvfs->z_userquota_obj);
- case ZFS_PROP_GROUPQUOTA:
- return (zfsvfs->z_groupquota_obj);
- case ZFS_PROP_USEROBJQUOTA:
- return (zfsvfs->z_userobjquota_obj);
- case ZFS_PROP_GROUPOBJQUOTA:
- return (zfsvfs->z_groupobjquota_obj);
- case ZFS_PROP_PROJECTQUOTA:
- return (zfsvfs->z_projectquota_obj);
- case ZFS_PROP_PROJECTOBJQUOTA:
- return (zfsvfs->z_projectobjquota_obj);
- default:
- return (ZFS_NO_OBJECT);
- }
-}
-
-int
-zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
- uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
-{
- int error;
- zap_cursor_t zc;
- zap_attribute_t za;
- zfs_useracct_t *buf = vbuf;
- uint64_t obj;
- int offset = 0;
-
- if (!dmu_objset_userspace_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
-
- if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
- type == ZFS_PROP_PROJECTOBJQUOTA ||
- type == ZFS_PROP_PROJECTOBJUSED) &&
- !dmu_objset_projectquota_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
-
- if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
- type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
- type == ZFS_PROP_PROJECTOBJUSED ||
- type == ZFS_PROP_PROJECTOBJQUOTA) &&
- !dmu_objset_userobjspace_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
-
- obj = zfs_userquota_prop_to_obj(zfsvfs, type);
- if (obj == ZFS_NO_OBJECT) {
- *bufsizep = 0;
- return (0);
- }
-
- if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
- type == ZFS_PROP_PROJECTOBJUSED)
- offset = DMU_OBJACCT_PREFIX_LEN;
-
- for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
- (error = zap_cursor_retrieve(&zc, &za)) == 0;
- zap_cursor_advance(&zc)) {
- if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
- *bufsizep)
- break;
-
- /*
- * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
- * when dealing with block quota and vice versa.
- */
- if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
- DMU_OBJACCT_PREFIX_LEN) == 0))
- continue;
-
- fuidstr_to_sid(zfsvfs, za.za_name + offset,
- buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
-
- buf->zu_space = za.za_first_integer;
- buf++;
- }
- if (error == ENOENT)
- error = 0;
-
- ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
- *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
- *cookiep = zap_cursor_serialize(&zc);
- zap_cursor_fini(&zc);
- return (error);
-}
-
-/*
- * buf must be big enough (eg, 32 bytes)
- */
-static int
-id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
- char *buf, boolean_t addok)
-{
- uint64_t fuid;
- int domainid = 0;
-
- if (domain && domain[0]) {
- domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
- if (domainid == -1)
- return (SET_ERROR(ENOENT));
- }
- fuid = FUID_ENCODE(domainid, rid);
- (void) sprintf(buf, "%llx", (longlong_t)fuid);
- return (0);
-}
-
-int
-zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
- const char *domain, uint64_t rid, uint64_t *valp)
-{
- char buf[20 + DMU_OBJACCT_PREFIX_LEN];
- int offset = 0;
- int err;
- uint64_t obj;
-
- *valp = 0;
-
- if (!dmu_objset_userspace_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
-
- if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
- type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
- type == ZFS_PROP_PROJECTOBJUSED ||
- type == ZFS_PROP_PROJECTOBJQUOTA) &&
- !dmu_objset_userobjspace_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
-
- if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
- type == ZFS_PROP_PROJECTOBJQUOTA ||
- type == ZFS_PROP_PROJECTOBJUSED) {
- if (!dmu_objset_projectquota_present(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
- if (!zpl_is_valid_projid(rid))
- return (SET_ERROR(EINVAL));
- }
-
- obj = zfs_userquota_prop_to_obj(zfsvfs, type);
- if (obj == ZFS_NO_OBJECT)
- return (0);
-
- if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
- type == ZFS_PROP_PROJECTOBJUSED) {
- strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
- offset = DMU_OBJACCT_PREFIX_LEN;
- }
-
- err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE);
- if (err)
- return (err);
-
- err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
- if (err == ENOENT)
- err = 0;
- return (err);
-}
-
-int
-zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
- const char *domain, uint64_t rid, uint64_t quota)
-{
- char buf[32];
- int err;
- dmu_tx_t *tx;
- uint64_t *objp;
- boolean_t fuid_dirtied;
-
- if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
- return (SET_ERROR(ENOTSUP));
-
- switch (type) {
- case ZFS_PROP_USERQUOTA:
- objp = &zfsvfs->z_userquota_obj;
- break;
- case ZFS_PROP_GROUPQUOTA:
- objp = &zfsvfs->z_groupquota_obj;
- break;
- case ZFS_PROP_USEROBJQUOTA:
- objp = &zfsvfs->z_userobjquota_obj;
- break;
- case ZFS_PROP_GROUPOBJQUOTA:
- objp = &zfsvfs->z_groupobjquota_obj;
- break;
- case ZFS_PROP_PROJECTQUOTA:
- if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
- if (!zpl_is_valid_projid(rid))
- return (SET_ERROR(EINVAL));
-
- objp = &zfsvfs->z_projectquota_obj;
- break;
- case ZFS_PROP_PROJECTOBJQUOTA:
- if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
- return (SET_ERROR(ENOTSUP));
- if (!zpl_is_valid_projid(rid))
- return (SET_ERROR(EINVAL));
-
- objp = &zfsvfs->z_projectobjquota_obj;
- break;
- default:
- return (SET_ERROR(EINVAL));
- }
-
- err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
- if (err)
- return (err);
- fuid_dirtied = zfsvfs->z_fuid_dirty;
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
- if (*objp == 0) {
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
- zfs_userquota_prop_prefixes[type]);
- }
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
-
- mutex_enter(&zfsvfs->z_lock);
- if (*objp == 0) {
- *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
- DMU_OT_NONE, 0, tx);
- VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
- }
- mutex_exit(&zfsvfs->z_lock);
-
- if (quota == 0) {
- err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
- if (err == ENOENT)
- err = 0;
- } else {
- err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
- }
- ASSERT(err == 0);
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
- dmu_tx_commit(tx);
- return (err);
-}
-
-boolean_t
-zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
-{
- char buf[20 + DMU_OBJACCT_PREFIX_LEN];
- uint64_t used, quota, quotaobj;
- int err;
-
- if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
- if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
- dsl_pool_config_enter(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- dmu_objset_id_quota_upgrade(zfsvfs->z_os);
- dsl_pool_config_exit(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- }
- return (B_FALSE);
- }
-
- if (usedobj == DMU_PROJECTUSED_OBJECT) {
- if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
- if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
- dsl_pool_config_enter(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- dmu_objset_id_quota_upgrade(zfsvfs->z_os);
- dsl_pool_config_exit(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- }
- return (B_FALSE);
- }
- quotaobj = zfsvfs->z_projectobjquota_obj;
- } else if (usedobj == DMU_USERUSED_OBJECT) {
- quotaobj = zfsvfs->z_userobjquota_obj;
- } else if (usedobj == DMU_GROUPUSED_OBJECT) {
- quotaobj = zfsvfs->z_groupobjquota_obj;
- } else {
- return (B_FALSE);
- }
- if (quotaobj == 0 || zfsvfs->z_replay)
- return (B_FALSE);
-
- (void) sprintf(buf, "%llx", (longlong_t)id);
- err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
- if (err != 0)
- return (B_FALSE);
-
- (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id);
- err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
- if (err != 0)
- return (B_FALSE);
- return (used >= quota);
-}
-
-boolean_t
-zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
-{
- char buf[20];
- uint64_t used, quota, quotaobj;
- int err;
-
- if (usedobj == DMU_PROJECTUSED_OBJECT) {
- if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
- if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
- dsl_pool_config_enter(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- dmu_objset_id_quota_upgrade(zfsvfs->z_os);
- dsl_pool_config_exit(
- dmu_objset_pool(zfsvfs->z_os), FTAG);
- }
- return (B_FALSE);
- }
- quotaobj = zfsvfs->z_projectquota_obj;
- } else if (usedobj == DMU_USERUSED_OBJECT) {
- quotaobj = zfsvfs->z_userquota_obj;
- } else if (usedobj == DMU_GROUPUSED_OBJECT) {
- quotaobj = zfsvfs->z_groupquota_obj;
- } else {
- return (B_FALSE);
- }
- if (quotaobj == 0 || zfsvfs->z_replay)
- return (B_FALSE);
-
- (void) sprintf(buf, "%llx", (longlong_t)id);
- err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
- if (err != 0)
- return (B_FALSE);
-
- err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
- if (err != 0)
- return (B_FALSE);
- return (used >= quota);
-}
-
-boolean_t
-zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
-{
- return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
- zfs_id_overobjquota(zfsvfs, usedobj, id));
-}
-
-/*
- * Associate this zfsvfs with the given objset, which must be owned.
- * This will cache a bunch of on-disk state from the objset in the
- * zfsvfs.
- */
-static int
-zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
-{
- int error;
- uint64_t val;
-
- zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
- zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
- zfsvfs->z_os = os;
-
- error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
- if (error != 0)
- return (error);
- if (zfsvfs->z_version >
- zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
- (void) printk("Can't mount a version %lld file system "
- "on a version %lld pool\n. Pool must be upgraded to mount "
- "this file system.\n", (u_longlong_t)zfsvfs->z_version,
- (u_longlong_t)spa_version(dmu_objset_spa(os)));
- return (SET_ERROR(ENOTSUP));
- }
- error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
- if (error != 0)
- return (error);
- zfsvfs->z_norm = (int)val;
-
- error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
- if (error != 0)
- return (error);
- zfsvfs->z_utf8 = (val != 0);
-
- error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
- if (error != 0)
- return (error);
- zfsvfs->z_case = (uint_t)val;
-
- if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0)
- return (error);
- zfsvfs->z_acl_type = (uint_t)val;
-
- /*
- * Fold case on file systems that are always or sometimes case
- * insensitive.
- */
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- zfsvfs->z_case == ZFS_CASE_MIXED)
- zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
-
- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
-
- uint64_t sa_obj = 0;
- if (zfsvfs->z_use_sa) {
- /* should either have both of these objects or none */
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
- &sa_obj);
- if (error != 0)
- return (error);
-
- error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
- if ((error == 0) && (val == ZFS_XATTR_SA))
- zfsvfs->z_xattr_sa = B_TRUE;
- }
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
- &zfsvfs->z_root);
- if (error != 0)
- return (error);
- ASSERT(zfsvfs->z_root != 0);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
- &zfsvfs->z_unlinkedobj);
- if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
- 8, 1, &zfsvfs->z_userquota_obj);
- if (error == ENOENT)
- zfsvfs->z_userquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
- 8, 1, &zfsvfs->z_groupquota_obj);
- if (error == ENOENT)
- zfsvfs->z_groupquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
- 8, 1, &zfsvfs->z_projectquota_obj);
- if (error == ENOENT)
- zfsvfs->z_projectquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
- 8, 1, &zfsvfs->z_userobjquota_obj);
- if (error == ENOENT)
- zfsvfs->z_userobjquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
- 8, 1, &zfsvfs->z_groupobjquota_obj);
- if (error == ENOENT)
- zfsvfs->z_groupobjquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
- 8, 1, &zfsvfs->z_projectobjquota_obj);
- if (error == ENOENT)
- zfsvfs->z_projectobjquota_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
- &zfsvfs->z_fuid_obj);
- if (error == ENOENT)
- zfsvfs->z_fuid_obj = 0;
- else if (error != 0)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
- &zfsvfs->z_shares_dir);
- if (error == ENOENT)
- zfsvfs->z_shares_dir = 0;
- else if (error != 0)
- return (error);
-
- error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
- &zfsvfs->z_attr_table);
- if (error != 0)
- return (error);
-
- if (zfsvfs->z_version >= ZPL_VERSION_SA)
- sa_register_update_callback(os, zfs_sa_upgrade);
-
- return (0);
-}
-
-int
-zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
-{
- objset_t *os;
- zfsvfs_t *zfsvfs;
- int error;
- boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
-
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-
- error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
- if (error != 0) {
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
- return (error);
- }
-
- error = zfsvfs_create_impl(zfvp, zfsvfs, os);
- if (error != 0) {
- dmu_objset_disown(os, B_TRUE, zfsvfs);
- }
- return (error);
-}
-
-
-/*
- * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function
- * on a failure. Do not pass in a statically allocated zfsvfs.
- */
-int
-zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
-{
- int error;
-
- zfsvfs->z_vfs = NULL;
- zfsvfs->z_sb = NULL;
- zfsvfs->z_parent = zfsvfs;
-
- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
- rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
- rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-
- int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
- ZFS_OBJ_MTX_MAX);
- zfsvfs->z_hold_size = size;
- zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
- KM_SLEEP);
- zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
- for (int i = 0; i != size; i++) {
- avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
- sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
- mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
- }
-
- error = zfsvfs_init(zfsvfs, os);
- if (error != 0) {
- *zfvp = NULL;
- zfsvfs_free(zfsvfs);
- return (error);
- }
-
- zfsvfs->z_drain_task = TASKQID_INVALID;
- zfsvfs->z_draining = B_FALSE;
- zfsvfs->z_drain_cancel = B_TRUE;
-
- *zfvp = zfsvfs;
- return (0);
-}
-
-static int
-zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
-{
- int error;
- boolean_t readonly = zfs_is_readonly(zfsvfs);
-
- error = zfs_register_callbacks(zfsvfs->z_vfs);
- if (error)
- return (error);
-
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
- /*
- * If we are not mounting (ie: online recv), then we don't
- * have to worry about replaying the log as we blocked all
- * operations out since we closed the ZIL.
- */
- if (mounting) {
- ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
- dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
-
- /*
- * During replay we remove the read only flag to
- * allow replays to succeed.
- */
- if (readonly != 0) {
- readonly_changed_cb(zfsvfs, B_FALSE);
- } else {
- zap_stats_t zs;
- if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
- &zs) == 0) {
- dataset_kstats_update_nunlinks_kstat(
- &zfsvfs->z_kstat, zs.zs_num_entries);
- }
- dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
- "num_entries in unlinked set: %llu",
- zs.zs_num_entries);
- zfs_unlinked_drain(zfsvfs);
- }
-
- /*
- * Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest
- * doesn't use readonly mounts, where
- * zfs_unlinked_drain() isn't called.) This is because
- * ziltest causes spa_sync() to think it's committed,
- * but actually it is not, so the intent log contains
- * many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated
- * in a yet later txg. This would write a "create
- * object N" record to the intent log. Normally, this
- * would be fine because the spa_sync() would have
- * written out the fact that object N is free, before
- * we could write the "create object N" intent log
- * record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
- */
- if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
- if (zil_replay_disable) {
- zil_destroy(zfsvfs->z_log, B_FALSE);
- } else {
- zfsvfs->z_replay = B_TRUE;
- zil_replay(zfsvfs->z_os, zfsvfs,
- zfs_replay_vector);
- zfsvfs->z_replay = B_FALSE;
- }
- }
-
- /* restore readonly bit */
- if (readonly != 0)
- readonly_changed_cb(zfsvfs, B_TRUE);
- }
-
- /*
- * Set the objset user_ptr to track its zfsvfs.
- */
- mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-
- return (0);
-}
-
-void
-zfsvfs_free(zfsvfs_t *zfsvfs)
-{
- int i, size = zfsvfs->z_hold_size;
-
- zfs_fuid_destroy(zfsvfs);
-
- mutex_destroy(&zfsvfs->z_znodes_lock);
- mutex_destroy(&zfsvfs->z_lock);
- list_destroy(&zfsvfs->z_all_znodes);
- rrm_destroy(&zfsvfs->z_teardown_lock);
- rw_destroy(&zfsvfs->z_teardown_inactive_lock);
- rw_destroy(&zfsvfs->z_fuid_lock);
- for (i = 0; i != size; i++) {
- avl_destroy(&zfsvfs->z_hold_trees[i]);
- mutex_destroy(&zfsvfs->z_hold_locks[i]);
- }
- vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
- vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
- zfsvfs_vfs_free(zfsvfs->z_vfs);
- dataset_kstats_destroy(&zfsvfs->z_kstat);
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
-}
-
-static void
-zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
-{
- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
-}
-
-void
-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
-{
- objset_t *os = zfsvfs->z_os;
-
- if (!dmu_objset_is_snapshot(os))
- dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
-}
-
-#ifdef HAVE_MLSLABEL
-/*
- * Check that the hex label string is appropriate for the dataset being
- * mounted into the global_zone proper.
- *
- * Return an error if the hex label string is not default or
- * admin_low/admin_high. For admin_low labels, the corresponding
- * dataset must be readonly.
- */
-int
-zfs_check_global_label(const char *dsname, const char *hexsl)
-{
- if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
- return (0);
- if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
- return (0);
- if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
- /* must be readonly */
- uint64_t rdonly;
-
- if (dsl_prop_get_integer(dsname,
- zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
- return (SET_ERROR(EACCES));
- return (rdonly ? 0 : EACCES);
- }
- return (SET_ERROR(EACCES));
-}
-#endif /* HAVE_MLSLABEL */
-
-static int
-zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
- uint32_t bshift)
-{
- char buf[20 + DMU_OBJACCT_PREFIX_LEN];
- uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
- uint64_t quota;
- uint64_t used;
- int err;
-
- strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
- err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE);
- if (err)
- return (err);
-
- if (zfsvfs->z_projectquota_obj == 0)
- goto objs;
-
- err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
- buf + offset, 8, 1, &quota);
- if (err == ENOENT)
- goto objs;
- else if (err)
- return (err);
-
- err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
- buf + offset, 8, 1, &used);
- if (unlikely(err == ENOENT)) {
- uint32_t blksize;
- u_longlong_t nblocks;
-
- /*
- * Quota accounting is async, so it is possible race case.
- * There is at least one object with the given project ID.
- */
- sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
- if (unlikely(zp->z_blksz == 0))
- blksize = zfsvfs->z_max_blksz;
-
- used = blksize * nblocks;
- } else if (err) {
- return (err);
- }
-
- statp->f_blocks = quota >> bshift;
- statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
- statp->f_bavail = statp->f_bfree;
-
-objs:
- if (zfsvfs->z_projectobjquota_obj == 0)
- return (0);
-
- err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
- buf + offset, 8, 1, &quota);
- if (err == ENOENT)
- return (0);
- else if (err)
- return (err);
-
- err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
- buf, 8, 1, &used);
- if (unlikely(err == ENOENT)) {
- /*
- * Quota accounting is async, so it is possible race case.
- * There is at least one object with the given project ID.
- */
- used = 1;
- } else if (err) {
- return (err);
- }
-
- statp->f_files = quota;
- statp->f_ffree = (quota > used) ? (quota - used) : 0;
-
- return (0);
-}
-
-int
-zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
-{
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
- uint64_t refdbytes, availbytes, usedobjs, availobjs;
- int err = 0;
-
- ZFS_ENTER(zfsvfs);
-
- dmu_objset_space(zfsvfs->z_os,
- &refdbytes, &availbytes, &usedobjs, &availobjs);
-
- uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os);
- /*
- * The underlying storage pool actually uses multiple block
- * size. Under Solaris frsize (fragment size) is reported as
- * the smallest block size we support, and bsize (block size)
- * as the filesystem's maximum block size. Unfortunately,
- * under Linux the fragment size and block size are often used
- * interchangeably. Thus we are forced to report both of them
- * as the filesystem's maximum block size.
- */
- statp->f_frsize = zfsvfs->z_max_blksz;
- statp->f_bsize = zfsvfs->z_max_blksz;
- uint32_t bshift = fls(statp->f_bsize) - 1;
-
- /*
- * The following report "total" blocks of various kinds in
- * the file system, but reported in terms of f_bsize - the
- * "preferred" size.
- */
-
- /* Round up so we never have a filesystem using 0 blocks. */
- refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize);
- statp->f_blocks = (refdbytes + availbytes) >> bshift;
- statp->f_bfree = availbytes >> bshift;
- statp->f_bavail = statp->f_bfree; /* no root reservation */
-
- /*
- * statvfs() should really be called statufs(), because it assumes
- * static metadata. ZFS doesn't preallocate files, so the best
- * we can do is report the max that could possibly fit in f_files,
- * and that minus the number actually used in f_ffree.
- * For f_ffree, report the smaller of the number of objects available
- * and the number of blocks (each object will take at least a block).
- */
- statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT);
- statp->f_files = statp->f_ffree + usedobjs;
- statp->f_fsid.val[0] = (uint32_t)fsid;
- statp->f_fsid.val[1] = (uint32_t)(fsid >> 32);
- statp->f_type = ZFS_SUPER_MAGIC;
- statp->f_namelen = MAXNAMELEN - 1;
-
- /*
- * We have all of 40 characters to stuff a string here.
- * Is there anything useful we could/should provide?
- */
- bzero(statp->f_spare, sizeof (statp->f_spare));
-
- if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
- dmu_objset_projectquota_present(zfsvfs->z_os)) {
- znode_t *zp = ITOZ(dentry->d_inode);
-
- if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
- zpl_is_valid_projid(zp->z_projid))
- err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
- }
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-int
-zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
-{
- znode_t *rootzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
- if (error == 0)
- *ipp = ZTOI(rootzp);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-#ifdef HAVE_D_PRUNE_ALIASES
-/*
- * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
- * To accommodate this we must improvise and manually walk the list of znodes
- * attempting to prune dentries in order to be able to drop the inodes.
- *
- * To avoid scanning the same znodes multiple times they are always rotated
- * to the end of the z_all_znodes list. New znodes are inserted at the
- * end of the list so we're always scanning the oldest znodes first.
- */
-static int
-zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
-{
- znode_t **zp_array, *zp;
- int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
- int objects = 0;
- int i = 0, j = 0;
-
- zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
-
- if ((i++ > nr_to_scan) || (j >= max_array))
- break;
-
- ASSERT(list_link_active(&zp->z_link_node));
- list_remove(&zfsvfs->z_all_znodes, zp);
- list_insert_tail(&zfsvfs->z_all_znodes, zp);
-
- /* Skip active znodes and .zfs entries */
- if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
- continue;
-
- if (igrab(ZTOI(zp)) == NULL)
- continue;
-
- zp_array[j] = zp;
- j++;
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- for (i = 0; i < j; i++) {
- zp = zp_array[i];
-
- ASSERT3P(zp, !=, NULL);
- d_prune_aliases(ZTOI(zp));
-
- if (atomic_read(&ZTOI(zp)->i_count) == 1)
- objects++;
-
- iput(ZTOI(zp));
- }
-
- kmem_free(zp_array, max_array * sizeof (znode_t *));
-
- return (objects);
-}
-#endif /* HAVE_D_PRUNE_ALIASES */
-
-/*
- * The ARC has requested that the filesystem drop entries from the dentry
- * and inode caches. This can occur when the ARC needs to free meta data
- * blocks but can't because they are all pinned by entries in these caches.
- */
-int
-zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
- int error = 0;
-#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
- struct shrinker *shrinker = &sb->s_shrink;
- struct shrink_control sc = {
- .nr_to_scan = nr_to_scan,
- .gfp_mask = GFP_KERNEL,
- };
-#endif
-
- ZFS_ENTER(zfsvfs);
-
-#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
- defined(SHRINK_CONTROL_HAS_NID) && \
- defined(SHRINKER_NUMA_AWARE)
- if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
- *objects = 0;
- for_each_online_node(sc.nid) {
- *objects += (*shrinker->scan_objects)(shrinker, &sc);
- }
- } else {
- *objects = (*shrinker->scan_objects)(shrinker, &sc);
- }
-
-#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
- *objects = (*shrinker->scan_objects)(shrinker, &sc);
-#elif defined(HAVE_SHRINK)
- *objects = (*shrinker->shrink)(shrinker, &sc);
-#elif defined(HAVE_D_PRUNE_ALIASES)
-#define D_PRUNE_ALIASES_IS_DEFAULT
- *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
-#else
-#error "No available dentry and inode cache pruning mechanism."
-#endif
-
-#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT)
-#undef D_PRUNE_ALIASES_IS_DEFAULT
- /*
- * Fall back to zfs_prune_aliases if the kernel's per-superblock
- * shrinker couldn't free anything, possibly due to the inodes being
- * allocated in a different memcg.
- */
- if (*objects == 0)
- *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
-#endif
-
- ZFS_EXIT(zfsvfs);
-
- dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
- "pruning, nr_to_scan=%lu objects=%d error=%d\n",
- nr_to_scan, *objects, error);
-
- return (error);
-}
-
-/*
- * Teardown the zfsvfs_t.
- *
- * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
- * and 'z_teardown_inactive_lock' held.
- */
-static int
-zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
-{
- znode_t *zp;
-
- zfs_unlinked_drain_stop_wait(zfsvfs);
-
- /*
- * If someone has not already unmounted this file system,
- * drain the iput_taskq to ensure all active references to the
- * zfsvfs_t have been handled only then can it be safely destroyed.
- */
- if (zfsvfs->z_os) {
- /*
- * If we're unmounting we have to wait for the list to
- * drain completely.
- *
- * If we're not unmounting there's no guarantee the list
- * will drain completely, but iputs run from the taskq
- * may add the parents of dir-based xattrs to the taskq
- * so we want to wait for these.
- *
- * We can safely read z_nr_znodes without locking because the
- * VFS has already blocked operations which add to the
- * z_all_znodes list and thus increment z_nr_znodes.
- */
- int round = 0;
- while (zfsvfs->z_nr_znodes > 0) {
- taskq_wait_outstanding(dsl_pool_iput_taskq(
- dmu_objset_pool(zfsvfs->z_os)), 0);
- if (++round > 1 && !unmounting)
- break;
- }
- }
-
- rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
-
- if (!unmounting) {
- /*
- * We purge the parent filesystem's super block as the
- * parent filesystem and all of its snapshots have their
- * inode's super block set to the parent's filesystem's
- * super block. Note, 'z_parent' is self referential
- * for non-snapshots.
- */
- shrink_dcache_sb(zfsvfs->z_parent->z_sb);
- }
-
- /*
- * Close the zil. NB: Can't close the zil while zfs_inactive
- * threads are blocked as zil_close can call zfs_inactive.
- */
- if (zfsvfs->z_log) {
- zil_close(zfsvfs->z_log);
- zfsvfs->z_log = NULL;
- }
-
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
-
- /*
- * If we are not unmounting (ie: online recv) and someone already
- * unmounted this file system while we were doing the switcheroo,
- * or a reopen of z_os failed then just bail out now.
- */
- if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
- return (SET_ERROR(EIO));
- }
-
- /*
- * At this point there are no VFS ops active, and any new VFS ops
- * will fail with EIO since we have z_teardown_lock for writer (only
- * relevant for forced unmount).
- *
- * Release all holds on dbufs. We also grab an extra reference to all
- * the remaining inodes so that the kernel does not attempt to free
- * any inodes of a suspended fs. This can cause deadlocks since the
- * zfs_resume_fs() process may involve starting threads, which might
- * attempt to free unreferenced inodes to free up memory for the new
- * thread.
- */
- if (!unmounting) {
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
- zp = list_next(&zfsvfs->z_all_znodes, zp)) {
- if (zp->z_sa_hdl)
- zfs_znode_dmu_fini(zp);
- if (igrab(ZTOI(zp)) != NULL)
- zp->z_suspended = B_TRUE;
-
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
- }
-
- /*
- * If we are unmounting, set the unmounted flag and let new VFS ops
- * unblock. zfs_inactive will have the unmounted behavior, and all
- * other VFS ops will fail with EIO.
- */
- if (unmounting) {
- zfsvfs->z_unmounted = B_TRUE;
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
- }
-
- /*
- * z_os will be NULL if there was an error in attempting to reopen
- * zfsvfs, so just return as the properties had already been
- *
- * unregistered and cached data had been evicted before.
- */
- if (zfsvfs->z_os == NULL)
- return (0);
-
- /*
- * Unregister properties.
- */
- zfs_unregister_callbacks(zfsvfs);
-
- /*
- * Evict cached data. We must write out any dirty data before
- * disowning the dataset.
- */
- objset_t *os = zfsvfs->z_os;
- boolean_t os_dirty = B_FALSE;
- for (int t = 0; t < TXG_SIZE; t++) {
- if (dmu_objset_is_dirty(os, t)) {
- os_dirty = B_TRUE;
- break;
- }
- }
- if (!zfs_is_readonly(zfsvfs) && os_dirty) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- }
- dmu_objset_evict_dbufs(zfsvfs->z_os);
-
- return (0);
-}
-
-#if !defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) && \
- !defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER)
-atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
-#endif
-
-int
-zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
-{
- const char *osname = zm->mnt_osname;
- struct inode *root_inode;
- uint64_t recordsize;
- int error = 0;
- zfsvfs_t *zfsvfs = NULL;
- vfs_t *vfs = NULL;
-
- ASSERT(zm);
- ASSERT(osname);
-
- error = zfsvfs_parse_options(zm->mnt_data, &vfs);
- if (error)
- return (error);
-
- error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
- if (error) {
- zfsvfs_vfs_free(vfs);
- goto out;
- }
-
- if ((error = dsl_prop_get_integer(osname, "recordsize",
- &recordsize, NULL))) {
- zfsvfs_vfs_free(vfs);
- goto out;
- }
-
- vfs->vfs_data = zfsvfs;
- zfsvfs->z_vfs = vfs;
- zfsvfs->z_sb = sb;
- sb->s_fs_info = zfsvfs;
- sb->s_magic = ZFS_SUPER_MAGIC;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_blocksize = recordsize;
- sb->s_blocksize_bits = ilog2(recordsize);
-
- error = -zpl_bdi_setup(sb, "zfs");
- if (error)
- goto out;
-
- sb->s_bdi->ra_pages = 0;
-
- /* Set callback operations for the file system. */
- sb->s_op = &zpl_super_operations;
- sb->s_xattr = zpl_xattr_handlers;
- sb->s_export_op = &zpl_export_operations;
-#ifdef HAVE_S_D_OP
- sb->s_d_op = &zpl_dentry_operations;
-#endif /* HAVE_S_D_OP */
-
- /* Set features for file system. */
- zfs_set_fuid_feature(zfsvfs);
-
- if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
- uint64_t pval;
-
- atime_changed_cb(zfsvfs, B_FALSE);
- readonly_changed_cb(zfsvfs, B_TRUE);
- if ((error = dsl_prop_get_integer(osname,
- "xattr", &pval, NULL)))
- goto out;
- xattr_changed_cb(zfsvfs, pval);
- if ((error = dsl_prop_get_integer(osname,
- "acltype", &pval, NULL)))
- goto out;
- acltype_changed_cb(zfsvfs, pval);
- zfsvfs->z_issnap = B_TRUE;
- zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
- zfsvfs->z_snap_defer_time = jiffies;
-
- mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
- } else {
- if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
- goto out;
- }
-
- /* Allocate a root inode for the filesystem. */
- error = zfs_root(zfsvfs, &root_inode);
- if (error) {
- (void) zfs_umount(sb);
- goto out;
- }
-
- /* Allocate a root dentry for the filesystem */
- sb->s_root = d_make_root(root_inode);
- if (sb->s_root == NULL) {
- (void) zfs_umount(sb);
- error = SET_ERROR(ENOMEM);
- goto out;
- }
-
- if (!zfsvfs->z_issnap)
- zfsctl_create(zfsvfs);
-
- zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
-out:
- if (error) {
- if (zfsvfs != NULL) {
- dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
- zfsvfs_free(zfsvfs);
- }
- /*
- * make sure we don't have dangling sb->s_fs_info which
- * zfs_preumount will use.
- */
- sb->s_fs_info = NULL;
- }
-
- return (error);
-}
-
-/*
- * Called when an unmount is requested and certain sanity checks have
- * already passed. At this point no dentries or inodes have been reclaimed
- * from their respective caches. We drop the extra reference on the .zfs
- * control directory to allow everything to be reclaimed. All snapshots
- * must already have been unmounted to reach this point.
- */
-void
-zfs_preumount(struct super_block *sb)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
-
- /* zfsvfs is NULL when zfs_domount fails during mount */
- if (zfsvfs) {
- zfs_unlinked_drain_stop_wait(zfsvfs);
- zfsctl_destroy(sb->s_fs_info);
- /*
- * Wait for iput_async before entering evict_inodes in
- * generic_shutdown_super. The reason we must finish before
- * evict_inodes is when lazytime is on, or when zfs_purgedir
- * calls zfs_zget, iput would bump i_count from 0 to 1. This
- * would race with the i_count check in evict_inodes. This means
- * it could destroy the inode while we are still using it.
- *
- * We wait for two passes. xattr directories in the first pass
- * may add xattr entries in zfs_purgedir, so in the second pass
- * we wait for them. We don't use taskq_wait here because it is
- * a pool wide taskq. Other mounted filesystems can constantly
- * do iput_async and there's no guarantee when taskq will be
- * empty.
- */
- taskq_wait_outstanding(dsl_pool_iput_taskq(
- dmu_objset_pool(zfsvfs->z_os)), 0);
- taskq_wait_outstanding(dsl_pool_iput_taskq(
- dmu_objset_pool(zfsvfs->z_os)), 0);
- }
-}
-
-/*
- * Called once all other unmount released tear down has occurred.
- * It is our responsibility to release any remaining infrastructure.
- */
-/*ARGSUSED*/
-int
-zfs_umount(struct super_block *sb)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
- objset_t *os;
-
- if (zfsvfs->z_arc_prune != NULL)
- arc_remove_prune_callback(zfsvfs->z_arc_prune);
- VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
- os = zfsvfs->z_os;
- zpl_bdi_destroy(sb);
-
- /*
- * z_os will be NULL if there was an error in
- * attempting to reopen zfsvfs.
- */
- if (os != NULL) {
- /*
- * Unset the objset user_ptr.
- */
- mutex_enter(&os->os_user_ptr_lock);
- dmu_objset_set_user(os, NULL);
- mutex_exit(&os->os_user_ptr_lock);
-
- /*
- * Finally release the objset
- */
- dmu_objset_disown(os, B_TRUE, zfsvfs);
- }
-
- zfsvfs_free(zfsvfs);
- return (0);
-}
-
-int
-zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
- vfs_t *vfsp;
- boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
- int error;
-
- if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
- !(*flags & SB_RDONLY)) {
- *flags |= SB_RDONLY;
- return (EROFS);
- }
-
- error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
- if (error)
- return (error);
-
- if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-
- zfs_unregister_callbacks(zfsvfs);
- zfsvfs_vfs_free(zfsvfs->z_vfs);
-
- vfsp->vfs_data = zfsvfs;
- zfsvfs->z_vfs = vfsp;
- if (!issnap)
- (void) zfs_register_callbacks(vfsp);
-
- return (error);
-}
-
-int
-zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
-{
- zfsvfs_t *zfsvfs = sb->s_fs_info;
- znode_t *zp;
- uint64_t object = 0;
- uint64_t fid_gen = 0;
- uint64_t gen_mask;
- uint64_t zp_gen;
- int i, err;
-
- *ipp = NULL;
-
- if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
- zfid_short_t *zfid = (zfid_short_t *)fidp;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
- } else {
- return (SET_ERROR(EINVAL));
- }
-
- /* LONG_FID_LEN means snapdirs */
- if (fidp->fid_len == LONG_FID_LEN) {
- zfid_long_t *zlfid = (zfid_long_t *)fidp;
- uint64_t objsetid = 0;
- uint64_t setgen = 0;
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
-
- if (objsetid != ZFSCTL_INO_SNAPDIRS - object) {
- dprintf("snapdir fid: objsetid (%llu) != "
- "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n",
- objsetid, ZFSCTL_INO_SNAPDIRS, object);
-
- return (SET_ERROR(EINVAL));
- }
-
- if (fid_gen > 1 || setgen != 0) {
- dprintf("snapdir fid: fid_gen (%llu) and setgen "
- "(%llu)\n", fid_gen, setgen);
- return (SET_ERROR(EINVAL));
- }
-
- return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
- }
-
- ZFS_ENTER(zfsvfs);
- /* A zero fid_gen means we are in the .zfs control directories */
- if (fid_gen == 0 &&
- (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
- *ipp = zfsvfs->z_ctldir;
- ASSERT(*ipp != NULL);
- if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
- 0, kcred, NULL, NULL) == 0);
- } else {
- igrab(*ipp);
- }
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- gen_mask = -1ULL >> (64 - 8 * i);
-
- dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
- if ((err = zfs_zget(zfsvfs, object, &zp))) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- /* Don't export xattr stuff */
- if (zp->z_pflags & ZFS_XATTR) {
- iput(ZTOI(zp));
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOENT));
- }
-
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
- sizeof (uint64_t));
- zp_gen = zp_gen & gen_mask;
- if (zp_gen == 0)
- zp_gen = 1;
- if ((fid_gen == 0) && (zfsvfs->z_root == object))
- fid_gen = zp_gen;
- if (zp->z_unlinked || zp_gen != fid_gen) {
- dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
- fid_gen);
- iput(ZTOI(zp));
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOENT));
- }
-
- *ipp = ZTOI(zp);
- if (*ipp)
- zfs_inode_update(ITOZ(*ipp));
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Block out VFS ops and close zfsvfs_t
- *
- * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
- * dataset and objset intact so that they can be atomically handed off during
- * a subsequent rollback or recv operation and the resume thereafter.
- */
-int
-zfs_suspend_fs(zfsvfs_t *zfsvfs)
-{
- int error;
-
- if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
- return (error);
-
- return (0);
-}
-
-/*
- * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
- * is an invariant across any of the operations that can be performed while the
- * filesystem was suspended. Whether it succeeded or failed, the preconditions
- * are the same: the relevant objset and associated dataset are owned by
- * zfsvfs, held, and long held on entry.
- */
-int
-zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
-{
- int err, err2;
- znode_t *zp;
-
- ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
- ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
-
- /*
- * We already own this, so just update the objset_t, as the one we
- * had before may have been evicted.
- */
- objset_t *os;
- VERIFY3P(ds->ds_owner, ==, zfsvfs);
- VERIFY(dsl_dataset_long_held(ds));
- VERIFY0(dmu_objset_from_ds(ds, &os));
-
- err = zfsvfs_init(zfsvfs, os);
- if (err != 0)
- goto bail;
-
- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
-
- zfs_set_fuid_feature(zfsvfs);
- zfsvfs->z_rollback_time = jiffies;
-
- /*
- * Attempt to re-establish all the active inodes with their
- * dbufs. If a zfs_rezget() fails, then we unhash the inode
- * and mark it stale. This prevents a collision if a new
- * inode/object is created which must use the same inode
- * number. The stale inode will be be released when the
- * VFS prunes the dentry holding the remaining references
- * on the stale inode.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp;
- zp = list_next(&zfsvfs->z_all_znodes, zp)) {
- err2 = zfs_rezget(zp);
- if (err2) {
- remove_inode_hash(ZTOI(zp));
- zp->z_is_stale = B_TRUE;
- }
-
- /* see comment in zfs_suspend_fs() */
- if (zp->z_suspended) {
- zfs_iput_async(ZTOI(zp));
- zp->z_suspended = B_FALSE;
- }
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
- /*
- * zfs_suspend_fs() could have interrupted freeing
- * of dnodes. We need to restart this freeing so
- * that we don't "leak" the space.
- */
- zfs_unlinked_drain(zfsvfs);
- }
-
-bail:
- if (err != 0)
- zfsvfs->z_unmounted = B_TRUE;
-
- /* release the VFS ops */
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-
- if (err != 0) {
- /*
- * Since we couldn't setup the sa framework, try to force
- * unmount this file system.
- */
- if (zfsvfs->z_os)
- (void) zfs_umount(zfsvfs->z_sb);
- }
- return (err);
-}
-
-/*
- * Release VOPs and unmount a suspended filesystem.
- */
-int
-zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
-{
- ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
- ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
-
- /*
- * We already own this, so just hold and rele it to update the
- * objset_t, as the one we had before may have been evicted.
- */
- objset_t *os;
- VERIFY3P(ds->ds_owner, ==, zfsvfs);
- VERIFY(dsl_dataset_long_held(ds));
- VERIFY0(dmu_objset_from_ds(ds, &os));
- zfsvfs->z_os = os;
-
- /* release the VOPs */
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-
- /*
- * Try to force unmount this file system.
- */
- (void) zfs_umount(zfsvfs->z_sb);
- zfsvfs->z_unmounted = B_TRUE;
- return (0);
-}
-
-int
-zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
-{
- int error;
- objset_t *os = zfsvfs->z_os;
- dmu_tx_t *tx;
-
- if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
- return (SET_ERROR(EINVAL));
-
- if (newvers < zfsvfs->z_version)
- return (SET_ERROR(EINVAL));
-
- if (zfs_spa_version_map(newvers) >
- spa_version(dmu_objset_spa(zfsvfs->z_os)))
- return (SET_ERROR(ENOTSUP));
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
- if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
- ZFS_SA_ATTRS);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- }
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
-
- error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
- 8, 1, &newvers, tx);
-
- if (error) {
- dmu_tx_commit(tx);
- return (error);
- }
-
- if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
- uint64_t sa_obj;
-
- ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
- SPA_VERSION_SA);
- sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
- DMU_OT_NONE, 0, tx);
-
- error = zap_add(os, MASTER_NODE_OBJ,
- ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
- ASSERT0(error);
-
- VERIFY(0 == sa_set_sa_object(os, sa_obj));
- sa_register_update_callback(os, zfs_sa_upgrade);
- }
-
- spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
- "from %llu to %llu", zfsvfs->z_version, newvers);
-
- dmu_tx_commit(tx);
-
- zfsvfs->z_version = newvers;
- os->os_version = newvers;
-
- zfs_set_fuid_feature(zfsvfs);
-
- return (0);
-}
-
-/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
- uint64_t *cached_copy = NULL;
-
- /*
- * Figure out where in the objset_t the cached copy would live, if it
- * is available for the requested property.
- */
- if (os != NULL) {
- switch (prop) {
- case ZFS_PROP_VERSION:
- cached_copy = &os->os_version;
- break;
- case ZFS_PROP_NORMALIZE:
- cached_copy = &os->os_normalization;
- break;
- case ZFS_PROP_UTF8ONLY:
- cached_copy = &os->os_utf8only;
- break;
- case ZFS_PROP_CASE:
- cached_copy = &os->os_casesensitivity;
- break;
- default:
- break;
- }
- }
- if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
- *value = *cached_copy;
- return (0);
- }
-
- /*
- * If the property wasn't cached, look up the file system's value for
- * the property. For the version property, we look up a slightly
- * different string.
- */
- const char *pname;
- int error = ENOENT;
- if (prop == ZFS_PROP_VERSION)
- pname = ZPL_VERSION_STR;
- else
- pname = zfs_prop_to_name(prop);
-
- if (os != NULL) {
- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
- }
-
- if (error == ENOENT) {
- /* No value set, use the default value */
- switch (prop) {
- case ZFS_PROP_VERSION:
- *value = ZPL_VERSION;
- break;
- case ZFS_PROP_NORMALIZE:
- case ZFS_PROP_UTF8ONLY:
- *value = 0;
- break;
- case ZFS_PROP_CASE:
- *value = ZFS_CASE_SENSITIVE;
- break;
- case ZFS_PROP_ACLTYPE:
- *value = ZFS_ACLTYPE_OFF;
- break;
- default:
- return (error);
- }
- error = 0;
- }
-
- /*
- * If one of the methods for getting the property value above worked,
- * copy it into the objset_t's cache.
- */
- if (error == 0 && cached_copy != NULL) {
- *cached_copy = *value;
- }
-
- return (error);
-}
-
-/*
- * Return true if the corresponding vfs's unmounted flag is set.
- * Otherwise return false.
- * If this function returns true we know VFS unmount has been initiated.
- */
-boolean_t
-zfs_get_vfs_flag_unmounted(objset_t *os)
-{
- zfsvfs_t *zfvp;
- boolean_t unmounted = B_FALSE;
-
- ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
-
- mutex_enter(&os->os_user_ptr_lock);
- zfvp = dmu_objset_get_user(os);
- if (zfvp != NULL && zfvp->z_unmounted)
- unmounted = B_TRUE;
- mutex_exit(&os->os_user_ptr_lock);
-
- return (unmounted);
-}
-
-struct objnode {
- avl_node_t node;
- uint64_t obj;
-};
-
-static int
-objnode_compare(const void *o1, const void *o2)
-{
- const struct objnode *obj1 = o1;
- const struct objnode *obj2 = o2;
- if (obj1->obj < obj2->obj)
- return (-1);
- if (obj1->obj > obj2->obj)
- return (1);
- return (0);
-}
-
-objlist_t *
-zfs_get_deleteq(objset_t *os)
-{
- objlist_t *deleteq_objlist = objlist_create();
- uint64_t deleteq_obj;
- zap_cursor_t zc;
- zap_attribute_t za;
- dmu_object_info_t doi;
-
- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
- VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi));
- ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE);
-
- VERIFY0(zap_lookup(os, MASTER_NODE_OBJ,
- ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
-
- /*
- * In order to insert objects into the objlist, they must be in sorted
- * order. We don't know what order we'll get them out of the ZAP in, so
- * we insert them into and remove them from an avl_tree_t to sort them.
- */
- avl_tree_t at;
- avl_create(&at, objnode_compare, sizeof (struct objnode),
- offsetof(struct objnode, node));
-
- for (zap_cursor_init(&zc, os, deleteq_obj);
- zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
- struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP);
- obj->obj = za.za_first_integer;
- avl_add(&at, obj);
- }
- zap_cursor_fini(&zc);
-
- struct objnode *next, *found = avl_first(&at);
- while (found != NULL) {
- next = AVL_NEXT(&at, found);
- objlist_insert(deleteq_objlist, found->obj);
- found = next;
- }
-
- void *cookie = NULL;
- while ((found = avl_destroy_nodes(&at, &cookie)) != NULL)
- kmem_free(found, sizeof (*found));
- avl_destroy(&at);
- return (deleteq_objlist);
-}
-
-
-void
-zfs_init(void)
-{
- zfsctl_init();
- zfs_znode_init();
- dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
- register_filesystem(&zpl_fs_type);
-}
-
-void
-zfs_fini(void)
-{
- /*
- * we don't use outstanding because zpl_posix_acl_free might add more.
- */
- taskq_wait(system_delay_taskq);
- taskq_wait(system_taskq);
- unregister_filesystem(&zpl_fs_type);
- zfs_znode_fini();
- zfsctl_fini();
-}
-
-#if defined(_KERNEL)
-EXPORT_SYMBOL(zfs_suspend_fs);
-EXPORT_SYMBOL(zfs_resume_fs);
-EXPORT_SYMBOL(zfs_userspace_one);
-EXPORT_SYMBOL(zfs_userspace_many);
-EXPORT_SYMBOL(zfs_set_userquota);
-EXPORT_SYMBOL(zfs_id_overblockquota);
-EXPORT_SYMBOL(zfs_id_overobjquota);
-EXPORT_SYMBOL(zfs_id_overquota);
-EXPORT_SYMBOL(zfs_set_version);
-EXPORT_SYMBOL(zfsvfs_create);
-EXPORT_SYMBOL(zfsvfs_free);
-EXPORT_SYMBOL(zfs_is_readonly);
-EXPORT_SYMBOL(zfs_domount);
-EXPORT_SYMBOL(zfs_preumount);
-EXPORT_SYMBOL(zfs_umount);
-EXPORT_SYMBOL(zfs_remount);
-EXPORT_SYMBOL(zfs_statvfs);
-EXPORT_SYMBOL(zfs_vget);
-EXPORT_SYMBOL(zfs_prune);
-#endif
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
deleted file mode 100644
index de7b59935..000000000
--- a/module/zfs/zfs_vnops.c
+++ /dev/null
@@ -1,5275 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-/* Portions Copyright 2010 Robert Milkowski */
-
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/uio.h>
-#include <sys/vmsystm.h>
-#include <sys/atomic.h>
-#include <sys/pathname.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/dbuf.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/policy.h>
-#include <sys/sunddi.h>
-#include <sys/sid.h>
-#include <sys/mode.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_rlock.h>
-#include <sys/cred.h>
-#include <sys/zpl.h>
-#include <sys/zil.h>
-#include <sys/sa_impl.h>
-
-/*
- * Programming rules.
- *
- * Each vnode op performs some logical unit of work. To do this, the ZPL must
- * properly lock its in-core state, create a DMU transaction, do the work,
- * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait for the intent log to commit if it is a synchronous operation.
- * Moreover, the vnode ops must work in both normal and log replay context.
- * The ordering of events is important to avoid deadlocks and references
- * to freed memory. The example below illustrates the following Big Rules:
- *
- * (1) A check must be made in each zfs thread for a mounted file system.
- * This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
- * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
- * can return EIO from the calling function.
- *
- * (2) iput() should always be the last thing except for zil_commit()
- * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
- * First, if it's the last reference, the vnode/znode
- * can be freed, so the zp may point to freed memory. Second, the last
- * reference will call zfs_zinactive(), which may induce a lot of work --
- * pushing cached pages (which acquires range locks) and syncing out
- * cached atime changes. Third, zfs_zinactive() may require a new tx,
- * which could deadlock the system if you were already holding one.
- * If you must call iput() within a tx then use zfs_iput_async().
- *
- * (3) All range locks must be grabbed before calling dmu_tx_assign(),
- * as they can span dmu_tx_assign() calls.
- *
- * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
- * dmu_tx_assign(). This is critical because we don't want to block
- * while holding locks.
- *
- * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
- * reduces lock contention and CPU usage when we must wait (note that if
- * throughput is constrained by the storage, nearly every transaction
- * must wait).
- *
- * Note, in particular, that if a lock is sometimes acquired before
- * the tx assigns, and sometimes after (e.g. z_lock), then failing
- * to use a non-blocking assign can deadlock the system. The scenario:
- *
- * Thread A has grabbed a lock before calling dmu_tx_assign().
- * Thread B is in an already-assigned tx, and blocks for this lock.
- * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
- * forever, because the previous txg can't quiesce until B's tx commits.
- *
- * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
- * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
- * to indicate that this operation has already called dmu_tx_wait().
- * This will ensure that we don't retry forever, waiting a short bit
- * each time.
- *
- * (5) If the operation succeeded, generate the intent log entry for it
- * before dropping locks. This ensures that the ordering of events
- * in the intent log matches the order in which they actually occurred.
- * During ZIL replay the zfs_log_* functions will update the sequence
- * number to indicate the zil transaction has replayed.
- *
- * (6) At the end of each vnode op, the DMU tx must always commit,
- * regardless of whether there were any errors.
- *
- * (7) After dropping all locks, invoke zil_commit(zilog, foid)
- * to ensure that synchronous semantics are provided when necessary.
- *
- * In general, this is how things should be ordered in each vnode op:
- *
- * ZFS_ENTER(zfsvfs); // exit if unmounted
- * top:
- * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
- * rw_enter(...); // grab any other locks you need
- * tx = dmu_tx_create(...); // get DMU tx
- * dmu_tx_hold_*(); // hold each object you might modify
- * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- * if (error) {
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * iput(...); // release held vnodes
- * if (error == ERESTART) {
- * waited = B_TRUE;
- * dmu_tx_wait(tx);
- * dmu_tx_abort(tx);
- * goto top;
- * }
- * dmu_tx_abort(tx); // abort DMU tx
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // really out of space
- * }
- * error = do_real_work(); // do whatever this VOP does
- * if (error == 0)
- * zfs_log_*(...); // on success, make ZIL entry
- * dmu_tx_commit(tx); // commit DMU tx -- error or not
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * iput(...); // release held vnodes
- * zil_commit(zilog, foid); // synchronous when necessary
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // done, report error
- */
-
-/*
- * Virus scanning is unsupported. It would be possible to add a hook
- * here to performance the required virus scan. This could be done
- * entirely in the kernel or potentially as an update to invoke a
- * scanning utility.
- */
-static int
-zfs_vscan(struct inode *ip, cred_t *cr, int async)
-{
- return (0);
-}
-
-/* ARGSUSED */
-int
-zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /* Honor ZFS_APPENDONLY file attribute */
- if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
- ((flag & O_APPEND) == 0)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- /* Virus scan eligible files on open */
- if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
- !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
- if (zfs_vscan(ip, cr, 0) != 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
- }
-
- /* Keep a count of the synchronous opens in the znode */
- if (flag & O_SYNC)
- atomic_inc_32(&zp->z_sync_cnt);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/* ARGSUSED */
-int
-zfs_close(struct inode *ip, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /* Decrement the synchronous opens in the znode */
- if (flag & O_SYNC)
- atomic_dec_32(&zp->z_sync_cnt);
-
- if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
- !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
- VERIFY(zfs_vscan(ip, cr, 1) == 0);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-#if defined(SEEK_HOLE) && defined(SEEK_DATA)
-/*
- * Lseek support for finding holes (cmd == SEEK_HOLE) and
- * data (cmd == SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
-{
- znode_t *zp = ITOZ(ip);
- uint64_t noff = (uint64_t)*off; /* new offset */
- uint64_t file_sz;
- int error;
- boolean_t hole;
-
- file_sz = zp->z_size;
- if (noff >= file_sz) {
- return (SET_ERROR(ENXIO));
- }
-
- if (cmd == SEEK_HOLE)
- hole = B_TRUE;
- else
- hole = B_FALSE;
-
- error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
-
- if (error == ESRCH)
- return (SET_ERROR(ENXIO));
-
- /* file was dirty, so fall back to using generic logic */
- if (error == EBUSY) {
- if (hole)
- *off = file_sz;
-
- return (0);
- }
-
- /*
- * We could find a hole that begins after the logical end-of-file,
- * because dmu_offset_next() only works on whole blocks. If the
- * EOF falls mid-block, then indicate that the "virtual hole"
- * at the end of the file begins at the logical EOF, rather than
- * at the end of the last block.
- */
- if (noff > file_sz) {
- ASSERT(hole);
- noff = file_sz;
- }
-
- if (noff < *off)
- return (error);
- *off = noff;
- return (error);
-}
-
-int
-zfs_holey(struct inode *ip, int cmd, loff_t *off)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- error = zfs_holey_common(ip, cmd, off);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* SEEK_HOLE && SEEK_DATA */
-
-#if defined(_KERNEL)
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Write: If we find a memory mapped page, we write to *both*
- * the page and the dmu buffer.
- */
-static void
-update_pages(struct inode *ip, int64_t start, int len,
- objset_t *os, uint64_t oid)
-{
- struct address_space *mp = ip->i_mapping;
- struct page *pp;
- uint64_t nbytes;
- int64_t off;
- void *pb;
-
- off = start & (PAGE_SIZE-1);
- for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
- nbytes = MIN(PAGE_SIZE - off, len);
-
- pp = find_lock_page(mp, start >> PAGE_SHIFT);
- if (pp) {
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
-
- pb = kmap(pp);
- (void) dmu_read(os, oid, start+off, nbytes, pb+off,
- DMU_READ_PREFETCH);
- kunmap(pp);
-
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
-
- mark_page_accessed(pp);
- SetPageUptodate(pp);
- ClearPageError(pp);
- unlock_page(pp);
- put_page(pp);
- }
-
- len -= nbytes;
- off = 0;
- }
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Read: We "read" preferentially from memory mapped pages,
- * else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
- */
-static int
-mappedread(struct inode *ip, int nbytes, uio_t *uio)
-{
- struct address_space *mp = ip->i_mapping;
- struct page *pp;
- znode_t *zp = ITOZ(ip);
- int64_t start, off;
- uint64_t bytes;
- int len = nbytes;
- int error = 0;
- void *pb;
-
- start = uio->uio_loffset;
- off = start & (PAGE_SIZE-1);
- for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
- bytes = MIN(PAGE_SIZE - off, len);
-
- pp = find_lock_page(mp, start >> PAGE_SHIFT);
- if (pp) {
- ASSERT(PageUptodate(pp));
- unlock_page(pp);
-
- pb = kmap(pp);
- error = uiomove(pb + off, bytes, UIO_READ, uio);
- kunmap(pp);
-
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
-
- mark_page_accessed(pp);
- put_page(pp);
- } else {
- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, bytes);
- }
-
- len -= bytes;
- off = 0;
- if (error)
- break;
- }
- return (error);
-}
-#endif /* _KERNEL */
-
-unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- * IN: ip - inode of file to be read from.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * ioflag - FSYNC flags; used to provide FRSYNC semantics.
- * O_DIRECT flag; used to bypass page cache.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range, buffer filled.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Side Effects:
- * inode - atime updated if byte count > 0
- */
-/* ARGSUSED */
-int
-zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
-{
- int error = 0;
- boolean_t frsync = B_FALSE;
-
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (zp->z_pflags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- /*
- * Validate file offset
- */
- if (uio->uio_loffset < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Fasttrack empty reads
- */
- if (uio->uio_resid == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
-#ifdef FRSYNC
- /*
- * If we're in FRSYNC mode, sync out this znode before reading it.
- * Only do this for non-snapshots.
- *
- * Some platforms do not support FRSYNC and instead map it
- * to FSYNC, which results in unnecessary calls to zil_commit. We
- * only honor FRSYNC requests on platforms which support it.
- */
- frsync = !!(ioflag & FRSYNC);
-#endif
- if (zfsvfs->z_log &&
- (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
- zil_commit(zfsvfs->z_log, zp->z_id);
-
- /*
- * Lock the range against changes.
- */
- locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
- uio->uio_loffset, uio->uio_resid, RL_READER);
-
- /*
- * If we are reading past end-of-file we can skip
- * to the end; but we might still need to set atime.
- */
- if (uio->uio_loffset >= zp->z_size) {
- error = 0;
- goto out;
- }
-
- ASSERT(uio->uio_loffset < zp->z_size);
- ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
- ssize_t start_resid = n;
-
-#ifdef HAVE_UIO_ZEROCOPY
- xuio_t *xuio = NULL;
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
- int nblk;
- int blksz = zp->z_blksz;
- uint64_t offset = uio->uio_loffset;
-
- xuio = (xuio_t *)uio;
- if ((ISP2(blksz))) {
- nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
- blksz)) / blksz;
- } else {
- ASSERT(offset + n <= blksz);
- nblk = 1;
- }
- (void) dmu_xuio_init(xuio, nblk);
-
- if (vn_has_cached_data(ip)) {
- /*
- * For simplicity, we always allocate a full buffer
- * even if we only expect to read a portion of a block.
- */
- while (--nblk >= 0) {
- (void) dmu_xuio_add(xuio,
- dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz), 0, blksz);
- }
- }
- }
-#endif /* HAVE_UIO_ZEROCOPY */
-
- while (n > 0) {
- ssize_t nbytes = MIN(n, zfs_read_chunk_size -
- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
- if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
- error = mappedread(ip, nbytes, uio);
- } else {
- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes);
- }
-
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = SET_ERROR(EIO);
- break;
- }
-
- n -= nbytes;
- }
-
- int64_t nread = start_resid - n;
- dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
- task_io_account_read(nread);
-out:
- rangelock_exit(lr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Write the bytes to a file.
- *
- * IN: ip - inode of file to be written to.
- * uio - structure supplying write location, range info,
- * and data buffer.
- * ioflag - FAPPEND flag set if in append mode.
- * O_DIRECT flag; used to bypass page cache.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - ctime|mtime updated if byte count > 0
- */
-
-/* ARGSUSED */
-int
-zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
-{
- int error = 0;
- ssize_t start_resid = uio->uio_resid;
-
- /*
- * Fasttrack empty write
- */
- ssize_t n = start_resid;
- if (n == 0)
- return (0);
-
- rlim64_t limit = uio->uio_limit;
- if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
- limit = MAXOFFSET_T;
-
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- sa_bulk_attr_t bulk[4];
- int count = 0;
- uint64_t mtime[2], ctime[2];
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &zp->z_size, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
-
- /*
- * Callers might not be able to detect properly that we are read-only,
- * so check it explicitly here.
- */
- if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EROFS));
- }
-
- /*
- * If immutable or not appending then return EPERM
- */
- if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_size))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- /*
- * Validate file offset
- */
- offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- int max_blksz = zfsvfs->z_max_blksz;
- xuio_t *xuio = NULL;
-
- /*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
- */
-#ifdef HAVE_UIO_ZEROCOPY
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
- xuio = (xuio_t *)uio;
- else
-#endif
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EFAULT));
- }
-
- /*
- * If in append mode, set the io offset pointer to eof.
- */
- locked_range_t *lr;
- if (ioflag & FAPPEND) {
- /*
- * Obtain an appending range lock to guarantee file append
- * semantics. We reset the write offset once we have the lock.
- */
- lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
- woff = lr->lr_offset;
- if (lr->lr_length == UINT64_MAX) {
- /*
- * We overlocked the file because this write will cause
- * the file block size to increase.
- * Note that zp_size cannot change with this lock held.
- */
- woff = zp->z_size;
- }
- uio->uio_loffset = woff;
- } else {
- /*
- * Note that if the file block size will change as a result of
- * this write, then this range lock will lock the entire file
- * so that we can re-write the block safely.
- */
- lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
- }
-
- if (woff >= limit) {
- rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EFBIG));
- }
-
- if ((woff + n) > limit || woff > (limit - n))
- n = limit - woff;
-
- /* Will this write extend the file length? */
- int write_eof = (woff + n > zp->z_size);
-
- uint64_t end_size = MAX(zp->z_size, woff + n);
- zilog_t *zilog = zfsvfs->z_log;
-#ifdef HAVE_UIO_ZEROCOPY
- int i_iov = 0;
- const iovec_t *iovp = uio->uio_iov;
- ASSERTV(int iovcnt = uio->uio_iovcnt);
-#endif
-
-
- /*
- * Write the file in reasonable size chunks. Each chunk is written
- * in a separate transaction; this keeps the intent log records small
- * and allows us to do more fine-grained space accounting.
- */
- while (n > 0) {
- woff = uio->uio_loffset;
-
- if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
- KUID_TO_SUID(ip->i_uid)) ||
- zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
- KGID_TO_SGID(ip->i_gid)) ||
- (zp->z_projid != ZFS_DEFAULT_PROJID &&
- zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
- zp->z_projid))) {
- error = SET_ERROR(EDQUOT);
- break;
- }
-
- arc_buf_t *abuf = NULL;
- const iovec_t *aiov = NULL;
- if (xuio) {
-#ifdef HAVE_UIO_ZEROCOPY
- ASSERT(i_iov < iovcnt);
- ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
- aiov = &iovp[i_iov];
- abuf = dmu_xuio_arcbuf(xuio, i_iov);
- dmu_xuio_clear(xuio, i_iov);
- ASSERT((aiov->iov_base == abuf->b_data) ||
- ((char *)aiov->iov_base - (char *)abuf->b_data +
- aiov->iov_len == arc_buf_size(abuf)));
- i_iov++;
-#endif
- } else if (n >= max_blksz && woff >= zp->z_size &&
- P2PHASE(woff, max_blksz) == 0 &&
- zp->z_blksz == max_blksz) {
- /*
- * This write covers a full block. "Borrow" a buffer
- * from the dmu so that we can fill it before we enter
- * a transaction. This avoids the possibility of
- * holding up the transaction if the data copy hangs
- * up on a pagefault (e.g., from an NFS server mapping).
- */
- size_t cbytes;
-
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- max_blksz);
- ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == max_blksz);
- if ((error = uiocopy(abuf->b_data, max_blksz,
- UIO_WRITE, uio, &cbytes))) {
- dmu_return_arcbuf(abuf);
- break;
- }
- ASSERT(cbytes == max_blksz);
- }
-
- /*
- * Start a transaction.
- */
- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
- DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
- MIN(n, max_blksz));
- DB_DNODE_EXIT(db);
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- if (abuf != NULL)
- dmu_return_arcbuf(abuf);
- break;
- }
-
- /*
- * If rangelock_enter() over-locked we grow the blocksize
- * and then reduce the lock range. This will only happen
- * on the first iteration since rangelock_reduce() will
- * shrink down lr_length to the appropriate size.
- */
- if (lr->lr_length == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size,
- 1 << highbit64(zp->z_blksz));
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
- rangelock_reduce(lr, woff, n);
- }
-
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
- ssize_t tx_bytes;
- if (abuf == NULL) {
- tx_bytes = uio->uio_resid;
- uio->uio_fault_disable = B_TRUE;
- error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes, tx);
- uio->uio_fault_disable = B_FALSE;
- if (error == EFAULT) {
- dmu_tx_commit(tx);
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- break;
- }
- continue;
- } else if (error != 0) {
- dmu_tx_commit(tx);
- break;
- }
- tx_bytes -= uio->uio_resid;
- } else {
- tx_bytes = nbytes;
- ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
- /*
- * If this is not a full block write, but we are
- * extending the file past EOF and this data starts
- * block-aligned, use assign_arcbuf(). Otherwise,
- * write via dmu_write().
- */
- if (tx_bytes < max_blksz && (!write_eof ||
- aiov->iov_base != abuf->b_data)) {
- ASSERT(xuio);
- dmu_write(zfsvfs->z_os, zp->z_id, woff,
- /* cppcheck-suppress nullPointer */
- aiov->iov_len, aiov->iov_base, tx);
- dmu_return_arcbuf(abuf);
- xuio_stat_wbuf_copied();
- } else {
- ASSERT(xuio || tx_bytes == max_blksz);
- error = dmu_assign_arcbuf_by_dbuf(
- sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
- if (error != 0) {
- dmu_return_arcbuf(abuf);
- dmu_tx_commit(tx);
- break;
- }
- }
- ASSERT(tx_bytes <= uio->uio_resid);
- uioskip(uio, tx_bytes);
- }
- if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
- update_pages(ip, woff,
- tx_bytes, zfsvfs->z_os, zp->z_id);
- }
-
- /*
- * If we made no progress, we're done. If we made even
- * partial progress, update the znode and ZIL accordingly.
- */
- if (tx_bytes == 0) {
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
- (void *)&zp->z_size, sizeof (uint64_t), tx);
- dmu_tx_commit(tx);
- ASSERT(error != 0);
- break;
- }
-
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the execute bits is set.
- *
- * It would be nice to do this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- uint32_t uid = KUID_TO_SUID(ip->i_uid);
- if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
- uint64_t newmode;
- zp->z_mode &= ~(S_ISUID | S_ISGID);
- ip->i_mode = newmode = zp->z_mode;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
- (void *)&newmode, sizeof (uint64_t), tx);
- }
- mutex_exit(&zp->z_acl_lock);
-
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
-
- /*
- * Update the file size (zp_size) if it has changed;
- * account for possible concurrent updates.
- */
- while ((end_size = zp->z_size) < uio->uio_loffset) {
- (void) atomic_cas_64(&zp->z_size, end_size,
- uio->uio_loffset);
- ASSERT(error == 0);
- }
- /*
- * If we are replaying and eof is non zero then force
- * the file size to the specified eof. Note, there's no
- * concurrency during replay.
- */
- if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
- zp->z_size = zfsvfs->z_replay_eof;
-
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
- NULL, NULL);
- dmu_tx_commit(tx);
-
- if (error != 0)
- break;
- ASSERT(tx_bytes == nbytes);
- n -= nbytes;
-
- if (!xuio && n > 0) {
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- error = EFAULT;
- break;
- }
- }
- }
-
- zfs_inode_update(zp);
- rangelock_exit(lr);
-
- /*
- * If we're in replay mode, or we made no progress, return error.
- * Otherwise, it's at least a partial write, so it's successful.
- */
- if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (ioflag & (FSYNC | FDSYNC) ||
- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, zp->z_id);
-
- int64_t nwritten = start_resid - uio->uio_resid;
- dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
- task_io_account_write(nwritten);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Drop a reference on the passed inode asynchronously. This ensures
- * that the caller will never drop the last reference on an inode in
- * the current context. Doing so while holding open a tx could result
- * in a deadlock if iput_final() re-enters the filesystem code.
- */
-void
-zfs_iput_async(struct inode *ip)
-{
- objset_t *os = ITOZSB(ip)->z_os;
-
- ASSERT(atomic_read(&ip->i_count) > 0);
- ASSERT(os != NULL);
-
- if (atomic_read(&ip->i_count) == 1)
- VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
- (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
- else
- iput(ip);
-}
-
-/* ARGSUSED */
-void
-zfs_get_done(zgd_t *zgd, int error)
-{
- znode_t *zp = zgd->zgd_private;
-
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
-
- rangelock_exit(zgd->zgd_lr);
-
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_iput_async(ZTOI(zp));
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t object = lr->lr_foid;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error = 0;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, object, &zp) != 0)
- return (SET_ERROR(ENOENT));
- if (zp->z_unlinked) {
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_iput_async(ZTOI(zp));
- return (SET_ERROR(ENOENT));
- }
-
- zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
- zgd->zgd_private = zp;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (offset >= zp->z_size) {
- error = SET_ERROR(ENOENT);
- } else {
- error = dmu_read(os, object, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- }
- ASSERT(error == 0 || error == ENOENT);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's
- * written out and its checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- uint64_t blkoff;
- size = zp->z_blksz;
- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
- offset -= blkoff;
- zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- if (zp->z_blksz == size)
- break;
- offset += blkoff;
- rangelock_exit(zgd->zgd_lr);
- }
- /* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_size)
- error = SET_ERROR(ENOENT);
-#ifdef DEBUG
- if (zil_fault_io) {
- error = SET_ERROR(EIO);
- zil_fault_io = 0;
- }
-#endif
- if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
-
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zfs_get_done, zgd);
- ASSERT(error || lr->lr_length <= size);
-
- /*
- * On success, we need to wait for the write I/O
- * initiated by dmu_sync() to complete before we can
- * release this dbuf. We will finish everything up
- * in the zfs_get_done() callback.
- */
- if (error == 0)
- return (0);
-
- if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- /*
- * TX_WRITE2 relies on the data previously
- * written by the TX_WRITE that caused
- * EALREADY. We zero out the BP because
- * it is the old, currently-on-disk BP.
- */
- zgd->zgd_bp = NULL;
- BP_ZERO(bp);
- error = 0;
- }
- }
- }
-
- zfs_get_done(zgd, error);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
- else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Lookup an entry in a directory, or an extended attribute directory.
- * If it exists, return a held inode reference for it.
- *
- * IN: dip - inode of directory to search.
- * nm - name of entry to lookup.
- * flags - LOOKUP_XATTR set if looking for an attribute.
- * cr - credentials of caller.
- * direntflags - directory lookup flags
- * realpnp - returned pathname.
- *
- * OUT: ipp - inode of located entry, NULL if not found.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * NA
- */
-/* ARGSUSED */
-int
-zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
- cred_t *cr, int *direntflags, pathname_t *realpnp)
-{
- znode_t *zdp = ITOZ(dip);
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- int error = 0;
-
- /*
- * Fast path lookup, however we must skip DNLC lookup
- * for case folding or normalizing lookups because the
- * DNLC code only stores the passed in name. This means
- * creating 'a' and removing 'A' on a case insensitive
- * file system would work, but DNLC still thinks 'a'
- * exists and won't let you create it again on the next
- * pass through fast path.
- */
- if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
-
- if (!S_ISDIR(dip->i_mode)) {
- return (SET_ERROR(ENOTDIR));
- } else if (zdp->z_sa_hdl == NULL) {
- return (SET_ERROR(EIO));
- }
-
- if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
- error = zfs_fastaccesschk_execute(zdp, cr);
- if (!error) {
- *ipp = dip;
- igrab(*ipp);
- return (0);
- }
- return (error);
- }
- }
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zdp);
-
- *ipp = NULL;
-
- if (flags & LOOKUP_XATTR) {
- /*
- * We don't allow recursive attributes..
- * Maybe someday we will.
- */
- if (zdp->z_pflags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Do we have permission to get into attribute directory?
- */
-
- if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
- B_FALSE, cr))) {
- iput(*ipp);
- *ipp = NULL;
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (!S_ISDIR(dip->i_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTDIR));
- }
-
- /*
- * Check accessibility of directory.
- */
-
- if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
-
- error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
- if ((error == 0) && (*ipp))
- zfs_inode_update(ITOZ(*ipp));
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Attempt to create a new entry in a directory. If the entry
- * already exists, truncate the file if permissible, else return
- * an error. Return the ip of the created or trunc'd file.
- *
- * IN: dip - inode of directory to put new file entry in.
- * name - name of new file entry.
- * vap - attributes of new file.
- * excl - flag indicating exclusive or non-exclusive mode.
- * mode - mode to open file with.
- * cr - credentials of caller.
- * flag - file flag.
- * vsecp - ACL to be set
- *
- * OUT: ipp - inode of created or trunc'd entry.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * dip - ctime|mtime updated if new entry created
- * ip - ctime|mtime always, atime if new
- */
-
-/* ARGSUSED */
-int
-zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
- int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
-{
- znode_t *zp, *dzp = ITOZ(dip);
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- zilog_t *zilog;
- objset_t *os;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- uid_t uid;
- gid_t gid;
- zfs_acl_ids_t acl_ids;
- boolean_t fuid_dirtied;
- boolean_t have_acl = B_FALSE;
- boolean_t waited = B_FALSE;
-
- /*
- * If we have an ephemeral id, ACL, or XVATTR then
- * make sure file system is at proper version
- */
-
- gid = crgetgid(cr);
- uid = crgetuid(cr);
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
- return (SET_ERROR(EINVAL));
-
- if (name == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- os = zfsvfs->z_os;
- zilog = zfsvfs->z_log;
-
- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
-
- if (vap->va_mask & ATTR_XVATTR) {
- if ((error = secpolicy_xvattr((xvattr_t *)vap,
- crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
-top:
- *ipp = NULL;
- if (*name == '\0') {
- /*
- * Null component name refers to the directory itself.
- */
- igrab(dip);
- zp = dzp;
- dl = NULL;
- error = 0;
- } else {
- /* possible igrab(zp) */
- int zflg = 0;
-
- if (flag & FIGNORECASE)
- zflg |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL);
- if (error) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- if (strcmp(name, "..") == 0)
- error = SET_ERROR(EISDIR);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- if (zp == NULL) {
- uint64_t txtype;
- uint64_t projid = ZFS_DEFAULT_PROJID;
-
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- goto out;
- }
-
- /*
- * We only support the creation of regular files in
- * extended attribute directories.
- */
-
- if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EINVAL);
- goto out;
- }
-
- if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
- goto out;
- have_acl = B_TRUE;
-
- if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
- projid = zfs_inherit_projid(dzp);
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EDQUOT);
- goto out;
- }
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
-
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
- if (!zfsvfs->z_use_sa &&
- acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, acl_ids.z_aclp->z_acl_bytes);
- }
-
- error = dmu_tx_assign(tx,
- (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
- error = zfs_link_create(dl, zp, tx, ZNEW);
- if (error != 0) {
- /*
- * Since, we failed to add the directory entry for it,
- * delete the newly created dnode.
- */
- zfs_znode_delete(zp, tx);
- remove_inode_hash(ZTOI(zp));
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
- goto out;
- }
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
- if (flag & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, name,
- vsecp, acl_ids.z_fuidp, vap);
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
- } else {
- int aflags = (flag & FAPPEND) ? V_APPEND : 0;
-
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- have_acl = B_FALSE;
-
- /*
- * A directory entry already exists for this name.
- */
- /*
- * Can't truncate an existing file if in exclusive mode.
- */
- if (excl) {
- error = SET_ERROR(EEXIST);
- goto out;
- }
- /*
- * Can't open a directory for writing.
- */
- if (S_ISDIR(ZTOI(zp)->i_mode)) {
- error = SET_ERROR(EISDIR);
- goto out;
- }
- /*
- * Verify requested access to file.
- */
- if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
- goto out;
- }
-
- mutex_enter(&dzp->z_lock);
- dzp->z_seq++;
- mutex_exit(&dzp->z_lock);
-
- /*
- * Truncate regular files if requested.
- */
- if (S_ISREG(ZTOI(zp)->i_mode) &&
- (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
- /* we can't hold any locks when calling zfs_freesp() */
- if (dl) {
- zfs_dirent_unlock(dl);
- dl = NULL;
- }
- error = zfs_freesp(zp, 0, 0, mode, TRUE);
- }
- }
-out:
-
- if (dl)
- zfs_dirent_unlock(dl);
-
- if (error) {
- if (zp)
- iput(ZTOI(zp));
- } else {
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
- *ipp = ZTOI(zp);
- }
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/* ARGSUSED */
-int
-zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
- int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
-{
- znode_t *zp = NULL, *dzp = ITOZ(dip);
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- objset_t *os;
- dmu_tx_t *tx;
- int error;
- uid_t uid;
- gid_t gid;
- zfs_acl_ids_t acl_ids;
- uint64_t projid = ZFS_DEFAULT_PROJID;
- boolean_t fuid_dirtied;
- boolean_t have_acl = B_FALSE;
- boolean_t waited = B_FALSE;
-
- /*
- * If we have an ephemeral id, ACL, or XVATTR then
- * make sure file system is at proper version
- */
-
- gid = crgetgid(cr);
- uid = crgetuid(cr);
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- os = zfsvfs->z_os;
-
- if (vap->va_mask & ATTR_XVATTR) {
- if ((error = secpolicy_xvattr((xvattr_t *)vap,
- crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
-top:
- *ipp = NULL;
-
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- goto out;
- }
-
- if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
- goto out;
- have_acl = B_TRUE;
-
- if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
- projid = zfs_inherit_projid(dzp);
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EDQUOT);
- goto out;
- }
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- if (!zfsvfs->z_use_sa &&
- acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, acl_ids.z_aclp->z_acl_bytes);
- }
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- /* Add to unlinked set */
- zp->z_unlinked = B_TRUE;
- zfs_unlinked_add(zp, tx);
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
-out:
-
- if (error) {
- if (zp)
- iput(ZTOI(zp));
- } else {
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
- *ipp = ZTOI(zp);
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Remove an entry from a directory.
- *
- * IN: dip - inode of directory to remove entry from.
- * name - name of entry to remove.
- * cr - credentials of caller.
- * flags - case flags.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dip - ctime|mtime
- * ip - ctime (if nlink > 0)
- */
-
-uint64_t null_xattr = 0;
-
-/*ARGSUSED*/
-int
-zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
-{
- znode_t *zp, *dzp = ITOZ(dip);
- znode_t *xzp;
- struct inode *ip;
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- zilog_t *zilog;
- uint64_t acl_obj, xattr_obj;
- uint64_t xattr_obj_unlinked = 0;
- uint64_t obj = 0;
- uint64_t links;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- boolean_t may_delete_now, delete_now = FALSE;
- boolean_t unlinked, toobig = FALSE;
- uint64_t txtype;
- pathname_t *realnmp = NULL;
- pathname_t realnm;
- int error;
- int zflg = ZEXISTS;
- boolean_t waited = B_FALSE;
-
- if (name == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (flags & FIGNORECASE) {
- zflg |= ZCILOOK;
- pn_alloc(&realnm);
- realnmp = &realnm;
- }
-
-top:
- xattr_obj = 0;
- xzp = NULL;
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, realnmp))) {
- if (realnmp)
- pn_free(realnmp);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- ip = ZTOI(zp);
-
- if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
- goto out;
- }
-
- /*
- * Need to use rmdir for removing directories.
- */
- if (S_ISDIR(ip->i_mode)) {
- error = SET_ERROR(EPERM);
- goto out;
- }
-
- mutex_enter(&zp->z_lock);
- may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped);
- mutex_exit(&zp->z_lock);
-
- /*
- * We may delete the znode now, or we may put it in the unlinked set;
- * it depends on whether we're the last link, and on whether there are
- * other holds on the inode. So we dmu_tx_hold() the right things to
- * allow for either case.
- */
- obj = zp->z_id;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- zfs_sa_upgrade_txholds(tx, dzp);
- if (may_delete_now) {
- toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
- /* if the file is too big, only hold_free a token amount */
- dmu_tx_hold_free(tx, zp->z_id, 0,
- (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
- }
-
- /* are there any extended attributes? */
- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj, sizeof (xattr_obj));
- if (error == 0 && xattr_obj) {
- error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT0(error);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
- }
-
- mutex_enter(&zp->z_lock);
- if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
- mutex_exit(&zp->z_lock);
-
- /* charge as an update -- would be nice not to charge at all */
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
- /*
- * Mark this transaction as typically resulting in a net free of space
- */
- dmu_tx_mark_netfree(tx);
-
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- iput(ip);
- if (xzp)
- iput(ZTOI(xzp));
- goto top;
- }
- if (realnmp)
- pn_free(realnmp);
- dmu_tx_abort(tx);
- iput(ip);
- if (xzp)
- iput(ZTOI(xzp));
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Remove the directory entry.
- */
- error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
-
- if (error) {
- dmu_tx_commit(tx);
- goto out;
- }
-
- if (unlinked) {
- /*
- * Hold z_lock so that we can make sure that the ACL obj
- * hasn't changed. Could have been deleted due to
- * zfs_sa_upgrade().
- */
- mutex_enter(&zp->z_lock);
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
- delete_now = may_delete_now && !toobig &&
- atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
- xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
- acl_obj;
- }
-
- if (delete_now) {
- if (xattr_obj_unlinked) {
- ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = B_TRUE;
- clear_nlink(ZTOI(xzp));
- links = 0;
- error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
- &links, sizeof (links), tx);
- ASSERT3U(error, ==, 0);
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
-
- if (zp->z_is_sa)
- error = sa_remove(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), tx);
- else
- error = sa_update(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), &null_xattr,
- sizeof (uint64_t), tx);
- ASSERT0(error);
- }
- /*
- * Add to the unlinked set because a new reference could be
- * taken concurrently resulting in a deferred destruction.
- */
- zfs_unlinked_add(zp, tx);
- mutex_exit(&zp->z_lock);
- } else if (unlinked) {
- mutex_exit(&zp->z_lock);
- zfs_unlinked_add(zp, tx);
- }
-
- txtype = TX_REMOVE;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
-
- dmu_tx_commit(tx);
-out:
- if (realnmp)
- pn_free(realnmp);
-
- zfs_dirent_unlock(dl);
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
-
- if (delete_now)
- iput(ip);
- else
- zfs_iput_async(ip);
-
- if (xzp) {
- zfs_inode_update(xzp);
- zfs_iput_async(ZTOI(xzp));
- }
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Create a new directory and insert it into dip using the name
- * provided. Return a pointer to the inserted directory.
- *
- * IN: dip - inode of directory to add subdir to.
- * dirname - name of new directory.
- * vap - attributes of new directory.
- * cr - credentials of caller.
- * flags - case flags.
- * vsecp - ACL to be set
- *
- * OUT: ipp - inode of created directory.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dip - ctime|mtime updated
- * ipp - ctime|mtime|atime updated
- */
-/*ARGSUSED*/
-int
-zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
- cred_t *cr, int flags, vsecattr_t *vsecp)
-{
- znode_t *zp, *dzp = ITOZ(dip);
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- uint64_t txtype;
- dmu_tx_t *tx;
- int error;
- int zf = ZNEW;
- uid_t uid;
- gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
- boolean_t fuid_dirtied;
- boolean_t waited = B_FALSE;
-
- ASSERT(S_ISDIR(vap->va_mode));
-
- /*
- * If we have an ephemeral id, ACL, or XVATTR then
- * make sure file system is at proper version
- */
-
- uid = crgetuid(cr);
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
- return (SET_ERROR(EINVAL));
-
- if (dirname == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (dzp->z_pflags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- if (zfsvfs->z_utf8 && u8_validate(dirname,
- strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- if (vap->va_mask & ATTR_XVATTR) {
- if ((error = secpolicy_xvattr((xvattr_t *)vap,
- crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
- vsecp, &acl_ids)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- /*
- * First make sure the new directory doesn't exist.
- *
- * Existence is checked first to make sure we don't return
- * EACCES instead of EEXIST which can cause some applications
- * to fail.
- */
-top:
- *ipp = NULL;
-
- if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
- NULL, NULL))) {
- zfs_acl_ids_free(&acl_ids);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
- zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
- zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EDQUOT));
- }
-
- /*
- * Add a new entry to the directory.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- acl_ids.z_aclp->z_acl_bytes);
- }
-
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
-
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Create new node.
- */
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
- /*
- * Now put new name in parent dir.
- */
- error = zfs_link_create(dl, zp, tx, ZNEW);
- if (error != 0) {
- zfs_znode_delete(zp, tx);
- remove_inode_hash(ZTOI(zp));
- goto out;
- }
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- *ipp = ZTOI(zp);
-
- txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
- acl_ids.z_fuidp, vap);
-
-out:
- zfs_acl_ids_free(&acl_ids);
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- if (error != 0) {
- iput(ZTOI(zp));
- } else {
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
- }
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Remove a directory subdir entry. If the current working
- * directory is the same as the subdir to be removed, the
- * remove will fail.
- *
- * IN: dip - inode of directory to remove from.
- * name - name of directory to be removed.
- * cwd - inode of current working directory.
- * cr - credentials of caller.
- * flags - case flags
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * dip - ctime|mtime updated
- */
-/*ARGSUSED*/
-int
-zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
- int flags)
-{
- znode_t *dzp = ITOZ(dip);
- znode_t *zp;
- struct inode *ip;
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- int zflg = ZEXISTS;
- boolean_t waited = B_FALSE;
-
- if (name == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-top:
- zp = NULL;
-
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- ip = ZTOI(zp);
-
- if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
- goto out;
- }
-
- if (!S_ISDIR(ip->i_mode)) {
- error = SET_ERROR(ENOTDIR);
- goto out;
- }
-
- if (ip == cwd) {
- error = SET_ERROR(EINVAL);
- goto out;
- }
-
- /*
- * Grab a lock on the directory to make sure that no one is
- * trying to add (or lookup) entries while we are removing it.
- */
- rw_enter(&zp->z_name_lock, RW_WRITER);
-
- /*
- * Grab a lock on the parent pointer to make sure we play well
- * with the treewalk and directory rename code.
- */
- rw_enter(&zp->z_parent_lock, RW_WRITER);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- zfs_sa_upgrade_txholds(tx, zp);
- zfs_sa_upgrade_txholds(tx, dzp);
- dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- iput(ip);
- goto top;
- }
- dmu_tx_abort(tx);
- iput(ip);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
-
- if (error == 0) {
- uint64_t txtype = TX_RMDIR;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
- B_FALSE);
- }
-
- dmu_tx_commit(tx);
-
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
-out:
- zfs_dirent_unlock(dl);
-
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
- iput(ip);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Read directory entries from the given directory cursor position and emit
- * name and position for each entry.
- *
- * IN: ip - inode of directory to read.
- * ctx - directory entry context.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - atime updated
- *
- * Note that the low 4 bits of the cookie returned by zap is always zero.
- * This allows us to use the low range for "special" directory entries:
- * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
- * we use the offset 2 for the '.zfs' directory.
- */
-/* ARGSUSED */
-int
-zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- objset_t *os;
- zap_cursor_t zc;
- zap_attribute_t zap;
- int error;
- uint8_t prefetch;
- uint8_t type;
- int done = 0;
- uint64_t parent;
- uint64_t offset; /* must be unsigned; checks for < 1 */
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
- &parent, sizeof (parent))) != 0)
- goto out;
-
- /*
- * Quit if directory has been removed (posix)
- */
- if (zp->z_unlinked)
- goto out;
-
- error = 0;
- os = zfsvfs->z_os;
- offset = ctx->pos;
- prefetch = zp->z_zn_prefetch;
-
- /*
- * Initialize the iterator cursor.
- */
- if (offset <= 3) {
- /*
- * Start iteration from the beginning of the directory.
- */
- zap_cursor_init(&zc, os, zp->z_id);
- } else {
- /*
- * The offset is a serialized cursor.
- */
- zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
- }
-
- /*
- * Transform to file-system independent format
- */
- while (!done) {
- uint64_t objnum;
- /*
- * Special case `.', `..', and `.zfs'.
- */
- if (offset == 0) {
- (void) strcpy(zap.za_name, ".");
- zap.za_normalization_conflict = 0;
- objnum = zp->z_id;
- type = DT_DIR;
- } else if (offset == 1) {
- (void) strcpy(zap.za_name, "..");
- zap.za_normalization_conflict = 0;
- objnum = parent;
- type = DT_DIR;
- } else if (offset == 2 && zfs_show_ctldir(zp)) {
- (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
- zap.za_normalization_conflict = 0;
- objnum = ZFSCTL_INO_ROOT;
- type = DT_DIR;
- } else {
- /*
- * Grab next entry.
- */
- if ((error = zap_cursor_retrieve(&zc, &zap))) {
- if (error == ENOENT)
- break;
- else
- goto update;
- }
-
- /*
- * Allow multiple entries provided the first entry is
- * the object id. Non-zpl consumers may safely make
- * use of the additional space.
- *
- * XXX: This should be a feature flag for compatibility
- */
- if (zap.za_integer_length != 8 ||
- zap.za_num_integers == 0) {
- cmn_err(CE_WARN, "zap_readdir: bad directory "
- "entry, obj = %lld, offset = %lld, "
- "length = %d, num = %lld\n",
- (u_longlong_t)zp->z_id,
- (u_longlong_t)offset,
- zap.za_integer_length,
- (u_longlong_t)zap.za_num_integers);
- error = SET_ERROR(ENXIO);
- goto update;
- }
-
- objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
- type = ZFS_DIRENT_TYPE(zap.za_first_integer);
- }
-
- done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
- objnum, type);
- if (done)
- break;
-
- /* Prefetch znode */
- if (prefetch) {
- dmu_prefetch(os, objnum, 0, 0, 0,
- ZIO_PRIORITY_SYNC_READ);
- }
-
- /*
- * Move to the next entry, fill in the previous offset.
- */
- if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
- zap_cursor_advance(&zc);
- offset = zap_cursor_serialize(&zc);
- } else {
- offset += 1;
- }
- ctx->pos = offset;
- }
- zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
-
-update:
- zap_cursor_fini(&zc);
- if (error == ENOENT)
- error = 0;
-out:
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-ulong_t zfs_fsync_sync_cnt = 4;
-
-int
-zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
- if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
- }
- tsd_set(zfs_fsyncer_key, NULL);
-
- return (0);
-}
-
-
-/*
- * Get the requested file attributes and place them in the provided
- * vattr structure.
- *
- * IN: ip - inode of file.
- * vap - va_mask identifies requested attributes.
- * If ATTR_XVATTR set, then optional attrs are requested
- * flags - ATTR_NOACLCHECK (CIFS server context)
- * cr - credentials of caller.
- *
- * OUT: vap - attribute values.
- *
- * RETURN: 0 (always succeeds)
- */
-/* ARGSUSED */
-int
-zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error = 0;
- uint64_t links;
- uint64_t atime[2], mtime[2], ctime[2];
- xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
- xoptattr_t *xoap = NULL;
- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- sa_bulk_attr_t bulk[3];
- int count = 0;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-
- if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
- * Also, if we are the owner don't bother, since owner should
- * always be allowed to read basic attributes of file.
- */
- if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
- (vap->va_uid != crgetuid(cr))) {
- if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
- skipaclchk, cr))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- /*
- * Return all attributes. It's cheaper to provide the answer
- * than to determine whether we were asked the question.
- */
-
- mutex_enter(&zp->z_lock);
- vap->va_type = vn_mode_to_vtype(zp->z_mode);
- vap->va_mode = zp->z_mode;
- vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
- vap->va_nodeid = zp->z_id;
- if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
- links = ZTOI(zp)->i_nlink + 1;
- else
- links = ZTOI(zp)->i_nlink;
- vap->va_nlink = MIN(links, ZFS_LINK_MAX);
- vap->va_size = i_size_read(ip);
- vap->va_rdev = ip->i_rdev;
- vap->va_seq = ip->i_generation;
-
- /*
- * Add in any requested optional attributes and the create time.
- * Also set the corresponding bits in the returned attribute bitmap.
- */
- if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- xoap->xoa_archive =
- ((zp->z_pflags & ZFS_ARCHIVE) != 0);
- XVA_SET_RTN(xvap, XAT_ARCHIVE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- xoap->xoa_readonly =
- ((zp->z_pflags & ZFS_READONLY) != 0);
- XVA_SET_RTN(xvap, XAT_READONLY);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- xoap->xoa_system =
- ((zp->z_pflags & ZFS_SYSTEM) != 0);
- XVA_SET_RTN(xvap, XAT_SYSTEM);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- xoap->xoa_hidden =
- ((zp->z_pflags & ZFS_HIDDEN) != 0);
- XVA_SET_RTN(xvap, XAT_HIDDEN);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- xoap->xoa_nounlink =
- ((zp->z_pflags & ZFS_NOUNLINK) != 0);
- XVA_SET_RTN(xvap, XAT_NOUNLINK);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- xoap->xoa_immutable =
- ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
- XVA_SET_RTN(xvap, XAT_IMMUTABLE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- xoap->xoa_appendonly =
- ((zp->z_pflags & ZFS_APPENDONLY) != 0);
- XVA_SET_RTN(xvap, XAT_APPENDONLY);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- xoap->xoa_nodump =
- ((zp->z_pflags & ZFS_NODUMP) != 0);
- XVA_SET_RTN(xvap, XAT_NODUMP);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- xoap->xoa_opaque =
- ((zp->z_pflags & ZFS_OPAQUE) != 0);
- XVA_SET_RTN(xvap, XAT_OPAQUE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
- xoap->xoa_av_quarantined =
- ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
- XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- xoap->xoa_av_modified =
- ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
- XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
- S_ISREG(ip->i_mode)) {
- zfs_sa_get_scanstamp(zp, xvap);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- uint64_t times[2];
-
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
- times, sizeof (times));
- ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
- XVA_SET_RTN(xvap, XAT_CREATETIME);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
- XVA_SET_RTN(xvap, XAT_REPARSE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
- xoap->xoa_generation = ip->i_generation;
- XVA_SET_RTN(xvap, XAT_GEN);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
- xoap->xoa_offline =
- ((zp->z_pflags & ZFS_OFFLINE) != 0);
- XVA_SET_RTN(xvap, XAT_OFFLINE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
- xoap->xoa_sparse =
- ((zp->z_pflags & ZFS_SPARSE) != 0);
- XVA_SET_RTN(xvap, XAT_SPARSE);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
- xoap->xoa_projinherit =
- ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
- XVA_SET_RTN(xvap, XAT_PROJINHERIT);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
- xoap->xoa_projid = zp->z_projid;
- XVA_SET_RTN(xvap, XAT_PROJID);
- }
- }
-
- ZFS_TIME_DECODE(&vap->va_atime, atime);
- ZFS_TIME_DECODE(&vap->va_mtime, mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, ctime);
-
- mutex_exit(&zp->z_lock);
-
- sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
-
- if (zp->z_blksz == 0) {
- /*
- * Block size hasn't been set; suggest maximal I/O transfers.
- */
- vap->va_blksize = zfsvfs->z_max_blksz;
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Get the basic file attributes and place them in the provided kstat
- * structure. The inode is assumed to be the authoritative source
- * for most of the attributes. However, the znode currently has the
- * authoritative atime, blksize, and block count.
- *
- * IN: ip - inode of file.
- *
- * OUT: sp - kstat values.
- *
- * RETURN: 0 (always succeeds)
- */
-/* ARGSUSED */
-int
-zfs_getattr_fast(struct inode *ip, struct kstat *sp)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- uint32_t blksize;
- u_longlong_t nblocks;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- mutex_enter(&zp->z_lock);
-
- generic_fillattr(ip, sp);
- /*
- * +1 link count for root inode with visible '.zfs' directory.
- */
- if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
- if (sp->nlink < ZFS_LINK_MAX)
- sp->nlink++;
-
- sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
- sp->blksize = blksize;
- sp->blocks = nblocks;
-
- if (unlikely(zp->z_blksz == 0)) {
- /*
- * Block size hasn't been set; suggest maximal I/O transfers.
- */
- sp->blksize = zfsvfs->z_max_blksz;
- }
-
- mutex_exit(&zp->z_lock);
-
- /*
- * Required to prevent NFS client from detecting different inode
- * numbers of snapshot root dentry before and after snapshot mount.
- */
- if (zfsvfs->z_issnap) {
- if (ip->i_sb->s_root->d_inode == ip)
- sp->ino = ZFSCTL_INO_SNAPDIRS -
- dmu_objset_id(zfsvfs->z_os);
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/*
- * For the operation of changing file's user/group/project, we need to
- * handle not only the main object that is assigned to the file directly,
- * but also the ones that are used by the file via hidden xattr directory.
- *
- * Because the xattr directory may contains many EA entries, as to it may
- * be impossible to change all of them via the transaction of changing the
- * main object's user/group/project attributes. Then we have to change them
- * via other multiple independent transactions one by one. It may be not good
- * solution, but we have no better idea yet.
- */
-static int
-zfs_setattr_dir(znode_t *dzp)
-{
- struct inode *dxip = ZTOI(dzp);
- struct inode *xip = NULL;
- zfsvfs_t *zfsvfs = ITOZSB(dxip);
- objset_t *os = zfsvfs->z_os;
- zap_cursor_t zc;
- zap_attribute_t zap;
- zfs_dirlock_t *dl;
- znode_t *zp;
- dmu_tx_t *tx = NULL;
- uint64_t uid, gid;
- sa_bulk_attr_t bulk[4];
- int count;
- int err;
-
- zap_cursor_init(&zc, os, dzp->z_id);
- while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
- count = 0;
- if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
- err = ENXIO;
- break;
- }
-
- err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
- ZEXISTS, NULL, NULL);
- if (err == ENOENT)
- goto next;
- if (err)
- break;
-
- xip = ZTOI(zp);
- if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
- KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
- zp->z_projid == dzp->z_projid)
- goto next;
-
- tx = dmu_tx_create(os);
- if (!(zp->z_pflags & ZFS_PROJID))
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- else
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err)
- break;
-
- mutex_enter(&dzp->z_lock);
-
- if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
- xip->i_uid = dxip->i_uid;
- uid = zfs_uid_read(dxip);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
- &uid, sizeof (uid));
- }
-
- if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
- xip->i_gid = dxip->i_gid;
- gid = zfs_gid_read(dxip);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
- &gid, sizeof (gid));
- }
-
- if (zp->z_projid != dzp->z_projid) {
- if (!(zp->z_pflags & ZFS_PROJID)) {
- zp->z_pflags |= ZFS_PROJID;
- SA_ADD_BULK_ATTR(bulk, count,
- SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
- sizeof (zp->z_pflags));
- }
-
- zp->z_projid = dzp->z_projid;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
- NULL, &zp->z_projid, sizeof (zp->z_projid));
- }
-
- mutex_exit(&dzp->z_lock);
-
- if (likely(count > 0)) {
- err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
- tx = NULL;
- if (err != 0 && err != ENOENT)
- break;
-
-next:
- if (xip) {
- iput(xip);
- xip = NULL;
- zfs_dirent_unlock(dl);
- }
- zap_cursor_advance(&zc);
- }
-
- if (tx)
- dmu_tx_abort(tx);
- if (xip) {
- iput(xip);
- zfs_dirent_unlock(dl);
- }
- zap_cursor_fini(&zc);
-
- return (err == ENOENT ? 0 : err);
-}
-
-/*
- * Set the file attributes to the values contained in the
- * vattr structure.
- *
- * IN: ip - inode of file to be modified.
- * vap - new attribute values.
- * If ATTR_XVATTR set, then optional attrs are being set
- * flags - ATTR_UTIME set if non-default time values provided.
- * - ATTR_NOACLCHECK (CIFS context only).
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - ctime updated, mtime updated if size changed.
- */
-/* ARGSUSED */
-int
-zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- objset_t *os = zfsvfs->z_os;
- zilog_t *zilog;
- dmu_tx_t *tx;
- vattr_t oldva;
- xvattr_t *tmpxvattr;
- uint_t mask = vap->va_mask;
- uint_t saved_mask = 0;
- int trim_mask = 0;
- uint64_t new_mode;
- uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
- uint64_t xattr_obj;
- uint64_t mtime[2], ctime[2], atime[2];
- uint64_t projid = ZFS_INVALID_PROJID;
- znode_t *attrzp;
- int need_policy = FALSE;
- int err, err2 = 0;
- zfs_fuid_info_t *fuidp = NULL;
- xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
- xoptattr_t *xoap;
- zfs_acl_t *aclp;
- boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- boolean_t fuid_dirtied = B_FALSE;
- boolean_t handle_eadir = B_FALSE;
- sa_bulk_attr_t *bulk, *xattr_bulk;
- int count = 0, xattr_count = 0, bulks = 8;
-
- if (mask == 0)
- return (0);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * If this is a xvattr_t, then get a pointer to the structure of
- * optional attributes. If this is NULL, then we have a vattr_t.
- */
- xoap = xva_getxoptattr(xvap);
- if (xoap != NULL && (mask & ATTR_XVATTR)) {
- if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
- if (!dmu_objset_projectquota_enabled(os) ||
- (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
- }
-
- projid = xoap->xoa_projid;
- if (unlikely(projid == ZFS_INVALID_PROJID)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
- projid = ZFS_INVALID_PROJID;
- else
- need_policy = TRUE;
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
- (xoap->xoa_projinherit !=
- ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
- (!dmu_objset_projectquota_enabled(os) ||
- (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
- }
- }
-
- zilog = zfsvfs->z_log;
-
- /*
- * Make sure that if we have ephemeral uid/gid or xvattr specified
- * that file system is at proper version level
- */
-
- if (zfsvfs->z_use_fuids == B_FALSE &&
- (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
- ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
- (mask & ATTR_XVATTR))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EISDIR));
- }
-
- if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
- xva_init(tmpxvattr);
-
- bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
- xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
-
- /*
- * Immutable files can only alter immutable bit and atime
- */
- if ((zp->z_pflags & ZFS_IMMUTABLE) &&
- ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
- ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
- err = SET_ERROR(EPERM);
- goto out3;
- }
-
- if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
- err = SET_ERROR(EPERM);
- goto out3;
- }
-
- /*
- * Verify timestamps doesn't overflow 32 bits.
- * ZFS can handle large timestamps, but 32bit syscalls can't
- * handle times greater than 2039. This check should be removed
- * once large timestamps are fully supported.
- */
- if (mask & (ATTR_ATIME | ATTR_MTIME)) {
- if (((mask & ATTR_ATIME) &&
- TIMESPEC_OVERFLOW(&vap->va_atime)) ||
- ((mask & ATTR_MTIME) &&
- TIMESPEC_OVERFLOW(&vap->va_mtime))) {
- err = SET_ERROR(EOVERFLOW);
- goto out3;
- }
- }
-
-top:
- attrzp = NULL;
- aclp = NULL;
-
- /* Can this be moved to before the top label? */
- if (zfs_is_readonly(zfsvfs)) {
- err = SET_ERROR(EROFS);
- goto out3;
- }
-
- /*
- * First validate permissions
- */
-
- if (mask & ATTR_SIZE) {
- err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
- if (err)
- goto out3;
-
- /*
- * XXX - Note, we are not providing any open
- * mode flags here (like FNDELAY), so we may
- * block if there are locks present... this
- * should be addressed in openat().
- */
- /* XXX - would it be OK to generate a log record here? */
- err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
- if (err)
- goto out3;
- }
-
- if (mask & (ATTR_ATIME|ATTR_MTIME) ||
- ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
- XVA_ISSET_REQ(xvap, XAT_READONLY) ||
- XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
- XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
- XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
- XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
- XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
- need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
- skipaclchk, cr);
- }
-
- if (mask & (ATTR_UID|ATTR_GID)) {
- int idmask = (mask & (ATTR_UID|ATTR_GID));
- int take_owner;
- int take_group;
-
- /*
- * NOTE: even if a new mode is being set,
- * we may clear S_ISUID/S_ISGID bits.
- */
-
- if (!(mask & ATTR_MODE))
- vap->va_mode = zp->z_mode;
-
- /*
- * Take ownership or chgrp to group we are a member of
- */
-
- take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
- take_group = (mask & ATTR_GID) &&
- zfs_groupmember(zfsvfs, vap->va_gid, cr);
-
- /*
- * If both ATTR_UID and ATTR_GID are set then take_owner and
- * take_group must both be set in order to allow taking
- * ownership.
- *
- * Otherwise, send the check through secpolicy_vnode_setattr()
- *
- */
-
- if (((idmask == (ATTR_UID|ATTR_GID)) &&
- take_owner && take_group) ||
- ((idmask == ATTR_UID) && take_owner) ||
- ((idmask == ATTR_GID) && take_group)) {
- if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
- skipaclchk, cr) == 0) {
- /*
- * Remove setuid/setgid for non-privileged users
- */
- (void) secpolicy_setid_clear(vap, cr);
- trim_mask = (mask & (ATTR_UID|ATTR_GID));
- } else {
- need_policy = TRUE;
- }
- } else {
- need_policy = TRUE;
- }
- }
-
- mutex_enter(&zp->z_lock);
- oldva.va_mode = zp->z_mode;
- zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
- if (mask & ATTR_XVATTR) {
- /*
- * Update xvattr mask to include only those attributes
- * that are actually changing.
- *
- * the bits will be restored prior to actually setting
- * the attributes so the caller thinks they were set.
- */
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- if (xoap->xoa_appendonly !=
- ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_APPENDONLY);
- XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
- if (xoap->xoa_projinherit !=
- ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
- XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- if (xoap->xoa_nounlink !=
- ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_NOUNLINK);
- XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- if (xoap->xoa_immutable !=
- ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
- XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- if (xoap->xoa_nodump !=
- ((zp->z_pflags & ZFS_NODUMP) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_NODUMP);
- XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- if (xoap->xoa_av_modified !=
- ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
- XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
- if ((!S_ISREG(ip->i_mode) &&
- xoap->xoa_av_quarantined) ||
- xoap->xoa_av_quarantined !=
- ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
- need_policy = TRUE;
- } else {
- XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
- XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
- }
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- mutex_exit(&zp->z_lock);
- err = SET_ERROR(EPERM);
- goto out3;
- }
-
- if (need_policy == FALSE &&
- (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
- XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
- need_policy = TRUE;
- }
- }
-
- mutex_exit(&zp->z_lock);
-
- if (mask & ATTR_MODE) {
- if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
- err = secpolicy_setid_setsticky_clear(ip, vap,
- &oldva, cr);
- if (err)
- goto out3;
-
- trim_mask |= ATTR_MODE;
- } else {
- need_policy = TRUE;
- }
- }
-
- if (need_policy) {
- /*
- * If trim_mask is set then take ownership
- * has been granted or write_acl is present and user
- * has the ability to modify mode. In that case remove
- * UID|GID and or MODE from mask so that
- * secpolicy_vnode_setattr() doesn't revoke it.
- */
-
- if (trim_mask) {
- saved_mask = vap->va_mask;
- vap->va_mask &= ~trim_mask;
- }
- err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
- (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
- if (err)
- goto out3;
-
- if (trim_mask)
- vap->va_mask |= saved_mask;
- }
-
- /*
- * secpolicy_vnode_setattr, or take ownership may have
- * changed va_mask
- */
- mask = vap->va_mask;
-
- if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
- handle_eadir = B_TRUE;
- err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj, sizeof (xattr_obj));
-
- if (err == 0 && xattr_obj) {
- err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
- if (err)
- goto out2;
- }
- if (mask & ATTR_UID) {
- new_kuid = zfs_fuid_create(zfsvfs,
- (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
- if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
- zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
- new_kuid)) {
- if (attrzp)
- iput(ZTOI(attrzp));
- err = SET_ERROR(EDQUOT);
- goto out2;
- }
- }
-
- if (mask & ATTR_GID) {
- new_kgid = zfs_fuid_create(zfsvfs,
- (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
- if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
- zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
- new_kgid)) {
- if (attrzp)
- iput(ZTOI(attrzp));
- err = SET_ERROR(EDQUOT);
- goto out2;
- }
- }
-
- if (projid != ZFS_INVALID_PROJID &&
- zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
- if (attrzp)
- iput(ZTOI(attrzp));
- err = EDQUOT;
- goto out2;
- }
- }
- tx = dmu_tx_create(os);
-
- if (mask & ATTR_MODE) {
- uint64_t pmode = zp->z_mode;
- uint64_t acl_obj;
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- zfs_acl_chmod_setattr(zp, &aclp, new_mode);
-
- mutex_enter(&zp->z_lock);
- if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
- /*
- * Are we upgrading ACL from old V0 format
- * to V1 format?
- */
- if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
- zfs_znode_acl_version(zp) ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx, acl_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx, acl_obj, 0,
- aclp->z_acl_bytes);
- }
- } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- }
- mutex_exit(&zp->z_lock);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- } else {
- if (((mask & ATTR_XVATTR) &&
- XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
- (projid != ZFS_INVALID_PROJID &&
- !(zp->z_pflags & ZFS_PROJID)))
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
- else
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- }
-
- if (attrzp) {
- dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
- }
-
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
-
- zfs_sa_upgrade_txholds(tx, zp);
-
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err)
- goto out;
-
- count = 0;
- /*
- * Set each attribute requested.
- * We group settings according to the locks they need to acquire.
- *
- * Note: you cannot set ctime directly, although it will be
- * updated as a side-effect of calling this function.
- */
-
- if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
- /*
- * For the existed object that is upgraded from old system,
- * its on-disk layout has no slot for the project ID attribute.
- * But quota accounting logic needs to access related slots by
- * offset directly. So we need to adjust old objects' layout
- * to make the project ID to some unified and fixed offset.
- */
- if (attrzp)
- err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
- if (err == 0)
- err = sa_add_projid(zp->z_sa_hdl, tx, projid);
-
- if (unlikely(err == EEXIST))
- err = 0;
- else if (err != 0)
- goto out;
- else
- projid = ZFS_INVALID_PROJID;
- }
-
- if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
- mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, sizeof (zp->z_pflags));
-
- if (attrzp) {
- if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
- mutex_enter(&attrzp->z_acl_lock);
- mutex_enter(&attrzp->z_lock);
- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
- SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
- sizeof (attrzp->z_pflags));
- if (projid != ZFS_INVALID_PROJID) {
- attrzp->z_projid = projid;
- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
- SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
- sizeof (attrzp->z_projid));
- }
- }
-
- if (mask & (ATTR_UID|ATTR_GID)) {
-
- if (mask & ATTR_UID) {
- ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
- new_uid = zfs_uid_read(ZTOI(zp));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
- &new_uid, sizeof (new_uid));
- if (attrzp) {
- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
- SA_ZPL_UID(zfsvfs), NULL, &new_uid,
- sizeof (new_uid));
- ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
- }
- }
-
- if (mask & ATTR_GID) {
- ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
- new_gid = zfs_gid_read(ZTOI(zp));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
- NULL, &new_gid, sizeof (new_gid));
- if (attrzp) {
- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
- SA_ZPL_GID(zfsvfs), NULL, &new_gid,
- sizeof (new_gid));
- ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
- }
- }
- if (!(mask & ATTR_MODE)) {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
- NULL, &new_mode, sizeof (new_mode));
- new_mode = zp->z_mode;
- }
- err = zfs_acl_chown_setattr(zp);
- ASSERT(err == 0);
- if (attrzp) {
- err = zfs_acl_chown_setattr(attrzp);
- ASSERT(err == 0);
- }
- }
-
- if (mask & ATTR_MODE) {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
- &new_mode, sizeof (new_mode));
- zp->z_mode = ZTOI(zp)->i_mode = new_mode;
- ASSERT3P(aclp, !=, NULL);
- err = zfs_aclset_common(zp, aclp, cr, tx);
- ASSERT0(err);
- if (zp->z_acl_cached)
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = aclp;
- aclp = NULL;
- }
-
- if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
- zp->z_atime_dirty = B_FALSE;
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
- &atime, sizeof (atime));
- }
-
- if (mask & (ATTR_MTIME | ATTR_SIZE)) {
- ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
- ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime,
- ZTOI(zp)->i_sb->s_time_gran);
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
- mtime, sizeof (mtime));
- }
-
- if (mask & (ATTR_CTIME | ATTR_SIZE)) {
- ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
- ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime,
- ZTOI(zp)->i_sb->s_time_gran);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- ctime, sizeof (ctime));
- }
-
- if (projid != ZFS_INVALID_PROJID) {
- zp->z_projid = projid;
- SA_ADD_BULK_ATTR(bulk, count,
- SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
- sizeof (zp->z_projid));
- }
-
- if (attrzp && mask) {
- SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
- SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
- sizeof (ctime));
- }
-
- /*
- * Do this after setting timestamps to prevent timestamp
- * update from toggling bit
- */
-
- if (xoap && (mask & ATTR_XVATTR)) {
-
- /*
- * restore trimmed off masks
- * so that return masks can be set for caller.
- */
-
- if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
- XVA_SET_REQ(xvap, XAT_APPENDONLY);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
- XVA_SET_REQ(xvap, XAT_NOUNLINK);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
- XVA_SET_REQ(xvap, XAT_IMMUTABLE);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
- XVA_SET_REQ(xvap, XAT_NODUMP);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
- XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
- XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
- }
- if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
- XVA_SET_REQ(xvap, XAT_PROJINHERIT);
- }
-
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
- ASSERT(S_ISREG(ip->i_mode));
-
- zfs_xvattr_set(zp, xvap, tx);
- }
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- if (mask != 0)
- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
-
- mutex_exit(&zp->z_lock);
- if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
- mutex_exit(&zp->z_acl_lock);
-
- if (attrzp) {
- if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
- mutex_exit(&attrzp->z_acl_lock);
- mutex_exit(&attrzp->z_lock);
- }
-out:
- if (err == 0 && xattr_count > 0) {
- err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
- xattr_count, tx);
- ASSERT(err2 == 0);
- }
-
- if (aclp)
- zfs_acl_free(aclp);
-
- if (fuidp) {
- zfs_fuid_info_free(fuidp);
- fuidp = NULL;
- }
-
- if (err) {
- dmu_tx_abort(tx);
- if (attrzp)
- iput(ZTOI(attrzp));
- if (err == ERESTART)
- goto top;
- } else {
- if (count > 0)
- err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- dmu_tx_commit(tx);
- if (attrzp) {
- if (err2 == 0 && handle_eadir)
- err2 = zfs_setattr_dir(attrzp);
- iput(ZTOI(attrzp));
- }
- zfs_inode_update(zp);
- }
-
-out2:
- if (os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
-out3:
- kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
- kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
- kmem_free(tmpxvattr, sizeof (xvattr_t));
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-typedef struct zfs_zlock {
- krwlock_t *zl_rwlock; /* lock we acquired */
- znode_t *zl_znode; /* znode we held */
- struct zfs_zlock *zl_next; /* next in list */
-} zfs_zlock_t;
-
-/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
- */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
-
- while ((zl = *zlpp) != NULL) {
- if (zl->zl_znode != NULL)
- zfs_iput_async(ZTOI(zl->zl_znode));
- rw_exit(zl->zl_rwlock);
- *zlpp = zl->zl_next;
- kmem_free(zl, sizeof (*zl));
- }
-}
-
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
- znode_t *zp = tdzp;
- uint64_t rootid = ZTOZSB(zp)->z_root;
- uint64_t oidp = zp->z_id;
- krwlock_t *rwlp = &szp->z_parent_lock;
- krw_t rw = RW_WRITER;
-
- /*
- * First pass write-locks szp and compares to zp->z_id.
- * Later passes read-lock zp and compare to zp->z_parent.
- */
- do {
- if (!rw_tryenter(rwlp, rw)) {
- /*
- * Another thread is renaming in this path.
- * Note that if we are a WRITER, we don't have any
- * parent_locks held yet.
- */
- if (rw == RW_READER && zp->z_id > szp->z_id) {
- /*
- * Drop our locks and restart
- */
- zfs_rename_unlock(&zl);
- *zlpp = NULL;
- zp = tdzp;
- oidp = zp->z_id;
- rwlp = &szp->z_parent_lock;
- rw = RW_WRITER;
- continue;
- } else {
- /*
- * Wait for other thread to drop its locks
- */
- rw_enter(rwlp, rw);
- }
- }
-
- zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
- zl->zl_rwlock = rwlp;
- zl->zl_znode = NULL;
- zl->zl_next = *zlpp;
- *zlpp = zl;
-
- if (oidp == szp->z_id) /* We're a descendant of szp */
- return (SET_ERROR(EINVAL));
-
- if (oidp == rootid) /* We've hit the top */
- return (0);
-
- if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
- if (error)
- return (error);
- zl->zl_znode = zp;
- }
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
- &oidp, sizeof (oidp));
- rwlp = &zp->z_parent_lock;
- rw = RW_READER;
-
- } while (zp->z_id != sdzp->z_id);
-
- return (0);
-}
-
-/*
- * Move an entry from the provided source directory to the target
- * directory. Change the entry name as indicated.
- *
- * IN: sdip - Source directory containing the "old entry".
- * snm - Old entry name.
- * tdip - Target directory to contain the "new entry".
- * tnm - New entry name.
- * cr - credentials of caller.
- * flags - case flags
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * sdip,tdip - ctime|mtime updated
- */
-/*ARGSUSED*/
-int
-zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
- cred_t *cr, int flags)
-{
- znode_t *tdzp, *szp, *tzp;
- znode_t *sdzp = ITOZ(sdip);
- zfsvfs_t *zfsvfs = ITOZSB(sdip);
- zilog_t *zilog;
- zfs_dirlock_t *sdl, *tdl;
- dmu_tx_t *tx;
- zfs_zlock_t *zl;
- int cmp, serr, terr;
- int error = 0;
- int zflg = 0;
- boolean_t waited = B_FALSE;
-
- if (snm == NULL || tnm == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(sdzp);
- zilog = zfsvfs->z_log;
-
- tdzp = ITOZ(tdip);
- ZFS_VERIFY_ZP(tdzp);
-
- /*
- * We check i_sb because snapshots and the ctldir must have different
- * super blocks.
- */
- if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
- }
-
- if (zfsvfs->z_utf8 && u8_validate(tnm,
- strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
-
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-
-top:
- szp = NULL;
- tzp = NULL;
- zl = NULL;
-
- /*
- * This is to prevent the creation of links into attribute space
- * by renaming a linked file into/outof an attribute directory.
- * See the comment in zfs_link() for why this is considered bad.
- */
- if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Lock source and target directory entries. To prevent deadlock,
- * a lock ordering must be defined. We lock the directory with
- * the smallest object id first, or if it's a tie, the one with
- * the lexically first name.
- */
- if (sdzp->z_id < tdzp->z_id) {
- cmp = -1;
- } else if (sdzp->z_id > tdzp->z_id) {
- cmp = 1;
- } else {
- /*
- * First compare the two name arguments without
- * considering any case folding.
- */
- int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
- cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
- ASSERT(error == 0 || !zfsvfs->z_utf8);
- if (cmp == 0) {
- /*
- * POSIX: "If the old argument and the new argument
- * both refer to links to the same existing file,
- * the rename() function shall return successfully
- * and perform no other action."
- */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
- /*
- * If the file system is case-folding, then we may
- * have some more checking to do. A case-folding file
- * system is either supporting mixed case sensitivity
- * access or is completely case-insensitive. Note
- * that the file system is always case preserving.
- *
- * In mixed sensitivity mode case sensitive behavior
- * is the default. FIGNORECASE must be used to
- * explicitly request case insensitive behavior.
- *
- * If the source and target names provided differ only
- * by case (e.g., a request to rename 'tim' to 'Tim'),
- * we will treat this as a special case in the
- * case-insensitive mode: as long as the source name
- * is an exact match, we will allow this to proceed as
- * a name-change request.
- */
- if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- (zfsvfs->z_case == ZFS_CASE_MIXED &&
- flags & FIGNORECASE)) &&
- u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
- &error) == 0) {
- /*
- * case preserving rename request, require exact
- * name matches
- */
- zflg |= ZCIEXACT;
- zflg &= ~ZCILOOK;
- }
- }
-
- /*
- * If the source and destination directories are the same, we should
- * grab the z_name_lock of that directory only once.
- */
- if (sdzp == tdzp) {
- zflg |= ZHAVELOCK;
- rw_enter(&sdzp->z_name_lock, RW_READER);
- }
-
- if (cmp < 0) {
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
- ZEXISTS | zflg, NULL, NULL);
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
- } else {
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, zflg, NULL, NULL);
- serr = zfs_dirent_lock(&sdl,
- sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
- NULL, NULL);
- }
-
- if (serr) {
- /*
- * Source entry invalid or not there.
- */
- if (!terr) {
- zfs_dirent_unlock(tdl);
- if (tzp)
- iput(ZTOI(tzp));
- }
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (strcmp(snm, "..") == 0)
- serr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (serr);
- }
- if (terr) {
- zfs_dirent_unlock(sdl);
- iput(ZTOI(szp));
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (strcmp(tnm, "..") == 0)
- terr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (terr);
- }
-
- /*
- * If we are using project inheritance, means if the directory has
- * ZFS_PROJINHERIT set, then its descendant directories will inherit
- * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
- * such case, we only allow renames into our tree when the project
- * IDs are the same.
- */
- if (tdzp->z_pflags & ZFS_PROJINHERIT &&
- tdzp->z_projid != szp->z_projid) {
- error = SET_ERROR(EXDEV);
- goto out;
- }
-
- /*
- * Must have write access at the source to remove the old entry
- * and write access at the target to create the new entry.
- * Note that if target and source are the same, this can be
- * done in a single check.
- */
-
- if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
- goto out;
-
- if (S_ISDIR(ZTOI(szp)->i_mode)) {
- /*
- * Check to make sure rename is valid.
- * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
- */
- if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
- goto out;
- }
-
- /*
- * Does target exist?
- */
- if (tzp) {
- /*
- * Source and target must be the same type.
- */
- if (S_ISDIR(ZTOI(szp)->i_mode)) {
- if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
- error = SET_ERROR(ENOTDIR);
- goto out;
- }
- } else {
- if (S_ISDIR(ZTOI(tzp)->i_mode)) {
- error = SET_ERROR(EISDIR);
- goto out;
- }
- }
- /*
- * POSIX dictates that when the source and target
- * entries refer to the same file object, rename
- * must do nothing and exit without error.
- */
- if (szp->z_id == tzp->z_id) {
- error = 0;
- goto out;
- }
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
- dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp) {
- dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, tdzp);
- }
- if (tzp) {
- dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, tzp);
- }
-
- zfs_sa_upgrade_txholds(tx, szp);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- iput(ZTOI(szp));
- if (tzp)
- iput(ZTOI(tzp));
- goto top;
- }
- dmu_tx_abort(tx);
- iput(ZTOI(szp));
- if (tzp)
- iput(ZTOI(tzp));
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
-
- if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
- if (error == 0) {
- szp->z_pflags |= ZFS_AV_MODIFIED;
- if (tdzp->z_pflags & ZFS_PROJINHERIT)
- szp->z_pflags |= ZFS_PROJINHERIT;
-
- error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
- (void *)&szp->z_pflags, sizeof (uint64_t), tx);
- ASSERT0(error);
-
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- if (error == 0) {
- zfs_log_rename(zilog, tx, TX_RENAME |
- (flags & FIGNORECASE ? TX_CI : 0), sdzp,
- sdl->dl_name, tdzp, tdl->dl_name, szp);
- } else {
- /*
- * At this point, we have successfully created
- * the target name, but have failed to remove
- * the source name. Since the create was done
- * with the ZRENAMING flag, there are
- * complications; for one, the link count is
- * wrong. The easiest way to deal with this
- * is to remove the newly created target, and
- * return the original error. This must
- * succeed; fortunately, it is very unlikely to
- * fail, since we just created it.
- */
- VERIFY3U(zfs_link_destroy(tdl, szp, tx,
- ZRENAMING, NULL), ==, 0);
- }
- } else {
- /*
- * If we had removed the existing target, subsequent
- * call to zfs_link_create() to add back the same entry
- * but, the new dnode (szp) should not fail.
- */
- ASSERT(tzp == NULL);
- }
- }
-
- dmu_tx_commit(tx);
-out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
-
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- zfs_inode_update(sdzp);
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (sdzp != tdzp)
- zfs_inode_update(tdzp);
-
- zfs_inode_update(szp);
- iput(ZTOI(szp));
- if (tzp) {
- zfs_inode_update(tzp);
- iput(ZTOI(tzp));
- }
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Insert the indicated symbolic reference entry into the directory.
- *
- * IN: dip - Directory to contain new symbolic link.
- * name - Name of directory entry in dip.
- * vap - Attributes of new entry.
- * link - Name for new symlink entry.
- * cr - credentials of caller.
- * flags - case flags
- *
- * OUT: ipp - Inode for new symbolic link.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * dip - ctime|mtime updated
- */
-/*ARGSUSED*/
-int
-zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
- struct inode **ipp, cred_t *cr, int flags)
-{
- znode_t *zp, *dzp = ITOZ(dip);
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = ITOZSB(dip);
- zilog_t *zilog;
- uint64_t len = strlen(link);
- int error;
- int zflg = ZNEW;
- zfs_acl_ids_t acl_ids;
- boolean_t fuid_dirtied;
- uint64_t txtype = TX_SYMLINK;
- boolean_t waited = B_FALSE;
-
- ASSERT(S_ISLNK(vap->va_mode));
-
- if (name == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
- NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-
- if (len > MAXPATHLEN) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENAMETOOLONG));
- }
-
- if ((error = zfs_acl_ids_create(dzp, 0,
- vap, cr, NULL, &acl_ids)) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-top:
- *ipp = NULL;
-
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
- if (error) {
- zfs_acl_ids_free(&acl_ids);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
- zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
- zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EDQUOT));
- }
- tx = dmu_tx_create(zfsvfs->z_os);
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE + len);
- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
- if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- acl_ids.z_aclp->z_acl_bytes);
- }
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Create a new object for the symlink.
- * for version 4 ZPL datsets the symlink will be an SA attribute
- */
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- mutex_enter(&zp->z_lock);
- if (zp->z_is_sa)
- error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
- link, len, tx);
- else
- zfs_sa_symlink(zp, link, len, tx);
- mutex_exit(&zp->z_lock);
-
- zp->z_size = len;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
- &zp->z_size, sizeof (zp->z_size), tx);
- /*
- * Insert the new object into the directory.
- */
- error = zfs_link_create(dl, zp, tx, ZNEW);
- if (error != 0) {
- zfs_znode_delete(zp, tx);
- remove_inode_hash(ZTOI(zp));
- } else {
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
-
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
- }
-
- zfs_acl_ids_free(&acl_ids);
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- if (error == 0) {
- *ipp = ZTOI(zp);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
- } else {
- iput(ZTOI(zp));
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Return, in the buffer contained in the provided uio structure,
- * the symbolic path referred to by ip.
- *
- * IN: ip - inode of symbolic link
- * uio - structure to contain the link path.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - atime updated
- */
-/* ARGSUSED */
-int
-zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- mutex_enter(&zp->z_lock);
- if (zp->z_is_sa)
- error = sa_lookup_uio(zp->z_sa_hdl,
- SA_ZPL_SYMLINK(zfsvfs), uio);
- else
- error = zfs_sa_readlink(zp, uio);
- mutex_exit(&zp->z_lock);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Insert a new entry into directory tdip referencing sip.
- *
- * IN: tdip - Directory to contain new entry.
- * sip - inode of new entry.
- * name - name of new entry.
- * cr - credentials of caller.
- * flags - case flags.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * tdip - ctime|mtime updated
- * sip - ctime updated
- */
-/* ARGSUSED */
-int
-zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr,
- int flags)
-{
- znode_t *dzp = ITOZ(tdip);
- znode_t *tzp, *szp;
- zfsvfs_t *zfsvfs = ITOZSB(tdip);
- zilog_t *zilog;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- int zf = ZNEW;
- uint64_t parent;
- uid_t owner;
- boolean_t waited = B_FALSE;
- boolean_t is_tmpfile = 0;
- uint64_t txg;
-#ifdef HAVE_TMPFILE
- is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
-#endif
- ASSERT(S_ISDIR(tdip->i_mode));
-
- if (name == NULL)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
- zilog = zfsvfs->z_log;
-
- /*
- * POSIX dictates that we return EPERM here.
- * Better choices include ENOTSUP or EISDIR.
- */
- if (S_ISDIR(sip->i_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- szp = ITOZ(sip);
- ZFS_VERIFY_ZP(szp);
-
- /*
- * If we are using project inheritance, means if the directory has
- * ZFS_PROJINHERIT set, then its descendant directories will inherit
- * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
- * such case, we only allow hard link creation in our tree when the
- * project IDs are the same.
- */
- if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
- }
-
- /*
- * We check i_sb because snapshots and the ctldir must have different
- * super blocks.
- */
- if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
- }
-
- /* Prevent links to .zfs/shares files */
-
- if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
- &parent, sizeof (uint64_t))) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- if (parent == zfsvfs->z_shares_dir) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- if (zfsvfs->z_utf8 && u8_validate(name,
- strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
- }
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- /*
- * We do not support links between attributes and non-attributes
- * because of the potential security risk of creating links
- * into "normal" file space in order to circumvent restrictions
- * imposed in attribute space.
- */
- if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
- cr, ZFS_OWNER);
- if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
-top:
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (is_tmpfile)
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
- zfs_sa_upgrade_txholds(tx, szp);
- zfs_sa_upgrade_txholds(tx, dzp);
- error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- waited = B_TRUE;
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- /* unmark z_unlinked so zfs_link_create will not reject */
- if (is_tmpfile)
- szp->z_unlinked = B_FALSE;
- error = zfs_link_create(dl, szp, tx, 0);
-
- if (error == 0) {
- uint64_t txtype = TX_LINK;
- /*
- * tmpfile is created to be in z_unlinkedobj, so remove it.
- * Also, we don't log in ZIL, because all previous file
- * operation on the tmpfile are ignored by ZIL. Instead we
- * always wait for txg to sync to make sure all previous
- * operation are sync safe.
- */
- if (is_tmpfile) {
- VERIFY(zap_remove_int(zfsvfs->z_os,
- zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
- } else {
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_link(zilog, tx, txtype, dzp, szp, name);
- }
- } else if (is_tmpfile) {
- /* restore z_unlinked since when linking failed */
- szp->z_unlinked = B_TRUE;
- }
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- if (is_tmpfile)
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
-
- zfs_inode_update(dzp);
- zfs_inode_update(szp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-static void
-zfs_putpage_commit_cb(void *arg)
-{
- struct page *pp = arg;
-
- ClearPageError(pp);
- end_page_writeback(pp);
-}
-
-/*
- * Push a page out to disk, once the page is on stable storage the
- * registered commit callback will be run as notification of completion.
- *
- * IN: ip - page mapped for inode.
- * pp - page to push (page is locked)
- * wbc - writeback control data
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - ctime|mtime updated
- */
-/* ARGSUSED */
-int
-zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- loff_t offset;
- loff_t pgoff;
- unsigned int pglen;
- dmu_tx_t *tx;
- caddr_t va;
- int err = 0;
- uint64_t mtime[2], ctime[2];
- sa_bulk_attr_t bulk[3];
- int cnt = 0;
- struct address_space *mapping;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- ASSERT(PageLocked(pp));
-
- pgoff = page_offset(pp); /* Page byte-offset in file */
- offset = i_size_read(ip); /* File length in bytes */
- pglen = MIN(PAGE_SIZE, /* Page length in bytes */
- P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
-
- /* Page is beyond end of file */
- if (pgoff >= offset) {
- unlock_page(pp);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /* Truncate page length to end of file */
- if (pgoff + pglen > offset)
- pglen = offset - pgoff;
-
-#if 0
- /*
- * FIXME: Allow mmap writes past its quota. The correct fix
- * is to register a page_mkwrite() handler to count the page
- * against its quota when it is about to be dirtied.
- */
- if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
- KUID_TO_SUID(ip->i_uid)) ||
- zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
- KGID_TO_SGID(ip->i_gid)) ||
- (zp->z_projid != ZFS_DEFAULT_PROJID &&
- zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
- zp->z_projid))) {
- err = EDQUOT;
- }
-#endif
-
- /*
- * The ordering here is critical and must adhere to the following
- * rules in order to avoid deadlocking in either zfs_read() or
- * zfs_free_range() due to a lock inversion.
- *
- * 1) The page must be unlocked prior to acquiring the range lock.
- * This is critical because zfs_read() calls find_lock_page()
- * which may block on the page lock while holding the range lock.
- *
- * 2) Before setting or clearing write back on a page the range lock
- * must be held in order to prevent a lock inversion with the
- * zfs_free_range() function.
- *
- * This presents a problem because upon entering this function the
- * page lock is already held. To safely acquire the range lock the
- * page lock must be dropped. This creates a window where another
- * process could truncate, invalidate, dirty, or write out the page.
- *
- * Therefore, after successfully reacquiring the range and page locks
- * the current page state is checked. In the common case everything
- * will be as is expected and it can be written out. However, if
- * the page state has changed it must be handled accordingly.
- */
- mapping = pp->mapping;
- redirty_page_for_writepage(wbc, pp);
- unlock_page(pp);
-
- locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
- pgoff, pglen, RL_WRITER);
- lock_page(pp);
-
- /* Page mapping changed or it was no longer dirty, we're done */
- if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
- unlock_page(pp);
- rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /* Another process started write block if required */
- if (PageWriteback(pp)) {
- unlock_page(pp);
- rangelock_exit(lr);
-
- if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(pp))
- wait_on_page_bit(pp, PG_writeback);
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /* Clear the dirty flag the required locks are held */
- if (!clear_page_dirty_for_io(pp)) {
- unlock_page(pp);
- rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /*
- * Counterpart for redirty_page_for_writepage() above. This page
- * was in fact not skipped and should not be counted as if it were.
- */
- wbc->pages_skipped--;
- set_page_writeback(pp);
- unlock_page(pp);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
-
- err = dmu_tx_assign(tx, TXG_NOWAIT);
- if (err != 0) {
- if (err == ERESTART)
- dmu_tx_wait(tx);
-
- dmu_tx_abort(tx);
- __set_page_dirty_nobuffers(pp);
- ClearPageError(pp);
- end_page_writeback(pp);
- rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- va = kmap(pp);
- ASSERT3U(pglen, <=, PAGE_SIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
- kunmap(pp);
-
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
-
- /* Preserve the mtime and ctime provided by the inode */
- ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
- ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
- zp->z_atime_dirty = B_FALSE;
- zp->z_seq++;
-
- err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
-
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
- zfs_putpage_commit_cb, pp);
- dmu_tx_commit(tx);
-
- rangelock_exit(lr);
-
- if (wbc->sync_mode != WB_SYNC_NONE) {
- /*
- * Note that this is rarely called under writepages(), because
- * writepages() normally handles the entire commit for
- * performance reasons.
- */
- zil_commit(zfsvfs->z_log, zp->z_id);
- }
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-/*
- * Update the system attributes when the inode has been dirtied. For the
- * moment we only update the mode, atime, mtime, and ctime.
- */
-int
-zfs_dirty_inode(struct inode *ip, int flags)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- dmu_tx_t *tx;
- uint64_t mode, atime[2], mtime[2], ctime[2];
- sa_bulk_attr_t bulk[4];
- int error = 0;
- int cnt = 0;
-
- if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
- return (0);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
-#ifdef I_DIRTY_TIME
- /*
- * This is the lazytime semantic introduced in Linux 4.0
- * This flag will only be called from update_time when lazytime is set.
- * (Note, I_DIRTY_SYNC will also set if not lazytime)
- * Fortunately mtime and ctime are managed within ZFS itself, so we
- * only need to dirty atime.
- */
- if (flags == I_DIRTY_TIME) {
- zp->z_atime_dirty = B_TRUE;
- goto out;
- }
-#endif
-
- tx = dmu_tx_create(zfsvfs->z_os);
-
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
-
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- goto out;
- }
-
- mutex_enter(&zp->z_lock);
- zp->z_atime_dirty = B_FALSE;
-
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-
- /* Preserve the mode, mtime and ctime provided by the inode */
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
- ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
- ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
- mode = ip->i_mode;
-
- zp->z_mode = mode;
-
- error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- mutex_exit(&zp->z_lock);
-
- dmu_tx_commit(tx);
-out:
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*ARGSUSED*/
-void
-zfs_inactive(struct inode *ip)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- uint64_t atime[2];
- int error;
- int need_unlock = 0;
-
- /* Only read lock if we haven't already write locked, e.g. rollback */
- if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
- need_unlock = 1;
- rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- }
- if (zp->z_sa_hdl == NULL) {
- if (need_unlock)
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
- return;
- }
-
- if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
- mutex_enter(&zp->z_lock);
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
- (void *)&atime, sizeof (atime), tx);
- zp->z_atime_dirty = B_FALSE;
- mutex_exit(&zp->z_lock);
- dmu_tx_commit(tx);
- }
- }
-
- zfs_zinactive(zp);
- if (need_unlock)
- rw_exit(&zfsvfs->z_teardown_inactive_lock);
-}
-
-/*
- * Bounds-check the seek operation.
- *
- * IN: ip - inode seeking within
- * ooff - old file offset
- * noffp - pointer to new file offset
- *
- * RETURN: 0 if success
- * EINVAL if new offset invalid
- */
-/* ARGSUSED */
-int
-zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
-{
- if (S_ISDIR(ip->i_mode))
- return (0);
- return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-/*
- * Fill pages with data from the disk.
- */
-static int
-zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- objset_t *os;
- struct page *cur_pp;
- u_offset_t io_off, total;
- size_t io_len;
- loff_t i_size;
- unsigned page_idx;
- int err;
-
- os = zfsvfs->z_os;
- io_len = nr_pages << PAGE_SHIFT;
- i_size = i_size_read(ip);
- io_off = page_offset(pl[0]);
-
- if (io_off + io_len > i_size)
- io_len = i_size - io_off;
-
- /*
- * Iterate over list of pages and read each page individually.
- */
- page_idx = 0;
- for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
- caddr_t va;
-
- cur_pp = pl[page_idx++];
- va = kmap(cur_pp);
- err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
- DMU_READ_PREFETCH);
- kunmap(cur_pp);
- if (err) {
- /* convert checksum errors into IO errors */
- if (err == ECKSUM)
- err = SET_ERROR(EIO);
- return (err);
- }
- }
-
- return (0);
-}
-
-/*
- * Uses zfs_fillpage to read data from the file and fill the pages.
- *
- * IN: ip - inode of file to get data from.
- * pl - list of pages to read
- * nr_pages - number of pages to read
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-int
-zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int err;
-
- if (pl == NULL)
- return (0);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- err = zfs_fillpage(ip, pl, nr_pages);
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-/*
- * Check ZFS specific permissions to memory map a section of a file.
- *
- * IN: ip - inode of the file to mmap
- * off - file offset
- * addrp - start address in memory region
- * len - length of memory region
- * vm_flags- address flags
- *
- * RETURN: 0 if success
- * error code if failure
- */
-/*ARGSUSED*/
-int
-zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
- unsigned long vm_flags)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((vm_flags & VM_WRITE) && (zp->z_pflags &
- (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- if ((vm_flags & (VM_READ | VM_EXEC)) &&
- (zp->z_pflags & ZFS_AV_QUARANTINED)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- if (off < 0 || len > MAXOFFSET_T - off) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENXIO));
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * convoff - converts the given data (start, whence) to the
- * given whence.
- */
-int
-convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset)
-{
- vattr_t vap;
- int error;
-
- if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) {
- if ((error = zfs_getattr(ip, &vap, 0, CRED())))
- return (error);
- }
-
- switch (lckdat->l_whence) {
- case SEEK_CUR:
- lckdat->l_start += offset;
- break;
- case SEEK_END:
- lckdat->l_start += vap.va_size;
- /* FALLTHRU */
- case SEEK_SET:
- break;
- default:
- return (SET_ERROR(EINVAL));
- }
-
- if (lckdat->l_start < 0)
- return (SET_ERROR(EINVAL));
-
- switch (whence) {
- case SEEK_CUR:
- lckdat->l_start -= offset;
- break;
- case SEEK_END:
- lckdat->l_start -= vap.va_size;
- /* FALLTHRU */
- case SEEK_SET:
- break;
- default:
- return (SET_ERROR(EINVAL));
- }
-
- lckdat->l_whence = (short)whence;
- return (0);
-}
-
-/*
- * Free or allocate space in a file. Currently, this function only
- * supports the `F_FREESP' command. However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- * IN: ip - inode of file to free data in.
- * cmd - action to take (only F_FREESP supported).
- * bfp - section of file to free/alloc.
- * flag - current file open mode flags.
- * offset - current file offset.
- * cr - credentials of caller.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * ip - ctime|mtime updated
- */
-/* ARGSUSED */
-int
-zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
- offset_t offset, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- uint64_t off, len;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (cmd != F_FREESP) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Callers might not be able to detect properly that we are read-only,
- * so check it explicitly here.
- */
- if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EROFS));
- }
-
- if ((error = convoff(ip, bfp, SEEK_SET, offset))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (bfp->l_len < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Permissions aren't checked on Solaris because on this OS
- * zfs_space() can only be called with an opened file handle.
- * On Linux we can get here through truncate_range() which
- * operates directly on inodes, so we need to check access rights.
- */
- if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- off = bfp->l_start;
- len = bfp->l_len; /* 0 means from off to end of file */
-
- error = zfs_freesp(zp, off, len, flag, TRUE);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_fid(struct inode *ip, fid_t *fidp)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- uint32_t gen;
- uint64_t gen64;
- uint64_t object = zp->z_id;
- zfid_short_t *zfid;
- int size, i, error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
- &gen64, sizeof (uint64_t))) != 0) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- gen = (uint32_t)gen64;
-
- size = SHORT_FID_LEN;
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = size;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* Must have a non-zero generation number to distinguish from .zfs */
- if (gen == 0)
- gen = 1;
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*ARGSUSED*/
-int
-zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- zilog_t *zilog = zfsvfs->z_log;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- error = zfs_setacl(zp, vsecp, skipaclchk, cr);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-#ifdef HAVE_UIO_ZEROCOPY
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10); /* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read. This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17); /* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int max_blksz = zfsvfs->z_max_blksz;
- uio_t *uio = &xuio->xu_uio;
- ssize_t size = uio->uio_resid;
- offset_t offset = uio->uio_loffset;
- int blksz;
- int fullblk, i;
- arc_buf_t *abuf;
- ssize_t maxsize;
- int preamble, postamble;
-
- if (xuio->xu_type != UIOTYPE_ZEROCOPY)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- switch (ioflag) {
- case UIO_WRITE:
- /*
- * Loan out an arc_buf for write if write size is bigger than
- * max_blksz, and the file's block size is also max_blksz.
- */
- blksz = max_blksz;
- if (size < blksz || zp->z_blksz != blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- /*
- * Caller requests buffers for write before knowing where the
- * write offset might be (e.g. NFS TCP write).
- */
- if (offset == -1) {
- preamble = 0;
- } else {
- preamble = P2PHASE(offset, blksz);
- if (preamble) {
- preamble = blksz - preamble;
- size -= preamble;
- }
- }
-
- postamble = P2PHASE(size, blksz);
- size -= postamble;
-
- fullblk = size / blksz;
- (void) dmu_xuio_init(xuio,
- (preamble != 0) + fullblk + (postamble != 0));
-
- /*
- * Have to fix iov base/len for partial buffers. They
- * currently represent full arc_buf's.
- */
- if (preamble) {
- /* data begins in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf,
- blksz - preamble, preamble);
- }
-
- for (i = 0; i < fullblk; i++) {
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, blksz);
- }
-
- if (postamble) {
- /* data ends in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, postamble);
- }
- break;
- case UIO_READ:
- /*
- * Loan out an arc_buf for read if the read size is larger than
- * the current file block size. Block alignment is not
- * considered. Partial arc_buf will be loaned out for read.
- */
- blksz = zp->z_blksz;
- if (blksz < zcr_blksz_min)
- blksz = zcr_blksz_min;
- if (blksz > zcr_blksz_max)
- blksz = zcr_blksz_max;
- /* avoid potential complexity of dealing with it */
- if (blksz > max_blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- maxsize = zp->z_size - uio->uio_loffset;
- if (size > maxsize)
- size = maxsize;
-
- if (size < blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- break;
- default:
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- uio->uio_extflg = UIO_XUIO;
- XUIO_XUZC_RW(xuio) = ioflag;
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
-{
- int i;
- arc_buf_t *abuf;
- int ioflag = XUIO_XUZC_RW(xuio);
-
- ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
- i = dmu_xuio_cnt(xuio);
- while (i-- > 0) {
- abuf = dmu_xuio_arcbuf(xuio, i);
- /*
- * if abuf == NULL, it must be a write buffer
- * that has been returned in zfs_write().
- */
- if (abuf)
- dmu_return_arcbuf(abuf);
- ASSERT(abuf || ioflag == UIO_WRITE);
- }
-
- dmu_xuio_fini(xuio);
- return (0);
-}
-#endif /* HAVE_UIO_ZEROCOPY */
-
-#if defined(_KERNEL)
-EXPORT_SYMBOL(zfs_open);
-EXPORT_SYMBOL(zfs_close);
-EXPORT_SYMBOL(zfs_read);
-EXPORT_SYMBOL(zfs_write);
-EXPORT_SYMBOL(zfs_access);
-EXPORT_SYMBOL(zfs_lookup);
-EXPORT_SYMBOL(zfs_create);
-EXPORT_SYMBOL(zfs_tmpfile);
-EXPORT_SYMBOL(zfs_remove);
-EXPORT_SYMBOL(zfs_mkdir);
-EXPORT_SYMBOL(zfs_rmdir);
-EXPORT_SYMBOL(zfs_readdir);
-EXPORT_SYMBOL(zfs_fsync);
-EXPORT_SYMBOL(zfs_getattr);
-EXPORT_SYMBOL(zfs_getattr_fast);
-EXPORT_SYMBOL(zfs_setattr);
-EXPORT_SYMBOL(zfs_rename);
-EXPORT_SYMBOL(zfs_symlink);
-EXPORT_SYMBOL(zfs_readlink);
-EXPORT_SYMBOL(zfs_link);
-EXPORT_SYMBOL(zfs_inactive);
-EXPORT_SYMBOL(zfs_space);
-EXPORT_SYMBOL(zfs_fid);
-EXPORT_SYMBOL(zfs_getsecattr);
-EXPORT_SYMBOL(zfs_setsecattr);
-EXPORT_SYMBOL(zfs_getpage);
-EXPORT_SYMBOL(zfs_putpage);
-EXPORT_SYMBOL(zfs_dirty_inode);
-EXPORT_SYMBOL(zfs_map);
-
-/* BEGIN CSTYLED */
-module_param(zfs_delete_blocks, ulong, 0644);
-MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-module_param(zfs_read_chunk_size, ulong, 0644);
-MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
-/* END CSTYLED */
-
-#endif
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
deleted file mode 100644
index 549c701a0..000000000
--- a/module/zfs/zfs_znode.c
+++ /dev/null
@@ -1,2234 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#ifdef _KERNEL
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/mntent.h>
-#include <sys/u8_textprep.h>
-#include <sys/dsl_dataset.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/errno.h>
-#include <sys/mode.h>
-#include <sys/atomic.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_rlock.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/dnode.h>
-#include <sys/fs/zfs.h>
-#include <sys/zpl.h>
-#endif /* _KERNEL */
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/refcount.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/zfs_znode.h>
-#include <sys/sa.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_stat.h>
-
-#include "zfs_prop.h"
-#include "zfs_comutil.h"
-
-/*
- * Functions needed for userland (ie: libzpool) are not put under
- * #ifdef_KERNEL; the rest of the functions have dependencies
- * (such as VFS logic) that will not compile easily in userland.
- */
-#ifdef _KERNEL
-
-static kmem_cache_t *znode_cache = NULL;
-static kmem_cache_t *znode_hold_cache = NULL;
-unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
-
-/*
- * This is used by the test suite so that it can delay znodes from being
- * freed in order to inspect the unlinked set.
- */
-int zfs_unlink_suspend_progress = 0;
-
-/*
- * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
- * z_rangelock. It will modify the offset and length of the lock to reflect
- * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
- * called with the rangelock_t's rl_lock held, which avoids races.
- */
-static void
-zfs_rangelock_cb(locked_range_t *new, void *arg)
-{
- znode_t *zp = arg;
-
- /*
- * If in append mode, convert to writer and lock starting at the
- * current end of file.
- */
- if (new->lr_type == RL_APPEND) {
- new->lr_offset = zp->z_size;
- new->lr_type = RL_WRITER;
- }
-
- /*
- * If we need to grow the block size then lock the whole file range.
- */
- uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
- if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
- zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
- new->lr_offset = 0;
- new->lr_length = UINT64_MAX;
- }
-}
-
-/*ARGSUSED*/
-static int
-zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
-{
- znode_t *zp = buf;
-
- inode_init_once(ZTOI(zp));
- list_link_init(&zp->z_link_node);
-
- mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
- mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
-
- rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
-
- zp->z_dirlocks = NULL;
- zp->z_acl_cached = NULL;
- zp->z_xattr_cached = NULL;
- zp->z_xattr_parent = 0;
- zp->z_moved = B_FALSE;
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-zfs_znode_cache_destructor(void *buf, void *arg)
-{
- znode_t *zp = buf;
-
- ASSERT(!list_link_active(&zp->z_link_node));
- mutex_destroy(&zp->z_lock);
- rw_destroy(&zp->z_parent_lock);
- rw_destroy(&zp->z_name_lock);
- mutex_destroy(&zp->z_acl_lock);
- rw_destroy(&zp->z_xattr_lock);
- rangelock_fini(&zp->z_rangelock);
-
- ASSERT(zp->z_dirlocks == NULL);
- ASSERT(zp->z_acl_cached == NULL);
- ASSERT(zp->z_xattr_cached == NULL);
-}
-
-static int
-zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
-{
- znode_hold_t *zh = buf;
-
- mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
- zfs_refcount_create(&zh->zh_refcount);
- zh->zh_obj = ZFS_NO_OBJECT;
-
- return (0);
-}
-
-static void
-zfs_znode_hold_cache_destructor(void *buf, void *arg)
-{
- znode_hold_t *zh = buf;
-
- mutex_destroy(&zh->zh_lock);
- zfs_refcount_destroy(&zh->zh_refcount);
-}
-
-void
-zfs_znode_init(void)
-{
- /*
- * Initialize zcache. The KMC_SLAB hint is used in order that it be
- * backed by kmalloc() when on the Linux slab in order that any
- * wait_on_bit() operations on the related inode operate properly.
- */
- ASSERT(znode_cache == NULL);
- znode_cache = kmem_cache_create("zfs_znode_cache",
- sizeof (znode_t), 0, zfs_znode_cache_constructor,
- zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
-
- ASSERT(znode_hold_cache == NULL);
- znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
- sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
- zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
-}
-
-void
-zfs_znode_fini(void)
-{
- /*
- * Cleanup zcache
- */
- if (znode_cache)
- kmem_cache_destroy(znode_cache);
- znode_cache = NULL;
-
- if (znode_hold_cache)
- kmem_cache_destroy(znode_hold_cache);
- znode_hold_cache = NULL;
-}
-
-/*
- * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
- * serialize access to a znode and its SA buffer while the object is being
- * created or destroyed. This kind of locking would normally reside in the
- * znode itself but in this case that's impossible because the znode and SA
- * buffer may not yet exist. Therefore the locking is handled externally
- * with an array of mutexs and AVLs trees which contain per-object locks.
- *
- * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
- * in to the correct AVL tree and finally the per-object lock is held. In
- * zfs_znode_hold_exit() the process is reversed. The per-object lock is
- * released, removed from the AVL tree and destroyed if there are no waiters.
- *
- * This scheme has two important properties:
- *
- * 1) No memory allocations are performed while holding one of the z_hold_locks.
- * This ensures evict(), which can be called from direct memory reclaim, will
- * never block waiting on a z_hold_locks which just happens to have hashed
- * to the same index.
- *
- * 2) All locks used to serialize access to an object are per-object and never
- * shared. This minimizes lock contention without creating a large number
- * of dedicated locks.
- *
- * On the downside it does require znode_lock_t structures to be frequently
- * allocated and freed. However, because these are backed by a kmem cache
- * and very short lived this cost is minimal.
- */
-int
-zfs_znode_hold_compare(const void *a, const void *b)
-{
- const znode_hold_t *zh_a = (const znode_hold_t *)a;
- const znode_hold_t *zh_b = (const znode_hold_t *)b;
-
- return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
-}
-
-boolean_t
-zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
-{
- znode_hold_t *zh, search;
- int i = ZFS_OBJ_HASH(zfsvfs, obj);
- boolean_t held;
-
- search.zh_obj = obj;
-
- mutex_enter(&zfsvfs->z_hold_locks[i]);
- zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
- held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
- mutex_exit(&zfsvfs->z_hold_locks[i]);
-
- return (held);
-}
-
-static znode_hold_t *
-zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
-{
- znode_hold_t *zh, *zh_new, search;
- int i = ZFS_OBJ_HASH(zfsvfs, obj);
- boolean_t found = B_FALSE;
-
- zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
- zh_new->zh_obj = obj;
- search.zh_obj = obj;
-
- mutex_enter(&zfsvfs->z_hold_locks[i]);
- zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
- if (likely(zh == NULL)) {
- zh = zh_new;
- avl_add(&zfsvfs->z_hold_trees[i], zh);
- } else {
- ASSERT3U(zh->zh_obj, ==, obj);
- found = B_TRUE;
- }
- zfs_refcount_add(&zh->zh_refcount, NULL);
- mutex_exit(&zfsvfs->z_hold_locks[i]);
-
- if (found == B_TRUE)
- kmem_cache_free(znode_hold_cache, zh_new);
-
- ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
- ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
- mutex_enter(&zh->zh_lock);
-
- return (zh);
-}
-
-static void
-zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
-{
- int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
- boolean_t remove = B_FALSE;
-
- ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
- ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
- mutex_exit(&zh->zh_lock);
-
- mutex_enter(&zfsvfs->z_hold_locks[i]);
- if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
- avl_remove(&zfsvfs->z_hold_trees[i], zh);
- remove = B_TRUE;
- }
- mutex_exit(&zfsvfs->z_hold_locks[i]);
-
- if (remove == B_TRUE)
- kmem_cache_free(znode_hold_cache, zh);
-}
-
-static void
-zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
- dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
-{
- ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
-
- mutex_enter(&zp->z_lock);
-
- ASSERT(zp->z_sa_hdl == NULL);
- ASSERT(zp->z_acl_cached == NULL);
- if (sa_hdl == NULL) {
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
- SA_HDL_SHARED, &zp->z_sa_hdl));
- } else {
- zp->z_sa_hdl = sa_hdl;
- sa_set_userp(sa_hdl, zp);
- }
-
- zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
-
- mutex_exit(&zp->z_lock);
-}
-
-void
-zfs_znode_dmu_fini(znode_t *zp)
-{
- ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
- RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
-
- sa_handle_destroy(zp->z_sa_hdl);
- zp->z_sa_hdl = NULL;
-}
-
-/*
- * Called by new_inode() to allocate a new inode.
- */
-int
-zfs_inode_alloc(struct super_block *sb, struct inode **ip)
-{
- znode_t *zp;
-
- zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- *ip = ZTOI(zp);
-
- return (0);
-}
-
-/*
- * Called in multiple places when an inode should be destroyed.
- */
-void
-zfs_inode_destroy(struct inode *ip)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- if (list_link_active(&zp->z_link_node)) {
- list_remove(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes--;
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
- }
-
- if (zp->z_xattr_cached) {
- nvlist_free(zp->z_xattr_cached);
- zp->z_xattr_cached = NULL;
- }
-
- kmem_cache_free(znode_cache, zp);
-}
-
-static void
-zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
-{
- uint64_t rdev = 0;
-
- switch (ip->i_mode & S_IFMT) {
- case S_IFREG:
- ip->i_op = &zpl_inode_operations;
- ip->i_fop = &zpl_file_operations;
- ip->i_mapping->a_ops = &zpl_address_space_operations;
- break;
-
- case S_IFDIR:
- ip->i_op = &zpl_dir_inode_operations;
- ip->i_fop = &zpl_dir_file_operations;
- ITOZ(ip)->z_zn_prefetch = B_TRUE;
- break;
-
- case S_IFLNK:
- ip->i_op = &zpl_symlink_inode_operations;
- break;
-
- /*
- * rdev is only stored in a SA only for device files.
- */
- case S_IFCHR:
- case S_IFBLK:
- (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
- sizeof (rdev));
- /*FALLTHROUGH*/
- case S_IFIFO:
- case S_IFSOCK:
- init_special_inode(ip, ip->i_mode, rdev);
- ip->i_op = &zpl_special_inode_operations;
- break;
-
- default:
- zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
- (u_longlong_t)ip->i_ino, ip->i_mode);
-
- /* Assume the inode is a file and attempt to continue */
- ip->i_mode = S_IFREG | 0644;
- ip->i_op = &zpl_inode_operations;
- ip->i_fop = &zpl_file_operations;
- ip->i_mapping->a_ops = &zpl_address_space_operations;
- break;
- }
-}
-
-void
-zfs_set_inode_flags(znode_t *zp, struct inode *ip)
-{
- /*
- * Linux and Solaris have different sets of file attributes, so we
- * restrict this conversion to the intersection of the two.
- */
-#ifdef HAVE_INODE_SET_FLAGS
- unsigned int flags = 0;
- if (zp->z_pflags & ZFS_IMMUTABLE)
- flags |= S_IMMUTABLE;
- if (zp->z_pflags & ZFS_APPENDONLY)
- flags |= S_APPEND;
-
- inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
-#else
- if (zp->z_pflags & ZFS_IMMUTABLE)
- ip->i_flags |= S_IMMUTABLE;
- else
- ip->i_flags &= ~S_IMMUTABLE;
-
- if (zp->z_pflags & ZFS_APPENDONLY)
- ip->i_flags |= S_APPEND;
- else
- ip->i_flags &= ~S_APPEND;
-#endif
-}
-
-/*
- * Update the embedded inode given the znode. We should work toward
- * eliminating this function as soon as possible by removing values
- * which are duplicated between the znode and inode. If the generic
- * inode has the correct field it should be used, and the ZFS code
- * updated to access the inode. This can be done incrementally.
- */
-void
-zfs_inode_update(znode_t *zp)
-{
- zfsvfs_t *zfsvfs;
- struct inode *ip;
- uint32_t blksize;
- u_longlong_t i_blocks;
-
- ASSERT(zp != NULL);
- zfsvfs = ZTOZSB(zp);
- ip = ZTOI(zp);
-
- /* Skip .zfs control nodes which do not exist on disk. */
- if (zfsctl_is_node(ip))
- return;
-
- dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
-
- spin_lock(&ip->i_lock);
- ip->i_blocks = i_blocks;
- i_size_write(ip, zp->z_size);
- spin_unlock(&ip->i_lock);
-}
-
-
-/*
- * Construct a znode+inode and initialize.
- *
- * This does not do a call to dmu_set_user() that is
- * up to the caller to do, in case you don't want to
- * return the znode
- */
-static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
- dmu_object_type_t obj_type, sa_handle_t *hdl)
-{
- znode_t *zp;
- struct inode *ip;
- uint64_t mode;
- uint64_t parent;
- uint64_t tmp_gen;
- uint64_t links;
- uint64_t z_uid, z_gid;
- uint64_t atime[2], mtime[2], ctime[2];
- uint64_t projid = ZFS_DEFAULT_PROJID;
- sa_bulk_attr_t bulk[11];
- int count = 0;
-
- ASSERT(zfsvfs != NULL);
-
- ip = new_inode(zfsvfs->z_sb);
- if (ip == NULL)
- return (NULL);
-
- zp = ITOZ(ip);
- ASSERT(zp->z_dirlocks == NULL);
- ASSERT3P(zp->z_acl_cached, ==, NULL);
- ASSERT3P(zp->z_xattr_cached, ==, NULL);
- zp->z_unlinked = B_FALSE;
- zp->z_atime_dirty = B_FALSE;
- zp->z_moved = B_FALSE;
- zp->z_is_mapped = B_FALSE;
- zp->z_is_ctldir = B_FALSE;
- zp->z_is_stale = B_FALSE;
- zp->z_suspended = B_FALSE;
- zp->z_sa_hdl = NULL;
- zp->z_mapcnt = 0;
- zp->z_id = db->db_object;
- zp->z_blksz = blksz;
- zp->z_seq = 0x7A4653;
- zp->z_sync_cnt = 0;
-
- zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &zp->z_size, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
- &parent, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-
- if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
- (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
- (zp->z_pflags & ZFS_PROJID) &&
- sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
- if (hdl == NULL)
- sa_handle_destroy(zp->z_sa_hdl);
- zp->z_sa_hdl = NULL;
- goto error;
- }
-
- zp->z_projid = projid;
- zp->z_mode = ip->i_mode = mode;
- ip->i_generation = (uint32_t)tmp_gen;
- ip->i_blkbits = SPA_MINBLOCKSHIFT;
- set_nlink(ip, (uint32_t)links);
- zfs_uid_write(ip, z_uid);
- zfs_gid_write(ip, z_gid);
- zfs_set_inode_flags(zp, ip);
-
- /* Cache the xattr parent id */
- if (zp->z_pflags & ZFS_XATTR)
- zp->z_xattr_parent = parent;
-
- ZFS_TIME_DECODE(&ip->i_atime, atime);
- ZFS_TIME_DECODE(&ip->i_mtime, mtime);
- ZFS_TIME_DECODE(&ip->i_ctime, ctime);
-
- ip->i_ino = zp->z_id;
- zfs_inode_update(zp);
- zfs_inode_set_ops(zfsvfs, ip);
-
- /*
- * The only way insert_inode_locked() can fail is if the ip->i_ino
- * number is already hashed for this super block. This can never
- * happen because the inode numbers map 1:1 with the object numbers.
- *
- * The one exception is rolling back a mounted file system, but in
- * this case all the active inode are unhashed during the rollback.
- */
- VERIFY3S(insert_inode_locked(ip), ==, 0);
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- list_insert_tail(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes++;
- membar_producer();
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- unlock_new_inode(ip);
- return (zp);
-
-error:
- iput(ip);
- return (NULL);
-}
-
-/*
- * Safely mark an inode dirty. Inodes which are part of a read-only
- * file system or snapshot may not be dirtied.
- */
-void
-zfs_mark_inode_dirty(struct inode *ip)
-{
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
- return;
-
- mark_inode_dirty(ip);
-}
-
-static uint64_t empty_xattr;
-static uint64_t pad[4];
-static zfs_acl_phys_t acl_phys;
-/*
- * Create a new DMU object to hold a zfs znode.
- *
- * IN: dzp - parent directory for new znode
- * vap - file attributes for new znode
- * tx - dmu transaction id for zap operations
- * cr - credentials of caller
- * flag - flags:
- * IS_ROOT_NODE - new object will be root
- * IS_TMPFILE - new object is of O_TMPFILE
- * IS_XATTR - new object is an attribute
- * acl_ids - ACL related attributes
- *
- * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
- *
- */
-void
-zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
-{
- uint64_t crtime[2], atime[2], mtime[2], ctime[2];
- uint64_t mode, size, links, parent, pflags;
- uint64_t projid = ZFS_DEFAULT_PROJID;
- uint64_t rdev = 0;
- zfsvfs_t *zfsvfs = ZTOZSB(dzp);
- dmu_buf_t *db;
- inode_timespec_t now;
- uint64_t gen, obj;
- int bonuslen;
- int dnodesize;
- sa_handle_t *sa_hdl;
- dmu_object_type_t obj_type;
- sa_bulk_attr_t *sa_attrs;
- int cnt = 0;
- zfs_acl_locator_cb_t locate = { 0 };
- znode_hold_t *zh;
-
- if (zfsvfs->z_replay) {
- obj = vap->va_nodeid;
- now = vap->va_ctime; /* see zfs_replay_create() */
- gen = vap->va_nblocks; /* ditto */
- dnodesize = vap->va_fsid; /* ditto */
- } else {
- obj = 0;
- gethrestime(&now);
- gen = dmu_tx_get_txg(tx);
- dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
- }
-
- if (dnodesize == 0)
- dnodesize = DNODE_MIN_SIZE;
-
- obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
-
- bonuslen = (obj_type == DMU_OT_SA) ?
- DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
-
- /*
- * Create a new DMU object.
- */
- /*
- * There's currently no mechanism for pre-reading the blocks that will
- * be needed to allocate a new object, so we accept the small chance
- * that there will be an i/o error and we will fail one of the
- * assertions below.
- */
- if (S_ISDIR(vap->va_mode)) {
- if (zfsvfs->z_replay) {
- VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
- zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, dnodesize, tx));
- } else {
- obj = zap_create_norm_dnsize(zfsvfs->z_os,
- zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, dnodesize, tx);
- }
- } else {
- if (zfsvfs->z_replay) {
- VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
- DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, dnodesize, tx));
- } else {
- obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
- DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, dnodesize, tx);
- }
- }
-
- zh = zfs_znode_hold_enter(zfsvfs, obj);
- VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
-
- /*
- * If this is the root, fix up the half-initialized parent pointer
- * to reference the just-allocated physical data area.
- */
- if (flag & IS_ROOT_NODE) {
- dzp->z_id = obj;
- }
-
- /*
- * If parent is an xattr, so am I.
- */
- if (dzp->z_pflags & ZFS_XATTR) {
- flag |= IS_XATTR;
- }
-
- if (zfsvfs->z_use_fuids)
- pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
- else
- pflags = 0;
-
- if (S_ISDIR(vap->va_mode)) {
- size = 2; /* contents ("." and "..") */
- links = 2;
- } else {
- size = 0;
- links = (flag & IS_TMPFILE) ? 0 : 1;
- }
-
- if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
- rdev = vap->va_rdev;
-
- parent = dzp->z_id;
- mode = acl_ids->z_mode;
- if (flag & IS_XATTR)
- pflags |= ZFS_XATTR;
-
- if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
- /*
- * With ZFS_PROJID flag, we can easily know whether there is
- * project ID stored on disk or not. See zfs_space_delta_cb().
- */
- if (obj_type != DMU_OT_ZNODE &&
- dmu_objset_projectquota_enabled(zfsvfs->z_os))
- pflags |= ZFS_PROJID;
-
- /*
- * Inherit project ID from parent if required.
- */
- projid = zfs_inherit_projid(dzp);
- if (dzp->z_pflags & ZFS_PROJINHERIT)
- pflags |= ZFS_PROJINHERIT;
- }
-
- /*
- * No execs denied will be determined when zfs_mode_compute() is called.
- */
- pflags |= acl_ids->z_aclp->z_hints &
- (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
- ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
-
- ZFS_TIME_ENCODE(&now, crtime);
- ZFS_TIME_ENCODE(&now, ctime);
-
- if (vap->va_mask & ATTR_ATIME) {
- ZFS_TIME_ENCODE(&vap->va_atime, atime);
- } else {
- ZFS_TIME_ENCODE(&now, atime);
- }
-
- if (vap->va_mask & ATTR_MTIME) {
- ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
- } else {
- ZFS_TIME_ENCODE(&now, mtime);
- }
-
- /* Now add in all of the "SA" attributes */
- VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
- &sa_hdl));
-
- /*
- * Setup the array of attributes to be replaced/set on the new file
- *
- * order for DMU_OT_ZNODE is critical since it needs to be constructed
- * in the old znode_phys_t format. Don't change this ordering
- */
- sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
-
- if (obj_type == DMU_OT_ZNODE) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
- NULL, &atime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
- NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
- NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
- NULL, &crtime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
- NULL, &gen, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
- NULL, &mode, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
- NULL, &size, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
- NULL, &parent, 8);
- } else {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
- NULL, &mode, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
- NULL, &size, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
- NULL, &gen, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
- NULL, &acl_ids->z_fuid, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
- NULL, &acl_ids->z_fgid, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
- NULL, &parent, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
- NULL, &pflags, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
- NULL, &atime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
- NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
- NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
- NULL, &crtime, 16);
- }
-
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
-
- if (obj_type == DMU_OT_ZNODE) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
- &empty_xattr, 8);
- } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
- pflags & ZFS_PROJID) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
- NULL, &projid, 8);
- }
- if (obj_type == DMU_OT_ZNODE ||
- (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
- NULL, &rdev, 8);
- }
- if (obj_type == DMU_OT_ZNODE) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
- NULL, &pflags, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
- &acl_ids->z_fuid, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
- &acl_ids->z_fgid, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
- sizeof (uint64_t) * 4);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
- &acl_phys, sizeof (zfs_acl_phys_t));
- } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
- &acl_ids->z_aclp->z_acl_count, 8);
- locate.cb_aclp = acl_ids->z_aclp;
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
- zfs_acl_data_locator, &locate,
- acl_ids->z_aclp->z_acl_bytes);
- mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
- acl_ids->z_fuid, acl_ids->z_fgid);
- }
-
- VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
-
- if (!(flag & IS_ROOT_NODE)) {
- /*
- * The call to zfs_znode_alloc() may fail if memory is low
- * via the call path: alloc_inode() -> inode_init_always() ->
- * security_inode_alloc() -> inode_alloc_security(). Since
- * the existing code is written such that zfs_mknode() can
- * not fail retry until sufficient memory has been reclaimed.
- */
- do {
- *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
- } while (*zpp == NULL);
-
- VERIFY(*zpp != NULL);
- VERIFY(dzp != NULL);
- } else {
- /*
- * If we are creating the root node, the "parent" we
- * passed in is the znode for the root.
- */
- *zpp = dzp;
-
- (*zpp)->z_sa_hdl = sa_hdl;
- }
-
- (*zpp)->z_pflags = pflags;
- (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
- (*zpp)->z_dnodesize = dnodesize;
- (*zpp)->z_projid = projid;
-
- if (obj_type == DMU_OT_ZNODE ||
- acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
- VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
- }
- kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
- zfs_znode_hold_exit(zfsvfs, zh);
-}
-
-/*
- * Update in-core attributes. It is assumed the caller will be doing an
- * sa_bulk_update to push the changes out.
- */
-void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
-{
- xoptattr_t *xoap;
- boolean_t update_inode = B_FALSE;
-
- xoap = xva_getxoptattr(xvap);
- ASSERT(xoap);
-
- if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- uint64_t times[2];
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
- &times, sizeof (times), tx);
- XVA_SET_RTN(xvap, XAT_CREATETIME);
- }
- if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_READONLY);
- }
- if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_HIDDEN);
- }
- if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_SYSTEM);
- }
- if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_ARCHIVE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_IMMUTABLE);
-
- update_inode = B_TRUE;
- }
- if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_NOUNLINK);
- }
- if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_APPENDONLY);
-
- update_inode = B_TRUE;
- }
- if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_NODUMP);
- }
- if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_OPAQUE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
- ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
- xoap->xoa_av_quarantined, zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
- }
- if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
- }
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- zfs_sa_set_scanstamp(zp, xvap, tx);
- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
- }
- if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_REPARSE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
- ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_OFFLINE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
- ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_SPARSE);
- }
- if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
- ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
- zp->z_pflags, tx);
- XVA_SET_RTN(xvap, XAT_PROJINHERIT);
- }
-
- if (update_inode)
- zfs_set_inode_flags(zp, ZTOI(zp));
-}
-
-int
-zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
-{
- dmu_object_info_t doi;
- dmu_buf_t *db;
- znode_t *zp;
- znode_hold_t *zh;
- int err;
- sa_handle_t *hdl;
-
- *zpp = NULL;
-
-again:
- zh = zfs_znode_hold_enter(zfsvfs, obj_num);
-
- err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
- if (err) {
- zfs_znode_hold_exit(zfsvfs, zh);
- return (err);
- }
-
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_SA &&
- (doi.doi_bonus_type != DMU_OT_ZNODE ||
- (doi.doi_bonus_type == DMU_OT_ZNODE &&
- doi.doi_bonus_size < sizeof (znode_phys_t)))) {
- sa_buf_rele(db, NULL);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (SET_ERROR(EINVAL));
- }
-
- hdl = dmu_buf_get_user(db);
- if (hdl != NULL) {
- zp = sa_get_userdata(hdl);
-
-
- /*
- * Since "SA" does immediate eviction we
- * should never find a sa handle that doesn't
- * know about the znode.
- */
-
- ASSERT3P(zp, !=, NULL);
-
- mutex_enter(&zp->z_lock);
- ASSERT3U(zp->z_id, ==, obj_num);
- /*
- * If igrab() returns NULL the VFS has independently
- * determined the inode should be evicted and has
- * called iput_final() to start the eviction process.
- * The SA handle is still valid but because the VFS
- * requires that the eviction succeed we must drop
- * our locks and references to allow the eviction to
- * complete. The zfs_zget() may then be retried.
- *
- * This unlikely case could be optimized by registering
- * a sops->drop_inode() callback. The callback would
- * need to detect the active SA hold thereby informing
- * the VFS that this inode should not be evicted.
- */
- if (igrab(ZTOI(zp)) == NULL) {
- mutex_exit(&zp->z_lock);
- sa_buf_rele(db, NULL);
- zfs_znode_hold_exit(zfsvfs, zh);
- /* inode might need this to finish evict */
- cond_resched();
- goto again;
- }
- *zpp = zp;
- err = 0;
- mutex_exit(&zp->z_lock);
- sa_buf_rele(db, NULL);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (err);
- }
-
- /*
- * Not found create new znode/vnode but only if file exists.
- *
- * There is a small window where zfs_vget() could
- * find this object while a file create is still in
- * progress. This is checked for in zfs_znode_alloc()
- *
- * if zfs_znode_alloc() fails it will drop the hold on the
- * bonus buffer.
- */
- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
- doi.doi_bonus_type, NULL);
- if (zp == NULL) {
- err = SET_ERROR(ENOENT);
- } else {
- *zpp = zp;
- }
- zfs_znode_hold_exit(zfsvfs, zh);
- return (err);
-}
-
-int
-zfs_rezget(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- dmu_object_info_t doi;
- dmu_buf_t *db;
- uint64_t obj_num = zp->z_id;
- uint64_t mode;
- uint64_t links;
- sa_bulk_attr_t bulk[10];
- int err;
- int count = 0;
- uint64_t gen;
- uint64_t z_uid, z_gid;
- uint64_t atime[2], mtime[2], ctime[2];
- uint64_t projid = ZFS_DEFAULT_PROJID;
- znode_hold_t *zh;
-
- /*
- * skip ctldir, otherwise they will always get invalidated. This will
- * cause funny behaviour for the mounted snapdirs. Especially for
- * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
- * anyone automount it again as long as someone is still using the
- * detached mount.
- */
- if (zp->z_is_ctldir)
- return (0);
-
- zh = zfs_znode_hold_enter(zfsvfs, obj_num);
-
- mutex_enter(&zp->z_acl_lock);
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
- }
- mutex_exit(&zp->z_acl_lock);
-
- rw_enter(&zp->z_xattr_lock, RW_WRITER);
- if (zp->z_xattr_cached) {
- nvlist_free(zp->z_xattr_cached);
- zp->z_xattr_cached = NULL;
- }
- rw_exit(&zp->z_xattr_lock);
-
- ASSERT(zp->z_sa_hdl == NULL);
- err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
- if (err) {
- zfs_znode_hold_exit(zfsvfs, zh);
- return (err);
- }
-
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_SA &&
- (doi.doi_bonus_type != DMU_OT_ZNODE ||
- (doi.doi_bonus_type == DMU_OT_ZNODE &&
- doi.doi_bonus_size < sizeof (znode_phys_t)))) {
- sa_buf_rele(db, NULL);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (SET_ERROR(EINVAL));
- }
-
- zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
-
- /* reload cached values */
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
- &gen, sizeof (gen));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &zp->z_size, sizeof (zp->z_size));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
- &links, sizeof (links));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, sizeof (zp->z_pflags));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
- &z_uid, sizeof (z_uid));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
- &z_gid, sizeof (z_gid));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
- &mode, sizeof (mode));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
- &atime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
- &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- &ctime, 16);
-
- if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
- zfs_znode_dmu_fini(zp);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (SET_ERROR(EIO));
- }
-
- if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
- err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
- &projid, 8);
- if (err != 0 && err != ENOENT) {
- zfs_znode_dmu_fini(zp);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (SET_ERROR(err));
- }
- }
-
- zp->z_projid = projid;
- zp->z_mode = ZTOI(zp)->i_mode = mode;
- zfs_uid_write(ZTOI(zp), z_uid);
- zfs_gid_write(ZTOI(zp), z_gid);
-
- ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
- ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
- ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
-
- if ((uint32_t)gen != ZTOI(zp)->i_generation) {
- zfs_znode_dmu_fini(zp);
- zfs_znode_hold_exit(zfsvfs, zh);
- return (SET_ERROR(EIO));
- }
-
- set_nlink(ZTOI(zp), (uint32_t)links);
- zfs_set_inode_flags(zp, ZTOI(zp));
-
- zp->z_blksz = doi.doi_data_block_size;
- zp->z_atime_dirty = B_FALSE;
- zfs_inode_update(zp);
-
- /*
- * If the file has zero links, then it has been unlinked on the send
- * side and it must be in the received unlinked set.
- * We call zfs_znode_dmu_fini() now to prevent any accesses to the
- * stale data and to prevent automatic removal of the file in
- * zfs_zinactive(). The file will be removed either when it is removed
- * on the send side and the next incremental stream is received or
- * when the unlinked set gets processed.
- */
- zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
- if (zp->z_unlinked)
- zfs_znode_dmu_fini(zp);
-
- zfs_znode_hold_exit(zfsvfs, zh);
-
- return (0);
-}
-
-void
-zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- objset_t *os = zfsvfs->z_os;
- uint64_t obj = zp->z_id;
- uint64_t acl_obj = zfs_external_acl(zp);
- znode_hold_t *zh;
-
- zh = zfs_znode_hold_enter(zfsvfs, obj);
- if (acl_obj) {
- VERIFY(!zp->z_is_sa);
- VERIFY(0 == dmu_object_free(os, acl_obj, tx));
- }
- VERIFY(0 == dmu_object_free(os, obj, tx));
- zfs_znode_dmu_fini(zp);
- zfs_znode_hold_exit(zfsvfs, zh);
-}
-
-void
-zfs_zinactive(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- uint64_t z_id = zp->z_id;
- znode_hold_t *zh;
-
- ASSERT(zp->z_sa_hdl);
-
- /*
- * Don't allow a zfs_zget() while were trying to release this znode.
- */
- zh = zfs_znode_hold_enter(zfsvfs, z_id);
-
- mutex_enter(&zp->z_lock);
-
- /*
- * If this was the last reference to a file with no links, remove
- * the file from the file system unless the file system is mounted
- * read-only. That can happen, for example, if the file system was
- * originally read-write, the file was opened, then unlinked and
- * the file system was made read-only before the file was finally
- * closed. The file will remain in the unlinked set.
- */
- if (zp->z_unlinked) {
- ASSERT(!zfsvfs->z_issnap);
- if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
- mutex_exit(&zp->z_lock);
- zfs_znode_hold_exit(zfsvfs, zh);
- zfs_rmnode(zp);
- return;
- }
- }
-
- mutex_exit(&zp->z_lock);
- zfs_znode_dmu_fini(zp);
-
- zfs_znode_hold_exit(zfsvfs, zh);
-}
-
-#if defined(HAVE_INODE_TIMESPEC64_TIMES)
-#define zfs_compare_timespec timespec64_compare
-#else
-#define zfs_compare_timespec timespec_compare
-#endif
-
-/*
- * Determine whether the znode's atime must be updated. The logic mostly
- * duplicates the Linux kernel's relatime_need_update() functionality.
- * This function is only called if the underlying filesystem actually has
- * atime updates enabled.
- */
-boolean_t
-zfs_relatime_need_update(const struct inode *ip)
-{
- inode_timespec_t now;
-
- gethrestime(&now);
- /*
- * In relatime mode, only update the atime if the previous atime
- * is earlier than either the ctime or mtime or if at least a day
- * has passed since the last update of atime.
- */
- if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
- return (B_TRUE);
-
- if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
- return (B_TRUE);
-
- if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
-/*
- * Prepare to update znode time stamps.
- *
- * IN: zp - znode requiring timestamp update
- * flag - ATTR_MTIME, ATTR_CTIME flags
- *
- * OUT: zp - z_seq
- * mtime - new mtime
- * ctime - new ctime
- *
- * Note: We don't update atime here, because we rely on Linux VFS to do
- * atime updating.
- */
-void
-zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
- uint64_t ctime[2])
-{
- inode_timespec_t now;
-
- gethrestime(&now);
-
- zp->z_seq++;
-
- if (flag & ATTR_MTIME) {
- ZFS_TIME_ENCODE(&now, mtime);
- ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
- if (ZTOZSB(zp)->z_use_fuids) {
- zp->z_pflags |= (ZFS_ARCHIVE |
- ZFS_AV_MODIFIED);
- }
- }
-
- if (flag & ATTR_CTIME) {
- ZFS_TIME_ENCODE(&now, ctime);
- ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
- if (ZTOZSB(zp)->z_use_fuids)
- zp->z_pflags |= ZFS_ARCHIVE;
- }
-}
-
-/*
- * Grow the block size for a file.
- *
- * IN: zp - znode of file to free data in.
- * size - requested block size
- * tx - open transaction.
- *
- * NOTE: this function assumes that the znode is write locked.
- */
-void
-zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
-{
- int error;
- u_longlong_t dummy;
-
- if (size <= zp->z_blksz)
- return;
- /*
- * If the file size is already greater than the current blocksize,
- * we will not grow. If there is more than one block in a file,
- * the blocksize cannot change.
- */
- if (zp->z_blksz && zp->z_size > zp->z_blksz)
- return;
-
- error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
- size, 0, tx);
-
- if (error == ENOTSUP)
- return;
- ASSERT0(error);
-
- /* What blocksize did we actually get? */
- dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
-}
-
-/*
- * Increase the file length
- *
- * IN: zp - znode of file to free data in.
- * end - new end-of-file
- *
- * RETURN: 0 on success, error code on failure
- */
-static int
-zfs_extend(znode_t *zp, uint64_t end)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- dmu_tx_t *tx;
- locked_range_t *lr;
- uint64_t newblksz;
- int error;
-
- /*
- * We will change zp_size, lock the whole file.
- */
- lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
-
- /*
- * Nothing to do if file already at desired length.
- */
- if (end <= zp->z_size) {
- rangelock_exit(lr);
- return (0);
- }
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- if (end > zp->z_blksz &&
- (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
- /*
- * We are growing the file past the current block size.
- */
- if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
- } else {
- newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
- }
- dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
- } else {
- newblksz = 0;
- }
-
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- rangelock_exit(lr);
- return (error);
- }
-
- if (newblksz)
- zfs_grow_blocksize(zp, newblksz, tx);
-
- zp->z_size = end;
-
- VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
- &zp->z_size, sizeof (zp->z_size), tx));
-
- rangelock_exit(lr);
-
- dmu_tx_commit(tx);
-
- return (0);
-}
-
-/*
- * zfs_zero_partial_page - Modeled after update_pages() but
- * with different arguments and semantics for use by zfs_freesp().
- *
- * Zeroes a piece of a single page cache entry for zp at offset
- * start and length len.
- *
- * Caller must acquire a range lock on the file for the region
- * being zeroed in order that the ARC and page cache stay in sync.
- */
-static void
-zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
-{
- struct address_space *mp = ZTOI(zp)->i_mapping;
- struct page *pp;
- int64_t off;
- void *pb;
-
- ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
-
- off = start & (PAGE_SIZE - 1);
- start &= PAGE_MASK;
-
- pp = find_lock_page(mp, start >> PAGE_SHIFT);
- if (pp) {
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
-
- pb = kmap(pp);
- bzero(pb + off, len);
- kunmap(pp);
-
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
-
- mark_page_accessed(pp);
- SetPageUptodate(pp);
- ClearPageError(pp);
- unlock_page(pp);
- put_page(pp);
- }
-}
-
-/*
- * Free space in a file.
- *
- * IN: zp - znode of file to free data in.
- * off - start of section to free.
- * len - length of section to free.
- *
- * RETURN: 0 on success, error code on failure
- */
-static int
-zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- locked_range_t *lr;
- int error;
-
- /*
- * Lock the range being freed.
- */
- lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
-
- /*
- * Nothing to do if file already at desired length.
- */
- if (off >= zp->z_size) {
- rangelock_exit(lr);
- return (0);
- }
-
- if (off + len > zp->z_size)
- len = zp->z_size - off;
-
- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
-
- /*
- * Zero partial page cache entries. This must be done under a
- * range lock in order to keep the ARC and page cache in sync.
- */
- if (zp->z_is_mapped) {
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
-
- /* first possible full page in hole */
- first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
- /* last page of hole */
- last_page = (off + len) >> PAGE_SHIFT;
-
- /* offset of first_page */
- first_page_offset = first_page << PAGE_SHIFT;
- /* offset of last_page */
- last_page_offset = last_page << PAGE_SHIFT;
-
- /* truncate whole pages */
- if (last_page_offset > first_page_offset) {
- truncate_inode_pages_range(ZTOI(zp)->i_mapping,
- first_page_offset, last_page_offset - 1);
- }
-
- /* truncate sub-page ranges */
- if (first_page > last_page) {
- /* entire punched area within a single page */
- zfs_zero_partial_page(zp, off, len);
- } else {
- /* beginning of punched area at the end of a page */
- page_len = first_page_offset - off;
- if (page_len > 0)
- zfs_zero_partial_page(zp, off, page_len);
-
- /* end of punched area at the beginning of a page */
- page_len = off + len - last_page_offset;
- if (page_len > 0)
- zfs_zero_partial_page(zp, last_page_offset,
- page_len);
- }
- }
- rangelock_exit(lr);
-
- return (error);
-}
-
-/*
- * Truncate a file
- *
- * IN: zp - znode of file to free data in.
- * end - new end-of-file.
- *
- * RETURN: 0 on success, error code on failure
- */
-static int
-zfs_trunc(znode_t *zp, uint64_t end)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- dmu_tx_t *tx;
- locked_range_t *lr;
- int error;
- sa_bulk_attr_t bulk[2];
- int count = 0;
-
- /*
- * We will change zp_size, lock the whole file.
- */
- lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
-
- /*
- * Nothing to do if file already at desired length.
- */
- if (end >= zp->z_size) {
- rangelock_exit(lr);
- return (0);
- }
-
- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
- DMU_OBJECT_END);
- if (error) {
- rangelock_exit(lr);
- return (error);
- }
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- rangelock_exit(lr);
- return (error);
- }
-
- zp->z_size = end;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
- NULL, &zp->z_size, sizeof (zp->z_size));
-
- if (end == 0) {
- zp->z_pflags &= ~ZFS_SPARSE;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
- NULL, &zp->z_pflags, 8);
- }
- VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
-
- dmu_tx_commit(tx);
- rangelock_exit(lr);
-
- return (0);
-}
-
-/*
- * Free space in a file
- *
- * IN: zp - znode of file to free data in.
- * off - start of range
- * len - end of range (0 => EOF)
- * flag - current file open mode flags.
- * log - TRUE if this action should be logged
- *
- * RETURN: 0 on success, error code on failure
- */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
-{
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- zilog_t *zilog = zfsvfs->z_log;
- uint64_t mode;
- uint64_t mtime[2], ctime[2];
- sa_bulk_attr_t bulk[3];
- int count = 0;
- int error;
-
- if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
- sizeof (mode))) != 0)
- return (error);
-
- if (off > zp->z_size) {
- error = zfs_extend(zp, off+len);
- if (error == 0 && log)
- goto log;
- goto out;
- }
-
- if (len == 0) {
- error = zfs_trunc(zp, off);
- } else {
- if ((error = zfs_free_range(zp, off, len)) == 0 &&
- off + len > zp->z_size)
- error = zfs_extend(zp, off+len);
- }
- if (error || !log)
- goto out;
-log:
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- goto out;
- }
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
- NULL, &zp->z_pflags, 8);
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
-
- zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
-
- dmu_tx_commit(tx);
-
- zfs_inode_update(zp);
- error = 0;
-
-out:
- /*
- * Truncate the page cache - for file truncate operations, use
- * the purpose-built API for truncations. For punching operations,
- * the truncation is handled under a range lock in zfs_free_range.
- */
- if (len == 0)
- truncate_setsize(ZTOI(zp), off);
- return (error);
-}
-
-void
-zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
-{
- struct super_block *sb;
- zfsvfs_t *zfsvfs;
- uint64_t moid, obj, sa_obj, version;
- uint64_t sense = ZFS_CASE_SENSITIVE;
- uint64_t norm = 0;
- nvpair_t *elem;
- int size;
- int error;
- int i;
- znode_t *rootzp = NULL;
- vattr_t vattr;
- znode_t *zp;
- zfs_acl_ids_t acl_ids;
-
- /*
- * First attempt to create master node.
- */
- /*
- * In an empty objset, there are no blocks to read and thus
- * there can be no i/o errors (which we assert below).
- */
- moid = MASTER_NODE_OBJ;
- error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- /*
- * Set starting attributes.
- */
- version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
- elem = NULL;
- while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
- /* For the moment we expect all zpl props to be uint64_ts */
- uint64_t val;
- char *name;
-
- ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
- VERIFY(nvpair_value_uint64(elem, &val) == 0);
- name = nvpair_name(elem);
- if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
- if (val < version)
- version = val;
- } else {
- error = zap_update(os, moid, name, 8, 1, &val, tx);
- }
- ASSERT(error == 0);
- if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
- norm = val;
- else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
- sense = val;
- }
- ASSERT(version != 0);
- error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
-
- /*
- * Create zap object used for SA attribute registration
- */
-
- if (version >= ZPL_VERSION_SA) {
- sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
- DMU_OT_NONE, 0, tx);
- error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
- ASSERT(error == 0);
- } else {
- sa_obj = 0;
- }
- /*
- * Create a delete queue.
- */
- obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
-
- error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
- ASSERT(error == 0);
-
- /*
- * Create root znode. Create minimal znode/inode/zfsvfs/sb
- * to allow zfs_mknode to work.
- */
- vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
- vattr.va_mode = S_IFDIR|0755;
- vattr.va_uid = crgetuid(cr);
- vattr.va_gid = crgetgid(cr);
-
- rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- rootzp->z_unlinked = B_FALSE;
- rootzp->z_atime_dirty = B_FALSE;
- rootzp->z_moved = B_FALSE;
- rootzp->z_is_sa = USE_SA(version, os);
- rootzp->z_pflags = 0;
-
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
- zfsvfs->z_os = os;
- zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_version = version;
- zfsvfs->z_use_fuids = USE_FUIDS(version, os);
- zfsvfs->z_use_sa = USE_SA(version, os);
- zfsvfs->z_norm = norm;
-
- sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
- sb->s_fs_info = zfsvfs;
-
- ZTOI(rootzp)->i_sb = sb;
-
- error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
- &zfsvfs->z_attr_table);
-
- ASSERT(error == 0);
-
- /*
- * Fold case on file systems that are always or sometimes case
- * insensitive.
- */
- if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
- zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
-
- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
-
- size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
- zfsvfs->z_hold_size = size;
- zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
- KM_SLEEP);
- zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
- for (i = 0; i != size; i++) {
- avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
- sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
- mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
- }
-
- VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
- cr, NULL, &acl_ids));
- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
- ASSERT3P(zp, ==, rootzp);
- error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
- ASSERT(error == 0);
- zfs_acl_ids_free(&acl_ids);
-
- atomic_set(&ZTOI(rootzp)->i_count, 0);
- sa_handle_destroy(rootzp->z_sa_hdl);
- kmem_cache_free(znode_cache, rootzp);
-
- for (i = 0; i != size; i++) {
- avl_destroy(&zfsvfs->z_hold_trees[i]);
- mutex_destroy(&zfsvfs->z_hold_locks[i]);
- }
-
- mutex_destroy(&zfsvfs->z_znodes_lock);
-
- vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
- vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
- kmem_free(sb, sizeof (struct super_block));
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
-}
-#endif /* _KERNEL */
-
-static int
-zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
-{
- uint64_t sa_obj = 0;
- int error;
-
- error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
- if (error != 0 && error != ENOENT)
- return (error);
-
- error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
- return (error);
-}
-
-static int
-zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
- dmu_buf_t **db, void *tag)
-{
- dmu_object_info_t doi;
- int error;
-
- if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
- return (error);
-
- dmu_object_info_from_db(*db, &doi);
- if ((doi.doi_bonus_type != DMU_OT_SA &&
- doi.doi_bonus_type != DMU_OT_ZNODE) ||
- (doi.doi_bonus_type == DMU_OT_ZNODE &&
- doi.doi_bonus_size < sizeof (znode_phys_t))) {
- sa_buf_rele(*db, tag);
- return (SET_ERROR(ENOTSUP));
- }
-
- error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
- if (error != 0) {
- sa_buf_rele(*db, tag);
- return (error);
- }
-
- return (0);
-}
-
-void
-zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
-{
- sa_handle_destroy(hdl);
- sa_buf_rele(db, tag);
-}
-
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
-static int
-zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
- uint64_t *pobjp, int *is_xattrdir)
-{
- uint64_t parent;
- uint64_t pflags;
- uint64_t mode;
- uint64_t parent_mode;
- sa_bulk_attr_t bulk[3];
- sa_handle_t *sa_hdl;
- dmu_buf_t *sa_db;
- int count = 0;
- int error;
-
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
- &parent, sizeof (parent));
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
- &pflags, sizeof (pflags));
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
- &mode, sizeof (mode));
-
- if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
- return (error);
-
- /*
- * When a link is removed its parent pointer is not changed and will
- * be invalid. There are two cases where a link is removed but the
- * file stays around, when it goes to the delete queue and when there
- * are additional links.
- */
- error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
- if (error != 0)
- return (error);
-
- error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
- zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
- if (error != 0)
- return (error);
-
- *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
-
- /*
- * Extended attributes can be applied to files, directories, etc.
- * Otherwise the parent must be a directory.
- */
- if (!*is_xattrdir && !S_ISDIR(parent_mode))
- return (SET_ERROR(EINVAL));
-
- *pobjp = parent;
-
- return (0);
-}
-
-/*
- * Given an object number, return some zpl level statistics
- */
-static int
-zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
- zfs_stat_t *sb)
-{
- sa_bulk_attr_t bulk[4];
- int count = 0;
-
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
- &sb->zs_mode, sizeof (sb->zs_mode));
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
- &sb->zs_gen, sizeof (sb->zs_gen));
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
- &sb->zs_links, sizeof (sb->zs_links));
- SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
- &sb->zs_ctime, sizeof (sb->zs_ctime));
-
- return (sa_bulk_lookup(hdl, bulk, count));
-}
-
-static int
-zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
- sa_attr_type_t *sa_table, char *buf, int len)
-{
- sa_handle_t *sa_hdl;
- sa_handle_t *prevhdl = NULL;
- dmu_buf_t *prevdb = NULL;
- dmu_buf_t *sa_db = NULL;
- char *path = buf + len - 1;
- int error;
-
- *path = '\0';
- sa_hdl = hdl;
-
- uint64_t deleteq_obj;
- VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
- ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
- error = zap_lookup_int(osp, deleteq_obj, obj);
- if (error == 0) {
- return (ESTALE);
- } else if (error != ENOENT) {
- return (error);
- }
- error = 0;
-
- for (;;) {
- uint64_t pobj = 0;
- char component[MAXNAMELEN + 2];
- size_t complen;
- int is_xattrdir = 0;
-
- if (prevdb)
- zfs_release_sa_handle(prevhdl, prevdb, FTAG);
-
- if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
- &is_xattrdir)) != 0)
- break;
-
- if (pobj == obj) {
- if (path[0] != '/')
- *--path = '/';
- break;
- }
-
- component[0] = '/';
- if (is_xattrdir) {
- (void) sprintf(component + 1, "<xattrdir>");
- } else {
- error = zap_value_search(osp, pobj, obj,
- ZFS_DIRENT_OBJ(-1ULL), component + 1);
- if (error != 0)
- break;
- }
-
- complen = strlen(component);
- path -= complen;
- ASSERT(path >= buf);
- bcopy(component, path, complen);
- obj = pobj;
-
- if (sa_hdl != hdl) {
- prevhdl = sa_hdl;
- prevdb = sa_db;
- }
- error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
- if (error != 0) {
- sa_hdl = prevhdl;
- sa_db = prevdb;
- break;
- }
- }
-
- if (sa_hdl != NULL && sa_hdl != hdl) {
- ASSERT(sa_db != NULL);
- zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
- }
-
- if (error == 0)
- (void) memmove(buf, path, buf + len - path);
-
- return (error);
-}
-
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
-{
- sa_attr_type_t *sa_table;
- sa_handle_t *hdl;
- dmu_buf_t *db;
- int error;
-
- error = zfs_sa_setup(osp, &sa_table);
- if (error != 0)
- return (error);
-
- error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
- if (error != 0)
- return (error);
-
- error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
-
- zfs_release_sa_handle(hdl, db, FTAG);
- return (error);
-}
-
-int
-zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
- char *buf, int len)
-{
- char *path = buf + len - 1;
- sa_attr_type_t *sa_table;
- sa_handle_t *hdl;
- dmu_buf_t *db;
- int error;
-
- *path = '\0';
-
- error = zfs_sa_setup(osp, &sa_table);
- if (error != 0)
- return (error);
-
- error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
- if (error != 0)
- return (error);
-
- error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
- if (error != 0) {
- zfs_release_sa_handle(hdl, db, FTAG);
- return (error);
- }
-
- error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
-
- zfs_release_sa_handle(hdl, db, FTAG);
- return (error);
-}
-
-#if defined(_KERNEL)
-EXPORT_SYMBOL(zfs_create_fs);
-EXPORT_SYMBOL(zfs_obj_to_path);
-
-/* CSTYLED */
-module_param(zfs_object_mutex_size, uint, 0644);
-MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
-module_param(zfs_unlink_suspend_progress, int, 0644);
-MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
-"(debug - leaks space into the unlinked set)");
-#endif
diff --git a/module/zfs/zio_crypt.c b/module/zfs/zio_crypt.c
deleted file mode 100644
index 7cf20f413..000000000
--- a/module/zfs/zio_crypt.c
+++ /dev/null
@@ -1,2036 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2017, Datto, Inc. All rights reserved.
- */
-
-#include <sys/zio_crypt.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dnode.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/zil.h>
-#include <sys/sha2.h>
-#include <sys/hkdf.h>
-#include "qat.h"
-
-/*
- * This file is responsible for handling all of the details of generating
- * encryption parameters and performing encryption and authentication.
- *
- * BLOCK ENCRYPTION PARAMETERS:
- * Encryption /Authentication Algorithm Suite (crypt):
- * The encryption algorithm, mode, and key length we are going to use. We
- * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
- * keys. All authentication is currently done with SHA512-HMAC.
- *
- * Plaintext:
- * The unencrypted data that we want to encrypt.
- *
- * Initialization Vector (IV):
- * An initialization vector for the encryption algorithms. This is used to
- * "tweak" the encryption algorithms so that two blocks of the same data are
- * encrypted into different ciphertext outputs, thus obfuscating block patterns.
- * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
- * never reused with the same encryption key. This value is stored unencrypted
- * and must simply be provided to the decryption function. We use a 96 bit IV
- * (as recommended by NIST) for all block encryption. For non-dedup blocks we
- * derive the IV randomly. The first 64 bits of the IV are stored in the second
- * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
- * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
- * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
- * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
- * level 0 blocks is the number of allocated dnodes in that block. The on-disk
- * format supports at most 2^15 slots per L0 dnode block, because the maximum
- * block size is 16MB (2^24). In either case, for level 0 blocks this number
- * will still be smaller than UINT32_MAX so it is safe to store the IV in the
- * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
- * for the dnode code.
- *
- * Master key:
- * This is the most important secret data of an encrypted dataset. It is used
- * along with the salt to generate that actual encryption keys via HKDF. We
- * do not use the master key to directly encrypt any data because there are
- * theoretical limits on how much data can actually be safely encrypted with
- * any encryption mode. The master key is stored encrypted on disk with the
- * user's wrapping key. Its length is determined by the encryption algorithm.
- * For details on how this is stored see the block comment in dsl_crypt.c
- *
- * Salt:
- * Used as an input to the HKDF function, along with the master key. We use a
- * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
- * can be used for encrypting many blocks, so we cache the current salt and the
- * associated derived key in zio_crypt_t so we do not need to derive it again
- * needlessly.
- *
- * Encryption Key:
- * A secret binary key, generated from an HKDF function used to encrypt and
- * decrypt data.
- *
- * Message Authentication Code (MAC)
- * The MAC is an output of authenticated encryption modes such as AES-GCM and
- * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
- * data on disk and return garbage to the application. Effectively, it is a
- * checksum that can not be reproduced by an attacker. We store the MAC in the
- * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
- * regular checksum of the ciphertext which can be used for scrubbing.
- *
- * OBJECT AUTHENTICATION:
- * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
- * they contain some info that always needs to be readable. To prevent this
- * data from being altered, we authenticate this data using SHA512-HMAC. This
- * will produce a MAC (similar to the one produced via encryption) which can
- * be used to verify the object was not modified. HMACs do not require key
- * rotation or IVs, so we can keep up to the full 3 copies of authenticated
- * data.
- *
- * ZIL ENCRYPTION:
- * ZIL blocks have their bp written to disk ahead of the associated data, so we
- * cannot store the MAC there as we normally do. For these blocks the MAC is
- * stored in the embedded checksum within the zil_chain_t header. The salt and
- * IV are generated for the block on bp allocation instead of at encryption
- * time. In addition, ZIL blocks have some pieces that must be left in plaintext
- * for claiming even though all of the sensitive user data still needs to be
- * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
- * pieces of the block need to be encrypted. All data that is not encrypted is
- * authenticated using the AAD mechanisms that the supported encryption modes
- * provide for. In order to preserve the semantics of the ZIL for encrypted
- * datasets, the ZIL is not protected at the objset level as described below.
- *
- * DNODE ENCRYPTION:
- * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
- * in plaintext for scrubbing and claiming, but the bonus buffers might contain
- * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
- * which which pieces of the block need to be encrypted. For more details about
- * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
- *
- * OBJECT SET AUTHENTICATION:
- * Up to this point, everything we have encrypted and authenticated has been
- * at level 0 (or -2 for the ZIL). If we did not do any further work the
- * on-disk format would be susceptible to attacks that deleted or rearranged
- * the order of level 0 blocks. Ideally, the cleanest solution would be to
- * maintain a tree of authentication MACs going up the bp tree. However, this
- * presents a problem for raw sends. Send files do not send information about
- * indirect blocks so there would be no convenient way to transfer the MACs and
- * they cannot be recalculated on the receive side without the master key which
- * would defeat one of the purposes of raw sends in the first place. Instead,
- * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
- * from the level below. We also include some portable fields from blk_prop such
- * as the lsize and compression algorithm to prevent the data from being
- * misinterpreted.
- *
- * At the objset level, we maintain 2 separate 256 bit MACs in the
- * objset_phys_t. The first one is "portable" and is the logical root of the
- * MAC tree maintained in the metadnode's bps. The second, is "local" and is
- * used as the root MAC for the user accounting objects, which are also not
- * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
- * of the send file. The useraccounting code ensures that the useraccounting
- * info is not present upon a receive, so the local MAC can simply be cleared
- * out at that time. For more info about objset_phys_t authentication, see
- * zio_crypt_do_objset_hmacs().
- *
- * CONSIDERATIONS FOR DEDUP:
- * In order for dedup to work, blocks that we want to dedup with one another
- * need to use the same IV and encryption key, so that they will have the same
- * ciphertext. Normally, one should never reuse an IV with the same encryption
- * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
- * blocks. In this case, however, since we are using the same plaintext as
- * well all that we end up with is a duplicate of the original ciphertext we
- * already had. As a result, an attacker with read access to the raw disk will
- * be able to tell which blocks are the same but this information is given away
- * by dedup anyway. In order to get the same IVs and encryption keys for
- * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
- * here so that a reproducible checksum of the plaintext is never available to
- * the attacker. The HMAC key is kept alongside the master key, encrypted on
- * disk. The first 64 bits of the HMAC are used in place of the random salt, and
- * the next 96 bits are used as the IV. As a result of this mechanism, dedup
- * will only work within a clone family since encrypted dedup requires use of
- * the same master and HMAC keys.
- */
-
-/*
- * After encrypting many blocks with the same key we may start to run up
- * against the theoretical limits of how much data can securely be encrypted
- * with a single key using the supported encryption modes. The most obvious
- * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
- * the more IVs we generate (which both GCM and CCM modes strictly forbid).
- * This risk actually grows surprisingly quickly over time according to the
- * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
- * generated n IVs with a cryptographically secure RNG, the approximate
- * probability p(n) of a collision is given as:
- *
- * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
- *
- * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
- *
- * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
- * we must not write more than 398,065,730 blocks with the same encryption key.
- * Therefore, we rotate our keys after 400,000,000 blocks have been written by
- * generating a new random 64 bit salt for our HKDF encryption key generation
- * function.
- */
-#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
-#define ZFS_CURRENT_MAX_SALT_USES \
- (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
-unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
-
-typedef struct blkptr_auth_buf {
- uint64_t bab_prop; /* blk_prop - portable mask */
- uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
- uint64_t bab_pad; /* reserved for future use */
-} blkptr_auth_buf_t;
-
-zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
- {"", ZC_TYPE_NONE, 0, "inherit"},
- {"", ZC_TYPE_NONE, 0, "on"},
- {"", ZC_TYPE_NONE, 0, "off"},
- {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
- {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
- {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
- {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
- {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
- {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
-};
-
-void
-zio_crypt_key_destroy(zio_crypt_key_t *key)
-{
- rw_destroy(&key->zk_salt_lock);
-
- /* free crypto templates */
- crypto_destroy_ctx_template(key->zk_current_tmpl);
- crypto_destroy_ctx_template(key->zk_hmac_tmpl);
-
- /* zero out sensitive data */
- bzero(key, sizeof (zio_crypt_key_t));
-}
-
-int
-zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
-{
- int ret;
- crypto_mechanism_t mech;
- uint_t keydata_len;
-
- ASSERT(key != NULL);
- ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
-
- keydata_len = zio_crypt_table[crypt].ci_keylen;
- bzero(key, sizeof (zio_crypt_key_t));
-
- /* fill keydata buffers and salt with random data */
- ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
- if (ret != 0)
- goto error;
-
- ret = random_get_bytes(key->zk_master_keydata, keydata_len);
- if (ret != 0)
- goto error;
-
- ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
- if (ret != 0)
- goto error;
-
- ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
- if (ret != 0)
- goto error;
-
- /* derive the current key from the master key */
- ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
- key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
- keydata_len);
- if (ret != 0)
- goto error;
-
- /* initialize keys for the ICP */
- key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
- key->zk_current_key.ck_data = key->zk_current_keydata;
- key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
-
- key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
- key->zk_hmac_key.ck_data = &key->zk_hmac_key;
- key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
-
- /*
- * Initialize the crypto templates. It's ok if this fails because
- * this is just an optimization.
- */
- mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
- ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
- &key->zk_current_tmpl, KM_SLEEP);
- if (ret != CRYPTO_SUCCESS)
- key->zk_current_tmpl = NULL;
-
- mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
- ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
- &key->zk_hmac_tmpl, KM_SLEEP);
- if (ret != CRYPTO_SUCCESS)
- key->zk_hmac_tmpl = NULL;
-
- key->zk_crypt = crypt;
- key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
- key->zk_salt_count = 0;
- rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
-
- return (0);
-
-error:
- zio_crypt_key_destroy(key);
- return (ret);
-}
-
-static int
-zio_crypt_key_change_salt(zio_crypt_key_t *key)
-{
- int ret = 0;
- uint8_t salt[ZIO_DATA_SALT_LEN];
- crypto_mechanism_t mech;
- uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
-
- /* generate a new salt */
- ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
- if (ret != 0)
- goto error;
-
- rw_enter(&key->zk_salt_lock, RW_WRITER);
-
- /* someone beat us to the salt rotation, just unlock and return */
- if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
- goto out_unlock;
-
- /* derive the current key from the master key and the new salt */
- ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
- salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
- if (ret != 0)
- goto out_unlock;
-
- /* assign the salt and reset the usage count */
- bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
- key->zk_salt_count = 0;
-
- /* destroy the old context template and create the new one */
- crypto_destroy_ctx_template(key->zk_current_tmpl);
- ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
- &key->zk_current_tmpl, KM_SLEEP);
- if (ret != CRYPTO_SUCCESS)
- key->zk_current_tmpl = NULL;
-
- rw_exit(&key->zk_salt_lock);
-
- return (0);
-
-out_unlock:
- rw_exit(&key->zk_salt_lock);
-error:
- return (ret);
-}
-
-/* See comment above zfs_key_max_salt_uses definition for details */
-int
-zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
-{
- int ret;
- boolean_t salt_change;
-
- rw_enter(&key->zk_salt_lock, RW_READER);
-
- bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
- salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
- ZFS_CURRENT_MAX_SALT_USES);
-
- rw_exit(&key->zk_salt_lock);
-
- if (salt_change) {
- ret = zio_crypt_key_change_salt(key);
- if (ret != 0)
- goto error;
- }
-
- return (0);
-
-error:
- return (ret);
-}
-
-/*
- * This function handles all encryption and decryption in zfs. When
- * encrypting it expects puio to reference the plaintext and cuio to
- * reference the ciphertext. cuio must have enough space for the
- * ciphertext + room for a MAC. datalen should be the length of the
- * plaintext / ciphertext alone.
- */
-static int
-zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
- crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
- uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
-{
- int ret;
- crypto_data_t plaindata, cipherdata;
- CK_AES_CCM_PARAMS ccmp;
- CK_AES_GCM_PARAMS gcmp;
- crypto_mechanism_t mech;
- zio_crypt_info_t crypt_info;
- uint_t plain_full_len, maclen;
-
- ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
- ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
-
- /* lookup the encryption info */
- crypt_info = zio_crypt_table[crypt];
-
- /* the mac will always be the last iovec_t in the cipher uio */
- maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
-
- ASSERT(maclen <= ZIO_DATA_MAC_LEN);
-
- /* setup encryption mechanism (same as crypt) */
- mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
-
- /*
- * Strangely, the ICP requires that plain_full_len must include
- * the MAC length when decrypting, even though the UIO does not
- * need to have the extra space allocated.
- */
- if (encrypt) {
- plain_full_len = datalen;
- } else {
- plain_full_len = datalen + maclen;
- }
-
- /*
- * setup encryption params (currently only AES CCM and AES GCM
- * are supported)
- */
- if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
- ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
- ccmp.ulAuthDataSize = auth_len;
- ccmp.authData = authbuf;
- ccmp.ulMACSize = maclen;
- ccmp.nonce = ivbuf;
- ccmp.ulDataSize = plain_full_len;
-
- mech.cm_param = (char *)(&ccmp);
- mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
- } else {
- gcmp.ulIvLen = ZIO_DATA_IV_LEN;
- gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
- gcmp.ulAADLen = auth_len;
- gcmp.pAAD = authbuf;
- gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
- gcmp.pIv = ivbuf;
-
- mech.cm_param = (char *)(&gcmp);
- mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
- }
-
- /* populate the cipher and plain data structs. */
- plaindata.cd_format = CRYPTO_DATA_UIO;
- plaindata.cd_offset = 0;
- plaindata.cd_uio = puio;
- plaindata.cd_miscdata = NULL;
- plaindata.cd_length = plain_full_len;
-
- cipherdata.cd_format = CRYPTO_DATA_UIO;
- cipherdata.cd_offset = 0;
- cipherdata.cd_uio = cuio;
- cipherdata.cd_miscdata = NULL;
- cipherdata.cd_length = datalen + maclen;
-
- /* perform the actual encryption */
- if (encrypt) {
- ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
- NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
- } else {
- ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
- NULL);
- if (ret != CRYPTO_SUCCESS) {
- ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
- ret = SET_ERROR(ECKSUM);
- goto error;
- }
- }
-
- return (0);
-
-error:
- return (ret);
-}
-
-int
-zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
- uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
-{
- int ret;
- uio_t puio, cuio;
- uint64_t aad[3];
- iovec_t plain_iovecs[2], cipher_iovecs[3];
- uint64_t crypt = key->zk_crypt;
- uint_t enc_len, keydata_len, aad_len;
-
- ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
- ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
-
- keydata_len = zio_crypt_table[crypt].ci_keylen;
-
- /* generate iv for wrapping the master and hmac key */
- ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
- if (ret != 0)
- goto error;
-
- /* initialize uio_ts */
- plain_iovecs[0].iov_base = key->zk_master_keydata;
- plain_iovecs[0].iov_len = keydata_len;
- plain_iovecs[1].iov_base = key->zk_hmac_keydata;
- plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
-
- cipher_iovecs[0].iov_base = keydata_out;
- cipher_iovecs[0].iov_len = keydata_len;
- cipher_iovecs[1].iov_base = hmac_keydata_out;
- cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
- cipher_iovecs[2].iov_base = mac;
- cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
-
- /*
- * Although we don't support writing to the old format, we do
- * support rewrapping the key so that the user can move and
- * quarantine datasets on the old format.
- */
- if (key->zk_version == 0) {
- aad_len = sizeof (uint64_t);
- aad[0] = LE_64(key->zk_guid);
- } else {
- ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
- aad_len = sizeof (uint64_t) * 3;
- aad[0] = LE_64(key->zk_guid);
- aad[1] = LE_64(crypt);
- aad[2] = LE_64(key->zk_version);
- }
-
- enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
- puio.uio_iov = plain_iovecs;
- puio.uio_iovcnt = 2;
- puio.uio_segflg = UIO_SYSSPACE;
- cuio.uio_iov = cipher_iovecs;
- cuio.uio_iovcnt = 3;
- cuio.uio_segflg = UIO_SYSSPACE;
-
- /* encrypt the keys and store the resulting ciphertext and mac */
- ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
- &puio, &cuio, (uint8_t *)aad, aad_len);
- if (ret != 0)
- goto error;
-
- return (0);
-
-error:
- return (ret);
-}
-
-int
-zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
- uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
- uint8_t *mac, zio_crypt_key_t *key)
-{
- int ret;
- crypto_mechanism_t mech;
- uio_t puio, cuio;
- uint64_t aad[3];
- iovec_t plain_iovecs[2], cipher_iovecs[3];
- uint_t enc_len, keydata_len, aad_len;
-
- ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
- ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
-
- rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
-
- keydata_len = zio_crypt_table[crypt].ci_keylen;
-
- /* initialize uio_ts */
- plain_iovecs[0].iov_base = key->zk_master_keydata;
- plain_iovecs[0].iov_len = keydata_len;
- plain_iovecs[1].iov_base = key->zk_hmac_keydata;
- plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
-
- cipher_iovecs[0].iov_base = keydata;
- cipher_iovecs[0].iov_len = keydata_len;
- cipher_iovecs[1].iov_base = hmac_keydata;
- cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
- cipher_iovecs[2].iov_base = mac;
- cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
-
- if (version == 0) {
- aad_len = sizeof (uint64_t);
- aad[0] = LE_64(guid);
- } else {
- ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
- aad_len = sizeof (uint64_t) * 3;
- aad[0] = LE_64(guid);
- aad[1] = LE_64(crypt);
- aad[2] = LE_64(version);
- }
-
- enc_len = keydata_len + SHA512_HMAC_KEYLEN;
- puio.uio_iov = plain_iovecs;
- puio.uio_segflg = UIO_SYSSPACE;
- puio.uio_iovcnt = 2;
- cuio.uio_iov = cipher_iovecs;
- cuio.uio_iovcnt = 3;
- cuio.uio_segflg = UIO_SYSSPACE;
-
- /* decrypt the keys and store the result in the output buffers */
- ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
- &puio, &cuio, (uint8_t *)aad, aad_len);
- if (ret != 0)
- goto error;
-
- /* generate a fresh salt */
- ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
- if (ret != 0)
- goto error;
-
- /* derive the current key from the master key */
- ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
- key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
- keydata_len);
- if (ret != 0)
- goto error;
-
- /* initialize keys for ICP */
- key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
- key->zk_current_key.ck_data = key->zk_current_keydata;
- key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
-
- key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
- key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
- key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
-
- /*
- * Initialize the crypto templates. It's ok if this fails because
- * this is just an optimization.
- */
- mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
- ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
- &key->zk_current_tmpl, KM_SLEEP);
- if (ret != CRYPTO_SUCCESS)
- key->zk_current_tmpl = NULL;
-
- mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
- ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
- &key->zk_hmac_tmpl, KM_SLEEP);
- if (ret != CRYPTO_SUCCESS)
- key->zk_hmac_tmpl = NULL;
-
- key->zk_crypt = crypt;
- key->zk_version = version;
- key->zk_guid = guid;
- key->zk_salt_count = 0;
-
- return (0);
-
-error:
- zio_crypt_key_destroy(key);
- return (ret);
-}
-
-int
-zio_crypt_generate_iv(uint8_t *ivbuf)
-{
- int ret;
-
- /* randomly generate the IV */
- ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
- if (ret != 0)
- goto error;
-
- return (0);
-
-error:
- bzero(ivbuf, ZIO_DATA_IV_LEN);
- return (ret);
-}
-
-int
-zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
- uint8_t *digestbuf, uint_t digestlen)
-{
- int ret;
- crypto_mechanism_t mech;
- crypto_data_t in_data, digest_data;
- uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
-
- ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
-
- /* initialize sha512-hmac mechanism and crypto data */
- mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
- mech.cm_param = NULL;
- mech.cm_param_len = 0;
-
- /* initialize the crypto data */
- in_data.cd_format = CRYPTO_DATA_RAW;
- in_data.cd_offset = 0;
- in_data.cd_length = datalen;
- in_data.cd_raw.iov_base = (char *)data;
- in_data.cd_raw.iov_len = in_data.cd_length;
-
- digest_data.cd_format = CRYPTO_DATA_RAW;
- digest_data.cd_offset = 0;
- digest_data.cd_length = SHA512_DIGEST_LENGTH;
- digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
- digest_data.cd_raw.iov_len = digest_data.cd_length;
-
- /* generate the hmac */
- ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
- &digest_data, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- bcopy(raw_digestbuf, digestbuf, digestlen);
-
- return (0);
-
-error:
- bzero(digestbuf, digestlen);
- return (ret);
-}
-
-int
-zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
- uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
-{
- int ret;
- uint8_t digestbuf[SHA512_DIGEST_LENGTH];
-
- ret = zio_crypt_do_hmac(key, data, datalen,
- digestbuf, SHA512_DIGEST_LENGTH);
- if (ret != 0)
- return (ret);
-
- bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
- bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
-
- return (0);
-}
-
-/*
- * The following functions are used to encode and decode encryption parameters
- * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
- * byte strings, which normally means that these strings would not need to deal
- * with byteswapping at all. However, both blkptr_t and zil_header_t may be
- * byteswapped by lower layers and so we must "undo" that byteswap here upon
- * decoding and encoding in a non-native byteorder. These functions require
- * that the byteorder bit is correct before being called.
- */
-void
-zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
-{
- uint64_t val64;
- uint32_t val32;
-
- ASSERT(BP_IS_ENCRYPTED(bp));
-
- if (!BP_SHOULD_BYTESWAP(bp)) {
- bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
- bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
- bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
- BP_SET_IV2(bp, val32);
- } else {
- bcopy(salt, &val64, sizeof (uint64_t));
- bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
-
- bcopy(iv, &val64, sizeof (uint64_t));
- bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
-
- bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
- BP_SET_IV2(bp, BSWAP_32(val32));
- }
-}
-
-void
-zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
-{
- uint64_t val64;
- uint32_t val32;
-
- ASSERT(BP_IS_PROTECTED(bp));
-
- /* for convenience, so callers don't need to check */
- if (BP_IS_AUTHENTICATED(bp)) {
- bzero(salt, ZIO_DATA_SALT_LEN);
- bzero(iv, ZIO_DATA_IV_LEN);
- return;
- }
-
- if (!BP_SHOULD_BYTESWAP(bp)) {
- bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
- bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
-
- val32 = (uint32_t)BP_GET_IV2(bp);
- bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
- } else {
- val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
- bcopy(&val64, salt, sizeof (uint64_t));
-
- val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
- bcopy(&val64, iv, sizeof (uint64_t));
-
- val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
- bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
- }
-}
-
-void
-zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
-{
- uint64_t val64;
-
- ASSERT(BP_USES_CRYPT(bp));
- ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
-
- if (!BP_SHOULD_BYTESWAP(bp)) {
- bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
- bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
- sizeof (uint64_t));
- } else {
- bcopy(mac, &val64, sizeof (uint64_t));
- bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
-
- bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
- bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
- }
-}
-
-void
-zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
-{
- uint64_t val64;
-
- ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
-
- /* for convenience, so callers don't need to check */
- if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- bzero(mac, ZIO_DATA_MAC_LEN);
- return;
- }
-
- if (!BP_SHOULD_BYTESWAP(bp)) {
- bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
- bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
- sizeof (uint64_t));
- } else {
- val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
- bcopy(&val64, mac, sizeof (uint64_t));
-
- val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
- bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
- }
-}
-
-void
-zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
-{
- zil_chain_t *zilc = data;
-
- bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
- bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
- sizeof (uint64_t));
-}
-
-void
-zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
-{
- /*
- * The ZIL MAC is embedded in the block it protects, which will
- * not have been byteswapped by the time this function has been called.
- * As a result, we don't need to worry about byteswapping the MAC.
- */
- const zil_chain_t *zilc = data;
-
- bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
- bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
- sizeof (uint64_t));
-}
-
-/*
- * This routine takes a block of dnodes (src_abd) and copies only the bonus
- * buffers to the same offsets in the dst buffer. datalen should be the size
- * of both the src_abd and the dst buffer (not just the length of the bonus
- * buffers).
- */
-void
-zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
-{
- uint_t i, max_dnp = datalen >> DNODE_SHIFT;
- uint8_t *src;
- dnode_phys_t *dnp, *sdnp, *ddnp;
-
- src = abd_borrow_buf_copy(src_abd, datalen);
-
- sdnp = (dnode_phys_t *)src;
- ddnp = (dnode_phys_t *)dst;
-
- for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
- dnp = &sdnp[i];
- if (dnp->dn_type != DMU_OT_NONE &&
- DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
- dnp->dn_bonuslen != 0) {
- bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
- DN_MAX_BONUS_LEN(dnp));
- }
- }
-
- abd_return_buf(src_abd, src, datalen);
-}
-
-/*
- * This function decides what fields from blk_prop are included in
- * the on-disk various MAC algorithms.
- */
-static void
-zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
-{
- /*
- * Version 0 did not properly zero out all non-portable fields
- * as it should have done. We maintain this code so that we can
- * do read-only imports of pools on this version.
- */
- if (version == 0) {
- BP_SET_DEDUP(bp, 0);
- BP_SET_CHECKSUM(bp, 0);
- BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
- return;
- }
-
- ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
-
- /*
- * The hole_birth feature might set these fields even if this bp
- * is a hole. We zero them out here to guarantee that raw sends
- * will function with or without the feature.
- */
- if (BP_IS_HOLE(bp)) {
- bp->blk_prop = 0ULL;
- return;
- }
-
- /*
- * At L0 we want to verify these fields to ensure that data blocks
- * can not be reinterpreted. For instance, we do not want an attacker
- * to trick us into returning raw lz4 compressed data to the user
- * by modifying the compression bits. At higher levels, we cannot
- * enforce this policy since raw sends do not convey any information
- * about indirect blocks, so these values might be different on the
- * receive side. Fortunately, this does not open any new attack
- * vectors, since any alterations that can be made to a higher level
- * bp must still verify the correct order of the layer below it.
- */
- if (BP_GET_LEVEL(bp) != 0) {
- BP_SET_BYTEORDER(bp, 0);
- BP_SET_COMPRESS(bp, 0);
-
- /*
- * psize cannot be set to zero or it will trigger
- * asserts, but the value doesn't really matter as
- * long as it is constant.
- */
- BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
- }
-
- BP_SET_DEDUP(bp, 0);
- BP_SET_CHECKSUM(bp, 0);
-}
-
-static void
-zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
- blkptr_auth_buf_t *bab, uint_t *bab_len)
-{
- blkptr_t tmpbp = *bp;
-
- if (should_bswap)
- byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
-
- ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
- ASSERT0(BP_IS_EMBEDDED(&tmpbp));
-
- zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
-
- /*
- * We always MAC blk_prop in LE to ensure portability. This
- * must be done after decoding the mac, since the endianness
- * will get zero'd out here.
- */
- zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
- bab->bab_prop = LE_64(tmpbp.blk_prop);
- bab->bab_pad = 0ULL;
-
- /* version 0 did not include the padding */
- *bab_len = sizeof (blkptr_auth_buf_t);
- if (version == 0)
- *bab_len -= sizeof (uint64_t);
-}
-
-static int
-zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
- boolean_t should_bswap, blkptr_t *bp)
-{
- int ret;
- uint_t bab_len;
- blkptr_auth_buf_t bab;
- crypto_data_t cd;
-
- zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
- cd.cd_format = CRYPTO_DATA_RAW;
- cd.cd_offset = 0;
- cd.cd_length = bab_len;
- cd.cd_raw.iov_base = (char *)&bab;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_update(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- return (0);
-
-error:
- return (ret);
-}
-
-static void
-zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
- boolean_t should_bswap, blkptr_t *bp)
-{
- uint_t bab_len;
- blkptr_auth_buf_t bab;
-
- zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
- SHA2Update(ctx, &bab, bab_len);
-}
-
-static void
-zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
- boolean_t should_bswap, blkptr_t *bp)
-{
- uint_t bab_len;
- blkptr_auth_buf_t bab;
-
- zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
- bcopy(&bab, *aadp, bab_len);
- *aadp += bab_len;
- *aad_len += bab_len;
-}
-
-static int
-zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
- boolean_t should_bswap, dnode_phys_t *dnp)
-{
- int ret, i;
- dnode_phys_t *adnp;
- boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
- crypto_data_t cd;
- uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
-
- cd.cd_format = CRYPTO_DATA_RAW;
- cd.cd_offset = 0;
-
- /* authenticate the core dnode (masking out non-portable bits) */
- bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
- adnp = (dnode_phys_t *)tmp_dncore;
- if (le_bswap) {
- adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
- adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
- adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
- adnp->dn_used = BSWAP_64(adnp->dn_used);
- }
- adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
- adnp->dn_used = 0;
-
- cd.cd_length = sizeof (tmp_dncore);
- cd.cd_raw.iov_base = (char *)adnp;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_update(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- for (i = 0; i < dnp->dn_nblkptr; i++) {
- ret = zio_crypt_bp_do_hmac_updates(ctx, version,
- should_bswap, &dnp->dn_blkptr[i]);
- if (ret != 0)
- goto error;
- }
-
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- ret = zio_crypt_bp_do_hmac_updates(ctx, version,
- should_bswap, DN_SPILL_BLKPTR(dnp));
- if (ret != 0)
- goto error;
- }
-
- return (0);
-
-error:
- return (ret);
-}
-
-/*
- * objset_phys_t blocks introduce a number of exceptions to the normal
- * authentication process. objset_phys_t's contain 2 separate HMACS for
- * protecting the integrity of their data. The portable_mac protects the
- * metadnode. This MAC can be sent with a raw send and protects against
- * reordering of data within the metadnode. The local_mac protects the user
- * accounting objects which are not sent from one system to another.
- *
- * In addition, objset blocks are the only blocks that can be modified and
- * written to disk without the key loaded under certain circumstances. During
- * zil_claim() we need to be able to update the zil_header_t to complete
- * claiming log blocks and during raw receives we need to write out the
- * portable_mac from the send file. Both of these actions are possible
- * because these fields are not protected by either MAC so neither one will
- * need to modify the MACs without the key. However, when the modified blocks
- * are written out they will be byteswapped into the host machine's native
- * endianness which will modify fields protected by the MAC. As a result, MAC
- * calculation for objset blocks works slightly differently from other block
- * types. Where other block types MAC the data in whatever endianness is
- * written to disk, objset blocks always MAC little endian version of their
- * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
- * and le_bswap indicates whether a byteswap is needed to get this block
- * into little endian format.
- */
-int
-zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
- boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
-{
- int ret;
- crypto_mechanism_t mech;
- crypto_context_t ctx;
- crypto_data_t cd;
- objset_phys_t *osp = data;
- uint64_t intval;
- boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
- uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
- uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
-
- /* initialize HMAC mechanism */
- mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
- mech.cm_param = NULL;
- mech.cm_param_len = 0;
-
- cd.cd_format = CRYPTO_DATA_RAW;
- cd.cd_offset = 0;
-
- /* calculate the portable MAC from the portable fields and metadnode */
- ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- /* add in the os_type */
- intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
- cd.cd_length = sizeof (uint64_t);
- cd.cd_raw.iov_base = (char *)&intval;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_update(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- /* add in the portable os_flags */
- intval = osp->os_flags;
- if (should_bswap)
- intval = BSWAP_64(intval);
- intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
- if (!ZFS_HOST_BYTEORDER)
- intval = BSWAP_64(intval);
-
- cd.cd_length = sizeof (uint64_t);
- cd.cd_raw.iov_base = (char *)&intval;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_update(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- /* add in fields from the metadnode */
- ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
- should_bswap, &osp->os_meta_dnode);
- if (ret)
- goto error;
-
- /* store the final digest in a temporary buffer and copy what we need */
- cd.cd_length = SHA512_DIGEST_LENGTH;
- cd.cd_raw.iov_base = (char *)raw_portable_mac;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_final(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
-
- /*
- * The local MAC protects the user, group and project accounting.
- * If these objects are not present, the local MAC is zeroed out.
- */
- if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
- osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
- osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
- osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
- (datalen >= OBJSET_PHYS_SIZE_V2 &&
- osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
- osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
- (datalen <= OBJSET_PHYS_SIZE_V1)) {
- bzero(local_mac, ZIO_OBJSET_MAC_LEN);
- return (0);
- }
-
- /* calculate the local MAC from the userused and groupused dnodes */
- ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- /* add in the non-portable os_flags */
- intval = osp->os_flags;
- if (should_bswap)
- intval = BSWAP_64(intval);
- intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
- if (!ZFS_HOST_BYTEORDER)
- intval = BSWAP_64(intval);
-
- cd.cd_length = sizeof (uint64_t);
- cd.cd_raw.iov_base = (char *)&intval;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_update(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- /* add in fields from the user accounting dnodes */
- if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
- ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
- should_bswap, &osp->os_userused_dnode);
- if (ret)
- goto error;
- }
-
- if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
- ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
- should_bswap, &osp->os_groupused_dnode);
- if (ret)
- goto error;
- }
-
- if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
- datalen >= OBJSET_PHYS_SIZE_V3) {
- ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
- should_bswap, &osp->os_projectused_dnode);
- if (ret)
- goto error;
- }
-
- /* store the final digest in a temporary buffer and copy what we need */
- cd.cd_length = SHA512_DIGEST_LENGTH;
- cd.cd_raw.iov_base = (char *)raw_local_mac;
- cd.cd_raw.iov_len = cd.cd_length;
-
- ret = crypto_mac_final(ctx, &cd, NULL);
- if (ret != CRYPTO_SUCCESS) {
- ret = SET_ERROR(EIO);
- goto error;
- }
-
- bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
-
- return (0);
-
-error:
- bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
- bzero(local_mac, ZIO_OBJSET_MAC_LEN);
- return (ret);
-}
-
-static void
-zio_crypt_destroy_uio(uio_t *uio)
-{
- if (uio->uio_iov)
- kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
-}
-
-/*
- * This function parses an uncompressed indirect block and returns a checksum
- * of all the portable fields from all of the contained bps. The portable
- * fields are the MAC and all of the fields from blk_prop except for the dedup,
- * checksum, and psize bits. For an explanation of the purpose of this, see
- * the comment block on object set authentication.
- */
-static int
-zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
- uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
-{
- blkptr_t *bp;
- int i, epb = datalen >> SPA_BLKPTRSHIFT;
- SHA2_CTX ctx;
- uint8_t digestbuf[SHA512_DIGEST_LENGTH];
-
- /* checksum all of the MACs from the layer below */
- SHA2Init(SHA512, &ctx);
- for (i = 0, bp = buf; i < epb; i++, bp++) {
- zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
- byteswap, bp);
- }
- SHA2Final(digestbuf, &ctx);
-
- if (generate) {
- bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
- return (0);
- }
-
- if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
- return (SET_ERROR(ECKSUM));
-
- return (0);
-}
-
-int
-zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
- uint_t datalen, boolean_t byteswap, uint8_t *cksum)
-{
- int ret;
-
- /*
- * Unfortunately, callers of this function will not always have
- * easy access to the on-disk format version. This info is
- * normally found in the DSL Crypto Key, but the checksum-of-MACs
- * is expected to be verifiable even when the key isn't loaded.
- * Here, instead of doing a ZAP lookup for the version for each
- * zio, we simply try both existing formats.
- */
- ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
- datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
- if (ret == ECKSUM) {
- ASSERT(!generate);
- ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
- buf, datalen, 0, byteswap, cksum);
- }
-
- return (ret);
-}
-
-int
-zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
- uint_t datalen, boolean_t byteswap, uint8_t *cksum)
-{
- int ret;
- void *buf;
-
- buf = abd_borrow_buf_copy(abd, datalen);
- ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
- byteswap, cksum);
- abd_return_buf(abd, buf, datalen);
-
- return (ret);
-}
-
-/*
- * Special case handling routine for encrypting / decrypting ZIL blocks.
- * We do not check for the older ZIL chain because the encryption feature
- * was not available before the newer ZIL chain was introduced. The goal
- * here is to encrypt everything except the blkptr_t of a lr_write_t and
- * the zil_chain_t header. Everything that is not encrypted is authenticated.
- */
-static int
-zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
- uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
- uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
- boolean_t *no_crypt)
-{
- int ret;
- uint64_t txtype, lr_len;
- uint_t nr_src, nr_dst, crypt_len;
- uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
- iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
- uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
- zil_chain_t *zilc;
- lr_t *lr;
- uint8_t *aadbuf = zio_buf_alloc(datalen);
-
- /* cipherbuf always needs an extra iovec for the MAC */
- if (encrypt) {
- src = plainbuf;
- dst = cipherbuf;
- nr_src = 0;
- nr_dst = 1;
- } else {
- src = cipherbuf;
- dst = plainbuf;
- nr_src = 1;
- nr_dst = 0;
- }
-
- /* find the start and end record of the log block */
- zilc = (zil_chain_t *)src;
- slrp = src + sizeof (zil_chain_t);
- aadp = aadbuf;
- blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
-
- /* calculate the number of encrypted iovecs we will need */
- for (; slrp < blkend; slrp += lr_len) {
- lr = (lr_t *)slrp;
-
- if (!byteswap) {
- txtype = lr->lrc_txtype;
- lr_len = lr->lrc_reclen;
- } else {
- txtype = BSWAP_64(lr->lrc_txtype);
- lr_len = BSWAP_64(lr->lrc_reclen);
- }
-
- nr_iovecs++;
- if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
- nr_iovecs++;
- }
-
- nr_src += nr_iovecs;
- nr_dst += nr_iovecs;
-
- /* allocate the iovec arrays */
- if (nr_src != 0) {
- src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
- if (src_iovecs == NULL) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
- }
-
- if (nr_dst != 0) {
- dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
- if (dst_iovecs == NULL) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
- }
-
- /*
- * Copy the plain zil header over and authenticate everything except
- * the checksum that will store our MAC. If we are writing the data
- * the embedded checksum will not have been calculated yet, so we don't
- * authenticate that.
- */
- bcopy(src, dst, sizeof (zil_chain_t));
- bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
- aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
- aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
-
- /* loop over records again, filling in iovecs */
- nr_iovecs = 0;
- slrp = src + sizeof (zil_chain_t);
- dlrp = dst + sizeof (zil_chain_t);
-
- for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
- lr = (lr_t *)slrp;
-
- if (!byteswap) {
- txtype = lr->lrc_txtype;
- lr_len = lr->lrc_reclen;
- } else {
- txtype = BSWAP_64(lr->lrc_txtype);
- lr_len = BSWAP_64(lr->lrc_reclen);
- }
-
- /* copy the common lr_t */
- bcopy(slrp, dlrp, sizeof (lr_t));
- bcopy(slrp, aadp, sizeof (lr_t));
- aadp += sizeof (lr_t);
- aad_len += sizeof (lr_t);
-
- ASSERT3P(src_iovecs, !=, NULL);
- ASSERT3P(dst_iovecs, !=, NULL);
-
- /*
- * If this is a TX_WRITE record we want to encrypt everything
- * except the bp if exists. If the bp does exist we want to
- * authenticate it.
- */
- if (txtype == TX_WRITE) {
- crypt_len = sizeof (lr_write_t) -
- sizeof (lr_t) - sizeof (blkptr_t);
- src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
- src_iovecs[nr_iovecs].iov_len = crypt_len;
- dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
- dst_iovecs[nr_iovecs].iov_len = crypt_len;
-
- /* copy the bp now since it will not be encrypted */
- bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- sizeof (blkptr_t));
- bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- aadp, sizeof (blkptr_t));
- aadp += sizeof (blkptr_t);
- aad_len += sizeof (blkptr_t);
- nr_iovecs++;
- total_len += crypt_len;
-
- if (lr_len != sizeof (lr_write_t)) {
- crypt_len = lr_len - sizeof (lr_write_t);
- src_iovecs[nr_iovecs].iov_base =
- slrp + sizeof (lr_write_t);
- src_iovecs[nr_iovecs].iov_len = crypt_len;
- dst_iovecs[nr_iovecs].iov_base =
- dlrp + sizeof (lr_write_t);
- dst_iovecs[nr_iovecs].iov_len = crypt_len;
- nr_iovecs++;
- total_len += crypt_len;
- }
- } else {
- crypt_len = lr_len - sizeof (lr_t);
- src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
- src_iovecs[nr_iovecs].iov_len = crypt_len;
- dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
- dst_iovecs[nr_iovecs].iov_len = crypt_len;
- nr_iovecs++;
- total_len += crypt_len;
- }
- }
-
- *no_crypt = (nr_iovecs == 0);
- *enc_len = total_len;
- *authbuf = aadbuf;
- *auth_len = aad_len;
-
- if (encrypt) {
- puio->uio_iov = src_iovecs;
- puio->uio_iovcnt = nr_src;
- cuio->uio_iov = dst_iovecs;
- cuio->uio_iovcnt = nr_dst;
- } else {
- puio->uio_iov = dst_iovecs;
- puio->uio_iovcnt = nr_dst;
- cuio->uio_iov = src_iovecs;
- cuio->uio_iovcnt = nr_src;
- }
-
- return (0);
-
-error:
- zio_buf_free(aadbuf, datalen);
- if (src_iovecs != NULL)
- kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
- if (dst_iovecs != NULL)
- kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
-
- *enc_len = 0;
- *authbuf = NULL;
- *auth_len = 0;
- *no_crypt = B_FALSE;
- puio->uio_iov = NULL;
- puio->uio_iovcnt = 0;
- cuio->uio_iov = NULL;
- cuio->uio_iovcnt = 0;
- return (ret);
-}
-
-/*
- * Special case handling routine for encrypting / decrypting dnode blocks.
- */
-static int
-zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
- uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
- uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
- uint_t *auth_len, boolean_t *no_crypt)
-{
- int ret;
- uint_t nr_src, nr_dst, crypt_len;
- uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
- uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
- iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
- uint8_t *src, *dst, *aadp;
- dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
- uint8_t *aadbuf = zio_buf_alloc(datalen);
-
- if (encrypt) {
- src = plainbuf;
- dst = cipherbuf;
- nr_src = 0;
- nr_dst = 1;
- } else {
- src = cipherbuf;
- dst = plainbuf;
- nr_src = 1;
- nr_dst = 0;
- }
-
- sdnp = (dnode_phys_t *)src;
- ddnp = (dnode_phys_t *)dst;
- aadp = aadbuf;
-
- /*
- * Count the number of iovecs we will need to do the encryption by
- * counting the number of bonus buffers that need to be encrypted.
- */
- for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
- /*
- * This block may still be byteswapped. However, all of the
- * values we use are either uint8_t's (for which byteswapping
- * is a noop) or a * != 0 check, which will work regardless
- * of whether or not we byteswap.
- */
- if (sdnp[i].dn_type != DMU_OT_NONE &&
- DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
- sdnp[i].dn_bonuslen != 0) {
- nr_iovecs++;
- }
- }
-
- nr_src += nr_iovecs;
- nr_dst += nr_iovecs;
-
- if (nr_src != 0) {
- src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
- if (src_iovecs == NULL) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
- }
-
- if (nr_dst != 0) {
- dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
- if (dst_iovecs == NULL) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
- }
-
- nr_iovecs = 0;
-
- /*
- * Iterate through the dnodes again, this time filling in the uios
- * we allocated earlier. We also concatenate any data we want to
- * authenticate onto aadbuf.
- */
- for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
- dnp = &sdnp[i];
-
- /* copy over the core fields and blkptrs (kept as plaintext) */
- bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
-
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
- sizeof (blkptr_t));
- }
-
- /*
- * Handle authenticated data. We authenticate everything in
- * the dnode that can be brought over when we do a raw send.
- * This includes all of the core fields as well as the MACs
- * stored in the bp checksums and all of the portable bits
- * from blk_prop. We include the dnode padding here in case it
- * ever gets used in the future. Some dn_flags and dn_used are
- * not portable so we mask those out values out of the
- * authenticated data.
- */
- crypt_len = offsetof(dnode_phys_t, dn_blkptr);
- bcopy(dnp, aadp, crypt_len);
- adnp = (dnode_phys_t *)aadp;
- adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
- adnp->dn_used = 0;
- aadp += crypt_len;
- aad_len += crypt_len;
-
- for (j = 0; j < dnp->dn_nblkptr; j++) {
- zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
- version, byteswap, &dnp->dn_blkptr[j]);
- }
-
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
- version, byteswap, DN_SPILL_BLKPTR(dnp));
- }
-
- /*
- * If this bonus buffer needs to be encrypted, we prepare an
- * iovec_t. The encryption / decryption functions will fill
- * this in for us with the encrypted or decrypted data.
- * Otherwise we add the bonus buffer to the authenticated
- * data buffer and copy it over to the destination. The
- * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
- * we can guarantee alignment with the AES block size
- * (128 bits).
- */
- crypt_len = DN_MAX_BONUS_LEN(dnp);
- if (dnp->dn_type != DMU_OT_NONE &&
- DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
- dnp->dn_bonuslen != 0) {
- ASSERT3U(nr_iovecs, <, nr_src);
- ASSERT3U(nr_iovecs, <, nr_dst);
- ASSERT3P(src_iovecs, !=, NULL);
- ASSERT3P(dst_iovecs, !=, NULL);
- src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
- src_iovecs[nr_iovecs].iov_len = crypt_len;
- dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
- dst_iovecs[nr_iovecs].iov_len = crypt_len;
-
- nr_iovecs++;
- total_len += crypt_len;
- } else {
- bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
- bcopy(DN_BONUS(dnp), aadp, crypt_len);
- aadp += crypt_len;
- aad_len += crypt_len;
- }
- }
-
- *no_crypt = (nr_iovecs == 0);
- *enc_len = total_len;
- *authbuf = aadbuf;
- *auth_len = aad_len;
-
- if (encrypt) {
- puio->uio_iov = src_iovecs;
- puio->uio_iovcnt = nr_src;
- cuio->uio_iov = dst_iovecs;
- cuio->uio_iovcnt = nr_dst;
- } else {
- puio->uio_iov = dst_iovecs;
- puio->uio_iovcnt = nr_dst;
- cuio->uio_iov = src_iovecs;
- cuio->uio_iovcnt = nr_src;
- }
-
- return (0);
-
-error:
- zio_buf_free(aadbuf, datalen);
- if (src_iovecs != NULL)
- kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
- if (dst_iovecs != NULL)
- kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
-
- *enc_len = 0;
- *authbuf = NULL;
- *auth_len = 0;
- *no_crypt = B_FALSE;
- puio->uio_iov = NULL;
- puio->uio_iovcnt = 0;
- cuio->uio_iov = NULL;
- cuio->uio_iovcnt = 0;
- return (ret);
-}
-
-static int
-zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
- uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio,
- uint_t *enc_len)
-{
- int ret;
- uint_t nr_plain = 1, nr_cipher = 2;
- iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
-
- /* allocate the iovecs for the plain and cipher data */
- plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
- KM_SLEEP);
- if (!plain_iovecs) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
-
- cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
- KM_SLEEP);
- if (!cipher_iovecs) {
- ret = SET_ERROR(ENOMEM);
- goto error;
- }
-
- plain_iovecs[0].iov_base = plainbuf;
- plain_iovecs[0].iov_len = datalen;
- cipher_iovecs[0].iov_base = cipherbuf;
- cipher_iovecs[0].iov_len = datalen;
-
- *enc_len = datalen;
- puio->uio_iov = plain_iovecs;
- puio->uio_iovcnt = nr_plain;
- cuio->uio_iov = cipher_iovecs;
- cuio->uio_iovcnt = nr_cipher;
-
- return (0);
-
-error:
- if (plain_iovecs != NULL)
- kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
- if (cipher_iovecs != NULL)
- kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
-
- *enc_len = 0;
- puio->uio_iov = NULL;
- puio->uio_iovcnt = 0;
- cuio->uio_iov = NULL;
- cuio->uio_iovcnt = 0;
- return (ret);
-}
-
-/*
- * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
- * that they can be used for encryption and decryption by zio_do_crypt_uio().
- * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
- * requiring special handling to parse out pieces that are to be encrypted. The
- * authbuf is used by these special cases to store additional authenticated
- * data (AAD) for the encryption modes.
- */
-static int
-zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
- uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
- uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
- uint_t *auth_len, boolean_t *no_crypt)
-{
- int ret;
- iovec_t *mac_iov;
-
- ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
-
- /* route to handler */
- switch (ot) {
- case DMU_OT_INTENT_LOG:
- ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
- datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
- no_crypt);
- break;
- case DMU_OT_DNODE:
- ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
- cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
- auth_len, no_crypt);
- break;
- default:
- ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
- datalen, puio, cuio, enc_len);
- *authbuf = NULL;
- *auth_len = 0;
- *no_crypt = B_FALSE;
- break;
- }
-
- if (ret != 0)
- goto error;
-
- /* populate the uios */
- puio->uio_segflg = UIO_SYSSPACE;
- cuio->uio_segflg = UIO_SYSSPACE;
-
- mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
- mac_iov->iov_base = mac;
- mac_iov->iov_len = ZIO_DATA_MAC_LEN;
-
- return (0);
-
-error:
- return (ret);
-}
-
-/*
- * Primary encryption / decryption entrypoint for zio data.
- */
-int
-zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
- dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
- uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
- boolean_t *no_crypt)
-{
- int ret;
- boolean_t locked = B_FALSE;
- uint64_t crypt = key->zk_crypt;
- uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
- uint_t enc_len, auth_len;
- uio_t puio, cuio;
- uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
- crypto_key_t tmp_ckey, *ckey = NULL;
- crypto_ctx_template_t tmpl;
- uint8_t *authbuf = NULL;
-
- /*
- * If the needed key is the current one, just use it. Otherwise we
- * need to generate a temporary one from the given salt + master key.
- * If we are encrypting, we must return a copy of the current salt
- * so that it can be stored in the blkptr_t.
- */
- rw_enter(&key->zk_salt_lock, RW_READER);
- locked = B_TRUE;
-
- if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
- ckey = &key->zk_current_key;
- tmpl = key->zk_current_tmpl;
- } else {
- rw_exit(&key->zk_salt_lock);
- locked = B_FALSE;
-
- ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
- salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
- if (ret != 0)
- goto error;
-
- tmp_ckey.ck_format = CRYPTO_KEY_RAW;
- tmp_ckey.ck_data = enc_keydata;
- tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
-
- ckey = &tmp_ckey;
- tmpl = NULL;
- }
-
- /*
- * Attempt to use QAT acceleration if we can. We currently don't
- * do this for metadnode and ZIL blocks, since they have a much
- * more involved buffer layout and the qat_crypt() function only
- * works in-place.
- */
- if (qat_crypt_use_accel(datalen) &&
- ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
- uint8_t *srcbuf, *dstbuf;
-
- if (encrypt) {
- srcbuf = plainbuf;
- dstbuf = cipherbuf;
- } else {
- srcbuf = cipherbuf;
- dstbuf = plainbuf;
- }
-
- ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
- dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
- if (ret == CPA_STATUS_SUCCESS) {
- if (locked) {
- rw_exit(&key->zk_salt_lock);
- locked = B_FALSE;
- }
-
- return (0);
- }
- /* If the hardware implementation fails fall back to software */
- }
-
- bzero(&puio, sizeof (uio_t));
- bzero(&cuio, sizeof (uio_t));
-
- /* create uios for encryption */
- ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
- cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
- &authbuf, &auth_len, no_crypt);
- if (ret != 0)
- goto error;
-
- /* perform the encryption / decryption in software */
- ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
- &puio, &cuio, authbuf, auth_len);
- if (ret != 0)
- goto error;
-
- if (locked) {
- rw_exit(&key->zk_salt_lock);
- locked = B_FALSE;
- }
-
- if (authbuf != NULL)
- zio_buf_free(authbuf, datalen);
- if (ckey == &tmp_ckey)
- bzero(enc_keydata, keydata_len);
- zio_crypt_destroy_uio(&puio);
- zio_crypt_destroy_uio(&cuio);
-
- return (0);
-
-error:
- if (locked)
- rw_exit(&key->zk_salt_lock);
- if (authbuf != NULL)
- zio_buf_free(authbuf, datalen);
- if (ckey == &tmp_ckey)
- bzero(enc_keydata, keydata_len);
- zio_crypt_destroy_uio(&puio);
- zio_crypt_destroy_uio(&cuio);
-
- return (ret);
-}
-
-/*
- * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
- * linear buffers.
- */
-int
-zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
- boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
- uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
-{
- int ret;
- void *ptmp, *ctmp;
-
- if (encrypt) {
- ptmp = abd_borrow_buf_copy(pabd, datalen);
- ctmp = abd_borrow_buf(cabd, datalen);
- } else {
- ptmp = abd_borrow_buf(pabd, datalen);
- ctmp = abd_borrow_buf_copy(cabd, datalen);
- }
-
- ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
- datalen, ptmp, ctmp, no_crypt);
- if (ret != 0)
- goto error;
-
- if (encrypt) {
- abd_return_buf(pabd, ptmp, datalen);
- abd_return_buf_copy(cabd, ctmp, datalen);
- } else {
- abd_return_buf_copy(pabd, ptmp, datalen);
- abd_return_buf(cabd, ctmp, datalen);
- }
-
- return (0);
-
-error:
- if (encrypt) {
- abd_return_buf(pabd, ptmp, datalen);
- abd_return_buf_copy(cabd, ctmp, datalen);
- } else {
- abd_return_buf_copy(pabd, ptmp, datalen);
- abd_return_buf(cabd, ctmp, datalen);
- }
-
- return (ret);
-}
-
-#if defined(_KERNEL)
-/* BEGIN CSTYLED */
-module_param(zfs_key_max_salt_uses, ulong, 0644);
-MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
- "can be used for generating encryption keys before it is rotated");
-/* END CSTYLED */
-#endif
diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c
deleted file mode 100644
index 6df367b81..000000000
--- a/module/zfs/zpl_ctldir.c
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * LLNL-CODE-403049.
- * Rewritten for Linux by:
- * Rohan Puri <[email protected]>
- * Brian Behlendorf <[email protected]>
- */
-
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zpl.h>
-
-/*
- * Common open routine. Disallow any write access.
- */
-/* ARGSUSED */
-static int
-zpl_common_open(struct inode *ip, struct file *filp)
-{
- if (filp->f_mode & FMODE_WRITE)
- return (-EACCES);
-
- return (generic_file_open(ip, filp));
-}
-
-/*
- * Get root directory contents.
- */
-static int
-zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
-{
- zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
-
- if (!zpl_dir_emit_dots(filp, ctx))
- goto out;
-
- if (ctx->pos == 2) {
- if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
- strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
- goto out;
-
- ctx->pos++;
- }
-
- if (ctx->pos == 3) {
- if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
- strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
- goto out;
-
- ctx->pos++;
- }
-out:
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
-static int
-zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- zpl_dir_context_t ctx =
- ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
- int error;
-
- error = zpl_root_iterate(filp, &ctx);
- filp->f_pos = ctx.pos;
-
- return (error);
-}
-#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
-
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
-static int
-zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *ip = path->dentry->d_inode;
-
- generic_fillattr(ip, stat);
- stat->atime = current_time(ip);
-
- return (0);
-}
-ZPL_GETATTR_WRAPPER(zpl_root_getattr);
-
-static struct dentry *
-#ifdef HAVE_LOOKUP_NAMEIDATA
-zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
-#else
-zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
-#endif
-{
- cred_t *cr = CRED();
- struct inode *ip;
- int error;
-
- crhold(cr);
- error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
- ASSERT3S(error, <=, 0);
- crfree(cr);
-
- if (error) {
- if (error == -ENOENT)
- return (d_splice_alias(NULL, dentry));
- else
- return (ERR_PTR(error));
- }
-
- return (d_splice_alias(ip, dentry));
-}
-
-/*
- * The '.zfs' control directory file and inode operations.
- */
-const struct file_operations zpl_fops_root = {
- .open = zpl_common_open,
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
-#ifdef HAVE_VFS_ITERATE_SHARED
- .iterate_shared = zpl_root_iterate,
-#elif defined(HAVE_VFS_ITERATE)
- .iterate = zpl_root_iterate,
-#else
- .readdir = zpl_root_readdir,
-#endif
-};
-
-const struct inode_operations zpl_ops_root = {
- .lookup = zpl_root_lookup,
- .getattr = zpl_root_getattr,
-};
-
-#ifdef HAVE_AUTOMOUNT
-static struct vfsmount *
-zpl_snapdir_automount(struct path *path)
-{
- int error;
-
- error = -zfsctl_snapshot_mount(path, 0);
- if (error)
- return (ERR_PTR(error));
-
- /*
- * Rather than returning the new vfsmount for the snapshot we must
- * return NULL to indicate a mount collision. This is done because
- * the user space mount calls do_add_mount() which adds the vfsmount
- * to the name space. If we returned the new mount here it would be
- * added again to the vfsmount list resulting in list corruption.
- */
- return (NULL);
-}
-#endif /* HAVE_AUTOMOUNT */
-
-/*
- * Negative dentries must always be revalidated so newly created snapshots
- * can be detected and automounted. Normal dentries should be kept because
- * as of the 3.18 kernel revaliding the mountpoint dentry will result in
- * the snapshot being immediately unmounted.
- */
-static int
-#ifdef HAVE_D_REVALIDATE_NAMEIDATA
-zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
-#else
-zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
-#endif
-{
- return (!!dentry->d_inode);
-}
-
-dentry_operations_t zpl_dops_snapdirs = {
-/*
- * Auto mounting of snapshots is only supported for 2.6.37 and
- * newer kernels. Prior to this kernel the ops->follow_link()
- * callback was used as a hack to trigger the mount. The
- * resulting vfsmount was then explicitly grafted in to the
- * name space. While it might be possible to add compatibility
- * code to accomplish this it would require considerable care.
- */
-#ifdef HAVE_AUTOMOUNT
- .d_automount = zpl_snapdir_automount,
-#endif /* HAVE_AUTOMOUNT */
- .d_revalidate = zpl_snapdir_revalidate,
-};
-
-static struct dentry *
-#ifdef HAVE_LOOKUP_NAMEIDATA
-zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
- struct nameidata *nd)
-#else
-zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
- unsigned int flags)
-#endif
-
-{
- fstrans_cookie_t cookie;
- cred_t *cr = CRED();
- struct inode *ip = NULL;
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
- 0, cr, NULL, NULL);
- ASSERT3S(error, <=, 0);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- if (error && error != -ENOENT)
- return (ERR_PTR(error));
-
- ASSERT(error == 0 || ip == NULL);
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
-#ifdef HAVE_AUTOMOUNT
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-#endif
-
- return (d_splice_alias(ip, dentry));
-}
-
-static int
-zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
-{
- zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
- fstrans_cookie_t cookie;
- char snapname[MAXNAMELEN];
- boolean_t case_conflict;
- uint64_t id, pos;
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
- cookie = spl_fstrans_mark();
-
- if (!zpl_dir_emit_dots(filp, ctx))
- goto out;
-
- pos = ctx->pos;
- while (error == 0) {
- dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
- error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
- snapname, &id, &pos, &case_conflict);
- dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
- if (error)
- goto out;
-
- if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
- ZFSCTL_INO_SHARES - id, DT_DIR))
- goto out;
-
- ctx->pos = pos;
- }
-out:
- spl_fstrans_unmark(cookie);
- ZFS_EXIT(zfsvfs);
-
- if (error == -ENOENT)
- return (0);
-
- return (error);
-}
-
-#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
-static int
-zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- zpl_dir_context_t ctx =
- ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
- int error;
-
- error = zpl_snapdir_iterate(filp, &ctx);
- filp->f_pos = ctx.pos;
-
- return (error);
-}
-#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
-
-static int
-zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
- struct inode *tdip, struct dentry *tdentry, unsigned int flags)
-{
- cred_t *cr = CRED();
- int error;
-
- /* We probably don't want to support renameat2(2) in ctldir */
- if (flags)
- return (-EINVAL);
-
- crhold(cr);
- error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
- tdip, dname(tdentry), cr, 0);
- ASSERT3S(error, <=, 0);
- crfree(cr);
-
- return (error);
-}
-
-#ifndef HAVE_RENAME_WANTS_FLAGS
-static int
-zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
- struct inode *tdip, struct dentry *tdentry)
-{
- return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
-}
-#endif
-
-static int
-zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
-{
- cred_t *cr = CRED();
- int error;
-
- crhold(cr);
- error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
- ASSERT3S(error, <=, 0);
- crfree(cr);
-
- return (error);
-}
-
-static int
-zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode)
-{
- cred_t *cr = CRED();
- vattr_t *vap;
- struct inode *ip;
- int error;
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
-
- error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
- if (error == 0) {
- d_clear_d_op(dentry);
- d_set_d_op(dentry, &zpl_dops_snapdirs);
- d_instantiate(dentry, ip);
- }
-
- kmem_free(vap, sizeof (vattr_t));
- ASSERT3S(error, <=, 0);
- crfree(cr);
-
- return (error);
-}
-
-/*
- * Get snapshot directory attributes.
- */
-/* ARGSUSED */
-static int
-zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *ip = path->dentry->d_inode;
- zfsvfs_t *zfsvfs = ITOZSB(ip);
-
- ZFS_ENTER(zfsvfs);
- generic_fillattr(ip, stat);
-
- stat->nlink = stat->size = 2;
- stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
- stat->atime = current_time(ip);
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
-
-/*
- * The '.zfs/snapshot' directory file operations. These mainly control
- * generating the list of available snapshots when doing an 'ls' in the
- * directory. See zpl_snapdir_readdir().
- */
-const struct file_operations zpl_fops_snapdir = {
- .open = zpl_common_open,
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
-#ifdef HAVE_VFS_ITERATE_SHARED
- .iterate_shared = zpl_snapdir_iterate,
-#elif defined(HAVE_VFS_ITERATE)
- .iterate = zpl_snapdir_iterate,
-#else
- .readdir = zpl_snapdir_readdir,
-#endif
-
-};
-
-/*
- * The '.zfs/snapshot' directory inode operations. These mainly control
- * creating an inode for a snapshot directory and initializing the needed
- * infrastructure to automount the snapshot. See zpl_snapdir_lookup().
- */
-const struct inode_operations zpl_ops_snapdir = {
- .lookup = zpl_snapdir_lookup,
- .getattr = zpl_snapdir_getattr,
-#ifdef HAVE_RENAME_WANTS_FLAGS
- .rename = zpl_snapdir_rename2,
-#else
- .rename = zpl_snapdir_rename,
-#endif
- .rmdir = zpl_snapdir_rmdir,
- .mkdir = zpl_snapdir_mkdir,
-};
-
-static struct dentry *
-#ifdef HAVE_LOOKUP_NAMEIDATA
-zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
- struct nameidata *nd)
-#else
-zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
- unsigned int flags)
-#endif
-{
- fstrans_cookie_t cookie;
- cred_t *cr = CRED();
- struct inode *ip = NULL;
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
- 0, cr, NULL, NULL);
- ASSERT3S(error, <=, 0);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- if (error) {
- if (error == -ENOENT)
- return (d_splice_alias(NULL, dentry));
- else
- return (ERR_PTR(error));
- }
-
- return (d_splice_alias(ip, dentry));
-}
-
-static int
-zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
-{
- fstrans_cookie_t cookie;
- cred_t *cr = CRED();
- zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
- znode_t *dzp;
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
- cookie = spl_fstrans_mark();
-
- if (zfsvfs->z_shares_dir == 0) {
- zpl_dir_emit_dots(filp, ctx);
- goto out;
- }
-
- error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
- if (error)
- goto out;
-
- crhold(cr);
- error = -zfs_readdir(ZTOI(dzp), ctx, cr);
- crfree(cr);
-
- iput(ZTOI(dzp));
-out:
- spl_fstrans_unmark(cookie);
- ZFS_EXIT(zfsvfs);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
-static int
-zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- zpl_dir_context_t ctx =
- ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
- int error;
-
- error = zpl_shares_iterate(filp, &ctx);
- filp->f_pos = ctx.pos;
-
- return (error);
-}
-#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
-
-/* ARGSUSED */
-static int
-zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *ip = path->dentry->d_inode;
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- znode_t *dzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- if (zfsvfs->z_shares_dir == 0) {
- generic_fillattr(path->dentry->d_inode, stat);
- stat->nlink = stat->size = 2;
- stat->atime = current_time(ip);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
- if (error == 0) {
- error = -zfs_getattr_fast(ZTOI(dzp), stat);
- iput(ZTOI(dzp));
- }
-
- ZFS_EXIT(zfsvfs);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
-
-/*
- * The '.zfs/shares' directory file operations.
- */
-const struct file_operations zpl_fops_shares = {
- .open = zpl_common_open,
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
-#ifdef HAVE_VFS_ITERATE_SHARED
- .iterate_shared = zpl_shares_iterate,
-#elif defined(HAVE_VFS_ITERATE)
- .iterate = zpl_shares_iterate,
-#else
- .readdir = zpl_shares_readdir,
-#endif
-
-};
-
-/*
- * The '.zfs/shares' directory inode operations.
- */
-const struct inode_operations zpl_ops_shares = {
- .lookup = zpl_shares_lookup,
- .getattr = zpl_shares_getattr,
-};
diff --git a/module/zfs/zpl_export.c b/module/zfs/zpl_export.c
deleted file mode 100644
index a264d664c..000000000
--- a/module/zfs/zpl_export.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2011 Gunnar Beutner
- * Copyright (c) 2012 Cyril Plisko. All rights reserved.
- */
-
-
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zpl.h>
-
-
-static int
-#ifdef HAVE_ENCODE_FH_WITH_INODE
-zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
-{
-#else
-zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
-{
- /* CSTYLED */
- struct inode *ip = dentry->d_inode;
-#endif /* HAVE_ENCODE_FH_WITH_INODE */
- fstrans_cookie_t cookie;
- fid_t *fid = (fid_t *)fh;
- int len_bytes, rc;
-
- len_bytes = *max_len * sizeof (__u32);
-
- if (len_bytes < offsetof(fid_t, fid_data))
- return (255);
-
- fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
- cookie = spl_fstrans_mark();
-
- if (zfsctl_is_node(ip))
- rc = zfsctl_fid(ip, fid);
- else
- rc = zfs_fid(ip, fid);
-
- spl_fstrans_unmark(cookie);
- len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
- *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
-
- return (rc == 0 ? FILEID_INO32_GEN : 255);
-}
-
-static struct dentry *
-zpl_dentry_obtain_alias(struct inode *ip)
-{
- struct dentry *result;
-
-#ifdef HAVE_D_OBTAIN_ALIAS
- result = d_obtain_alias(ip);
-#else
- result = d_alloc_anon(ip);
-
- if (result == NULL) {
- iput(ip);
- result = ERR_PTR(-ENOMEM);
- }
-#endif /* HAVE_D_OBTAIN_ALIAS */
-
- return (result);
-}
-
-static struct dentry *
-zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
- int fh_len, int fh_type)
-{
- fid_t *fid = (fid_t *)fh;
- fstrans_cookie_t cookie;
- struct inode *ip;
- int len_bytes, rc;
-
- len_bytes = fh_len * sizeof (__u32);
-
- if (fh_type != FILEID_INO32_GEN ||
- len_bytes < offsetof(fid_t, fid_data) ||
- len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
- return (ERR_PTR(-EINVAL));
-
- cookie = spl_fstrans_mark();
- rc = zfs_vget(sb, &ip, fid);
- spl_fstrans_unmark(cookie);
-
- if (rc) {
- /*
- * If we see ENOENT it might mean that an NFSv4 * client
- * is using a cached inode value in a file handle and
- * that the sought after file has had its inode changed
- * by a third party. So change the error to ESTALE
- * which will trigger a full lookup by the client and
- * will find the new filename/inode pair if it still
- * exists.
- */
- if (rc == ENOENT)
- rc = ESTALE;
-
- return (ERR_PTR(-rc));
- }
-
- ASSERT((ip != NULL) && !IS_ERR(ip));
-
- return (zpl_dentry_obtain_alias(ip));
-}
-
-static struct dentry *
-zpl_get_parent(struct dentry *child)
-{
- cred_t *cr = CRED();
- fstrans_cookie_t cookie;
- struct inode *ip;
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- if (error)
- return (ERR_PTR(error));
-
- return (zpl_dentry_obtain_alias(ip));
-}
-
-#ifdef HAVE_COMMIT_METADATA
-static int
-zpl_commit_metadata(struct inode *inode)
-{
- cred_t *cr = CRED();
- fstrans_cookie_t cookie;
- int error;
-
- if (zfsctl_is_node(inode))
- return (0);
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_fsync(inode, 0, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-#endif /* HAVE_COMMIT_METADATA */
-
-const struct export_operations zpl_export_operations = {
- .encode_fh = zpl_encode_fh,
- .fh_to_dentry = zpl_fh_to_dentry,
- .get_parent = zpl_get_parent,
-#ifdef HAVE_COMMIT_METADATA
- .commit_metadata = zpl_commit_metadata,
-#endif /* HAVE_COMMIT_METADATA */
-};
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
deleted file mode 100644
index acad4670d..000000000
--- a/module/zfs/zpl_file.c
+++ /dev/null
@@ -1,1075 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
- * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
- */
-
-
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-#endif
-#include <sys/file.h>
-#include <sys/dmu_objset.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_project.h>
-
-
-static int
-zpl_open(struct inode *ip, struct file *filp)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- error = generic_file_open(ip, filp);
- if (error)
- return (error);
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_release(struct inode *ip, struct file *filp)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- if (ITOZ(ip)->z_atime_dirty)
- zfs_mark_inode_dirty(ip);
-
- crhold(cr);
- error = -zfs_close(ip, filp->f_flags, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_readdir(file_inode(filp), ctx, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
-static int
-zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- zpl_dir_context_t ctx =
- ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
- int error;
-
- error = zpl_iterate(filp, &ctx);
- filp->f_pos = ctx.pos;
-
- return (error);
-}
-#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
-
-#if defined(HAVE_FSYNC_WITH_DENTRY)
-/*
- * Linux 2.6.x - 2.6.34 API,
- * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
- * to the fops->fsync() hook. For this reason, we must be careful not to
- * use filp unconditionally.
- */
-static int
-zpl_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_fsync(dentry->d_inode, datasync, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifdef HAVE_FILE_AIO_FSYNC
-static int
-zpl_aio_fsync(struct kiocb *kiocb, int datasync)
-{
- struct file *filp = kiocb->ki_filp;
- return (zpl_fsync(filp, file_dentry(filp), datasync));
-}
-#endif
-
-#elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
-/*
- * Linux 2.6.35 - 3.0 API,
- * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
- * redundant. The dentry is still accessible via filp->f_path.dentry,
- * and we are guaranteed that filp will never be NULL.
- */
-static int
-zpl_fsync(struct file *filp, int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_fsync(inode, datasync, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifdef HAVE_FILE_AIO_FSYNC
-static int
-zpl_aio_fsync(struct kiocb *kiocb, int datasync)
-{
- return (zpl_fsync(kiocb->ki_filp, datasync));
-}
-#endif
-
-#elif defined(HAVE_FSYNC_RANGE)
-/*
- * Linux 3.1 - 3.x API,
- * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
- * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex
- * lock is no longer held by the caller, for zfs we don't require the lock
- * to be held so we don't acquire it.
- */
-static int
-zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (error)
- return (error);
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_fsync(inode, datasync, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifdef HAVE_FILE_AIO_FSYNC
-static int
-zpl_aio_fsync(struct kiocb *kiocb, int datasync)
-{
- return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
-}
-#endif
-
-#else
-#error "Unsupported fops->fsync() implementation"
-#endif
-
-static inline int
-zfs_io_flags(struct kiocb *kiocb)
-{
- int flags = 0;
-
-#if defined(IOCB_DSYNC)
- if (kiocb->ki_flags & IOCB_DSYNC)
- flags |= FDSYNC;
-#endif
-#if defined(IOCB_SYNC)
- if (kiocb->ki_flags & IOCB_SYNC)
- flags |= FSYNC;
-#endif
-#if defined(IOCB_APPEND)
- if (kiocb->ki_flags & IOCB_APPEND)
- flags |= FAPPEND;
-#endif
-#if defined(IOCB_DIRECT)
- if (kiocb->ki_flags & IOCB_DIRECT)
- flags |= FDIRECT;
-#endif
- return (flags);
-}
-
-static ssize_t
-zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
- unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
- cred_t *cr, size_t skip)
-{
- ssize_t read;
- uio_t uio = { { 0 }, 0 };
- int error;
- fstrans_cookie_t cookie;
-
- uio.uio_iov = iovp;
- uio.uio_iovcnt = nr_segs;
- uio.uio_loffset = *ppos;
- uio.uio_segflg = segment;
- uio.uio_limit = MAXOFFSET_T;
- uio.uio_resid = count;
- uio.uio_skip = skip;
-
- cookie = spl_fstrans_mark();
- error = -zfs_read(ip, &uio, flags, cr);
- spl_fstrans_unmark(cookie);
- if (error < 0)
- return (error);
-
- read = count - uio.uio_resid;
- *ppos += read;
-
- return (read);
-}
-
-inline ssize_t
-zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
- uio_seg_t segment, int flags, cred_t *cr)
-{
- struct iovec iov;
-
- iov.iov_base = (void *)buf;
- iov.iov_len = len;
-
- return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
- flags, cr, 0));
-}
-
-static ssize_t
-zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
-{
- cred_t *cr = CRED();
- struct file *filp = kiocb->ki_filp;
- struct inode *ip = filp->f_mapping->host;
- zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
- ssize_t read;
- unsigned int f_flags = filp->f_flags;
-
- f_flags |= zfs_io_flags(kiocb);
- crhold(cr);
- read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
- nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
- crfree(cr);
-
- /*
- * If relatime is enabled, call file_accessed() only if
- * zfs_relatime_need_update() is true. This is needed since datasets
- * with inherited "relatime" property aren't necessarily mounted with
- * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
- * relatime test in VFS by relatime_need_update() is based on.
- */
- if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
- if (zfs_relatime_need_update(ip))
- file_accessed(filp);
- } else {
- file_accessed(filp);
- }
-
- return (read);
-}
-
-#if defined(HAVE_VFS_RW_ITERATE)
-static ssize_t
-zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
-{
- ssize_t ret;
- uio_seg_t seg = UIO_USERSPACE;
- if (to->type & ITER_KVEC)
- seg = UIO_SYSSPACE;
- if (to->type & ITER_BVEC)
- seg = UIO_BVEC;
- ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
- iov_iter_count(to), seg, to->iov_offset);
- if (ret > 0)
- iov_iter_advance(to, ret);
- return (ret);
-}
-#else
-static ssize_t
-zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, loff_t pos)
-{
- ssize_t ret;
- size_t count;
-
- ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
- if (ret)
- return (ret);
-
- return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
- UIO_USERSPACE, 0));
-}
-#endif /* HAVE_VFS_RW_ITERATE */
-
-static ssize_t
-zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
- unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
- cred_t *cr, size_t skip)
-{
- ssize_t wrote;
- uio_t uio = { { 0 }, 0 };
- int error;
- fstrans_cookie_t cookie;
-
- if (flags & O_APPEND)
- *ppos = i_size_read(ip);
-
- uio.uio_iov = iovp;
- uio.uio_iovcnt = nr_segs;
- uio.uio_loffset = *ppos;
- uio.uio_segflg = segment;
- uio.uio_limit = MAXOFFSET_T;
- uio.uio_resid = count;
- uio.uio_skip = skip;
-
- cookie = spl_fstrans_mark();
- error = -zfs_write(ip, &uio, flags, cr);
- spl_fstrans_unmark(cookie);
- if (error < 0)
- return (error);
-
- wrote = count - uio.uio_resid;
- *ppos += wrote;
-
- return (wrote);
-}
-
-inline ssize_t
-zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
- uio_seg_t segment, int flags, cred_t *cr)
-{
- struct iovec iov;
-
- iov.iov_base = (void *)buf;
- iov.iov_len = len;
-
- return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
- flags, cr, 0));
-}
-
-static ssize_t
-zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
-{
- cred_t *cr = CRED();
- struct file *filp = kiocb->ki_filp;
- ssize_t wrote;
- unsigned int f_flags = filp->f_flags;
-
- f_flags |= zfs_io_flags(kiocb);
- crhold(cr);
- wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
- nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
- crfree(cr);
-
- return (wrote);
-}
-
-#if defined(HAVE_VFS_RW_ITERATE)
-static ssize_t
-zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
-{
- size_t count;
- ssize_t ret;
- uio_seg_t seg = UIO_USERSPACE;
-
-#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
- struct file *file = kiocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *ip = mapping->host;
- int isblk = S_ISBLK(ip->i_mode);
-
- count = iov_iter_count(from);
- ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
- if (ret)
- return (ret);
-#else
- /*
- * XXX - ideally this check should be in the same lock region with
- * write operations, so that there's no TOCTTOU race when doing
- * append and someone else grow the file.
- */
- ret = generic_write_checks(kiocb, from);
- if (ret <= 0)
- return (ret);
- count = ret;
-#endif
-
- if (from->type & ITER_KVEC)
- seg = UIO_SYSSPACE;
- if (from->type & ITER_BVEC)
- seg = UIO_BVEC;
-
- ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
- count, seg, from->iov_offset);
- if (ret > 0)
- iov_iter_advance(from, ret);
-
- return (ret);
-}
-#else
-static ssize_t
-zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = kiocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *ip = mapping->host;
- int isblk = S_ISBLK(ip->i_mode);
- size_t count;
- ssize_t ret;
-
- ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
- if (ret)
- return (ret);
-
- ret = generic_write_checks(file, &pos, &count, isblk);
- if (ret)
- return (ret);
-
- return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
- UIO_USERSPACE, 0));
-}
-#endif /* HAVE_VFS_RW_ITERATE */
-
-#if defined(HAVE_VFS_RW_ITERATE)
-static ssize_t
-zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
-{
- if (rw == WRITE)
- return (zpl_iter_write(kiocb, iter));
- else
- return (zpl_iter_read(kiocb, iter));
-}
-#if defined(HAVE_VFS_DIRECT_IO_ITER)
-static ssize_t
-zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
-{
- return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
-}
-#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
-static ssize_t
-zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
-{
- ASSERT3S(pos, ==, kiocb->ki_pos);
- return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
-}
-#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
-static ssize_t
-zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
-{
- ASSERT3S(pos, ==, kiocb->ki_pos);
- return (zpl_direct_IO_impl(rw, kiocb, iter));
-}
-#else
-#error "Unknown direct IO interface"
-#endif
-
-#else
-
-#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
-static ssize_t
-zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
- loff_t pos, unsigned long nr_segs)
-{
- if (rw == WRITE)
- return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
- else
- return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
-}
-#else
-#error "Unknown direct IO interface"
-#endif
-
-#endif /* HAVE_VFS_RW_ITERATE */
-
-static loff_t
-zpl_llseek(struct file *filp, loff_t offset, int whence)
-{
-#if defined(SEEK_HOLE) && defined(SEEK_DATA)
- fstrans_cookie_t cookie;
-
- if (whence == SEEK_DATA || whence == SEEK_HOLE) {
- struct inode *ip = filp->f_mapping->host;
- loff_t maxbytes = ip->i_sb->s_maxbytes;
- loff_t error;
-
- spl_inode_lock_shared(ip);
- cookie = spl_fstrans_mark();
- error = -zfs_holey(ip, whence, &offset);
- spl_fstrans_unmark(cookie);
- if (error == 0)
- error = lseek_execute(filp, ip, offset, maxbytes);
- spl_inode_unlock_shared(ip);
-
- return (error);
- }
-#endif /* SEEK_HOLE && SEEK_DATA */
-
- return (generic_file_llseek(filp, offset, whence));
-}
-
-/*
- * It's worth taking a moment to describe how mmap is implemented
- * for zfs because it differs considerably from other Linux filesystems.
- * However, this issue is handled the same way under OpenSolaris.
- *
- * The issue is that by design zfs bypasses the Linux page cache and
- * leaves all caching up to the ARC. This has been shown to work
- * well for the common read(2)/write(2) case. However, mmap(2)
- * is problem because it relies on being tightly integrated with the
- * page cache. To handle this we cache mmap'ed files twice, once in
- * the ARC and a second time in the page cache. The code is careful
- * to keep both copies synchronized.
- *
- * When a file with an mmap'ed region is written to using write(2)
- * both the data in the ARC and existing pages in the page cache
- * are updated. For a read(2) data will be read first from the page
- * cache then the ARC if needed. Neither a write(2) or read(2) will
- * will ever result in new pages being added to the page cache.
- *
- * New pages are added to the page cache only via .readpage() which
- * is called when the vfs needs to read a page off disk to back the
- * virtual memory region. These pages may be modified without
- * notifying the ARC and will be written out periodically via
- * .writepage(). This will occur due to either a sync or the usual
- * page aging behavior. Note because a read(2) of a mmap'ed file
- * will always check the page cache first even when the ARC is out
- * of date correct data will still be returned.
- *
- * While this implementation ensures correct behavior it does have
- * have some drawbacks. The most obvious of which is that it
- * increases the required memory footprint when access mmap'ed
- * files. It also adds additional complexity to the code keeping
- * both caches synchronized.
- *
- * Longer term it may be possible to cleanly resolve this wart by
- * mapping page cache pages directly on to the ARC buffers. The
- * Linux address space operations are flexible enough to allow
- * selection of which pages back a particular index. The trick
- * would be working out the details of which subsystem is in
- * charge, the ARC, the page cache, or both. It may also prove
- * helpful to move the ARC buffers to a scatter-gather lists
- * rather than a vmalloc'ed region.
- */
-static int
-zpl_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- struct inode *ip = filp->f_mapping->host;
- znode_t *zp = ITOZ(ip);
- int error;
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
- (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
- spl_fstrans_unmark(cookie);
- if (error)
- return (error);
-
- error = generic_file_mmap(filp, vma);
- if (error)
- return (error);
-
- mutex_enter(&zp->z_lock);
- zp->z_is_mapped = B_TRUE;
- mutex_exit(&zp->z_lock);
-
- return (error);
-}
-
-/*
- * Populate a page with data for the Linux page cache. This function is
- * only used to support mmap(2). There will be an identical copy of the
- * data in the ARC which is kept up to date via .write() and .writepage().
- *
- * Current this function relies on zpl_read_common() and the O_DIRECT
- * flag to read in a page. This works but the more correct way is to
- * update zfs_fillpage() to be Linux friendly and use that interface.
- */
-static int
-zpl_readpage(struct file *filp, struct page *pp)
-{
- struct inode *ip;
- struct page *pl[1];
- int error = 0;
- fstrans_cookie_t cookie;
-
- ASSERT(PageLocked(pp));
- ip = pp->mapping->host;
- pl[0] = pp;
-
- cookie = spl_fstrans_mark();
- error = -zfs_getpage(ip, pl, 1);
- spl_fstrans_unmark(cookie);
-
- if (error) {
- SetPageError(pp);
- ClearPageUptodate(pp);
- } else {
- ClearPageError(pp);
- SetPageUptodate(pp);
- flush_dcache_page(pp);
- }
-
- unlock_page(pp);
- return (error);
-}
-
-/*
- * Populate a set of pages with data for the Linux page cache. This
- * function will only be called for read ahead and never for demand
- * paging. For simplicity, the code relies on read_cache_pages() to
- * correctly lock each page for IO and call zpl_readpage().
- */
-static int
-zpl_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- return (read_cache_pages(mapping, pages,
- (filler_t *)zpl_readpage, filp));
-}
-
-int
-zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
-{
- struct address_space *mapping = data;
- fstrans_cookie_t cookie;
-
- ASSERT(PageLocked(pp));
- ASSERT(!PageWriteback(pp));
-
- cookie = spl_fstrans_mark();
- (void) zfs_putpage(mapping->host, pp, wbc);
- spl_fstrans_unmark(cookie);
-
- return (0);
-}
-
-static int
-zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
- znode_t *zp = ITOZ(mapping->host);
- zfsvfs_t *zfsvfs = ITOZSB(mapping->host);
- enum writeback_sync_modes sync_mode;
- int result;
-
- ZFS_ENTER(zfsvfs);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- wbc->sync_mode = WB_SYNC_ALL;
- ZFS_EXIT(zfsvfs);
- sync_mode = wbc->sync_mode;
-
- /*
- * We don't want to run write_cache_pages() in SYNC mode here, because
- * that would make putpage() wait for a single page to be committed to
- * disk every single time, resulting in atrocious performance. Instead
- * we run it once in non-SYNC mode so that the ZIL gets all the data,
- * and then we commit it all in one go.
- */
- wbc->sync_mode = WB_SYNC_NONE;
- result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
- if (sync_mode != wbc->sync_mode) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
-
- /*
- * We need to call write_cache_pages() again (we can't just
- * return after the commit) because the previous call in
- * non-SYNC mode does not guarantee that we got all the dirty
- * pages (see the implementation of write_cache_pages() for
- * details). That being said, this is a no-op in most cases.
- */
- wbc->sync_mode = sync_mode;
- result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
- }
- return (result);
-}
-
-/*
- * Write out dirty pages to the ARC, this function is only required to
- * support mmap(2). Mapped pages may be dirtied by memory operations
- * which never call .write(). These dirty pages are kept in sync with
- * the ARC buffers via this hook.
- */
-static int
-zpl_writepage(struct page *pp, struct writeback_control *wbc)
-{
- if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
- wbc->sync_mode = WB_SYNC_ALL;
-
- return (zpl_putpage(pp, wbc, pp->mapping));
-}
-
-/*
- * The only flag combination which matches the behavior of zfs_space()
- * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
- * flag was introduced in the 2.6.38 kernel.
- */
-#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE)
-long
-zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
-{
- int error = -EOPNOTSUPP;
-
-#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
- cred_t *cr = CRED();
- flock64_t bf;
- loff_t olen;
- fstrans_cookie_t cookie;
-
- if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
- return (error);
-
- if (offset < 0 || len <= 0)
- return (-EINVAL);
-
- spl_inode_lock(ip);
- olen = i_size_read(ip);
-
- if (offset > olen) {
- spl_inode_unlock(ip);
- return (0);
- }
- if (offset + len > olen)
- len = olen - offset;
- bf.l_type = F_WRLCK;
- bf.l_whence = SEEK_SET;
- bf.l_start = offset;
- bf.l_len = len;
- bf.l_pid = 0;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
- spl_fstrans_unmark(cookie);
- spl_inode_unlock(ip);
-
- crfree(cr);
-#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */
-
- ASSERT3S(error, <=, 0);
- return (error);
-}
-#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */
-
-#ifdef HAVE_FILE_FALLOCATE
-static long
-zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
-{
- return zpl_fallocate_common(file_inode(filp),
- mode, offset, len);
-}
-#endif /* HAVE_FILE_FALLOCATE */
-
-#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
-#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
-
-static uint32_t
-__zpl_ioctl_getflags(struct inode *ip)
-{
- uint64_t zfs_flags = ITOZ(ip)->z_pflags;
- uint32_t ioctl_flags = 0;
-
- if (zfs_flags & ZFS_IMMUTABLE)
- ioctl_flags |= FS_IMMUTABLE_FL;
-
- if (zfs_flags & ZFS_APPENDONLY)
- ioctl_flags |= FS_APPEND_FL;
-
- if (zfs_flags & ZFS_NODUMP)
- ioctl_flags |= FS_NODUMP_FL;
-
- if (zfs_flags & ZFS_PROJINHERIT)
- ioctl_flags |= ZFS_PROJINHERIT_FL;
-
- return (ioctl_flags & ZFS_FL_USER_VISIBLE);
-}
-
-/*
- * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
- * attributes common to both Linux and Solaris are mapped.
- */
-static int
-zpl_ioctl_getflags(struct file *filp, void __user *arg)
-{
- uint32_t flags;
- int err;
-
- flags = __zpl_ioctl_getflags(file_inode(filp));
- err = copy_to_user(arg, &flags, sizeof (flags));
-
- return (err);
-}
-
-/*
- * fchange() is a helper macro to detect if we have been asked to change a
- * flag. This is ugly, but the requirement that we do this is a consequence of
- * how the Linux file attribute interface was designed. Another consequence is
- * that concurrent modification of files suffers from a TOCTOU race. Neither
- * are things we can fix without modifying the kernel-userland interface, which
- * is outside of our jurisdiction.
- */
-
-#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
-
-static int
-__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
-{
- uint64_t zfs_flags = ITOZ(ip)->z_pflags;
- xoptattr_t *xoap;
-
- if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
- ZFS_PROJINHERIT_FL))
- return (-EOPNOTSUPP);
-
- if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
- return (-EACCES);
-
- if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
- fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return (-EACCES);
-
- if (!zpl_inode_owner_or_capable(ip))
- return (-EACCES);
-
- xva_init(xva);
- xoap = xva_getxoptattr(xva);
-
- XVA_SET_REQ(xva, XAT_IMMUTABLE);
- if (ioctl_flags & FS_IMMUTABLE_FL)
- xoap->xoa_immutable = B_TRUE;
-
- XVA_SET_REQ(xva, XAT_APPENDONLY);
- if (ioctl_flags & FS_APPEND_FL)
- xoap->xoa_appendonly = B_TRUE;
-
- XVA_SET_REQ(xva, XAT_NODUMP);
- if (ioctl_flags & FS_NODUMP_FL)
- xoap->xoa_nodump = B_TRUE;
-
- XVA_SET_REQ(xva, XAT_PROJINHERIT);
- if (ioctl_flags & ZFS_PROJINHERIT_FL)
- xoap->xoa_projinherit = B_TRUE;
-
- return (0);
-}
-
-static int
-zpl_ioctl_setflags(struct file *filp, void __user *arg)
-{
- struct inode *ip = file_inode(filp);
- uint32_t flags;
- cred_t *cr = CRED();
- xvattr_t xva;
- int err;
- fstrans_cookie_t cookie;
-
- if (copy_from_user(&flags, arg, sizeof (flags)))
- return (-EFAULT);
-
- err = __zpl_ioctl_setflags(ip, flags, &xva);
- if (err)
- return (err);
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- return (err);
-}
-
-static int
-zpl_ioctl_getxattr(struct file *filp, void __user *arg)
-{
- zfsxattr_t fsx = { 0 };
- struct inode *ip = file_inode(filp);
- int err;
-
- fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
- fsx.fsx_projid = ITOZ(ip)->z_projid;
- err = copy_to_user(arg, &fsx, sizeof (fsx));
-
- return (err);
-}
-
-static int
-zpl_ioctl_setxattr(struct file *filp, void __user *arg)
-{
- struct inode *ip = file_inode(filp);
- zfsxattr_t fsx;
- cred_t *cr = CRED();
- xvattr_t xva;
- xoptattr_t *xoap;
- int err;
- fstrans_cookie_t cookie;
-
- if (copy_from_user(&fsx, arg, sizeof (fsx)))
- return (-EFAULT);
-
- if (!zpl_is_valid_projid(fsx.fsx_projid))
- return (-EINVAL);
-
- err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
- if (err)
- return (err);
-
- xoap = xva_getxoptattr(&xva);
- XVA_SET_REQ(&xva, XAT_PROJID);
- xoap->xoa_projid = fsx.fsx_projid;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- return (err);
-}
-
-static long
-zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- return (zpl_ioctl_getflags(filp, (void *)arg));
- case FS_IOC_SETFLAGS:
- return (zpl_ioctl_setflags(filp, (void *)arg));
- case ZFS_IOC_FSGETXATTR:
- return (zpl_ioctl_getxattr(filp, (void *)arg));
- case ZFS_IOC_FSSETXATTR:
- return (zpl_ioctl_setxattr(filp, (void *)arg));
- default:
- return (-ENOTTY);
- }
-}
-
-#ifdef CONFIG_COMPAT
-static long
-zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
- default:
- return (-ENOTTY);
- }
- return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
-}
-#endif /* CONFIG_COMPAT */
-
-
-const struct address_space_operations zpl_address_space_operations = {
- .readpages = zpl_readpages,
- .readpage = zpl_readpage,
- .writepage = zpl_writepage,
- .writepages = zpl_writepages,
- .direct_IO = zpl_direct_IO,
-};
-
-const struct file_operations zpl_file_operations = {
- .open = zpl_open,
- .release = zpl_release,
- .llseek = zpl_llseek,
-#ifdef HAVE_VFS_RW_ITERATE
-#ifdef HAVE_NEW_SYNC_READ
- .read = new_sync_read,
- .write = new_sync_write,
-#endif
- .read_iter = zpl_iter_read,
- .write_iter = zpl_iter_write,
-#else
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = zpl_aio_read,
- .aio_write = zpl_aio_write,
-#endif
- .mmap = zpl_mmap,
- .fsync = zpl_fsync,
-#ifdef HAVE_FILE_AIO_FSYNC
- .aio_fsync = zpl_aio_fsync,
-#endif
-#ifdef HAVE_FILE_FALLOCATE
- .fallocate = zpl_fallocate,
-#endif /* HAVE_FILE_FALLOCATE */
- .unlocked_ioctl = zpl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = zpl_compat_ioctl,
-#endif
-};
-
-const struct file_operations zpl_dir_file_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
-#if defined(HAVE_VFS_ITERATE_SHARED)
- .iterate_shared = zpl_iterate,
-#elif defined(HAVE_VFS_ITERATE)
- .iterate = zpl_iterate,
-#else
- .readdir = zpl_readdir,
-#endif
- .fsync = zpl_fsync,
- .unlocked_ioctl = zpl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = zpl_compat_ioctl,
-#endif
-};
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
deleted file mode 100644
index 3f3b2e2dc..000000000
--- a/module/zfs/zpl_inode.c
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
- * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
- */
-
-
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/dmu_objset.h>
-#include <sys/vfs.h>
-#include <sys/zpl.h>
-#include <sys/file.h>
-
-
-static struct dentry *
-#ifdef HAVE_LOOKUP_NAMEIDATA
-zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-#else
-zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
-#endif
-{
- cred_t *cr = CRED();
- struct inode *ip;
- int error;
- fstrans_cookie_t cookie;
- pathname_t *ppn = NULL;
- pathname_t pn;
- int zfs_flags = 0;
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
-
- if (dlen(dentry) >= ZAP_MAXNAMELEN)
- return (ERR_PTR(-ENAMETOOLONG));
-
- crhold(cr);
- cookie = spl_fstrans_mark();
-
- /* If we are a case insensitive fs, we need the real name */
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- zfs_flags = FIGNORECASE;
- pn_alloc(&pn);
- ppn = &pn;
- }
-
- error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
- crfree(cr);
-
- spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
-#ifndef HAVE_S_D_OP
- d_set_d_op(dentry, &zpl_dentry_operations);
-#endif /* HAVE_S_D_OP */
- spin_unlock(&dentry->d_lock);
-
- if (error) {
- /*
- * If we have a case sensitive fs, we do not want to
- * insert negative entries, so return NULL for ENOENT.
- * Fall through if the error is not ENOENT. Also free memory.
- */
- if (ppn) {
- pn_free(ppn);
- if (error == -ENOENT)
- return (NULL);
- }
-
- if (error == -ENOENT)
- return (d_splice_alias(NULL, dentry));
- else
- return (ERR_PTR(error));
- }
-
- /*
- * If we are case insensitive, call the correct function
- * to install the name.
- */
- if (ppn) {
- struct dentry *new_dentry;
- struct qstr ci_name;
-
- if (strcmp(dname(dentry), pn.pn_buf) == 0) {
- new_dentry = d_splice_alias(ip, dentry);
- } else {
- ci_name.name = pn.pn_buf;
- ci_name.len = strlen(pn.pn_buf);
- new_dentry = d_add_ci(dentry, ip, &ci_name);
- }
- pn_free(ppn);
- return (new_dentry);
- } else {
- return (d_splice_alias(ip, dentry));
- }
-}
-
-void
-zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr)
-{
- vap->va_mask = ATTR_MODE;
- vap->va_mode = mode;
- vap->va_uid = crgetfsuid(cr);
-
- if (dir && dir->i_mode & S_ISGID) {
- vap->va_gid = KGID_TO_SGID(dir->i_gid);
- if (S_ISDIR(mode))
- vap->va_mode |= S_ISGID;
- } else {
- vap->va_gid = crgetfsgid(cr);
- }
-}
-
-static int
-#ifdef HAVE_CREATE_NAMEIDATA
-zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
- struct nameidata *nd)
-#else
-zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
- bool flag)
-#endif
-{
- cred_t *cr = CRED();
- struct inode *ip;
- vattr_t *vap;
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode, cr);
-
- cookie = spl_fstrans_mark();
- error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
- if (error == 0) {
- d_instantiate(dentry, ip);
-
- error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
- if (error == 0)
- error = zpl_init_acl(ip, dir);
-
- if (error)
- (void) zfs_remove(dir, dname(dentry), cr, 0);
- }
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
- dev_t rdev)
-{
- cred_t *cr = CRED();
- struct inode *ip;
- vattr_t *vap;
- int error;
- fstrans_cookie_t cookie;
-
- /*
- * We currently expect Linux to supply rdev=0 for all sockets
- * and fifos, but we want to know if this behavior ever changes.
- */
- if (S_ISSOCK(mode) || S_ISFIFO(mode))
- ASSERT(rdev == 0);
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode, cr);
- vap->va_rdev = rdev;
-
- cookie = spl_fstrans_mark();
- error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
- if (error == 0) {
- d_instantiate(dentry, ip);
-
- error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
- if (error == 0)
- error = zpl_init_acl(ip, dir);
-
- if (error)
- (void) zfs_remove(dir, dname(dentry), cr, 0);
- }
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifdef HAVE_TMPFILE
-static int
-zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
-{
- cred_t *cr = CRED();
- struct inode *ip;
- vattr_t *vap;
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode, cr);
-
- cookie = spl_fstrans_mark();
- error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
- if (error == 0) {
- /* d_tmpfile will do drop_nlink, so we should set it first */
- set_nlink(ip, 1);
- d_tmpfile(dentry, ip);
-
- error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
- if (error == 0)
- error = zpl_init_acl(ip, dir);
- /*
- * don't need to handle error here, file is already in
- * unlinked set.
- */
- }
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-#endif
-
-static int
-zpl_unlink(struct inode *dir, struct dentry *dentry)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_remove(dir, dname(dentry), cr, 0);
-
- /*
- * For a CI FS we must invalidate the dentry to prevent the
- * creation of negative entries.
- */
- if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
- d_invalidate(dentry);
-
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
-{
- cred_t *cr = CRED();
- vattr_t *vap;
- struct inode *ip;
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
-
- cookie = spl_fstrans_mark();
- error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
- if (error == 0) {
- d_instantiate(dentry, ip);
-
- error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
- if (error == 0)
- error = zpl_init_acl(ip, dir);
-
- if (error)
- (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
- }
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_rmdir(struct inode *dir, struct dentry *dentry)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
-
- /*
- * For a CI FS we must invalidate the dentry to prevent the
- * creation of negative entries.
- */
- if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
- d_invalidate(dentry);
-
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
- unsigned int query_flags)
-{
- int error;
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
-
- /*
- * XXX request_mask and query_flags currently ignored.
- */
-
- error = -zfs_getattr_fast(path->dentry->d_inode, stat);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-ZPL_GETATTR_WRAPPER(zpl_getattr);
-
-static int
-zpl_setattr(struct dentry *dentry, struct iattr *ia)
-{
- struct inode *ip = dentry->d_inode;
- cred_t *cr = CRED();
- vattr_t *vap;
- int error;
- fstrans_cookie_t cookie;
-
- error = setattr_prepare(dentry, ia);
- if (error)
- return (error);
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
- vap->va_mode = ia->ia_mode;
- vap->va_uid = KUID_TO_SUID(ia->ia_uid);
- vap->va_gid = KGID_TO_SGID(ia->ia_gid);
- vap->va_size = ia->ia_size;
- vap->va_atime = ia->ia_atime;
- vap->va_mtime = ia->ia_mtime;
- vap->va_ctime = ia->ia_ctime;
-
- if (vap->va_mask & ATTR_ATIME) {
- ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime,
- ip->i_sb->s_time_gran);
- }
-
- cookie = spl_fstrans_mark();
- error = -zfs_setattr(ip, vap, 0, cr);
- if (!error && (ia->ia_valid & ATTR_MODE))
- error = zpl_chmod_acl(ip);
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_rename2(struct inode *sdip, struct dentry *sdentry,
- struct inode *tdip, struct dentry *tdentry, unsigned int flags)
-{
- cred_t *cr = CRED();
- int error;
- fstrans_cookie_t cookie;
-
- /* We don't have renameat2(2) support */
- if (flags)
- return (-EINVAL);
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifndef HAVE_RENAME_WANTS_FLAGS
-static int
-zpl_rename(struct inode *sdip, struct dentry *sdentry,
- struct inode *tdip, struct dentry *tdentry)
-{
- return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
-}
-#endif
-
-static int
-zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
-{
- cred_t *cr = CRED();
- vattr_t *vap;
- struct inode *ip;
- int error;
- fstrans_cookie_t cookie;
-
- crhold(cr);
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
-
- cookie = spl_fstrans_mark();
- error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
- if (error == 0) {
- d_instantiate(dentry, ip);
-
- error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
- if (error)
- (void) zfs_remove(dir, dname(dentry), cr, 0);
- }
-
- spl_fstrans_unmark(cookie);
- kmem_free(vap, sizeof (vattr_t));
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#if defined(HAVE_PUT_LINK_COOKIE)
-static void
-zpl_put_link(struct inode *unused, void *cookie)
-{
- kmem_free(cookie, MAXPATHLEN);
-}
-#elif defined(HAVE_PUT_LINK_NAMEIDATA)
-static void
-zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
-{
- const char *link = nd_get_link(nd);
-
- if (!IS_ERR(link))
- kmem_free(link, MAXPATHLEN);
-}
-#elif defined(HAVE_PUT_LINK_DELAYED)
-static void
-zpl_put_link(void *ptr)
-{
- kmem_free(ptr, MAXPATHLEN);
-}
-#endif
-
-static int
-zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
-{
- fstrans_cookie_t cookie;
- cred_t *cr = CRED();
- struct iovec iov;
- uio_t uio = { { 0 }, 0 };
- int error;
-
- crhold(cr);
- *link = NULL;
- iov.iov_len = MAXPATHLEN;
- iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = (MAXPATHLEN - 1);
-
- cookie = spl_fstrans_mark();
- error = -zfs_readlink(ip, &uio, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- if (error)
- kmem_free(iov.iov_base, MAXPATHLEN);
- else
- *link = iov.iov_base;
-
- return (error);
-}
-
-#if defined(HAVE_GET_LINK_DELAYED)
-const char *
-zpl_get_link(struct dentry *dentry, struct inode *inode,
- struct delayed_call *done)
-{
- char *link = NULL;
- int error;
-
- if (!dentry)
- return (ERR_PTR(-ECHILD));
-
- error = zpl_get_link_common(dentry, inode, &link);
- if (error)
- return (ERR_PTR(error));
-
- set_delayed_call(done, zpl_put_link, link);
-
- return (link);
-}
-#elif defined(HAVE_GET_LINK_COOKIE)
-const char *
-zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
-{
- char *link = NULL;
- int error;
-
- if (!dentry)
- return (ERR_PTR(-ECHILD));
-
- error = zpl_get_link_common(dentry, inode, &link);
- if (error)
- return (ERR_PTR(error));
-
- return (*cookie = link);
-}
-#elif defined(HAVE_FOLLOW_LINK_COOKIE)
-const char *
-zpl_follow_link(struct dentry *dentry, void **cookie)
-{
- char *link = NULL;
- int error;
-
- error = zpl_get_link_common(dentry, dentry->d_inode, &link);
- if (error)
- return (ERR_PTR(error));
-
- return (*cookie = link);
-}
-#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
-static void *
-zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- char *link = NULL;
- int error;
-
- error = zpl_get_link_common(dentry, dentry->d_inode, &link);
- if (error)
- nd_set_link(nd, ERR_PTR(error));
- else
- nd_set_link(nd, link);
-
- return (NULL);
-}
-#endif
-
-static int
-zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
-{
- cred_t *cr = CRED();
- struct inode *ip = old_dentry->d_inode;
- int error;
- fstrans_cookie_t cookie;
-
- if (ip->i_nlink >= ZFS_LINK_MAX)
- return (-EMLINK);
-
- crhold(cr);
- ip->i_ctime = current_time(ip);
- igrab(ip); /* Use ihold() if available */
-
- cookie = spl_fstrans_mark();
- error = -zfs_link(dir, ip, dname(dentry), cr, 0);
- if (error) {
- iput(ip);
- goto out;
- }
-
- d_instantiate(dentry, ip);
-out:
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-#ifdef HAVE_INODE_TRUNCATE_RANGE
-static void
-zpl_truncate_range(struct inode *ip, loff_t start, loff_t end)
-{
- cred_t *cr = CRED();
- flock64_t bf;
- fstrans_cookie_t cookie;
-
- ASSERT3S(start, <=, end);
-
- /*
- * zfs_freesp() will interpret (len == 0) as meaning "truncate until
- * the end of the file". We don't want that.
- */
- if (start == end)
- return;
-
- crhold(cr);
-
- bf.l_type = F_WRLCK;
- bf.l_whence = SEEK_SET;
- bf.l_start = start;
- bf.l_len = end - start;
- bf.l_pid = 0;
- cookie = spl_fstrans_mark();
- zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr);
- spl_fstrans_unmark(cookie);
-
- crfree(cr);
-}
-#endif /* HAVE_INODE_TRUNCATE_RANGE */
-
-#ifdef HAVE_INODE_FALLOCATE
-static long
-zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len)
-{
- return (zpl_fallocate_common(ip, mode, offset, len));
-}
-#endif /* HAVE_INODE_FALLOCATE */
-
-static int
-#ifdef HAVE_D_REVALIDATE_NAMEIDATA
-zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- unsigned int flags = (nd ? nd->flags : 0);
-#else
-zpl_revalidate(struct dentry *dentry, unsigned int flags)
-{
-#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
- /* CSTYLED */
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
- int error;
-
- if (flags & LOOKUP_RCU)
- return (-ECHILD);
-
- /*
- * Automounted snapshots rely on periodic dentry revalidation
- * to defer snapshots from being automatically unmounted.
- */
- if (zfsvfs->z_issnap) {
- if (time_after(jiffies, zfsvfs->z_snap_defer_time +
- MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
- zfsvfs->z_snap_defer_time = jiffies;
- zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
- dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot);
- }
- }
-
- /*
- * After a rollback negative dentries created before the rollback
- * time must be invalidated. Otherwise they can obscure files which
- * are only present in the rolled back dataset.
- */
- if (dentry->d_inode == NULL) {
- spin_lock(&dentry->d_lock);
- error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
- spin_unlock(&dentry->d_lock);
-
- if (error)
- return (0);
- }
-
- /*
- * The dentry may reference a stale inode if a mounted file system
- * was rolled back to a point in time where the object didn't exist.
- */
- if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
- return (0);
-
- return (1);
-}
-
-const struct inode_operations zpl_inode_operations = {
- .setattr = zpl_setattr,
- .getattr = zpl_getattr,
-#ifdef HAVE_GENERIC_SETXATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
-#endif
- .listxattr = zpl_xattr_list,
-#ifdef HAVE_INODE_TRUNCATE_RANGE
- .truncate_range = zpl_truncate_range,
-#endif /* HAVE_INODE_TRUNCATE_RANGE */
-#ifdef HAVE_INODE_FALLOCATE
- .fallocate = zpl_fallocate,
-#endif /* HAVE_INODE_FALLOCATE */
-#if defined(CONFIG_FS_POSIX_ACL)
-#if defined(HAVE_SET_ACL)
- .set_acl = zpl_set_acl,
-#endif
-#if defined(HAVE_GET_ACL)
- .get_acl = zpl_get_acl,
-#elif defined(HAVE_CHECK_ACL)
- .check_acl = zpl_check_acl,
-#elif defined(HAVE_PERMISSION)
- .permission = zpl_permission,
-#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
-#endif /* CONFIG_FS_POSIX_ACL */
-};
-
-const struct inode_operations zpl_dir_inode_operations = {
- .create = zpl_create,
- .lookup = zpl_lookup,
- .link = zpl_link,
- .unlink = zpl_unlink,
- .symlink = zpl_symlink,
- .mkdir = zpl_mkdir,
- .rmdir = zpl_rmdir,
- .mknod = zpl_mknod,
-#ifdef HAVE_RENAME_WANTS_FLAGS
- .rename = zpl_rename2,
-#else
- .rename = zpl_rename,
-#endif
-#ifdef HAVE_TMPFILE
- .tmpfile = zpl_tmpfile,
-#endif
- .setattr = zpl_setattr,
- .getattr = zpl_getattr,
-#ifdef HAVE_GENERIC_SETXATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
-#endif
- .listxattr = zpl_xattr_list,
-#if defined(CONFIG_FS_POSIX_ACL)
-#if defined(HAVE_SET_ACL)
- .set_acl = zpl_set_acl,
-#endif
-#if defined(HAVE_GET_ACL)
- .get_acl = zpl_get_acl,
-#elif defined(HAVE_CHECK_ACL)
- .check_acl = zpl_check_acl,
-#elif defined(HAVE_PERMISSION)
- .permission = zpl_permission,
-#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
-#endif /* CONFIG_FS_POSIX_ACL */
-};
-
-const struct inode_operations zpl_symlink_inode_operations = {
-#ifdef HAVE_GENERIC_READLINK
- .readlink = generic_readlink,
-#endif
-#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
- .get_link = zpl_get_link,
-#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
- .follow_link = zpl_follow_link,
-#endif
-#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
- .put_link = zpl_put_link,
-#endif
- .setattr = zpl_setattr,
- .getattr = zpl_getattr,
-#ifdef HAVE_GENERIC_SETXATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
-#endif
- .listxattr = zpl_xattr_list,
-};
-
-const struct inode_operations zpl_special_inode_operations = {
- .setattr = zpl_setattr,
- .getattr = zpl_getattr,
-#ifdef HAVE_GENERIC_SETXATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .removexattr = generic_removexattr,
-#endif
- .listxattr = zpl_xattr_list,
-#if defined(CONFIG_FS_POSIX_ACL)
-#if defined(HAVE_SET_ACL)
- .set_acl = zpl_set_acl,
-#endif
-#if defined(HAVE_GET_ACL)
- .get_acl = zpl_get_acl,
-#elif defined(HAVE_CHECK_ACL)
- .check_acl = zpl_check_acl,
-#elif defined(HAVE_PERMISSION)
- .permission = zpl_permission,
-#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
-#endif /* CONFIG_FS_POSIX_ACL */
-};
-
-dentry_operations_t zpl_dentry_operations = {
- .d_revalidate = zpl_revalidate,
-};
diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c
deleted file mode 100644
index 810ab2898..000000000
--- a/module/zfs/zpl_super.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
- */
-
-
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zpl.h>
-
-
-static struct inode *
-zpl_inode_alloc(struct super_block *sb)
-{
- struct inode *ip;
-
- VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
- inode_set_iversion(ip, 1);
-
- return (ip);
-}
-
-static void
-zpl_inode_destroy(struct inode *ip)
-{
- ASSERT(atomic_read(&ip->i_count) == 0);
- zfs_inode_destroy(ip);
-}
-
-/*
- * Called from __mark_inode_dirty() to reflect that something in the
- * inode has changed. We use it to ensure the znode system attributes
- * are always strictly update to date with respect to the inode.
- */
-#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
-static void
-zpl_dirty_inode(struct inode *ip, int flags)
-{
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- zfs_dirty_inode(ip, flags);
- spl_fstrans_unmark(cookie);
-}
-#else
-static void
-zpl_dirty_inode(struct inode *ip)
-{
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- zfs_dirty_inode(ip, 0);
- spl_fstrans_unmark(cookie);
-}
-#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
-
-/*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache. If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
- *
- * Prior to 2.6.36 this eviction was accomplished by the vfs calling
- * ->delete_inode(). It was ->delete_inode()'s responsibility to
- * truncate the inode pages and call clear_inode(). The call to
- * clear_inode() synchronously invalidates all the buffers and
- * calls ->clear_inode(). It was ->clear_inode()'s responsibility
- * to cleanup and filesystem specific data before freeing the inode.
- *
- * This elaborate mechanism was replaced by ->evict_inode() which
- * does the job of both ->delete_inode() and ->clear_inode(). It
- * will be called exactly once, and when it returns the inode must
- * be in a state where it can simply be freed.i
- *
- * The ->evict_inode() callback must minimally truncate the inode pages,
- * and call clear_inode(). For 2.6.35 and later kernels this will
- * simply update the inode state, with the sync occurring before the
- * truncate in evict(). For earlier kernels clear_inode() maps to
- * end_writeback() which is responsible for completing all outstanding
- * write back. In either case, once this is done it is safe to cleanup
- * any remaining inode specific data via zfs_inactive().
- * remaining filesystem specific data.
- */
-#ifdef HAVE_EVICT_INODE
-static void
-zpl_evict_inode(struct inode *ip)
-{
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- truncate_setsize(ip, 0);
- clear_inode(ip);
- zfs_inactive(ip);
- spl_fstrans_unmark(cookie);
-}
-
-#else
-
-static void
-zpl_drop_inode(struct inode *ip)
-{
- generic_delete_inode(ip);
-}
-
-static void
-zpl_clear_inode(struct inode *ip)
-{
- fstrans_cookie_t cookie;
-
- cookie = spl_fstrans_mark();
- zfs_inactive(ip);
- spl_fstrans_unmark(cookie);
-}
-
-static void
-zpl_inode_delete(struct inode *ip)
-{
- truncate_setsize(ip, 0);
- clear_inode(ip);
-}
-#endif /* HAVE_EVICT_INODE */
-
-static void
-zpl_put_super(struct super_block *sb)
-{
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_umount(sb);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-}
-
-static int
-zpl_sync_fs(struct super_block *sb, int wait)
-{
- fstrans_cookie_t cookie;
- cred_t *cr = CRED();
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- error = -zfs_sync(sb, wait, cr);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
-{
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_statvfs(dentry, statp);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- /*
- * If required by a 32-bit system call, dynamically scale the
- * block size up to 16MiB and decrease the block counts. This
- * allows for a maximum size of 64EiB to be reported. The file
- * counts must be artificially capped at 2^32-1.
- */
- if (unlikely(zpl_is_32bit_api())) {
- while (statp->f_blocks > UINT32_MAX &&
- statp->f_bsize < SPA_MAXBLOCKSIZE) {
- statp->f_frsize <<= 1;
- statp->f_bsize <<= 1;
-
- statp->f_blocks >>= 1;
- statp->f_bfree >>= 1;
- statp->f_bavail >>= 1;
- }
-
- uint64_t usedobjs = statp->f_files - statp->f_ffree;
- statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
- statp->f_files = statp->f_ffree + usedobjs;
- }
-
- return (error);
-}
-
-static int
-zpl_remount_fs(struct super_block *sb, int *flags, char *data)
-{
- zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_remount(sb, flags, &zm);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
-{
- seq_printf(seq, ",%s",
- zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
-
-#ifdef CONFIG_FS_POSIX_ACL
- switch (zfsvfs->z_acl_type) {
- case ZFS_ACLTYPE_POSIXACL:
- seq_puts(seq, ",posixacl");
- break;
- default:
- seq_puts(seq, ",noacl");
- break;
- }
-#endif /* CONFIG_FS_POSIX_ACL */
-
- return (0);
-}
-
-#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY
-static int
-zpl_show_options(struct seq_file *seq, struct dentry *root)
-{
- return (__zpl_show_options(seq, root->d_sb->s_fs_info));
-}
-#else
-static int
-zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp)
-{
- return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info));
-}
-#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */
-
-static int
-zpl_fill_super(struct super_block *sb, void *data, int silent)
-{
- zfs_mnt_t *zm = (zfs_mnt_t *)data;
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_domount(sb, zm, silent);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_test_super(struct super_block *s, void *data)
-{
- zfsvfs_t *zfsvfs = s->s_fs_info;
- objset_t *os = data;
-
- if (zfsvfs == NULL)
- return (0);
-
- return (os == zfsvfs->z_os);
-}
-
-static struct super_block *
-zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
-{
- struct super_block *s;
- objset_t *os;
- int err;
-
- err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
- if (err)
- return (ERR_PTR(-err));
-
- /*
- * The dsl pool lock must be released prior to calling sget().
- * It is possible sget() may block on the lock in grab_super()
- * while deactivate_super() holds that same lock and waits for
- * a txg sync. If the dsl_pool lock is held over sget()
- * this can prevent the pool sync and cause a deadlock.
- */
- dsl_pool_rele(dmu_objset_pool(os), FTAG);
- s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os);
- dsl_dataset_rele(dmu_objset_ds(os), FTAG);
-
- if (IS_ERR(s))
- return (ERR_CAST(s));
-
- if (s->s_root == NULL) {
- err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
- if (err) {
- deactivate_locked_super(s);
- return (ERR_PTR(err));
- }
- s->s_flags |= SB_ACTIVE;
- } else if ((flags ^ s->s_flags) & SB_RDONLY) {
- deactivate_locked_super(s);
- return (ERR_PTR(-EBUSY));
- }
-
- return (s);
-}
-
-#ifdef HAVE_FST_MOUNT
-static struct dentry *
-zpl_mount(struct file_system_type *fs_type, int flags,
- const char *osname, void *data)
-{
- zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
-
- struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
- if (IS_ERR(sb))
- return (ERR_CAST(sb));
-
- return (dget(sb->s_root));
-}
-#else
-static int
-zpl_get_sb(struct file_system_type *fs_type, int flags,
- const char *osname, void *data, struct vfsmount *mnt)
-{
- zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
-
- struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
- if (IS_ERR(sb))
- return (PTR_ERR(sb));
-
- (void) simple_set_mnt(mnt, sb);
-
- return (0);
-}
-#endif /* HAVE_FST_MOUNT */
-
-static void
-zpl_kill_sb(struct super_block *sb)
-{
- zfs_preumount(sb);
- kill_anon_super(sb);
-
-#ifdef HAVE_S_INSTANCES_LIST_HEAD
- sb->s_instances.next = &(zpl_fs_type.fs_supers);
-#endif /* HAVE_S_INSTANCES_LIST_HEAD */
-}
-
-void
-zpl_prune_sb(int64_t nr_to_scan, void *arg)
-{
- struct super_block *sb = (struct super_block *)arg;
- int objects = 0;
-
- (void) -zfs_prune(sb, nr_to_scan, &objects);
-}
-
-#ifdef HAVE_NR_CACHED_OBJECTS
-static int
-zpl_nr_cached_objects(struct super_block *sb)
-{
- return (0);
-}
-#endif /* HAVE_NR_CACHED_OBJECTS */
-
-#ifdef HAVE_FREE_CACHED_OBJECTS
-static void
-zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
-{
- /* noop */
-}
-#endif /* HAVE_FREE_CACHED_OBJECTS */
-
-const struct super_operations zpl_super_operations = {
- .alloc_inode = zpl_inode_alloc,
- .destroy_inode = zpl_inode_destroy,
- .dirty_inode = zpl_dirty_inode,
- .write_inode = NULL,
-#ifdef HAVE_EVICT_INODE
- .evict_inode = zpl_evict_inode,
-#else
- .drop_inode = zpl_drop_inode,
- .clear_inode = zpl_clear_inode,
- .delete_inode = zpl_inode_delete,
-#endif /* HAVE_EVICT_INODE */
- .put_super = zpl_put_super,
- .sync_fs = zpl_sync_fs,
- .statfs = zpl_statfs,
- .remount_fs = zpl_remount_fs,
- .show_options = zpl_show_options,
- .show_stats = NULL,
-#ifdef HAVE_NR_CACHED_OBJECTS
- .nr_cached_objects = zpl_nr_cached_objects,
-#endif /* HAVE_NR_CACHED_OBJECTS */
-#ifdef HAVE_FREE_CACHED_OBJECTS
- .free_cached_objects = zpl_free_cached_objects,
-#endif /* HAVE_FREE_CACHED_OBJECTS */
-};
-
-struct file_system_type zpl_fs_type = {
- .owner = THIS_MODULE,
- .name = ZFS_DRIVER,
-#ifdef HAVE_FST_MOUNT
- .mount = zpl_mount,
-#else
- .get_sb = zpl_get_sb,
-#endif /* HAVE_FST_MOUNT */
- .kill_sb = zpl_kill_sb,
-};
diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c
deleted file mode 100644
index 95523f28e..000000000
--- a/module/zfs/zpl_xattr.c
+++ /dev/null
@@ -1,1548 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
- *
- * Extended attributes (xattr) on Solaris are implemented as files
- * which exist in a hidden xattr directory. These extended attributes
- * can be accessed using the attropen() system call which opens
- * the extended attribute. It can then be manipulated just like
- * a standard file descriptor. This has a couple advantages such
- * as practically no size limit on the file, and the extended
- * attributes permissions may differ from those of the parent file.
- * This interface is really quite clever, but it's also completely
- * different than what is supported on Linux. It also comes with a
- * steep performance penalty when accessing small xattrs because they
- * are not stored with the parent file.
- *
- * Under Linux extended attributes are manipulated by the system
- * calls getxattr(2), setxattr(2), and listxattr(2). They consider
- * extended attributes to be name/value pairs where the name is a
- * NULL terminated string. The name must also include one of the
- * following namespace prefixes:
- *
- * user - No restrictions and is available to user applications.
- * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use.
- * system - Used for access control lists (system.nfs4_acl, etc).
- * security - Used by SELinux to store a files security context.
- *
- * The value under Linux to limited to 65536 bytes of binary data.
- * In practice, individual xattrs tend to be much smaller than this
- * and are typically less than 100 bytes. A good example of this
- * are the security.selinux xattrs which are less than 100 bytes and
- * exist for every file when xattr labeling is enabled.
- *
- * The Linux xattr implementation has been written to take advantage of
- * this typical usage. When the dataset property 'xattr=sa' is set,
- * then xattrs will be preferentially stored as System Attributes (SA).
- * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
- * up to 64k of xattrs to be stored in the spill block. If additional
- * xattr space is required, which is unlikely under Linux, they will
- * be stored using the traditional directory approach.
- *
- * This optimization results in roughly a 3x performance improvement
- * when accessing xattrs because it avoids the need to perform a seek
- * for every xattr value. When multiple xattrs are stored per-file
- * the performance improvements are even greater because all of the
- * xattrs stored in the spill block will be cached.
- *
- * However, by default SA based xattrs are disabled in the Linux port
- * to maximize compatibility with other implementations. If you do
- * enable SA based xattrs then they will not be visible on platforms
- * which do not support this feature.
- *
- * NOTE: One additional consequence of the xattr directory implementation
- * is that when an extended attribute is manipulated an inode is created.
- * This inode will exist in the Linux inode cache but there will be no
- * associated entry in the dentry cache which references it. This is
- * safe but it may result in some confusion. Enabling SA based xattrs
- * largely avoids the issue except in the overflow case.
- */
-
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_vnops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zap.h>
-#include <sys/vfs.h>
-#include <sys/zpl.h>
-
-typedef struct xattr_filldir {
- size_t size;
- size_t offset;
- char *buf;
- struct dentry *dentry;
-} xattr_filldir_t;
-
-static const struct xattr_handler *zpl_xattr_handler(const char *);
-
-static int
-zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
-{
- static const struct xattr_handler *handler;
- struct dentry *d = xf->dentry;
-
- handler = zpl_xattr_handler(name);
- if (!handler)
- return (0);
-
- if (handler->list) {
-#if defined(HAVE_XATTR_LIST_SIMPLE)
- if (!handler->list(d))
- return (0);
-#elif defined(HAVE_XATTR_LIST_DENTRY)
- if (!handler->list(d, NULL, 0, name, name_len, 0))
- return (0);
-#elif defined(HAVE_XATTR_LIST_HANDLER)
- if (!handler->list(handler, d, NULL, 0, name, name_len))
- return (0);
-#elif defined(HAVE_XATTR_LIST_INODE)
- if (!handler->list(d->d_inode, NULL, 0, name, name_len))
- return (0);
-#endif
- }
-
- return (1);
-}
-
-/*
- * Determine is a given xattr name should be visible and if so copy it
- * in to the provided buffer (xf->buf).
- */
-static int
-zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len)
-{
- /* Check permissions using the per-namespace list xattr handler. */
- if (!zpl_xattr_permission(xf, name, name_len))
- return (0);
-
- /* When xf->buf is NULL only calculate the required size. */
- if (xf->buf) {
- if (xf->offset + name_len + 1 > xf->size)
- return (-ERANGE);
-
- memcpy(xf->buf + xf->offset, name, name_len);
- xf->buf[xf->offset + name_len] = '\0';
- }
-
- xf->offset += (name_len + 1);
-
- return (0);
-}
-
-/*
- * Read as many directory entry names as will fit in to the provided buffer,
- * or when no buffer is provided calculate the required buffer size.
- */
-int
-zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- int error;
-
- zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id);
-
- while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) {
-
- if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
- error = -ENXIO;
- break;
- }
-
- error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name));
- if (error)
- break;
-
- zap_cursor_advance(&zc);
- }
-
- zap_cursor_fini(&zc);
-
- if (error == -ENOENT)
- error = 0;
-
- return (error);
-}
-
-static ssize_t
-zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
-{
- struct inode *ip = xf->dentry->d_inode;
- struct inode *dxip = NULL;
- int error;
-
- /* Lookup the xattr directory */
- error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
- if (error) {
- if (error == -ENOENT)
- error = 0;
-
- return (error);
- }
-
- error = zpl_xattr_readdir(dxip, xf);
- iput(dxip);
-
- return (error);
-}
-
-static ssize_t
-zpl_xattr_list_sa(xattr_filldir_t *xf)
-{
- znode_t *zp = ITOZ(xf->dentry->d_inode);
- nvpair_t *nvp = NULL;
- int error = 0;
-
- mutex_enter(&zp->z_lock);
- if (zp->z_xattr_cached == NULL)
- error = -zfs_sa_get_xattr(zp);
- mutex_exit(&zp->z_lock);
-
- if (error)
- return (error);
-
- ASSERT(zp->z_xattr_cached);
-
- while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
- ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
-
- error = zpl_xattr_filldir(xf, nvpair_name(nvp),
- strlen(nvpair_name(nvp)));
- if (error)
- return (error);
- }
-
- return (0);
-}
-
-ssize_t
-zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- znode_t *zp = ITOZ(dentry->d_inode);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- xattr_filldir_t xf = { buffer_size, 0, buffer, dentry };
- cred_t *cr = CRED();
- fstrans_cookie_t cookie;
- int error = 0;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
- rw_enter(&zp->z_xattr_lock, RW_READER);
-
- if (zfsvfs->z_use_sa && zp->z_is_sa) {
- error = zpl_xattr_list_sa(&xf);
- if (error)
- goto out;
- }
-
- error = zpl_xattr_list_dir(&xf, cr);
- if (error)
- goto out;
-
- error = xf.offset;
-out:
-
- rw_exit(&zp->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- return (error);
-}
-
-static int
-zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
- size_t size, cred_t *cr)
-{
- struct inode *dxip = NULL;
- struct inode *xip = NULL;
- loff_t pos = 0;
- int error;
-
- /* Lookup the xattr directory */
- error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
- if (error)
- goto out;
-
- /* Lookup a specific xattr name in the directory */
- error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL);
- if (error)
- goto out;
-
- if (!size) {
- error = i_size_read(xip);
- goto out;
- }
-
- if (size < i_size_read(xip)) {
- error = -ERANGE;
- goto out;
- }
-
- error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
-out:
- if (xip)
- iput(xip);
-
- if (dxip)
- iput(dxip);
-
- return (error);
-}
-
-static int
-zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
-{
- znode_t *zp = ITOZ(ip);
- uchar_t *nv_value;
- uint_t nv_size;
- int error = 0;
-
- ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
-
- mutex_enter(&zp->z_lock);
- if (zp->z_xattr_cached == NULL)
- error = -zfs_sa_get_xattr(zp);
- mutex_exit(&zp->z_lock);
-
- if (error)
- return (error);
-
- ASSERT(zp->z_xattr_cached);
- error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
- &nv_value, &nv_size);
- if (error)
- return (error);
-
- if (size == 0 || value == NULL)
- return (nv_size);
-
- if (size < nv_size)
- return (-ERANGE);
-
- memcpy(value, nv_value, nv_size);
-
- return (nv_size);
-}
-
-static int
-__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
- cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- int error;
-
- ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
-
- if (zfsvfs->z_use_sa && zp->z_is_sa) {
- error = zpl_xattr_get_sa(ip, name, value, size);
- if (error != -ENOENT)
- goto out;
- }
-
- error = zpl_xattr_get_dir(ip, name, value, size, cr);
-out:
- if (error == -ENOENT)
- error = -ENODATA;
-
- return (error);
-}
-
-#define XATTR_NOENT 0x0
-#define XATTR_IN_SA 0x1
-#define XATTR_IN_DIR 0x2
-/* check where the xattr resides */
-static int
-__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- int error;
-
- ASSERT(where);
- ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
-
- *where = XATTR_NOENT;
- if (zfsvfs->z_use_sa && zp->z_is_sa) {
- error = zpl_xattr_get_sa(ip, name, NULL, 0);
- if (error >= 0)
- *where |= XATTR_IN_SA;
- else if (error != -ENOENT)
- return (error);
- }
-
- error = zpl_xattr_get_dir(ip, name, NULL, 0, cr);
- if (error >= 0)
- *where |= XATTR_IN_DIR;
- else if (error != -ENOENT)
- return (error);
-
- if (*where == (XATTR_IN_SA|XATTR_IN_DIR))
- cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\""
- " in both SA and dir", ip, name);
- if (*where == XATTR_NOENT)
- error = -ENODATA;
- else
- error = 0;
- return (error);
-}
-
-static int
-zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- cred_t *cr = CRED();
- fstrans_cookie_t cookie;
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
- rw_enter(&zp->z_xattr_lock, RW_READER);
- error = __zpl_xattr_get(ip, name, value, size, cr);
- rw_exit(&zp->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
- spl_fstrans_unmark(cookie);
- crfree(cr);
-
- return (error);
-}
-
-static int
-zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
- size_t size, int flags, cred_t *cr)
-{
- struct inode *dxip = NULL;
- struct inode *xip = NULL;
- vattr_t *vap = NULL;
- ssize_t wrote;
- int lookup_flags, error;
- const int xattr_mode = S_IFREG | 0644;
- loff_t pos = 0;
-
- /*
- * Lookup the xattr directory. When we're adding an entry pass
- * CREATE_XATTR_DIR to ensure the xattr directory is created.
- * When removing an entry this flag is not passed to avoid
- * unnecessarily creating a new xattr directory.
- */
- lookup_flags = LOOKUP_XATTR;
- if (value != NULL)
- lookup_flags |= CREATE_XATTR_DIR;
-
- error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL);
- if (error)
- goto out;
-
- /* Lookup a specific xattr name in the directory */
- error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL);
- if (error && (error != -ENOENT))
- goto out;
-
- error = 0;
-
- /* Remove a specific name xattr when value is set to NULL. */
- if (value == NULL) {
- if (xip)
- error = -zfs_remove(dxip, (char *)name, cr, 0);
-
- goto out;
- }
-
- /* Lookup failed create a new xattr. */
- if (xip == NULL) {
- vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- vap->va_mode = xattr_mode;
- vap->va_mask = ATTR_MODE;
- vap->va_uid = crgetfsuid(cr);
- vap->va_gid = crgetfsgid(cr);
-
- error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip,
- cr, 0, NULL);
- if (error)
- goto out;
- }
-
- ASSERT(xip != NULL);
-
- error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE);
- if (error)
- goto out;
-
- wrote = zpl_write_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
- if (wrote < 0)
- error = wrote;
-
-out:
-
- if (error == 0) {
- ip->i_ctime = current_time(ip);
- zfs_mark_inode_dirty(ip);
- }
-
- if (vap)
- kmem_free(vap, sizeof (vattr_t));
-
- if (xip)
- iput(xip);
-
- if (dxip)
- iput(dxip);
-
- if (error == -ENOENT)
- error = -ENODATA;
-
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
- size_t size, int flags, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- nvlist_t *nvl;
- size_t sa_size;
- int error = 0;
-
- mutex_enter(&zp->z_lock);
- if (zp->z_xattr_cached == NULL)
- error = -zfs_sa_get_xattr(zp);
- mutex_exit(&zp->z_lock);
-
- if (error)
- return (error);
-
- ASSERT(zp->z_xattr_cached);
- nvl = zp->z_xattr_cached;
-
- if (value == NULL) {
- error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
- if (error == -ENOENT)
- error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
- } else {
- /* Limited to 32k to keep nvpair memory allocations small */
- if (size > DXATTR_MAX_ENTRY_SIZE)
- return (-EFBIG);
-
- /* Prevent the DXATTR SA from consuming the entire SA region */
- error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
- if (error)
- return (error);
-
- if (sa_size > DXATTR_MAX_SA_SIZE)
- return (-EFBIG);
-
- error = -nvlist_add_byte_array(nvl, name,
- (uchar_t *)value, size);
- }
-
- /*
- * Update the SA for additions, modifications, and removals. On
- * error drop the inconsistent cached version of the nvlist, it
- * will be reconstructed from the ARC when next accessed.
- */
- if (error == 0)
- error = -zfs_sa_set_xattr(zp);
-
- if (error) {
- nvlist_free(nvl);
- zp->z_xattr_cached = NULL;
- }
-
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
-zpl_xattr_set(struct inode *ip, const char *name, const void *value,
- size_t size, int flags)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- cred_t *cr = CRED();
- fstrans_cookie_t cookie;
- int where;
- int error;
-
- crhold(cr);
- cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
- rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
-
- /*
- * Before setting the xattr check to see if it already exists.
- * This is done to ensure the following optional flags are honored.
- *
- * XATTR_CREATE: fail if xattr already exists
- * XATTR_REPLACE: fail if xattr does not exist
- *
- * We also want to know if it resides in sa or dir, so we can make
- * sure we don't end up with duplicate in both places.
- */
- error = __zpl_xattr_where(ip, name, &where, cr);
- if (error < 0) {
- if (error != -ENODATA)
- goto out;
- if (flags & XATTR_REPLACE)
- goto out;
-
- /* The xattr to be removed already doesn't exist */
- error = 0;
- if (value == NULL)
- goto out;
- } else {
- error = -EEXIST;
- if (flags & XATTR_CREATE)
- goto out;
- }
-
- /* Preferentially store the xattr as a SA for better performance */
- if (zfsvfs->z_use_sa && zp->z_is_sa &&
- (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) {
- error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
- if (error == 0) {
- /*
- * Successfully put into SA, we need to clear the one
- * in dir.
- */
- if (where & XATTR_IN_DIR)
- zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr);
- goto out;
- }
- }
-
- error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
- /*
- * Successfully put into dir, we need to clear the one in SA.
- */
- if (error == 0 && (where & XATTR_IN_SA))
- zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
-out:
- rw_exit(&ITOZ(ip)->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
- spl_fstrans_unmark(cookie);
- crfree(cr);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-/*
- * Extended user attributes
- *
- * "Extended user attributes may be assigned to files and directories for
- * storing arbitrary additional information such as the mime type,
- * character set or encoding of a file. The access permissions for user
- * attributes are defined by the file permission bits: read permission
- * is required to retrieve the attribute value, and writer permission is
- * required to change it.
- *
- * The file permission bits of regular files and directories are
- * interpreted differently from the file permission bits of special
- * files and symbolic links. For regular files and directories the file
- * permission bits define access to the file's contents, while for
- * device special files they define access to the device described by
- * the special file. The file permissions of symbolic links are not
- * used in access checks. These differences would allow users to
- * consume filesystem resources in a way not controllable by disk quotas
- * for group or world writable special files and directories.
- *
- * For this reason, extended user attributes are allowed only for
- * regular files and directories, and access to extended user attributes
- * is restricted to the owner and to users with appropriate capabilities
- * for directories with the sticky bit set (see the chmod(1) manual page
- * for an explanation of the sticky bit)." - xattr(7)
- *
- * ZFS allows extended user attributes to be disabled administratively
- * by setting the 'xattr=off' property on the dataset.
- */
-static int
-__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- return (ITOZSB(ip)->z_flags & ZSB_XATTR);
-}
-ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list);
-
-static int
-__zpl_xattr_user_get(struct inode *ip, const char *name,
- void *value, size_t size)
-{
- char *xattr_name;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
- return (-EOPNOTSUPP);
-
- xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
- error = zpl_xattr_get(ip, xattr_name, value, size);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
-
-static int
-__zpl_xattr_user_set(struct inode *ip, const char *name,
- const void *value, size_t size, int flags)
-{
- char *xattr_name;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
- return (-EOPNOTSUPP);
-
- xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
- error = zpl_xattr_set(ip, xattr_name, value, size, flags);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
-
-xattr_handler_t zpl_xattr_user_handler =
-{
- .prefix = XATTR_USER_PREFIX,
- .list = zpl_xattr_user_list,
- .get = zpl_xattr_user_get,
- .set = zpl_xattr_user_set,
-};
-
-/*
- * Trusted extended attributes
- *
- * "Trusted extended attributes are visible and accessible only to
- * processes that have the CAP_SYS_ADMIN capability. Attributes in this
- * class are used to implement mechanisms in user space (i.e., outside
- * the kernel) which keep information in extended attributes to which
- * ordinary processes should not have access." - xattr(7)
- */
-static int
-__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- return (capable(CAP_SYS_ADMIN));
-}
-ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list);
-
-static int
-__zpl_xattr_trusted_get(struct inode *ip, const char *name,
- void *value, size_t size)
-{
- char *xattr_name;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return (-EACCES);
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
- error = zpl_xattr_get(ip, xattr_name, value, size);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
-
-static int
-__zpl_xattr_trusted_set(struct inode *ip, const char *name,
- const void *value, size_t size, int flags)
-{
- char *xattr_name;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return (-EACCES);
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
- error = zpl_xattr_set(ip, xattr_name, value, size, flags);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
-
-xattr_handler_t zpl_xattr_trusted_handler =
-{
- .prefix = XATTR_TRUSTED_PREFIX,
- .list = zpl_xattr_trusted_list,
- .get = zpl_xattr_trusted_get,
- .set = zpl_xattr_trusted_set,
-};
-
-/*
- * Extended security attributes
- *
- * "The security attribute namespace is used by kernel security modules,
- * such as Security Enhanced Linux, and also to implement file
- * capabilities (see capabilities(7)). Read and write access
- * permissions to security attributes depend on the policy implemented
- * for each security attribute by the security module. When no security
- * module is loaded, all processes have read access to extended security
- * attributes, and write access is limited to processes that have the
- * CAP_SYS_ADMIN capability." - xattr(7)
- */
-static int
-__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- return (1);
-}
-ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list);
-
-static int
-__zpl_xattr_security_get(struct inode *ip, const char *name,
- void *value, size_t size)
-{
- char *xattr_name;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
- error = zpl_xattr_get(ip, xattr_name, value, size);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
-
-static int
-__zpl_xattr_security_set(struct inode *ip, const char *name,
- const void *value, size_t size, int flags)
-{
- char *xattr_name;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") == 0)
- return (-EINVAL);
-#endif
- xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
- error = zpl_xattr_set(ip, xattr_name, value, size, flags);
- strfree(xattr_name);
-
- return (error);
-}
-ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
-
-#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY
-static int
-__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs,
- void *fs_info)
-{
- const struct xattr *xattr;
- int error = 0;
-
- for (xattr = xattrs; xattr->name != NULL; xattr++) {
- error = __zpl_xattr_security_set(ip,
- xattr->name, xattr->value, xattr->value_len, 0);
-
- if (error < 0)
- break;
- }
-
- return (error);
-}
-
-int
-zpl_xattr_security_init(struct inode *ip, struct inode *dip,
- const struct qstr *qstr)
-{
- return security_inode_init_security(ip, dip, qstr,
- &__zpl_xattr_security_init, NULL);
-}
-
-#else
-int
-zpl_xattr_security_init(struct inode *ip, struct inode *dip,
- const struct qstr *qstr)
-{
- int error;
- size_t len;
- void *value;
- char *name;
-
- error = zpl_security_inode_init_security(ip, dip, qstr,
- &name, &value, &len);
- if (error) {
- if (error == -EOPNOTSUPP)
- return (0);
-
- return (error);
- }
-
- error = __zpl_xattr_security_set(ip, name, value, len, 0);
-
- kfree(name);
- kfree(value);
-
- return (error);
-}
-#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */
-
-/*
- * Security xattr namespace handlers.
- */
-xattr_handler_t zpl_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = zpl_xattr_security_list,
- .get = zpl_xattr_security_get,
- .set = zpl_xattr_security_set,
-};
-
-/*
- * Extended system attributes
- *
- * "Extended system attributes are used by the kernel to store system
- * objects such as Access Control Lists. Read and write access permissions
- * to system attributes depend on the policy implemented for each system
- * attribute implemented by filesystems in the kernel." - xattr(7)
- */
-#ifdef CONFIG_FS_POSIX_ACL
-int
-zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
-{
- char *name, *value = NULL;
- int error = 0;
- size_t size = 0;
-
- if (S_ISLNK(ip->i_mode))
- return (-EOPNOTSUPP);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name = XATTR_NAME_POSIX_ACL_ACCESS;
- if (acl) {
- zpl_equivmode_t mode = ip->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
- if (error < 0) {
- return (error);
- } else {
- /*
- * The mode bits will have been set by
- * ->zfs_setattr()->zfs_acl_chmod_setattr()
- * using the ZFS ACL conversion. If they
- * differ from the Posix ACL conversion dirty
- * the inode to write the Posix mode bits.
- */
- if (ip->i_mode != mode) {
- ip->i_mode = mode;
- ip->i_ctime = current_time(ip);
- zfs_mark_inode_dirty(ip);
- }
-
- if (error == 0)
- acl = NULL;
- }
- }
- break;
-
- case ACL_TYPE_DEFAULT:
- name = XATTR_NAME_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(ip->i_mode))
- return (acl ? -EACCES : 0);
- break;
-
- default:
- return (-EINVAL);
- }
-
- if (acl) {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmem_alloc(size, KM_SLEEP);
-
- error = zpl_acl_to_xattr(acl, value, size);
- if (error < 0) {
- kmem_free(value, size);
- return (error);
- }
- }
-
- error = zpl_xattr_set(ip, name, value, size, 0);
- if (value)
- kmem_free(value, size);
-
- if (!error) {
- if (acl)
- zpl_set_cached_acl(ip, type, acl);
- else
- zpl_forget_cached_acl(ip, type);
- }
-
- return (error);
-}
-
-struct posix_acl *
-zpl_get_acl(struct inode *ip, int type)
-{
- struct posix_acl *acl;
- void *value = NULL;
- char *name;
- int size;
-
- /*
- * As of Linux 3.14, the kernel get_acl will check this for us.
- * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong
- * as the kernel get_acl will set it to temporary sentinel value.
- */
-#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
- acl = get_cached_acl(ip, type);
- if (acl != ACL_NOT_CACHED)
- return (acl);
-#endif
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name = XATTR_NAME_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name = XATTR_NAME_POSIX_ACL_DEFAULT;
- break;
- default:
- return (ERR_PTR(-EINVAL));
- }
-
- size = zpl_xattr_get(ip, name, NULL, 0);
- if (size > 0) {
- value = kmem_alloc(size, KM_SLEEP);
- size = zpl_xattr_get(ip, name, value, size);
- }
-
- if (size > 0) {
- acl = zpl_acl_from_xattr(value, size);
- } else if (size == -ENODATA || size == -ENOSYS) {
- acl = NULL;
- } else {
- acl = ERR_PTR(-EIO);
- }
-
- if (size > 0)
- kmem_free(value, size);
-
- /* As of Linux 4.7, the kernel get_acl will set this for us */
-#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
- if (!IS_ERR(acl))
- zpl_set_cached_acl(ip, type, acl);
-#endif
-
- return (acl);
-}
-
-#if !defined(HAVE_GET_ACL)
-static int
-__zpl_check_acl(struct inode *ip, int mask)
-{
- struct posix_acl *acl;
- int error;
-
- acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
-
- if (acl) {
- error = posix_acl_permission(ip, acl, mask);
- zpl_posix_acl_release(acl);
- return (error);
- }
-
- return (-EAGAIN);
-}
-
-#if defined(HAVE_CHECK_ACL_WITH_FLAGS)
-int
-zpl_check_acl(struct inode *ip, int mask, unsigned int flags)
-{
- return (__zpl_check_acl(ip, mask));
-}
-#elif defined(HAVE_CHECK_ACL)
-int
-zpl_check_acl(struct inode *ip, int mask)
-{
- return (__zpl_check_acl(ip, mask));
-}
-#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA)
-int
-zpl_permission(struct inode *ip, int mask, struct nameidata *nd)
-{
- return (generic_permission(ip, mask, __zpl_check_acl));
-}
-#elif defined(HAVE_PERMISSION)
-int
-zpl_permission(struct inode *ip, int mask)
-{
- return (generic_permission(ip, mask, __zpl_check_acl));
-}
-#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */
-#endif /* !HAVE_GET_ACL */
-
-int
-zpl_init_acl(struct inode *ip, struct inode *dir)
-{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (0);
-
- if (!S_ISLNK(ip->i_mode)) {
- acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
- if (!acl) {
- ip->i_mode &= ~current_umask();
- ip->i_ctime = current_time(ip);
- zfs_mark_inode_dirty(ip);
- return (0);
- }
- }
-
- if (acl) {
- umode_t mode;
-
- if (S_ISDIR(ip->i_mode)) {
- error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT);
- if (error)
- goto out;
- }
-
- mode = ip->i_mode;
- error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
- if (error >= 0) {
- ip->i_mode = mode;
- zfs_mark_inode_dirty(ip);
- if (error > 0)
- error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
- }
- }
-out:
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-
-int
-zpl_chmod_acl(struct inode *ip)
-{
- struct posix_acl *acl;
- int error;
-
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (0);
-
- if (S_ISLNK(ip->i_mode))
- return (-EOPNOTSUPP);
-
- acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return (PTR_ERR(acl));
-
- error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode);
- if (!error)
- error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
-
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-
-static int
-__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
- size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS);
-
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (0);
-
- if (list && xattr_size <= list_size)
- memcpy(list, xattr_name, xattr_size);
-
- return (xattr_size);
-}
-ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access);
-
-static int
-__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
- size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT);
-
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (0);
-
- if (list && xattr_size <= list_size)
- memcpy(list, xattr_name, xattr_size);
-
- return (xattr_size);
-}
-ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default);
-
-static int
-__zpl_xattr_acl_get_access(struct inode *ip, const char *name,
- void *buffer, size_t size)
-{
- struct posix_acl *acl;
- int type = ACL_TYPE_ACCESS;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") != 0)
- return (-EINVAL);
-#endif
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (-EOPNOTSUPP);
-
- acl = zpl_get_acl(ip, type);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
- if (acl == NULL)
- return (-ENODATA);
-
- error = zpl_acl_to_xattr(acl, buffer, size);
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access);
-
-static int
-__zpl_xattr_acl_get_default(struct inode *ip, const char *name,
- void *buffer, size_t size)
-{
- struct posix_acl *acl;
- int type = ACL_TYPE_DEFAULT;
- int error;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") != 0)
- return (-EINVAL);
-#endif
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (-EOPNOTSUPP);
-
- acl = zpl_get_acl(ip, type);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
- if (acl == NULL)
- return (-ENODATA);
-
- error = zpl_acl_to_xattr(acl, buffer, size);
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
-
-static int
-__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
- const void *value, size_t size, int flags)
-{
- struct posix_acl *acl;
- int type = ACL_TYPE_ACCESS;
- int error = 0;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") != 0)
- return (-EINVAL);
-#endif
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (-EOPNOTSUPP);
-
- if (!zpl_inode_owner_or_capable(ip))
- return (-EPERM);
-
- if (value) {
- acl = zpl_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
- else if (acl) {
- error = zpl_posix_acl_valid(ip, acl);
- if (error) {
- zpl_posix_acl_release(acl);
- return (error);
- }
- }
- } else {
- acl = NULL;
- }
-
- error = zpl_set_acl(ip, acl, type);
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
-
-static int
-__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
- const void *value, size_t size, int flags)
-{
- struct posix_acl *acl;
- int type = ACL_TYPE_DEFAULT;
- int error = 0;
- /* xattr_resolve_name will do this for us if this is defined */
-#ifndef HAVE_XATTR_HANDLER_NAME
- if (strcmp(name, "") != 0)
- return (-EINVAL);
-#endif
- if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
- return (-EOPNOTSUPP);
-
- if (!zpl_inode_owner_or_capable(ip))
- return (-EPERM);
-
- if (value) {
- acl = zpl_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return (PTR_ERR(acl));
- else if (acl) {
- error = zpl_posix_acl_valid(ip, acl);
- if (error) {
- zpl_posix_acl_release(acl);
- return (error);
- }
- }
- } else {
- acl = NULL;
- }
-
- error = zpl_set_acl(ip, acl, type);
- zpl_posix_acl_release(acl);
-
- return (error);
-}
-ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default);
-
-/*
- * ACL access xattr namespace handlers.
- *
- * Use .name instead of .prefix when available. xattr_resolve_name will match
- * whole name and reject anything that has .name only as prefix.
- */
-xattr_handler_t zpl_xattr_acl_access_handler =
-{
-#ifdef HAVE_XATTR_HANDLER_NAME
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
-#else
- .prefix = XATTR_NAME_POSIX_ACL_ACCESS,
-#endif
- .list = zpl_xattr_acl_list_access,
- .get = zpl_xattr_acl_get_access,
- .set = zpl_xattr_acl_set_access,
-#if defined(HAVE_XATTR_LIST_SIMPLE) || \
- defined(HAVE_XATTR_LIST_DENTRY) || \
- defined(HAVE_XATTR_LIST_HANDLER)
- .flags = ACL_TYPE_ACCESS,
-#endif
-};
-
-/*
- * ACL default xattr namespace handlers.
- *
- * Use .name instead of .prefix when available. xattr_resolve_name will match
- * whole name and reject anything that has .name only as prefix.
- */
-xattr_handler_t zpl_xattr_acl_default_handler =
-{
-#ifdef HAVE_XATTR_HANDLER_NAME
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
-#else
- .prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
-#endif
- .list = zpl_xattr_acl_list_default,
- .get = zpl_xattr_acl_get_default,
- .set = zpl_xattr_acl_set_default,
-#if defined(HAVE_XATTR_LIST_SIMPLE) || \
- defined(HAVE_XATTR_LIST_DENTRY) || \
- defined(HAVE_XATTR_LIST_HANDLER)
- .flags = ACL_TYPE_DEFAULT,
-#endif
-};
-
-#endif /* CONFIG_FS_POSIX_ACL */
-
-xattr_handler_t *zpl_xattr_handlers[] = {
- &zpl_xattr_security_handler,
- &zpl_xattr_trusted_handler,
- &zpl_xattr_user_handler,
-#ifdef CONFIG_FS_POSIX_ACL
- &zpl_xattr_acl_access_handler,
- &zpl_xattr_acl_default_handler,
-#endif /* CONFIG_FS_POSIX_ACL */
- NULL
-};
-
-static const struct xattr_handler *
-zpl_xattr_handler(const char *name)
-{
- if (strncmp(name, XATTR_USER_PREFIX,
- XATTR_USER_PREFIX_LEN) == 0)
- return (&zpl_xattr_user_handler);
-
- if (strncmp(name, XATTR_TRUSTED_PREFIX,
- XATTR_TRUSTED_PREFIX_LEN) == 0)
- return (&zpl_xattr_trusted_handler);
-
- if (strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) == 0)
- return (&zpl_xattr_security_handler);
-
-#ifdef CONFIG_FS_POSIX_ACL
- if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
- sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0)
- return (&zpl_xattr_acl_access_handler);
-
- if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
- sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0)
- return (&zpl_xattr_acl_default_handler);
-#endif /* CONFIG_FS_POSIX_ACL */
-
- return (NULL);
-}
-
-#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
-struct acl_rel_struct {
- struct acl_rel_struct *next;
- struct posix_acl *acl;
- clock_t time;
-};
-
-#define ACL_REL_GRACE (60*HZ)
-#define ACL_REL_WINDOW (1*HZ)
-#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW)
-
-/*
- * Lockless multi-producer single-consumer fifo list.
- * Nodes are added to tail and removed from head. Tail pointer is our
- * synchronization point. It always points to the next pointer of the last
- * node, or head if list is empty.
- */
-static struct acl_rel_struct *acl_rel_head = NULL;
-static struct acl_rel_struct **acl_rel_tail = &acl_rel_head;
-
-static void
-zpl_posix_acl_free(void *arg)
-{
- struct acl_rel_struct *freelist = NULL;
- struct acl_rel_struct *a;
- clock_t new_time;
- boolean_t refire = B_FALSE;
-
- ASSERT3P(acl_rel_head, !=, NULL);
- while (acl_rel_head) {
- a = acl_rel_head;
- if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) {
- /*
- * If a is the last node we need to reset tail, but we
- * need to use cmpxchg to make sure it is still the
- * last node.
- */
- if (acl_rel_tail == &a->next) {
- acl_rel_head = NULL;
- if (cmpxchg(&acl_rel_tail, &a->next,
- &acl_rel_head) == &a->next) {
- ASSERT3P(a->next, ==, NULL);
- a->next = freelist;
- freelist = a;
- break;
- }
- }
- /*
- * a is not last node, make sure next pointer is set
- * by the adder and advance the head.
- */
- while (READ_ONCE(a->next) == NULL)
- cpu_relax();
- acl_rel_head = a->next;
- a->next = freelist;
- freelist = a;
- } else {
- /*
- * a is still in grace period. We are responsible to
- * reschedule the free task, since adder will only do
- * so if list is empty.
- */
- new_time = a->time + ACL_REL_SCHED;
- refire = B_TRUE;
- break;
- }
- }
-
- if (refire)
- taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
- NULL, TQ_SLEEP, new_time);
-
- while (freelist) {
- a = freelist;
- freelist = a->next;
- kfree(a->acl);
- kmem_free(a, sizeof (struct acl_rel_struct));
- }
-}
-
-void
-zpl_posix_acl_release_impl(struct posix_acl *acl)
-{
- struct acl_rel_struct *a, **prev;
-
- a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP);
- a->next = NULL;
- a->acl = acl;
- a->time = ddi_get_lbolt();
- /* atomically points tail to us and get the previous tail */
- prev = xchg(&acl_rel_tail, &a->next);
- ASSERT3P(*prev, ==, NULL);
- *prev = a;
- /* if it was empty before, schedule the free task */
- if (prev == &acl_rel_head)
- taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
- NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED);
-}
-#endif