diff options
Diffstat (limited to 'module/zfs')
-rw-r--r-- | module/zfs/Makefile.in | 43 | ||||
-rw-r--r-- | module/zfs/abd.c | 1638 | ||||
-rw-r--r-- | module/zfs/gzip.c | 2 | ||||
-rw-r--r-- | module/zfs/policy.c | 355 | ||||
-rw-r--r-- | module/zfs/qat.c | 105 | ||||
-rw-r--r-- | module/zfs/qat.h | 204 | ||||
-rw-r--r-- | module/zfs/qat_compress.c | 574 | ||||
-rw-r--r-- | module/zfs/qat_crypt.c | 631 | ||||
-rw-r--r-- | module/zfs/sha256.c | 2 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 2 | ||||
-rw-r--r-- | module/zfs/spa_stats.c | 1034 | ||||
-rw-r--r-- | module/zfs/vdev_disk.c | 954 | ||||
-rw-r--r-- | module/zfs/vdev_file.c | 331 | ||||
-rw-r--r-- | module/zfs/zfs_acl.c | 2816 | ||||
-rw-r--r-- | module/zfs/zfs_ctldir.c | 1240 | ||||
-rw-r--r-- | module/zfs/zfs_debug.c | 253 | ||||
-rw-r--r-- | module/zfs/zfs_dir.c | 1205 | ||||
-rw-r--r-- | module/zfs/zfs_sysfs.c | 661 | ||||
-rw-r--r-- | module/zfs/zfs_vfsops.c | 2562 | ||||
-rw-r--r-- | module/zfs/zfs_vnops.c | 5275 | ||||
-rw-r--r-- | module/zfs/zfs_znode.c | 2234 | ||||
-rw-r--r-- | module/zfs/zio_crypt.c | 2036 | ||||
-rw-r--r-- | module/zfs/zpl_ctldir.c | 572 | ||||
-rw-r--r-- | module/zfs/zpl_export.c | 177 | ||||
-rw-r--r-- | module/zfs/zpl_file.c | 1075 | ||||
-rw-r--r-- | module/zfs/zpl_inode.c | 826 | ||||
-rw-r--r-- | module/zfs/zpl_super.c | 426 | ||||
-rw-r--r-- | module/zfs/zpl_xattr.c | 1548 |
28 files changed, 14 insertions, 28767 deletions
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 5adea9fb5..7c560fad7 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -16,18 +16,17 @@ endif # Suppress unused but set variable warnings often due to ASSERTs ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) -$(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o $(MODULE)-objs += bpobj.o -$(MODULE)-objs += cityhash.o -$(MODULE)-objs += dbuf.o -$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += bptree.o $(MODULE)-objs += bqueue.o +$(MODULE)-objs += cityhash.o $(MODULE)-objs += dataset_kstats.o +$(MODULE)-objs += dbuf.o +$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += ddt.o $(MODULE)-objs += ddt_zap.o $(MODULE)-objs += dmu.o @@ -42,28 +41,29 @@ $(MODULE)-objs += dmu_tx.o $(MODULE)-objs += dmu_zfetch.o $(MODULE)-objs += dnode.o $(MODULE)-objs += dnode_sync.o +$(MODULE)-objs += dsl_bookmark.o +$(MODULE)-objs += dsl_crypt.o $(MODULE)-objs += dsl_dataset.o $(MODULE)-objs += dsl_deadlist.o $(MODULE)-objs += dsl_deleg.o -$(MODULE)-objs += dsl_bookmark.o +$(MODULE)-objs += dsl_destroy.o $(MODULE)-objs += dsl_dir.o -$(MODULE)-objs += dsl_crypt.o $(MODULE)-objs += dsl_pool.o $(MODULE)-objs += dsl_prop.o $(MODULE)-objs += dsl_scan.o $(MODULE)-objs += dsl_synctask.o +$(MODULE)-objs += dsl_userhold.o $(MODULE)-objs += edonr_zfs.o $(MODULE)-objs += fm.o $(MODULE)-objs += gzip.o $(MODULE)-objs += hkdf.o -$(MODULE)-objs += lzjb.o $(MODULE)-objs += lz4.o +$(MODULE)-objs += lzjb.o $(MODULE)-objs += metaslab.o $(MODULE)-objs += mmp.o $(MODULE)-objs += multilist.o $(MODULE)-objs += objlist.o $(MODULE)-objs += pathname.o -$(MODULE)-objs += policy.o $(MODULE)-objs += range_tree.o $(MODULE)-objs += refcount.o $(MODULE)-objs += rrwlock.o @@ -78,17 +78,14 @@ $(MODULE)-objs += spa_errlog.o $(MODULE)-objs += spa_history.o $(MODULE)-objs += spa_log_spacemap.o $(MODULE)-objs += spa_misc.o -$(MODULE)-objs += spa_stats.o $(MODULE)-objs += space_map.o $(MODULE)-objs += space_reftree.o -$(MODULE)-objs += txg.o $(MODULE)-objs += trace.o +$(MODULE)-objs += txg.o $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o -$(MODULE)-objs += vdev_disk.o -$(MODULE)-objs += vdev_file.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o @@ -112,11 +109,7 @@ $(MODULE)-objs += zcp_global.o $(MODULE)-objs += zcp_iter.o $(MODULE)-objs += zcp_synctask.o $(MODULE)-objs += zfeature.o -$(MODULE)-objs += zfs_acl.o $(MODULE)-objs += zfs_byteswap.o -$(MODULE)-objs += zfs_ctldir.o -$(MODULE)-objs += zfs_debug.o -$(MODULE)-objs += zfs_dir.o $(MODULE)-objs += zfs_fm.o $(MODULE)-objs += zfs_fuid.o $(MODULE)-objs += zfs_ioctl.o @@ -126,31 +119,15 @@ $(MODULE)-objs += zfs_ratelimit.o $(MODULE)-objs += zfs_replay.o $(MODULE)-objs += zfs_rlock.o $(MODULE)-objs += zfs_sa.o -$(MODULE)-objs += zfs_sysfs.o -$(MODULE)-objs += zfs_vfsops.o -$(MODULE)-objs += zfs_vnops.o -$(MODULE)-objs += zfs_znode.o $(MODULE)-objs += zil.o $(MODULE)-objs += zio.o $(MODULE)-objs += zio_checksum.o $(MODULE)-objs += zio_compress.o -$(MODULE)-objs += zio_crypt.o $(MODULE)-objs += zio_inject.o $(MODULE)-objs += zle.o -$(MODULE)-objs += zpl_ctldir.o -$(MODULE)-objs += zpl_export.o -$(MODULE)-objs += zpl_file.o -$(MODULE)-objs += zpl_inode.o -$(MODULE)-objs += zpl_super.o -$(MODULE)-objs += zpl_xattr.o $(MODULE)-objs += zrlock.o $(MODULE)-objs += zthr.o $(MODULE)-objs += zvol.o -$(MODULE)-objs += dsl_destroy.o -$(MODULE)-objs += dsl_userhold.o -$(MODULE)-objs += qat.o -$(MODULE)-objs += qat_compress.o -$(MODULE)-objs += qat_crypt.o # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. @@ -165,3 +142,5 @@ $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o + +-include @abs_top_builddir@/module/os/linux/zfs/Makefile diff --git a/module/zfs/abd.c b/module/zfs/abd.c deleted file mode 100644 index ac6b0b742..000000000 --- a/module/zfs/abd.c +++ /dev/null @@ -1,1638 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2019 by Delphix. All rights reserved. - */ - -/* - * ARC buffer data (ABD). - * - * ABDs are an abstract data structure for the ARC which can use two - * different ways of storing the underlying data: - * - * (a) Linear buffer. In this case, all the data in the ABD is stored in one - * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). - * - * +-------------------+ - * | ABD (linear) | - * | abd_flags = ... | - * | abd_size = ... | +--------------------------------+ - * | abd_buf ------------->| raw buffer of size abd_size | - * +-------------------+ +--------------------------------+ - * no abd_chunks - * - * (b) Scattered buffer. In this case, the data in the ABD is split into - * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers - * to the chunks recorded in an array at the end of the ABD structure. - * - * +-------------------+ - * | ABD (scattered) | - * | abd_flags = ... | - * | abd_size = ... | - * | abd_offset = 0 | +-----------+ - * | abd_chunks[0] ----------------------------->| chunk 0 | - * | abd_chunks[1] ---------------------+ +-----------+ - * | ... | | +-----------+ - * | abd_chunks[N-1] ---------+ +------->| chunk 1 | - * +-------------------+ | +-----------+ - * | ... - * | +-----------+ - * +----------------->| chunk N-1 | - * +-----------+ - * - * Linear buffers act exactly like normal buffers and are always mapped into the - * kernel's virtual memory space, while scattered ABD data chunks are allocated - * as physical pages and then mapped in only while they are actually being - * accessed through one of the abd_* library functions. Using scattered ABDs - * provides several benefits: - * - * (1) They avoid use of kmem_*, preventing performance problems where running - * kmem_reap on very large memory systems never finishes and causes - * constant TLB shootdowns. - * - * (2) Fragmentation is less of an issue since when we are at the limit of - * allocatable space, we won't have to search around for a long free - * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we are using HIGHMEM (see next point) we - * wouldn't need to worry about finding a contiguous address range. - * - * (3) If we are not using HIGHMEM, then all physical memory is always - * mapped into the kernel's address space, so we also avoid the map / - * unmap costs on each ABD access. - * - * If we are not using HIGHMEM, scattered buffers which have only one chunk - * can be treated as linear buffers, because they are contiguous in the - * kernel's virtual address space. See abd_alloc_pages() for details. - * - * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to - * B_FALSE. - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. - */ - -#include <sys/abd.h> -#include <sys/param.h> -#include <sys/zio.h> -#include <sys/zfs_context.h> -#include <sys/zfs_znode.h> -#ifdef _KERNEL -#include <linux/scatterlist.h> -#include <linux/kmap_compat.h> -#else -#define MAX_ORDER 1 -#endif - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; - kstat_named_t abdstat_scatter_page_multi_chunk; - kstat_named_t abdstat_scatter_page_multi_zone; - kstat_named_t abdstat_scatter_page_alloc_retry; - kstat_named_t abdstat_scatter_sg_table_retry; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of compound allocations of a given order. These - * allocations are spread over all currently allocated ABDs, and - * act as a measure of memory fragmentation. - */ - { { "scatter_order_N", KSTAT_DATA_UINT64 } }, - /* - * The number of scatter ABDs which contain multiple chunks. - * ABDs are preferentially allocated from the minimum number of - * contiguous multi-page chunks, a single chunk is optimal. - */ - { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are split across memory zones. - * ABDs are preferentially allocated using pages from a single zone. - */ - { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the pages to populate the scatter ABD. - */ - { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the sg table for an ABD. - */ - { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define abd_for_each_sg(abd, sg, n, i) \ - for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) - -/* see block comment above for description */ -int zfs_abd_scatter_enabled = B_TRUE; -unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; - -/* - * zfs_abd_scatter_min_size is the minimum allocation size to use scatter - * ABD's. Smaller allocations will use linear ABD's which uses - * zio_[data_]buf_alloc(). - * - * Scatter ABD's use at least one page each, so sub-page allocations waste - * some space when allocated as scatter (e.g. 2KB scatter allocation wastes - * half of each page). Using linear ABD's for small allocations means that - * they will be put on slabs which contain many allocations. This can - * improve memory efficiency, but it also makes it much harder for ARC - * evictions to actually free pages, because all the buffers on one slab need - * to be freed in order for the slab (and underlying pages) to be freed. - * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's - * possible for them to actually waste more memory than scatter (one page per - * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). - * - * Spill blocks are typically 512B and are heavily used on systems running - * selinux with the default dnode size and the `xattr=sa` property set. - * - * By default we use linear allocations for 512B and 1KB, and scatter - * allocations for larger (1.5KB and up). - */ -int zfs_abd_scatter_min_size = 512 * 3; - -static kmem_cache_t *abd_cache = NULL; -static kstat_t *abd_ksp; - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); -} - -#ifdef _KERNEL -/* - * Mark zfs data pages so they can be excluded from kernel crash dumps - */ -#ifdef _LP64 -#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E - -static inline void -abd_mark_zfs_page(struct page *page) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, ABD_FILE_CACHE_PAGE); -} - -static inline void -abd_unmark_zfs_page(struct page *page) -{ - set_page_private(page, 0UL); - ClearPagePrivate(page); - put_page(page); -} -#else -#define abd_mark_zfs_page(page) -#define abd_unmark_zfs_page(page) -#endif /* _LP64 */ - -#ifndef CONFIG_HIGHMEM - -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -/* - * The goal is to minimize fragmentation by preferentially populating ABDs - * with higher order compound pages from a single zone. Allocation size is - * progressively decreased until it can be satisfied without performing - * reclaim or compaction. When necessary this function will degenerate to - * allocating individual pages and allowing reclaim to satisfy allocations. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct list_head pages; - struct sg_table table; - struct scatterlist *sg; - struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; - size_t remaining_size; - int nid = NUMA_NO_NODE; - int alloc_pages = 0; - - INIT_LIST_HEAD(&pages); - - while (alloc_pages < nr_pages) { - unsigned chunk_pages; - int order; - - order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); - chunk_pages = (1U << order); - - page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); - if (page == NULL) { - if (order == 0) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } else { - max_order = MAX(0, order - 1); - } - continue; - } - - list_add_tail(&page->lru, &pages); - - if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) - zones++; - - nid = page_to_nid(page); - ABDSTAT_BUMP(abdstat_scatter_orders[order]); - chunks++; - alloc_pages += chunk_pages; - } - - ASSERT3S(alloc_pages, ==, nr_pages); - - while (sg_alloc_table(&table, chunks, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - sg = table.sgl; - remaining_size = size; - list_for_each_entry_safe(page, tmp_page, &pages, lru) { - size_t sg_size = MIN(PAGESIZE << compound_order(page), - remaining_size); - sg_set_page(sg, page, sg_size, 0); - abd_mark_zfs_page(page); - remaining_size -= sg_size; - - sg = sg_next(sg); - list_del(&page->lru); - } - - /* - * These conditions ensure that a possible transformation to a linear - * ABD would be valid. - */ - ASSERT(!PageHighMem(sg_page(table.sgl))); - ASSERT0(ABD_SCATTER(abd).abd_offset); - - if (table.nents == 1) { - /* - * Since there is only one entry, this ABD can be represented - * as a linear buffer. All single-page (4K) ABD's can be - * represented this way. Some multi-page ABD's can also be - * represented this way, if we were able to allocate a single - * "chunk" (higher-order "page" which represents a power-of-2 - * series of physically-contiguous pages). This is often the - * case for 2-page (8K) ABD's. - * - * Representing a single-entry scatter ABD as a linear ABD - * has the performance advantage of avoiding the copy (and - * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. - * A performance increase of around 5% has been observed for - * ARC-cached reads (of small blocks which can take advantage - * of this). - * - * Note that this optimization is only possible because the - * pages are always mapped into the kernel's address space. - * This is not the case for highmem pages, so the - * optimization can not be made there. - */ - abd->abd_flags |= ABD_FLAG_LINEAR; - abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; - abd->abd_u.abd_linear.abd_sgl = table.sgl; - abd->abd_u.abd_linear.abd_buf = - page_address(sg_page(table.sgl)); - } else if (table.nents > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - - if (zones) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); - abd->abd_flags |= ABD_FLAG_MULTI_ZONE; - } - - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = table.nents; - } -} -#else -/* - * Allocate N individual pages to construct a scatter ABD. This function - * makes no attempt to request contiguous pages and requires the minimal - * number of kernel interfaces. It's designed for maximum compatibility. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - int nr_pages = abd_chunkcnt_for_bytes(size); - int i = 0; - - while (sg_alloc_table(&table, nr_pages, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - ASSERT3U(table.nents, ==, nr_pages); - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = nr_pages; - - abd_for_each_sg(abd, sg, nr_pages, i) { - while ((page = __page_cache_alloc(gfp)) == NULL) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } - - ABDSTAT_BUMP(abdstat_scatter_orders[0]); - sg_set_page(sg, page, PAGESIZE, 0); - abd_mark_zfs_page(page); - } - - if (nr_pages > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - } -} -#endif /* !CONFIG_HIGHMEM */ - -static void -abd_free_pages(abd_t *abd) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - int nr_pages = ABD_SCATTER(abd).abd_nents; - int order, i = 0; - - if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); - - if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); - } - - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); -} - -#else /* _KERNEL */ - -#ifndef PAGE_SHIFT -#define PAGE_SHIFT (highbit64(PAGESIZE)-1) -#endif - -struct page; - -#define zfs_kmap_atomic(chunk, km) ((void *)chunk) -#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) -#define local_irq_save(flags) do { (void)(flags); } while (0) -#define local_irq_restore(flags) do { (void)(flags); } while (0) -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); -} - -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; - - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - sg_set_page(sg, p, PAGESIZE, 0); - } - ABD_SCATTER(abd).abd_nents = nr_pages; -} - -static void -abd_free_pages(abd_t *abd) -{ - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - - abd_for_each_sg(abd, sg, n, i) { - for (int j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); - umem_free(p, PAGESIZE); - } - } - - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); -} - -#endif /* _KERNEL */ - -void -abd_init(void) -{ - int i; - - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - - for (i = 0; i < MAX_ORDER; i++) { - snprintf(abd_stats.abdstat_scatter_orders[i].name, - KSTAT_STRLEN, "scatter_order_%d", i); - abd_stats.abdstat_scatter_orders[i].data_type = - KSTAT_DATA_UINT64; - } - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - if (abd_cache) { - kmem_cache_destroy(abd_cache); - abd_cache = NULL; - } -} - -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(void) -{ - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); - - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - kmem_cache_free(abd_cache, abd); - ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - /* see the comment above zfs_abd_scatter_min_size */ - if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd_t *abd = abd_alloc_struct(); - abd->abd_flags = ABD_FLAG_OWNER; - abd->abd_u.abd_scatter.abd_offset = 0; - abd_alloc_pages(abd, size); - - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - P2ROUNDUP(size, PAGESIZE) - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) -{ - abd_free_pages(abd); - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); - - abd_free_struct(abd); -} - -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd_is_linear_page(abd)) { - /* Transform it back into a scatter ABD for freeing */ - struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; - abd->abd_flags &= ~ABD_FLAG_LINEAR; - abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; - ABD_SCATTER(abd).abd_nents = 1; - ABD_SCATTER(abd).abd_offset = 0; - ABD_SCATTER(abd).abd_sgl = sg; - abd_free_scatter(abd); - return; - } - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd) && - !abd_is_linear_page(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - * - * On Linux the optimal thing to do would be to use abd_get_offset() and - * construct a new ABD which shares the original pages thereby eliminating - * the copy. But for the moment a new linear ABD is allocated until this - * performance optimization can be implemented. - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc(size, is_metadata)); -} - -/* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. - */ -static inline abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) -{ - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - int i = 0; - struct scatterlist *sg = NULL; - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { - if (new_offset < sg->length) - break; - new_offset -= sg->length; - } - - ABD_SCATTER(abd).abd_sgl = sg; - ABD_SCATTER(abd).abd_offset = new_offset; - ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; - } - - abd->abd_size = size; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); -} - -abd_t * -abd_get_offset(abd_t *sabd, size_t off) -{ - size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; - - VERIFY3U(size, >, 0); - - return (abd_get_offset_impl(sabd, off, size)); -} - -abd_t * -abd_get_offset_size(abd_t *sabd, size_t off, size_t size) -{ - ASSERT3U(off + size, <=, sabd->abd_size); - - return (abd_get_offset_impl(sabd, off, size)); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - - /* - * abd_free() needs to handle LINEAR_PAGE ABD's specially. - * Since that flag does not survive the - * abd_release_ownership_of_buf() -> abd_get_from_buf() -> - * abd_take_ownership_of_buf() sequence, we don't allow releasing - * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. - */ - ASSERT(!abd_is_linear_page(abd)); - - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -#ifndef HAVE_1ARG_KMAP_ATOMIC -#define NR_KM_TYPE (6) -#ifdef _KERNEL -int km_table[NR_KM_TYPE] = { - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, -}; -#endif -#endif - -struct abd_iter { - /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; - size_t iter_offset; /* offset in current sg/abd_buf, */ - /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ -#ifndef HAVE_1ARG_KMAP_ATOMIC - int iter_km; /* KM_* for kmap_atomic */ -#endif -}; - -/* - * Initialize the abd_iter. - */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; - } -#ifndef HAVE_1ARG_KMAP_ATOMIC - ASSERT3U(km_type, <, NR_KM_TYPE); - aiter->iter_km = km_type; -#endif -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; - aiter->iter_offset += amount; - if (!abd_is_linear(aiter->iter_abd)) { - while (aiter->iter_offset >= aiter->iter_sg->length) { - aiter->iter_offset -= aiter->iter_sg->length; - aiter->iter_sg = sg_next(aiter->iter_sg); - if (aiter->iter_sg == NULL) { - ASSERT0(aiter->iter_offset); - break; - } - } - } -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); - offset = aiter->iter_offset; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - offset = aiter->iter_offset; - aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - - paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), - km_table[aiter->iter_km]); - } - - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (!abd_is_linear(aiter->iter_abd)) { - /* LINTED E_FUNC_SET_NOT_USED */ - zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, - km_table[aiter->iter_km]); - } - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ -void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); -} - -/* - * Zero out the abd from a particular offset to the end. - */ -void -abd_zero_off(abd_t *abd, size_t off, size_t size) -{ - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); -} - -/* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. - */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) -{ - (void) memcpy(dbuf, sbuf, size); - return (0); -} - -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} - -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} - -/* - * Compares the contents of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd) -{ - ASSERT3U(dabd->abd_size, ==, sabd->abd_size); - return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, - abd_cmp_cb, NULL)); -} - -/* - * Iterate over code ABDs and a data ABD and call @func_raidz_gen. - * - * @cabds parity ABDs, must have equal size - * @dabd data ABD. Can be NULL (in this case @dsize = 0) - * @func_raidz_gen should be implemented so that its behaviour - * is the same when taking linear and when taking scatter - */ -void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, - void (*func_raidz_gen)(void **, const void *, size_t, size_t)) -{ - int i; - ssize_t len, dlen; - struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; - void *caddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); - - if (dabd) - abd_iter_init(&daiter, dabd, i); - - ASSERT3S(dsize, >=, 0); - - local_irq_save(flags); - while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); - - for (i = 0; i < parity; i++) { - abd_iter_map(&caiters[i]); - caddrs[i] = caiters[i].iter_mapaddr; - } - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(caiters[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ - len = MIN(daiter.iter_mapsize, len); - dlen = len; - } else - dlen = 0; - - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); - } - - if (dabd && dsize > 0) { - abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); - dsize -= dlen; - } - - csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); - } - local_irq_restore(flags); -} - -/* - * Iterate over code ABDs and data reconstruction target ABDs and call - * @func_raidz_rec. Function maps at most 6 pages atomically. - * - * @cabds parity ABDs, must have equal size - * @tabds rec target ABDs, at most 3 - * @tsize size of data target columns - * @func_raidz_rec expects syndrome data in target columns. Function - * reconstructs data and overwrites target columns. - */ -void -abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, - void (*func_raidz_rec)(void **t, const size_t tsize, void **c, - const unsigned *mul), - const unsigned *mul) -{ - int i; - ssize_t len; - struct abd_iter citers[3]; - struct abd_iter xiters[3]; - void *caddrs[3], *xaddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); - } - - local_irq_save(flags); - while (tsize > 0) { - - for (i = 0; i < parity; i++) { - abd_iter_map(&citers[i]); - abd_iter_map(&xiters[i]); - caddrs[i] = citers[i].iter_mapaddr; - xaddrs[i] = xiters[i].iter_mapaddr; - } - - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_rec(xaddrs, len, caddrs, mul); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&xiters[i]); - abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); - } - - tsize -= len; - ASSERT3S(tsize, >=, 0); - } - local_irq_restore(flags); -} - -#if defined(_KERNEL) -/* - * bio_nr_pages for ABD. - * @off is the offset in @abd - */ -unsigned long -abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) -{ - unsigned long pos; - - if (abd_is_linear(abd)) - pos = (unsigned long)abd_to_buf(abd) + off; - else - pos = abd->abd_u.abd_scatter.abd_offset + off; - - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); -} - -/* - * bio_map for scatter ABD. - * @off is the offset in @abd - * Remaining IO size is returned - */ -unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, - unsigned int io_size, size_t off) -{ - int i; - struct abd_iter aiter; - - ASSERT(!abd_is_linear(abd)); - ASSERT3U(io_size, <=, abd->abd_size - off); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - for (i = 0; i < bio->bi_max_vecs; i++) { - struct page *pg; - size_t len, sgoff, pgoff; - struct scatterlist *sg; - - if (io_size <= 0) - break; - - sg = aiter.iter_sg; - sgoff = aiter.iter_offset; - pgoff = sgoff & (PAGESIZE - 1); - len = MIN(io_size, PAGESIZE - pgoff); - ASSERT(len > 0); - - pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); - if (bio_add_page(bio, pg, len, pgoff) != len) - break; - - io_size -= len; - abd_iter_advance(&aiter, len); - } - - return (io_size); -} - -/* Tunable Parameters */ -module_param(zfs_abd_scatter_enabled, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_enabled, - "Toggle whether ABD allocations must be linear."); -module_param(zfs_abd_scatter_min_size, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_min_size, - "Minimum size of scatter allocations."); -/* CSTYLED */ -module_param(zfs_abd_scatter_max_order, uint, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_max_order, - "Maximum order allocation used for a scatter ABD."); -#endif diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c index 5cac2a7de..9d8af3228 100644 --- a/module/zfs/gzip.c +++ b/module/zfs/gzip.c @@ -29,7 +29,7 @@ #include <sys/debug.h> #include <sys/types.h> #include <sys/strings.h> -#include "qat.h" +#include <sys/qat.h> #ifdef _KERNEL diff --git a/module/zfs/policy.c b/module/zfs/policy.c deleted file mode 100644 index 7f9456a67..000000000 --- a/module/zfs/policy.c +++ /dev/null @@ -1,355 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. - * Copyright (C) 2016 Lawrence Livermore National Security, LLC. - * - * For Linux the vast majority of this enforcement is already handled via - * the standard Linux VFS permission checks. However certain administrative - * commands which bypass the standard mechanisms may need to make use of - * this functionality. - */ - -#include <sys/policy.h> -#include <linux/security.h> -#include <linux/vfs_compat.h> - -/* - * The passed credentials cannot be directly verified because Linux only - * provides and interface to check the *current* process credentials. In - * order to handle this the capable() test is only run when the passed - * credentials match the current process credentials or the kcred. In - * all other cases this function must fail and return the passed err. - */ -static int -priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, - struct user_namespace *ns) -{ - ASSERT3S(all, ==, B_FALSE); - - if (cr != CRED() && (cr != kcred)) - return (err); - -#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE) - if (!(ns ? ns_capable(ns, capability) : capable(capability))) -#else - if (!capable(capability)) -#endif - return (err); - - return (0); -} - -static int -priv_policy(const cred_t *cr, int capability, boolean_t all, int err) -{ - return (priv_policy_ns(cr, capability, all, err, NULL)); -} - -static int -priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) -{ - /* - * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() - * checks. If we cannot do them, we shouldn't be using ns_capable() - * since we don't know whether the affected files are valid in our - * namespace. Note that kuid_has_mapping() came after cred->user_ns, so - * we shouldn't need to re-check for HAVE_CRED_USER_NS - */ -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - return (priv_policy_ns(cr, capability, all, err, cr->user_ns)); -#else - return (priv_policy_ns(cr, capability, all, err, NULL)); -#endif -} - -/* - * Checks for operations that are either client-only or are used by - * both clients and servers. - */ -int -secpolicy_nfs(const cred_t *cr) -{ - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); -} - -/* - * Catch all system configuration. - */ -int -secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) -{ - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); -} - -/* - * Like secpolicy_vnode_access() but we get the actual wanted mode and the - * current mode of the file, not the missing bits. - * - * Enforced in the Linux VFS. - */ -int -secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, - mode_t curmode, mode_t wantmode) -{ - return (0); -} - -/* - * This is a special routine for ZFS; it is used to determine whether - * any of the privileges in effect allow any form of access to the - * file. There's no reason to audit this or any reason to record - * this. More work is needed to do the "KPLD" stuff. - */ -int -secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) -{ - if (crgetfsuid(cr) == owner) - return (0); - - if (zpl_inode_owner_or_capable(ip)) - return (0); - -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) - return (EPERM); -#endif - - if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) - return (0); - - if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) - return (0); - - return (EPERM); -} - -/* - * Determine if subject can chown owner of a file. - */ -int -secpolicy_vnode_chown(const cred_t *cr, uid_t owner) -{ - if (crgetfsuid(cr) == owner) - return (0); - -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) - return (EPERM); -#endif - - return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); -} - -/* - * Determine if subject can change group ownership of a file. - */ -int -secpolicy_vnode_create_gid(const cred_t *cr) -{ - return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); -} - -/* - * Policy determines whether we can remove an entry from a directory, - * regardless of permission bits. - */ -int -secpolicy_vnode_remove(const cred_t *cr) -{ - return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); -} - -/* - * Determine that subject can modify the mode of a file. allzone privilege - * needed when modifying root owned object. - */ -int -secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) -{ - if (crgetfsuid(cr) == owner) - return (0); - -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) - return (EPERM); -#endif - - return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); -} - -/* - * Are we allowed to retain the set-uid/set-gid bits when - * changing ownership or when writing to a file? - * "issuid" should be true when set-uid; only in that case - * root ownership is checked (setgid is assumed). - * - * Enforced in the Linux VFS. - */ -int -secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) -{ - return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); -} - -/* - * Determine that subject can set the file setgid flag. - */ -int -secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) -{ -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) - return (EPERM); -#endif - if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) - return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); - - return (0); -} - -/* - * Determine if the subject can inject faults in the ZFS fault injection - * framework. Requires all privileges. - */ -int -secpolicy_zinject(const cred_t *cr) -{ - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); -} - -/* - * Determine if the subject has permission to manipulate ZFS datasets - * (not pools). Equivalent to the SYS_MOUNT privilege. - */ -int -secpolicy_zfs(const cred_t *cr) -{ - return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); -} - -void -secpolicy_setid_clear(vattr_t *vap, cred_t *cr) -{ - if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - (vap->va_mode & S_ISUID) != 0 && - (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { - vap->va_mask |= AT_MODE; - vap->va_mode &= ~(S_ISUID|S_ISGID); - } -} - -/* - * Determine that subject can set the file setid flags. - */ -static int -secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) -{ - if (crgetfsuid(cr) == owner) - return (0); - -#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) - if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) - return (EPERM); -#endif - - return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); -} - -/* - * Determine that subject can make a file a "sticky". - * - * Enforced in the Linux VFS. - */ -static int -secpolicy_vnode_stky_modify(const cred_t *cr) -{ - return (0); -} - -int -secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, - const vattr_t *ovap, cred_t *cr) -{ - int error; - - if ((vap->va_mode & S_ISUID) != 0 && - (error = secpolicy_vnode_setid_modify(cr, - ovap->va_uid)) != 0) { - return (error); - } - - /* - * Check privilege if attempting to set the - * sticky bit on a non-directory. - */ - if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && - secpolicy_vnode_stky_modify(cr) != 0) { - vap->va_mode &= ~S_ISVTX; - } - - /* - * Check for privilege if attempting to set the - * group-id bit. - */ - if ((vap->va_mode & S_ISGID) != 0 && - secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { - vap->va_mode &= ~S_ISGID; - } - - return (0); -} - -/* - * Check privileges for setting xvattr attributes - */ -int -secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) -{ - return (secpolicy_vnode_chown(cr, owner)); -} - -/* - * Check privileges for setattr attributes. - * - * Enforced in the Linux VFS. - */ -int -secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, - const struct vattr *ovap, int flags, - int unlocked_access(void *, int, cred_t *), void *node) -{ - return (0); -} - -/* - * Check privileges for links. - * - * Enforced in the Linux VFS. - */ -int -secpolicy_basic_link(const cred_t *cr) -{ - return (0); -} diff --git a/module/zfs/qat.c b/module/zfs/qat.c deleted file mode 100644 index a6f024cb4..000000000 --- a/module/zfs/qat.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#if defined(_KERNEL) && defined(HAVE_QAT) -#include <sys/zfs_context.h> -#include "qat.h" - -qat_stats_t qat_stats = { - { "comp_requests", KSTAT_DATA_UINT64 }, - { "comp_total_in_bytes", KSTAT_DATA_UINT64 }, - { "comp_total_out_bytes", KSTAT_DATA_UINT64 }, - { "decomp_requests", KSTAT_DATA_UINT64 }, - { "decomp_total_in_bytes", KSTAT_DATA_UINT64 }, - { "decomp_total_out_bytes", KSTAT_DATA_UINT64 }, - { "dc_fails", KSTAT_DATA_UINT64 }, - { "encrypt_requests", KSTAT_DATA_UINT64 }, - { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 }, - { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 }, - { "decrypt_requests", KSTAT_DATA_UINT64 }, - { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 }, - { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 }, - { "crypt_fails", KSTAT_DATA_UINT64 }, - { "cksum_requests", KSTAT_DATA_UINT64 }, - { "cksum_total_in_bytes", KSTAT_DATA_UINT64 }, - { "cksum_fails", KSTAT_DATA_UINT64 }, -}; - -static kstat_t *qat_ksp = NULL; - -CpaStatus -qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes) -{ - *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL); - if (*pp_mem_addr == NULL) - return (CPA_STATUS_RESOURCE); - return (CPA_STATUS_SUCCESS); -} - -void -qat_mem_free_contig(void **pp_mem_addr) -{ - if (*pp_mem_addr != NULL) { - kfree(*pp_mem_addr); - *pp_mem_addr = NULL; - } -} - -int -qat_init(void) -{ - qat_ksp = kstat_create("zfs", 0, "qat", "misc", - KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (qat_ksp != NULL) { - qat_ksp->ks_data = &qat_stats; - kstat_install(qat_ksp); - } - - /* - * Just set the disable flag when qat init failed, qat can be - * turned on again in post-process after zfs module is loaded, e.g.: - * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable - */ - if (qat_dc_init() != 0) - zfs_qat_compress_disable = 1; - - if (qat_cy_init() != 0) { - zfs_qat_checksum_disable = 1; - zfs_qat_encrypt_disable = 1; - } - - return (0); -} - -void -qat_fini(void) -{ - if (qat_ksp != NULL) { - kstat_delete(qat_ksp); - qat_ksp = NULL; - } - - qat_cy_fini(); - qat_dc_fini(); -} - -#endif diff --git a/module/zfs/qat.h b/module/zfs/qat.h deleted file mode 100644 index fdd608139..000000000 --- a/module/zfs/qat.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#ifndef _SYS_QAT_H -#define _SYS_QAT_H - -typedef enum qat_compress_dir { - QAT_DECOMPRESS = 0, - QAT_COMPRESS = 1, -} qat_compress_dir_t; - -typedef enum qat_encrypt_dir { - QAT_DECRYPT = 0, - QAT_ENCRYPT = 1, -} qat_encrypt_dir_t; - - -#if defined(_KERNEL) && defined(HAVE_QAT) -#include <sys/zio.h> -#include <sys/crypto/api.h> -#include "cpa.h" -#include "dc/cpa_dc.h" -#include "lac/cpa_cy_sym.h" - -/* - * Timeout - no response from hardware after 0.5 seconds - */ -#define QAT_TIMEOUT_MS 500 - -/* - * The minimal and maximal buffer size which are not restricted - * in the QAT hardware, but with the input buffer size between 4KB - * and 128KB the hardware can provide the optimal performance. - */ -#define QAT_MIN_BUF_SIZE (4*1024) -#define QAT_MAX_BUF_SIZE (128*1024) - -/* - * Used for QAT kstat. - */ -typedef struct qat_stats { - /* - * Number of jobs submitted to QAT compression engine. - */ - kstat_named_t comp_requests; - /* - * Total bytes sent to QAT compression engine. - */ - kstat_named_t comp_total_in_bytes; - /* - * Total bytes output from QAT compression engine. - */ - kstat_named_t comp_total_out_bytes; - /* - * Number of jobs submitted to QAT de-compression engine. - */ - kstat_named_t decomp_requests; - /* - * Total bytes sent to QAT de-compression engine. - */ - kstat_named_t decomp_total_in_bytes; - /* - * Total bytes output from QAT de-compression engine. - */ - kstat_named_t decomp_total_out_bytes; - /* - * Number of fails in the QAT compression / decompression engine. - * Note: when a QAT error happens, it doesn't necessarily indicate a - * critical hardware issue. Sometimes it is because the output buffer - * is not big enough. The compression job will be transferred to the - * gzip software implementation so the functionality of ZFS is not - * impacted. - */ - kstat_named_t dc_fails; - - /* - * Number of jobs submitted to QAT encryption engine. - */ - kstat_named_t encrypt_requests; - /* - * Total bytes sent to QAT encryption engine. - */ - kstat_named_t encrypt_total_in_bytes; - /* - * Total bytes output from QAT encryption engine. - */ - kstat_named_t encrypt_total_out_bytes; - /* - * Number of jobs submitted to QAT decryption engine. - */ - kstat_named_t decrypt_requests; - /* - * Total bytes sent to QAT decryption engine. - */ - kstat_named_t decrypt_total_in_bytes; - /* - * Total bytes output from QAT decryption engine. - */ - kstat_named_t decrypt_total_out_bytes; - /* - * Number of fails in the QAT encryption / decryption engine. - * Note: when a QAT error happens, it doesn't necessarily indicate a - * critical hardware issue. The encryption job will be transferred - * to the software implementation so the functionality of ZFS is - * not impacted. - */ - kstat_named_t crypt_fails; - - /* - * Number of jobs submitted to QAT checksum engine. - */ - kstat_named_t cksum_requests; - /* - * Total bytes sent to QAT checksum engine. - */ - kstat_named_t cksum_total_in_bytes; - /* - * Number of fails in the QAT checksum engine. - * Note: when a QAT error happens, it doesn't necessarily indicate a - * critical hardware issue. The checksum job will be transferred to the - * software implementation so the functionality of ZFS is not impacted. - */ - kstat_named_t cksum_fails; -} qat_stats_t; - -#define QAT_STAT_INCR(stat, val) \ - atomic_add_64(&qat_stats.stat.value.ui64, (val)) -#define QAT_STAT_BUMP(stat) \ - QAT_STAT_INCR(stat, 1) - -extern qat_stats_t qat_stats; -extern int zfs_qat_compress_disable; -extern int zfs_qat_checksum_disable; -extern int zfs_qat_encrypt_disable; - -/* inlined for performance */ -static inline struct page * -qat_mem_to_page(void *addr) -{ - if (!is_vmalloc_addr(addr)) - return (virt_to_page(addr)); - - return (vmalloc_to_page(addr)); -} - -CpaStatus qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes); -void qat_mem_free_contig(void **pp_mem_addr); -#define QAT_PHYS_CONTIG_ALLOC(pp_mem_addr, size_bytes) \ - qat_mem_alloc_contig((void *)(pp_mem_addr), (size_bytes)) -#define QAT_PHYS_CONTIG_FREE(p_mem_addr) \ - qat_mem_free_contig((void *)&(p_mem_addr)) - -extern int qat_dc_init(void); -extern void qat_dc_fini(void); -extern int qat_cy_init(void); -extern void qat_cy_fini(void); -extern int qat_init(void); -extern void qat_fini(void); - -/* fake CpaStatus used to indicate data was not compressible */ -#define CPA_STATUS_INCOMPRESSIBLE (-127) - -extern boolean_t qat_dc_use_accel(size_t s_len); -extern boolean_t qat_crypt_use_accel(size_t s_len); -extern boolean_t qat_checksum_use_accel(size_t s_len); -extern int qat_compress(qat_compress_dir_t dir, char *src, int src_len, - char *dst, int dst_len, size_t *c_len); -extern int qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, - uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, - crypto_key_t *key, uint64_t crypt, uint32_t enc_len); -extern int qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, - zio_cksum_t *zcp); -#else -#define CPA_STATUS_SUCCESS 0 -#define CPA_STATUS_INCOMPRESSIBLE (-127) -#define qat_init() -#define qat_fini() -#define qat_dc_use_accel(s_len) 0 -#define qat_crypt_use_accel(s_len) 0 -#define qat_checksum_use_accel(s_len) 0 -#define qat_compress(dir, s, sl, d, dl, cl) 0 -#define qat_crypt(dir, s, d, a, al, i, db, k, c, el) 0 -#define qat_checksum(c, buf, s, z) 0 -#endif - -#endif /* _SYS_QAT_H */ diff --git a/module/zfs/qat_compress.c b/module/zfs/qat_compress.c deleted file mode 100644 index 4136b6555..000000000 --- a/module/zfs/qat_compress.c +++ /dev/null @@ -1,574 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#if defined(_KERNEL) && defined(HAVE_QAT) -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/pagemap.h> -#include <linux/completion.h> -#include <sys/zfs_context.h> -#include <sys/byteorder.h> -#include <sys/zio.h> -#include "qat.h" - -/* - * Max instances in a QAT device, each instance is a channel to submit - * jobs to QAT hardware, this is only for pre-allocating instance and - * session arrays; the actual number of instances are defined in the - * QAT driver's configuration file. - */ -#define QAT_DC_MAX_INSTANCES 48 - -/* - * ZLIB head and foot size - */ -#define ZLIB_HEAD_SZ 2 -#define ZLIB_FOOT_SZ 4 - -static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES]; -static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES]; -static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES]; -static Cpa16U num_inst = 0; -static Cpa32U inst_num = 0; -static boolean_t qat_dc_init_done = B_FALSE; -int zfs_qat_compress_disable = 0; - -boolean_t -qat_dc_use_accel(size_t s_len) -{ - return (!zfs_qat_compress_disable && - qat_dc_init_done && - s_len >= QAT_MIN_BUF_SIZE && - s_len <= QAT_MAX_BUF_SIZE); -} - -static void -qat_dc_callback(void *p_callback, CpaStatus status) -{ - if (p_callback != NULL) - complete((struct completion *)p_callback); -} - -static void -qat_dc_clean(void) -{ - Cpa16U buff_num = 0; - Cpa16U num_inter_buff_lists = 0; - - for (Cpa16U i = 0; i < num_inst; i++) { - cpaDcStopInstance(dc_inst_handles[i]); - QAT_PHYS_CONTIG_FREE(session_handles[i]); - /* free intermediate buffers */ - if (buffer_array[i] != NULL) { - cpaDcGetNumIntermediateBuffers( - dc_inst_handles[i], &num_inter_buff_lists); - for (buff_num = 0; buff_num < num_inter_buff_lists; - buff_num++) { - CpaBufferList *buffer_inter = - buffer_array[i][buff_num]; - if (buffer_inter->pBuffers) { - QAT_PHYS_CONTIG_FREE( - buffer_inter->pBuffers->pData); - QAT_PHYS_CONTIG_FREE( - buffer_inter->pBuffers); - } - QAT_PHYS_CONTIG_FREE( - buffer_inter->pPrivateMetaData); - QAT_PHYS_CONTIG_FREE(buffer_inter); - } - } - } - - num_inst = 0; - qat_dc_init_done = B_FALSE; -} - -int -qat_dc_init(void) -{ - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa32U sess_size = 0; - Cpa32U ctx_size = 0; - Cpa16U num_inter_buff_lists = 0; - Cpa16U buff_num = 0; - Cpa32U buff_meta_size = 0; - CpaDcSessionSetupData sd = {0}; - - if (qat_dc_init_done) - return (0); - - status = cpaDcGetNumInstances(&num_inst); - if (status != CPA_STATUS_SUCCESS) - return (-1); - - /* if the user has configured no QAT compression units just return */ - if (num_inst == 0) - return (0); - - if (num_inst > QAT_DC_MAX_INSTANCES) - num_inst = QAT_DC_MAX_INSTANCES; - - status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]); - if (status != CPA_STATUS_SUCCESS) - return (-1); - - for (Cpa16U i = 0; i < num_inst; i++) { - cpaDcSetAddressTranslation(dc_inst_handles[i], - (void*)virt_to_phys); - - status = cpaDcBufferListGetMetaSize(dc_inst_handles[i], - 1, &buff_meta_size); - - if (status == CPA_STATUS_SUCCESS) - status = cpaDcGetNumIntermediateBuffers( - dc_inst_handles[i], &num_inter_buff_lists); - - if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0) - status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i], - num_inter_buff_lists * - sizeof (CpaBufferList *)); - - for (buff_num = 0; buff_num < num_inter_buff_lists; - buff_num++) { - if (status == CPA_STATUS_SUCCESS) - status = QAT_PHYS_CONTIG_ALLOC( - &buffer_array[i][buff_num], - sizeof (CpaBufferList)); - - if (status == CPA_STATUS_SUCCESS) - status = QAT_PHYS_CONTIG_ALLOC( - &buffer_array[i][buff_num]-> - pPrivateMetaData, - buff_meta_size); - - if (status == CPA_STATUS_SUCCESS) - status = QAT_PHYS_CONTIG_ALLOC( - &buffer_array[i][buff_num]->pBuffers, - sizeof (CpaFlatBuffer)); - - if (status == CPA_STATUS_SUCCESS) { - /* - * implementation requires an intermediate - * buffer approximately twice the size of - * output buffer, which is 2x max buffer - * size here. - */ - status = QAT_PHYS_CONTIG_ALLOC( - &buffer_array[i][buff_num]->pBuffers-> - pData, 2 * QAT_MAX_BUF_SIZE); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - buffer_array[i][buff_num]->numBuffers = 1; - buffer_array[i][buff_num]->pBuffers-> - dataLenInBytes = 2 * QAT_MAX_BUF_SIZE; - } - } - - status = cpaDcStartInstance(dc_inst_handles[i], - num_inter_buff_lists, buffer_array[i]); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - sd.compLevel = CPA_DC_L1; - sd.compType = CPA_DC_DEFLATE; - sd.huffType = CPA_DC_HT_FULL_DYNAMIC; - sd.sessDirection = CPA_DC_DIR_COMBINED; - sd.sessState = CPA_DC_STATELESS; - sd.deflateWindowSize = 7; - sd.checksum = CPA_DC_ADLER32; - status = cpaDcGetSessionSize(dc_inst_handles[i], - &sd, &sess_size, &ctx_size); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size); - if (session_handles[i] == NULL) - goto fail; - - status = cpaDcInitSession(dc_inst_handles[i], - session_handles[i], - &sd, NULL, qat_dc_callback); - if (status != CPA_STATUS_SUCCESS) - goto fail; - } - - qat_dc_init_done = B_TRUE; - return (0); -fail: - qat_dc_clean(); - return (-1); -} - -void -qat_dc_fini(void) -{ - if (!qat_dc_init_done) - return; - - qat_dc_clean(); -} - -/* - * The "add" parameter is an additional buffer which is passed - * to QAT as a scratch buffer alongside the destination buffer - * in case the "compressed" data ends up being larger than the - * original source data. This is necessary to prevent QAT from - * generating buffer overflow warnings for incompressible data. - */ -static int -qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, - char *dst, int dst_len, char *add, int add_len, size_t *c_len) -{ - CpaInstanceHandle dc_inst_handle; - CpaDcSessionHandle session_handle; - CpaBufferList *buf_list_src = NULL; - CpaBufferList *buf_list_dst = NULL; - CpaFlatBuffer *flat_buf_src = NULL; - CpaFlatBuffer *flat_buf_dst = NULL; - Cpa8U *buffer_meta_src = NULL; - Cpa8U *buffer_meta_dst = NULL; - Cpa32U buffer_meta_size = 0; - CpaDcRqResults dc_results; - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa32U hdr_sz = 0; - Cpa32U compressed_sz; - Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; - Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2; - Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2; - Cpa32U bytes_left; - Cpa32U dst_pages = 0; - Cpa32U adler32 = 0; - char *data; - struct page *page; - struct page **in_pages = NULL; - struct page **out_pages = NULL; - struct page **add_pages = NULL; - Cpa32U page_off = 0; - struct completion complete; - Cpa32U page_num = 0; - Cpa16U i; - - /* - * We increment num_src_buf and num_dst_buf by 2 to allow - * us to handle non page-aligned buffer addresses and buffers - * whose sizes are not divisible by PAGE_SIZE. - */ - Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) + - (num_src_buf * sizeof (CpaFlatBuffer)); - Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + - ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); - - if (QAT_PHYS_CONTIG_ALLOC(&in_pages, - num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) - goto fail; - - if (QAT_PHYS_CONTIG_ALLOC(&out_pages, - num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) - goto fail; - - if (QAT_PHYS_CONTIG_ALLOC(&add_pages, - num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) - goto fail; - - i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; - dc_inst_handle = dc_inst_handles[i]; - session_handle = session_handles[i]; - - cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, - &buffer_meta_size); - if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) != - CPA_STATUS_SUCCESS) - goto fail; - - cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, - &buffer_meta_size); - if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) != - CPA_STATUS_SUCCESS) - goto fail; - - /* build source buffer list */ - if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) != - CPA_STATUS_SUCCESS) - goto fail; - - flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); - - buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ - - /* build destination buffer list */ - if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) != - CPA_STATUS_SUCCESS) - goto fail; - - flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); - - buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */ - - buf_list_src->numBuffers = 0; - buf_list_src->pPrivateMetaData = buffer_meta_src; - bytes_left = src_len; - data = src; - page_num = 0; - while (bytes_left > 0) { - page_off = ((long)data & ~PAGE_MASK); - page = qat_mem_to_page(data); - in_pages[page_num] = page; - flat_buf_src->pData = kmap(page) + page_off; - flat_buf_src->dataLenInBytes = - min((long)PAGE_SIZE - page_off, (long)bytes_left); - - bytes_left -= flat_buf_src->dataLenInBytes; - data += flat_buf_src->dataLenInBytes; - flat_buf_src++; - buf_list_src->numBuffers++; - page_num++; - } - - buf_list_dst->numBuffers = 0; - buf_list_dst->pPrivateMetaData = buffer_meta_dst; - bytes_left = dst_len; - data = dst; - page_num = 0; - while (bytes_left > 0) { - page_off = ((long)data & ~PAGE_MASK); - page = qat_mem_to_page(data); - flat_buf_dst->pData = kmap(page) + page_off; - out_pages[page_num] = page; - flat_buf_dst->dataLenInBytes = - min((long)PAGE_SIZE - page_off, (long)bytes_left); - - bytes_left -= flat_buf_dst->dataLenInBytes; - data += flat_buf_dst->dataLenInBytes; - flat_buf_dst++; - buf_list_dst->numBuffers++; - page_num++; - dst_pages++; - } - - /* map additional scratch pages into the destination buffer list */ - bytes_left = add_len; - data = add; - page_num = 0; - while (bytes_left > 0) { - page_off = ((long)data & ~PAGE_MASK); - page = qat_mem_to_page(data); - flat_buf_dst->pData = kmap(page) + page_off; - add_pages[page_num] = page; - flat_buf_dst->dataLenInBytes = - min((long)PAGE_SIZE - page_off, (long)bytes_left); - - bytes_left -= flat_buf_dst->dataLenInBytes; - data += flat_buf_dst->dataLenInBytes; - flat_buf_dst++; - buf_list_dst->numBuffers++; - page_num++; - } - - init_completion(&complete); - - if (dir == QAT_COMPRESS) { - QAT_STAT_BUMP(comp_requests); - QAT_STAT_INCR(comp_total_in_bytes, src_len); - - cpaDcGenerateHeader(session_handle, - buf_list_dst->pBuffers, &hdr_sz); - buf_list_dst->pBuffers->pData += hdr_sz; - buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz; - status = cpaDcCompressData( - dc_inst_handle, session_handle, - buf_list_src, buf_list_dst, - &dc_results, CPA_DC_FLUSH_FINAL, - &complete); - if (status != CPA_STATUS_SUCCESS) { - goto fail; - } - - /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } - - if (dc_results.status != CPA_STATUS_SUCCESS) { - status = CPA_STATUS_FAIL; - goto fail; - } - - compressed_sz = dc_results.produced; - if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) { - status = CPA_STATUS_INCOMPRESSIBLE; - goto fail; - } - - flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); - /* move to the last page */ - flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; - - /* no space for gzip footer in the last page */ - if (((compressed_sz + hdr_sz) % PAGE_SIZE) - + ZLIB_FOOT_SZ > PAGE_SIZE) { - status = CPA_STATUS_INCOMPRESSIBLE; - goto fail; - } - - /* jump to the end of the buffer and append footer */ - flat_buf_dst->pData = - (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) - + ((compressed_sz + hdr_sz) % PAGE_SIZE); - flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; - - dc_results.produced = 0; - status = cpaDcGenerateFooter(session_handle, - flat_buf_dst, &dc_results); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - *c_len = compressed_sz + dc_results.produced + hdr_sz; - QAT_STAT_INCR(comp_total_out_bytes, *c_len); - } else { - ASSERT3U(dir, ==, QAT_DECOMPRESS); - QAT_STAT_BUMP(decomp_requests); - QAT_STAT_INCR(decomp_total_in_bytes, src_len); - - buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ; - buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ; - status = cpaDcDecompressData(dc_inst_handle, session_handle, - buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL, - &complete); - - if (CPA_STATUS_SUCCESS != status) { - status = CPA_STATUS_FAIL; - goto fail; - } - - /* we now wait until the completion of the operation. */ - if (!wait_for_completion_interruptible_timeout(&complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } - - if (dc_results.status != CPA_STATUS_SUCCESS) { - status = CPA_STATUS_FAIL; - goto fail; - } - - /* verify adler checksum */ - adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ); - if (adler32 != BSWAP_32(dc_results.checksum)) { - status = CPA_STATUS_FAIL; - goto fail; - } - *c_len = dc_results.produced; - QAT_STAT_INCR(decomp_total_out_bytes, *c_len); - } - -fail: - if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE) - QAT_STAT_BUMP(dc_fails); - - if (in_pages) { - for (page_num = 0; - page_num < buf_list_src->numBuffers; - page_num++) { - kunmap(in_pages[page_num]); - } - QAT_PHYS_CONTIG_FREE(in_pages); - } - - if (out_pages) { - for (page_num = 0; page_num < dst_pages; page_num++) { - kunmap(out_pages[page_num]); - } - QAT_PHYS_CONTIG_FREE(out_pages); - } - - if (add_pages) { - for (page_num = 0; - page_num < buf_list_dst->numBuffers - dst_pages; - page_num++) { - kunmap(add_pages[page_num]); - } - QAT_PHYS_CONTIG_FREE(add_pages); - } - - QAT_PHYS_CONTIG_FREE(buffer_meta_src); - QAT_PHYS_CONTIG_FREE(buffer_meta_dst); - QAT_PHYS_CONTIG_FREE(buf_list_src); - QAT_PHYS_CONTIG_FREE(buf_list_dst); - - return (status); -} - -/* - * Entry point for QAT accelerated compression / decompression. - */ -int -qat_compress(qat_compress_dir_t dir, char *src, int src_len, - char *dst, int dst_len, size_t *c_len) -{ - int ret; - size_t add_len = 0; - void *add = NULL; - - if (dir == QAT_COMPRESS) { - add_len = dst_len; - add = zio_data_buf_alloc(add_len); - } - - ret = qat_compress_impl(dir, src, src_len, dst, - dst_len, add, add_len, c_len); - - if (dir == QAT_COMPRESS) - zio_data_buf_free(add, add_len); - - return (ret); -} - -static int -param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) -{ - int ret; - int *pvalue = kp->arg; - ret = param_set_int(val, kp); - if (ret) - return (ret); - /* - * zfs_qat_compress_disable = 0: enable qat compress - * try to initialize qat instance if it has not been done - */ - if (*pvalue == 0 && !qat_dc_init_done) { - ret = qat_dc_init(); - if (ret != 0) { - zfs_qat_compress_disable = 1; - return (ret); - } - } - return (ret); -} - -module_param_call(zfs_qat_compress_disable, param_set_qat_compress, - param_get_int, &zfs_qat_compress_disable, 0644); -MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression"); - -#endif diff --git a/module/zfs/qat_crypt.c b/module/zfs/qat_crypt.c deleted file mode 100644 index 02e19d21d..000000000 --- a/module/zfs/qat_crypt.c +++ /dev/null @@ -1,631 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * This file represents the QAT implementation of checksums and encryption. - * Internally, QAT shares the same cryptographic instances for both of these - * operations, so the code has been combined here. QAT data compression uses - * compression instances, so that code is separated into qat_compress.c - */ - -#if defined(_KERNEL) && defined(HAVE_QAT) -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/pagemap.h> -#include <linux/completion.h> -#include <sys/zfs_context.h> -#include <sys/zio_crypt.h> -#include "lac/cpa_cy_im.h" -#include "lac/cpa_cy_common.h" -#include "qat.h" - -/* - * Max instances in a QAT device, each instance is a channel to submit - * jobs to QAT hardware, this is only for pre-allocating instances - * and session arrays; the actual number of instances are defined in - * the QAT driver's configure file. - */ -#define QAT_CRYPT_MAX_INSTANCES 48 - -#define MAX_PAGE_NUM 1024 - -static Cpa32U inst_num = 0; -static Cpa16U num_inst = 0; -static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES]; -static boolean_t qat_cy_init_done = B_FALSE; -int zfs_qat_encrypt_disable = 0; -int zfs_qat_checksum_disable = 0; - -typedef struct cy_callback { - CpaBoolean verify_result; - struct completion complete; -} cy_callback_t; - -static void -symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation, - void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify) -{ - cy_callback_t *cb = p_callback; - - if (cb != NULL) { - /* indicate that the function has been called */ - cb->verify_result = verify; - complete(&cb->complete); - } -} - -boolean_t -qat_crypt_use_accel(size_t s_len) -{ - return (!zfs_qat_encrypt_disable && - qat_cy_init_done && - s_len >= QAT_MIN_BUF_SIZE && - s_len <= QAT_MAX_BUF_SIZE); -} - -boolean_t -qat_checksum_use_accel(size_t s_len) -{ - return (!zfs_qat_checksum_disable && - qat_cy_init_done && - s_len >= QAT_MIN_BUF_SIZE && - s_len <= QAT_MAX_BUF_SIZE); -} - -void -qat_cy_clean(void) -{ - for (Cpa16U i = 0; i < num_inst; i++) - cpaCyStopInstance(cy_inst_handles[i]); - - num_inst = 0; - qat_cy_init_done = B_FALSE; -} - -int -qat_cy_init(void) -{ - CpaStatus status = CPA_STATUS_FAIL; - - if (qat_cy_init_done) - return (0); - - status = cpaCyGetNumInstances(&num_inst); - if (status != CPA_STATUS_SUCCESS) - return (-1); - - /* if the user has configured no QAT encryption units just return */ - if (num_inst == 0) - return (0); - - if (num_inst > QAT_CRYPT_MAX_INSTANCES) - num_inst = QAT_CRYPT_MAX_INSTANCES; - - status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]); - if (status != CPA_STATUS_SUCCESS) - return (-1); - - for (Cpa16U i = 0; i < num_inst; i++) { - status = cpaCySetAddressTranslation(cy_inst_handles[i], - (void *)virt_to_phys); - if (status != CPA_STATUS_SUCCESS) - goto error; - - status = cpaCyStartInstance(cy_inst_handles[i]); - if (status != CPA_STATUS_SUCCESS) - goto error; - } - - qat_cy_init_done = B_TRUE; - return (0); - -error: - qat_cy_clean(); - return (-1); -} - -void -qat_cy_fini(void) -{ - if (!qat_cy_init_done) - return; - - qat_cy_clean(); -} - -static CpaStatus -qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle, - CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key, - Cpa64U crypt, Cpa32U aad_len) -{ - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa32U ctx_size; - Cpa32U ciper_algorithm; - Cpa32U hash_algorithm; - CpaCySymSessionSetupData sd = { 0 }; - - if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) { - return (CPA_STATUS_FAIL); - } else { - ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM; - hash_algorithm = CPA_CY_SYM_HASH_AES_GCM; - } - - sd.cipherSetupData.cipherAlgorithm = ciper_algorithm; - sd.cipherSetupData.pCipherKey = key->ck_data; - sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8; - sd.hashSetupData.hashAlgorithm = hash_algorithm; - sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH; - sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN; - sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len; - sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; - sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING; - sd.digestIsAppended = CPA_FALSE; - sd.verifyDigest = CPA_FALSE; - - if (dir == QAT_ENCRYPT) { - sd.cipherSetupData.cipherDirection = - CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; - sd.algChainOrder = - CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER; - } else { - ASSERT3U(dir, ==, QAT_DECRYPT); - sd.cipherSetupData.cipherDirection = - CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT; - sd.algChainOrder = - CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH; - } - - status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); - if (status != CPA_STATUS_SUCCESS) - return (status); - - status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); - if (status != CPA_STATUS_SUCCESS) - return (status); - - status = cpaCySymInitSession(inst_handle, symcallback, &sd, - *cy_session_ctx); - if (status != CPA_STATUS_SUCCESS) { - QAT_PHYS_CONTIG_FREE(*cy_session_ctx); - return (status); - } - - return (CPA_STATUS_SUCCESS); -} - -static CpaStatus -qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle, - CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum) -{ - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa32U ctx_size; - Cpa32U hash_algorithm; - CpaCySymSessionSetupData sd = { 0 }; - - /* - * ZFS's SHA512 checksum is actually SHA512/256, which uses - * a different IV from standard SHA512. QAT does not support - * SHA512/256, so we can only support SHA256. - */ - if (cksum == ZIO_CHECKSUM_SHA256) - hash_algorithm = CPA_CY_SYM_HASH_SHA256; - else - return (CPA_STATUS_FAIL); - - sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; - sd.symOperation = CPA_CY_SYM_OP_HASH; - sd.hashSetupData.hashAlgorithm = hash_algorithm; - sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN; - sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t); - sd.digestIsAppended = CPA_FALSE; - sd.verifyDigest = CPA_FALSE; - - status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); - if (status != CPA_STATUS_SUCCESS) - return (status); - - status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); - if (status != CPA_STATUS_SUCCESS) - return (status); - - status = cpaCySymInitSession(inst_handle, symcallback, &sd, - *cy_session_ctx); - if (status != CPA_STATUS_SUCCESS) { - QAT_PHYS_CONTIG_FREE(*cy_session_ctx); - return (status); - } - - return (CPA_STATUS_SUCCESS); -} - -static CpaStatus -qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs, - CpaBufferList *src, CpaBufferList *dst) -{ - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa32U meta_size = 0; - - status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size); - if (status != CPA_STATUS_SUCCESS) - return (status); - - status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size); - if (status != CPA_STATUS_SUCCESS) - goto error; - - if (src != dst) { - status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData, - meta_size); - if (status != CPA_STATUS_SUCCESS) - goto error; - } - - return (CPA_STATUS_SUCCESS); - -error: - QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData); - if (src != dst) - QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData); - - return (status); -} - -int -qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, - uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, - crypto_key_t *key, uint64_t crypt, uint32_t enc_len) -{ - CpaStatus status = CPA_STATUS_SUCCESS; - Cpa16U i; - CpaInstanceHandle cy_inst_handle; - Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2; - Cpa32U bytes_left = 0; - Cpa8S *data = NULL; - CpaCySymSessionCtx *cy_session_ctx = NULL; - cy_callback_t cb; - CpaCySymOpData op_data = { 0 }; - CpaBufferList src_buffer_list = { 0 }; - CpaBufferList dst_buffer_list = { 0 }; - CpaFlatBuffer *flat_src_buf_array = NULL; - CpaFlatBuffer *flat_src_buf = NULL; - CpaFlatBuffer *flat_dst_buf_array = NULL; - CpaFlatBuffer *flat_dst_buf = NULL; - struct page *in_pages[MAX_PAGE_NUM]; - struct page *out_pages[MAX_PAGE_NUM]; - Cpa32U in_page_num = 0; - Cpa32U out_page_num = 0; - Cpa32U in_page_off = 0; - Cpa32U out_page_off = 0; - - if (dir == QAT_ENCRYPT) { - QAT_STAT_BUMP(encrypt_requests); - QAT_STAT_INCR(encrypt_total_in_bytes, enc_len); - } else { - QAT_STAT_BUMP(decrypt_requests); - QAT_STAT_INCR(decrypt_total_in_bytes, enc_len); - } - - i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; - cy_inst_handle = cy_inst_handles[i]; - - status = qat_init_crypt_session_ctx(dir, cy_inst_handle, - &cy_session_ctx, key, crypt, aad_len); - if (status != CPA_STATUS_SUCCESS) { - /* don't count CCM as a failure since it's not supported */ - if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM) - QAT_STAT_BUMP(crypt_fails); - return (status); - } - - /* - * We increment nr_bufs by 2 to allow us to handle non - * page-aligned buffer addresses and buffers whose sizes - * are not divisible by PAGE_SIZE. - */ - status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, - &src_buffer_list, &dst_buffer_list); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, - nr_bufs * sizeof (CpaFlatBuffer)); - if (status != CPA_STATUS_SUCCESS) - goto fail; - status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array, - nr_bufs * sizeof (CpaFlatBuffer)); - if (status != CPA_STATUS_SUCCESS) - goto fail; - status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult, - ZIO_DATA_MAC_LEN); - if (status != CPA_STATUS_SUCCESS) - goto fail; - status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv, - ZIO_DATA_IV_LEN); - if (status != CPA_STATUS_SUCCESS) - goto fail; - if (aad_len > 0) { - status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData, - aad_len); - if (status != CPA_STATUS_SUCCESS) - goto fail; - bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len); - } - - bytes_left = enc_len; - data = src_buf; - flat_src_buf = flat_src_buf_array; - while (bytes_left > 0) { - in_page_off = ((long)data & ~PAGE_MASK); - in_pages[in_page_num] = qat_mem_to_page(data); - flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off; - flat_src_buf->dataLenInBytes = - min((long)PAGE_SIZE - in_page_off, (long)bytes_left); - data += flat_src_buf->dataLenInBytes; - bytes_left -= flat_src_buf->dataLenInBytes; - flat_src_buf++; - in_page_num++; - } - src_buffer_list.pBuffers = flat_src_buf_array; - src_buffer_list.numBuffers = in_page_num; - - bytes_left = enc_len; - data = dst_buf; - flat_dst_buf = flat_dst_buf_array; - while (bytes_left > 0) { - out_page_off = ((long)data & ~PAGE_MASK); - out_pages[out_page_num] = qat_mem_to_page(data); - flat_dst_buf->pData = kmap(out_pages[out_page_num]) + - out_page_off; - flat_dst_buf->dataLenInBytes = - min((long)PAGE_SIZE - out_page_off, (long)bytes_left); - data += flat_dst_buf->dataLenInBytes; - bytes_left -= flat_dst_buf->dataLenInBytes; - flat_dst_buf++; - out_page_num++; - } - dst_buffer_list.pBuffers = flat_dst_buf_array; - dst_buffer_list.numBuffers = out_page_num; - - op_data.sessionCtx = cy_session_ctx; - op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; - op_data.cryptoStartSrcOffsetInBytes = 0; - op_data.messageLenToCipherInBytes = 0; - op_data.hashStartSrcOffsetInBytes = 0; - op_data.messageLenToHashInBytes = 0; - op_data.messageLenToCipherInBytes = enc_len; - op_data.ivLenInBytes = ZIO_DATA_IV_LEN; - bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); - - cb.verify_result = CPA_FALSE; - init_completion(&cb.complete); - status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, - &src_buffer_list, &dst_buffer_list, NULL); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } - - if (cb.verify_result == CPA_FALSE) { - status = CPA_STATUS_FAIL; - goto fail; - } - - /* save digest result to digest_buf */ - bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); - if (dir == QAT_ENCRYPT) - QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); - else - QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); - -fail: - if (status != CPA_STATUS_SUCCESS) - QAT_STAT_BUMP(crypt_fails); - - for (i = 0; i < in_page_num; i++) - kunmap(in_pages[i]); - for (i = 0; i < out_page_num; i++) - kunmap(out_pages[i]); - - cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); - if (aad_len > 0) - QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData); - QAT_PHYS_CONTIG_FREE(op_data.pIv); - QAT_PHYS_CONTIG_FREE(op_data.pDigestResult); - QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); - QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData); - QAT_PHYS_CONTIG_FREE(cy_session_ctx); - QAT_PHYS_CONTIG_FREE(flat_src_buf_array); - QAT_PHYS_CONTIG_FREE(flat_dst_buf_array); - - return (status); -} - -int -qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) -{ - CpaStatus status; - Cpa16U i; - CpaInstanceHandle cy_inst_handle; - Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2; - Cpa32U bytes_left = 0; - Cpa8S *data = NULL; - CpaCySymSessionCtx *cy_session_ctx = NULL; - cy_callback_t cb; - Cpa8U *digest_buffer = NULL; - CpaCySymOpData op_data = { 0 }; - CpaBufferList src_buffer_list = { 0 }; - CpaFlatBuffer *flat_src_buf_array = NULL; - CpaFlatBuffer *flat_src_buf = NULL; - struct page *in_pages[MAX_PAGE_NUM]; - Cpa32U page_num = 0; - Cpa32U page_off = 0; - - QAT_STAT_BUMP(cksum_requests); - QAT_STAT_INCR(cksum_total_in_bytes, size); - - i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; - cy_inst_handle = cy_inst_handles[i]; - - status = qat_init_checksum_session_ctx(cy_inst_handle, - &cy_session_ctx, cksum); - if (status != CPA_STATUS_SUCCESS) { - /* don't count unsupported checksums as a failure */ - if (cksum == ZIO_CHECKSUM_SHA256 || - cksum == ZIO_CHECKSUM_SHA512) - QAT_STAT_BUMP(cksum_fails); - return (status); - } - - /* - * We increment nr_bufs by 2 to allow us to handle non - * page-aligned buffer addresses and buffers whose sizes - * are not divisible by PAGE_SIZE. - */ - status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, - &src_buffer_list, &src_buffer_list); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, - nr_bufs * sizeof (CpaFlatBuffer)); - if (status != CPA_STATUS_SUCCESS) - goto fail; - status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer, - sizeof (zio_cksum_t)); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - bytes_left = size; - data = buf; - flat_src_buf = flat_src_buf_array; - while (bytes_left > 0) { - page_off = ((long)data & ~PAGE_MASK); - in_pages[page_num] = qat_mem_to_page(data); - flat_src_buf->pData = kmap(in_pages[page_num]) + page_off; - flat_src_buf->dataLenInBytes = - min((long)PAGE_SIZE - page_off, (long)bytes_left); - data += flat_src_buf->dataLenInBytes; - bytes_left -= flat_src_buf->dataLenInBytes; - flat_src_buf++; - page_num++; - } - src_buffer_list.pBuffers = flat_src_buf_array; - src_buffer_list.numBuffers = page_num; - - op_data.sessionCtx = cy_session_ctx; - op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; - op_data.hashStartSrcOffsetInBytes = 0; - op_data.messageLenToHashInBytes = size; - op_data.pDigestResult = digest_buffer; - - cb.verify_result = CPA_FALSE; - init_completion(&cb.complete); - status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, - &src_buffer_list, &src_buffer_list, NULL); - if (status != CPA_STATUS_SUCCESS) - goto fail; - - if (!wait_for_completion_interruptible_timeout(&cb.complete, - QAT_TIMEOUT_MS)) { - status = CPA_STATUS_FAIL; - goto fail; - } - if (cb.verify_result == CPA_FALSE) { - status = CPA_STATUS_FAIL; - goto fail; - } - - bcopy(digest_buffer, zcp, sizeof (zio_cksum_t)); - -fail: - if (status != CPA_STATUS_SUCCESS) - QAT_STAT_BUMP(cksum_fails); - - for (i = 0; i < page_num; i++) - kunmap(in_pages[i]); - - cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); - QAT_PHYS_CONTIG_FREE(digest_buffer); - QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); - QAT_PHYS_CONTIG_FREE(cy_session_ctx); - QAT_PHYS_CONTIG_FREE(flat_src_buf_array); - - return (status); -} - -static int -param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) -{ - int ret; - int *pvalue = kp->arg; - ret = param_set_int(val, kp); - if (ret) - return (ret); - /* - * zfs_qat_encrypt_disable = 0: enable qat encrypt - * try to initialize qat instance if it has not been done - */ - if (*pvalue == 0 && !qat_cy_init_done) { - ret = qat_cy_init(); - if (ret != 0) { - zfs_qat_encrypt_disable = 1; - return (ret); - } - } - return (ret); -} - -static int -param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) -{ - int ret; - int *pvalue = kp->arg; - ret = param_set_int(val, kp); - if (ret) - return (ret); - /* - * set_checksum_param_ops = 0: enable qat checksum - * try to initialize qat instance if it has not been done - */ - if (*pvalue == 0 && !qat_cy_init_done) { - ret = qat_cy_init(); - if (ret != 0) { - zfs_qat_checksum_disable = 1; - return (ret); - } - } - return (ret); -} - -module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt, - param_get_int, &zfs_qat_encrypt_disable, 0644); -MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption"); - -module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum, - param_get_int, &zfs_qat_checksum_disable, 0644); -MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming"); - -#endif diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c index 2adadf56f..406c926a0 100644 --- a/module/zfs/sha256.c +++ b/module/zfs/sha256.c @@ -30,7 +30,7 @@ #include <sys/zio.h> #include <sys/sha2.h> #include <sys/abd.h> -#include "qat.h" +#include <sys/qat.h> static int sha_incremental(void *buf, size_t size, void *arg) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index b0c1ae1e6..a18f9604a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -59,7 +59,7 @@ #include <sys/kstat.h> #include "zfs_prop.h" #include <sys/zfeature.h> -#include "qat.h" +#include <sys/qat.h> /* * SPA locking diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c deleted file mode 100644 index 6895428f4..000000000 --- a/module/zfs/spa_stats.c +++ /dev/null @@ -1,1034 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#include <sys/zfs_context.h> -#include <sys/spa_impl.h> -#include <sys/vdev_impl.h> -#include <sys/spa.h> -#include <zfs_comutil.h> - -/* - * Keeps stats on last N reads per spa_t, disabled by default. - */ -int zfs_read_history = 0; - -/* - * Include cache hits in history, disabled by default. - */ -int zfs_read_history_hits = 0; - -/* - * Keeps stats on the last 100 txgs by default. - */ -int zfs_txg_history = 100; - -/* - * Keeps stats on the last N MMP updates, disabled by default. - */ -int zfs_multihost_history = 0; - -/* - * ========================================================================== - * SPA Read History Routines - * ========================================================================== - */ - -/* - * Read statistics - Information exported regarding each arc_read call - */ -typedef struct spa_read_history { - hrtime_t start; /* time read completed */ - uint64_t objset; /* read from this objset */ - uint64_t object; /* read of this object number */ - uint64_t level; /* block's indirection level */ - uint64_t blkid; /* read of this block id */ - char origin[24]; /* read originated from here */ - uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ - pid_t pid; /* PID of task doing read */ - char comm[16]; /* process name of task doing read */ - procfs_list_node_t srh_node; -} spa_read_history_t; - -static int -spa_read_history_show_header(struct seq_file *f) -{ - seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " - "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", - "level", "blkid", "aflags", "origin", "pid", "process"); - - return (0); -} - -static int -spa_read_history_show(struct seq_file *f, void *data) -{ - spa_read_history_t *srh = (spa_read_history_t *)data; - - seq_printf(f, "%-8llu %-16llu 0x%-6llx " - "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", - (u_longlong_t)srh->srh_node.pln_id, srh->start, - (longlong_t)srh->objset, (longlong_t)srh->object, - (longlong_t)srh->level, (longlong_t)srh->blkid, - srh->aflags, srh->origin, srh->pid, srh->comm); - - return (0); -} - -/* Remove oldest elements from list until there are no more than 'size' left */ -static void -spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) -{ - spa_read_history_t *srh; - while (shl->size > size) { - srh = list_remove_head(&shl->procfs_list.pl_list); - ASSERT3P(srh, !=, NULL); - kmem_free(srh, sizeof (spa_read_history_t)); - shl->size--; - } - - if (size == 0) - ASSERT(list_is_empty(&shl->procfs_list.pl_list)); -} - -static int -spa_read_history_clear(procfs_list_t *procfs_list) -{ - spa_history_list_t *shl = procfs_list->pl_private; - mutex_enter(&procfs_list->pl_lock); - spa_read_history_truncate(shl, 0); - mutex_exit(&procfs_list->pl_lock); - return (0); -} - -static void -spa_read_history_init(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.read_history; - char *module; - - shl->size = 0; - - module = kmem_asprintf("zfs/%s", spa_name(spa)); - - shl->procfs_list.pl_private = shl; - procfs_list_install(module, - "reads", - 0600, - &shl->procfs_list, - spa_read_history_show, - spa_read_history_show_header, - spa_read_history_clear, - offsetof(spa_read_history_t, srh_node)); - - strfree(module); -} - -static void -spa_read_history_destroy(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.read_history; - procfs_list_uninstall(&shl->procfs_list); - spa_read_history_truncate(shl, 0); - procfs_list_destroy(&shl->procfs_list); -} - -void -spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) -{ - spa_history_list_t *shl = &spa->spa_stats.read_history; - spa_read_history_t *srh; - - ASSERT3P(spa, !=, NULL); - ASSERT3P(zb, !=, NULL); - - if (zfs_read_history == 0 && shl->size == 0) - return; - - if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) - return; - - srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); - strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); - srh->start = gethrtime(); - srh->objset = zb->zb_objset; - srh->object = zb->zb_object; - srh->level = zb->zb_level; - srh->blkid = zb->zb_blkid; - srh->aflags = aflags; - srh->pid = getpid(); - - mutex_enter(&shl->procfs_list.pl_lock); - - procfs_list_add(&shl->procfs_list, srh); - shl->size++; - - spa_read_history_truncate(shl, zfs_read_history); - - mutex_exit(&shl->procfs_list.pl_lock); -} - -/* - * ========================================================================== - * SPA TXG History Routines - * ========================================================================== - */ - -/* - * Txg statistics - Information exported regarding each txg sync - */ - -typedef struct spa_txg_history { - uint64_t txg; /* txg id */ - txg_state_t state; /* active txg state */ - uint64_t nread; /* number of bytes read */ - uint64_t nwritten; /* number of bytes written */ - uint64_t reads; /* number of read operations */ - uint64_t writes; /* number of write operations */ - uint64_t ndirty; /* number of dirty bytes */ - hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ - procfs_list_node_t sth_node; -} spa_txg_history_t; - -static int -spa_txg_history_show_header(struct seq_file *f) -{ - seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " - "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", - "ndirty", "nread", "nwritten", "reads", "writes", - "otime", "qtime", "wtime", "stime"); - return (0); -} - -static int -spa_txg_history_show(struct seq_file *f, void *data) -{ - spa_txg_history_t *sth = (spa_txg_history_t *)data; - uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; - char state; - - switch (sth->state) { - case TXG_STATE_BIRTH: state = 'B'; break; - case TXG_STATE_OPEN: state = 'O'; break; - case TXG_STATE_QUIESCED: state = 'Q'; break; - case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; - case TXG_STATE_SYNCED: state = 'S'; break; - case TXG_STATE_COMMITTED: state = 'C'; break; - default: state = '?'; break; - } - - if (sth->times[TXG_STATE_OPEN]) - open = sth->times[TXG_STATE_OPEN] - - sth->times[TXG_STATE_BIRTH]; - - if (sth->times[TXG_STATE_QUIESCED]) - quiesce = sth->times[TXG_STATE_QUIESCED] - - sth->times[TXG_STATE_OPEN]; - - if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) - wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - - sth->times[TXG_STATE_QUIESCED]; - - if (sth->times[TXG_STATE_SYNCED]) - sync = sth->times[TXG_STATE_SYNCED] - - sth->times[TXG_STATE_WAIT_FOR_SYNC]; - - seq_printf(f, "%-8llu %-16llu %-5c %-12llu " - "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", - (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, - (u_longlong_t)sth->ndirty, - (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, - (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, - (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, - (u_longlong_t)sync); - - return (0); -} - -/* Remove oldest elements from list until there are no more than 'size' left */ -static void -spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) -{ - spa_txg_history_t *sth; - while (shl->size > size) { - sth = list_remove_head(&shl->procfs_list.pl_list); - ASSERT3P(sth, !=, NULL); - kmem_free(sth, sizeof (spa_txg_history_t)); - shl->size--; - } - - if (size == 0) - ASSERT(list_is_empty(&shl->procfs_list.pl_list)); - -} - -static int -spa_txg_history_clear(procfs_list_t *procfs_list) -{ - spa_history_list_t *shl = procfs_list->pl_private; - mutex_enter(&procfs_list->pl_lock); - spa_txg_history_truncate(shl, 0); - mutex_exit(&procfs_list->pl_lock); - return (0); -} - -static void -spa_txg_history_init(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.txg_history; - char *module; - - shl->size = 0; - - module = kmem_asprintf("zfs/%s", spa_name(spa)); - - shl->procfs_list.pl_private = shl; - procfs_list_install(module, - "txgs", - 0644, - &shl->procfs_list, - spa_txg_history_show, - spa_txg_history_show_header, - spa_txg_history_clear, - offsetof(spa_txg_history_t, sth_node)); - - strfree(module); -} - -static void -spa_txg_history_destroy(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.txg_history; - procfs_list_uninstall(&shl->procfs_list); - spa_txg_history_truncate(shl, 0); - procfs_list_destroy(&shl->procfs_list); -} - -/* - * Add a new txg to historical record. - */ -void -spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) -{ - spa_history_list_t *shl = &spa->spa_stats.txg_history; - spa_txg_history_t *sth; - - if (zfs_txg_history == 0 && shl->size == 0) - return; - - sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); - sth->txg = txg; - sth->state = TXG_STATE_OPEN; - sth->times[TXG_STATE_BIRTH] = birth_time; - - mutex_enter(&shl->procfs_list.pl_lock); - procfs_list_add(&shl->procfs_list, sth); - shl->size++; - spa_txg_history_truncate(shl, zfs_txg_history); - mutex_exit(&shl->procfs_list.pl_lock); -} - -/* - * Set txg state completion time and increment current state. - */ -int -spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, - hrtime_t completed_time) -{ - spa_history_list_t *shl = &spa->spa_stats.txg_history; - spa_txg_history_t *sth; - int error = ENOENT; - - if (zfs_txg_history == 0) - return (0); - - mutex_enter(&shl->procfs_list.pl_lock); - for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; - sth = list_prev(&shl->procfs_list.pl_list, sth)) { - if (sth->txg == txg) { - sth->times[completed_state] = completed_time; - sth->state++; - error = 0; - break; - } - } - mutex_exit(&shl->procfs_list.pl_lock); - - return (error); -} - -/* - * Set txg IO stats. - */ -static int -spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, - uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) -{ - spa_history_list_t *shl = &spa->spa_stats.txg_history; - spa_txg_history_t *sth; - int error = ENOENT; - - if (zfs_txg_history == 0) - return (0); - - mutex_enter(&shl->procfs_list.pl_lock); - for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; - sth = list_prev(&shl->procfs_list.pl_list, sth)) { - if (sth->txg == txg) { - sth->nread = nread; - sth->nwritten = nwritten; - sth->reads = reads; - sth->writes = writes; - sth->ndirty = ndirty; - error = 0; - break; - } - } - mutex_exit(&shl->procfs_list.pl_lock); - - return (error); -} - -txg_stat_t * -spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) -{ - txg_stat_t *ts; - - if (zfs_txg_history == 0) - return (NULL); - - ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vdev_get_stats(spa->spa_root_vdev, &ts->vs1); - spa_config_exit(spa, SCL_CONFIG, FTAG); - - ts->txg = txg; - ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; - - spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); - - return (ts); -} - -void -spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) -{ - if (ts == NULL) - return; - - if (zfs_txg_history == 0) { - kmem_free(ts, sizeof (txg_stat_t)); - return; - } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vdev_get_stats(spa->spa_root_vdev, &ts->vs2); - spa_config_exit(spa, SCL_CONFIG, FTAG); - - spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); - spa_txg_history_set_io(spa, ts->txg, - ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], - ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], - ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], - ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], - ts->ndirty); - - kmem_free(ts, sizeof (txg_stat_t)); -} - -/* - * ========================================================================== - * SPA TX Assign Histogram Routines - * ========================================================================== - */ - -/* - * Tx statistics - Information exported regarding dmu_tx_assign time. - */ - -/* - * When the kstat is written zero all buckets. When the kstat is read - * count the number of trailing buckets set to zero and update ks_ndata - * such that they are not output. - */ -static int -spa_tx_assign_update(kstat_t *ksp, int rw) -{ - spa_t *spa = ksp->ks_private; - spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; - int i; - - if (rw == KSTAT_WRITE) { - for (i = 0; i < shk->count; i++) - ((kstat_named_t *)shk->private)[i].value.ui64 = 0; - } - - for (i = shk->count; i > 0; i--) - if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) - break; - - ksp->ks_ndata = i; - ksp->ks_data_size = i * sizeof (kstat_named_t); - - return (0); -} - -static void -spa_tx_assign_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; - char *name; - kstat_named_t *ks; - kstat_t *ksp; - int i; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - shk->count = 42; /* power of two buckets for 1ns to 2,199s */ - shk->size = shk->count * sizeof (kstat_named_t); - shk->private = kmem_alloc(shk->size, KM_SLEEP); - - name = kmem_asprintf("zfs/%s", spa_name(spa)); - - for (i = 0; i < shk->count; i++) { - ks = &((kstat_named_t *)shk->private)[i]; - ks->data_type = KSTAT_DATA_UINT64; - ks->value.ui64 = 0; - (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", - (u_longlong_t)1 << i); - } - - ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", - KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); - shk->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &shk->lock; - ksp->ks_data = shk->private; - ksp->ks_ndata = shk->count; - ksp->ks_data_size = shk->size; - ksp->ks_private = spa; - ksp->ks_update = spa_tx_assign_update; - kstat_install(ksp); - } - strfree(name); -} - -static void -spa_tx_assign_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; - kstat_t *ksp; - - ksp = shk->kstat; - if (ksp) - kstat_delete(ksp); - - kmem_free(shk->private, shk->size); - mutex_destroy(&shk->lock); -} - -void -spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) -{ - spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; - uint64_t idx = 0; - - while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) - idx++; - - atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); -} - -/* - * ========================================================================== - * SPA IO History Routines - * ========================================================================== - */ -static int -spa_io_history_update(kstat_t *ksp, int rw) -{ - if (rw == KSTAT_WRITE) - memset(ksp->ks_data, 0, ksp->ks_data_size); - - return (0); -} - -static void -spa_io_history_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - char *name; - kstat_t *ksp; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - name = kmem_asprintf("zfs/%s", spa_name(spa)); - - ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); - shk->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &shk->lock; - ksp->ks_private = spa; - ksp->ks_update = spa_io_history_update; - kstat_install(ksp); - } - strfree(name); -} - -static void -spa_io_history_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - - if (shk->kstat) - kstat_delete(shk->kstat); - - mutex_destroy(&shk->lock); -} - -/* - * ========================================================================== - * SPA MMP History Routines - * ========================================================================== - */ - -/* - * MMP statistics - Information exported regarding attempted MMP writes - * For MMP writes issued, fields used as per comments below. - * For MMP writes skipped, an entry represents a span of time when - * writes were skipped for same reason (error from mmp_random_leaf). - * Differences are: - * timestamp time first write skipped, if >1 skipped in a row - * mmp_delay delay value at timestamp - * vdev_guid number of writes skipped - * io_error one of enum mmp_error - * duration time span (ns) of skipped writes - */ - -typedef struct spa_mmp_history { - uint64_t mmp_node_id; /* unique # for updates */ - uint64_t txg; /* txg of last sync */ - uint64_t timestamp; /* UTC time MMP write issued */ - uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ - uint64_t vdev_guid; /* unique ID of leaf vdev */ - char *vdev_path; - int vdev_label; /* vdev label */ - int io_error; /* error status of MMP write */ - hrtime_t error_start; /* hrtime of start of error period */ - hrtime_t duration; /* time from submission to completion */ - procfs_list_node_t smh_node; -} spa_mmp_history_t; - -static int -spa_mmp_history_show_header(struct seq_file *f) -{ - seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " - "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", - "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); - return (0); -} - -static int -spa_mmp_history_show(struct seq_file *f, void *data) -{ - spa_mmp_history_t *smh = (spa_mmp_history_t *)data; - char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " - "%-10lld %s\n"; - char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " - "%-10lld %s\n"; - - seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), - (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, - (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, - (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, - (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, - (smh->vdev_path ? smh->vdev_path : "-")); - - return (0); -} - -/* Remove oldest elements from list until there are no more than 'size' left */ -static void -spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) -{ - spa_mmp_history_t *smh; - while (shl->size > size) { - smh = list_remove_head(&shl->procfs_list.pl_list); - if (smh->vdev_path) - strfree(smh->vdev_path); - kmem_free(smh, sizeof (spa_mmp_history_t)); - shl->size--; - } - - if (size == 0) - ASSERT(list_is_empty(&shl->procfs_list.pl_list)); - -} - -static int -spa_mmp_history_clear(procfs_list_t *procfs_list) -{ - spa_history_list_t *shl = procfs_list->pl_private; - mutex_enter(&procfs_list->pl_lock); - spa_mmp_history_truncate(shl, 0); - mutex_exit(&procfs_list->pl_lock); - return (0); -} - -static void -spa_mmp_history_init(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.mmp_history; - char *module; - - shl->size = 0; - - module = kmem_asprintf("zfs/%s", spa_name(spa)); - - shl->procfs_list.pl_private = shl; - procfs_list_install(module, - "multihost", - 0644, - &shl->procfs_list, - spa_mmp_history_show, - spa_mmp_history_show_header, - spa_mmp_history_clear, - offsetof(spa_mmp_history_t, smh_node)); - - strfree(module); -} - -static void -spa_mmp_history_destroy(spa_t *spa) -{ - spa_history_list_t *shl = &spa->spa_stats.mmp_history; - procfs_list_uninstall(&shl->procfs_list); - spa_mmp_history_truncate(shl, 0); - procfs_list_destroy(&shl->procfs_list); -} - -/* - * Set duration in existing "skip" record to how long we have waited for a leaf - * vdev to become available. - * - * Important that we start search at the tail of the list where new - * records are inserted, so this is normally an O(1) operation. - */ -int -spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) -{ - spa_history_list_t *shl = &spa->spa_stats.mmp_history; - spa_mmp_history_t *smh; - int error = ENOENT; - - if (zfs_multihost_history == 0 && shl->size == 0) - return (0); - - mutex_enter(&shl->procfs_list.pl_lock); - for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; - smh = list_prev(&shl->procfs_list.pl_list, smh)) { - if (smh->mmp_node_id == mmp_node_id) { - ASSERT3U(smh->io_error, !=, 0); - smh->duration = gethrtime() - smh->error_start; - smh->vdev_guid++; - error = 0; - break; - } - } - mutex_exit(&shl->procfs_list.pl_lock); - - return (error); -} - -/* - * Set MMP write duration and error status in existing record. - * See comment re: search order above spa_mmp_history_set_skip(). - */ -int -spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, - hrtime_t duration) -{ - spa_history_list_t *shl = &spa->spa_stats.mmp_history; - spa_mmp_history_t *smh; - int error = ENOENT; - - if (zfs_multihost_history == 0 && shl->size == 0) - return (0); - - mutex_enter(&shl->procfs_list.pl_lock); - for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; - smh = list_prev(&shl->procfs_list.pl_list, smh)) { - if (smh->mmp_node_id == mmp_node_id) { - ASSERT(smh->io_error == 0); - smh->io_error = io_error; - smh->duration = duration; - error = 0; - break; - } - } - mutex_exit(&shl->procfs_list.pl_lock); - - return (error); -} - -/* - * Add a new MMP historical record. - * error == 0 : a write was issued. - * error != 0 : a write was not issued because no leaves were found. - */ -void -spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, - uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, - int error) -{ - spa_history_list_t *shl = &spa->spa_stats.mmp_history; - spa_mmp_history_t *smh; - - if (zfs_multihost_history == 0 && shl->size == 0) - return; - - smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); - smh->txg = txg; - smh->timestamp = timestamp; - smh->mmp_delay = mmp_delay; - if (vd) { - smh->vdev_guid = vd->vdev_guid; - if (vd->vdev_path) - smh->vdev_path = strdup(vd->vdev_path); - } - smh->vdev_label = label; - smh->mmp_node_id = mmp_node_id; - - if (error) { - smh->io_error = error; - smh->error_start = gethrtime(); - smh->vdev_guid = 1; - } - - mutex_enter(&shl->procfs_list.pl_lock); - procfs_list_add(&shl->procfs_list, smh); - shl->size++; - spa_mmp_history_truncate(shl, zfs_multihost_history); - mutex_exit(&shl->procfs_list.pl_lock); -} - -static void * -spa_state_addr(kstat_t *ksp, loff_t n) -{ - return (ksp->ks_private); /* return the spa_t */ -} - -static int -spa_state_data(char *buf, size_t size, void *data) -{ - spa_t *spa = (spa_t *)data; - (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); - return (0); -} - -/* - * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. - * - * This is a lock-less read of the pool's state (unlike using 'zpool', which - * can potentially block for seconds). Because it doesn't block, it can useful - * as a pool heartbeat value. - */ -static void -spa_state_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.state; - char *name; - kstat_t *ksp; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - name = kmem_asprintf("zfs/%s", spa_name(spa)); - ksp = kstat_create(name, 0, "state", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - - shk->kstat = ksp; - if (ksp) { - ksp->ks_lock = &shk->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; - kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); - kstat_install(ksp); - } - - strfree(name); -} - -static void -spa_health_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.state; - kstat_t *ksp = shk->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_destroy(&shk->lock); -} - -static spa_iostats_t spa_iostats_template = { - { "trim_extents_written", KSTAT_DATA_UINT64 }, - { "trim_bytes_written", KSTAT_DATA_UINT64 }, - { "trim_extents_skipped", KSTAT_DATA_UINT64 }, - { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, - { "trim_extents_failed", KSTAT_DATA_UINT64 }, - { "trim_bytes_failed", KSTAT_DATA_UINT64 }, - { "autotrim_extents_written", KSTAT_DATA_UINT64 }, - { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, - { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, - { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, - { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, - { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, -}; - -#define SPA_IOSTATS_ADD(stat, val) \ - atomic_add_64(&iostats->stat.value.ui64, (val)); - -void -spa_iostats_trim_add(spa_t *spa, trim_type_t type, - uint64_t extents_written, uint64_t bytes_written, - uint64_t extents_skipped, uint64_t bytes_skipped, - uint64_t extents_failed, uint64_t bytes_failed) -{ - spa_history_kstat_t *shk = &spa->spa_stats.iostats; - kstat_t *ksp = shk->kstat; - spa_iostats_t *iostats; - - if (ksp == NULL) - return; - - iostats = ksp->ks_data; - if (type == TRIM_TYPE_MANUAL) { - SPA_IOSTATS_ADD(trim_extents_written, extents_written); - SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); - SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); - SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); - SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); - SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); - } else { - SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); - SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); - SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); - SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); - SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); - SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); - } -} - -int -spa_iostats_update(kstat_t *ksp, int rw) -{ - if (rw == KSTAT_WRITE) { - memcpy(ksp->ks_data, &spa_iostats_template, - sizeof (spa_iostats_t)); - } - - return (0); -} - -static void -spa_iostats_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.iostats; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - char *name = kmem_asprintf("zfs/%s", spa_name(spa)); - kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", - KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - shk->kstat = ksp; - if (ksp) { - int size = sizeof (spa_iostats_t); - ksp->ks_lock = &shk->lock; - ksp->ks_private = spa; - ksp->ks_update = spa_iostats_update; - ksp->ks_data = kmem_alloc(size, KM_SLEEP); - memcpy(ksp->ks_data, &spa_iostats_template, size); - kstat_install(ksp); - } - - strfree(name); -} - -static void -spa_iostats_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.iostats; - kstat_t *ksp = shk->kstat; - if (ksp) { - kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); - kstat_delete(ksp); - } - - mutex_destroy(&shk->lock); -} - -void -spa_stats_init(spa_t *spa) -{ - spa_read_history_init(spa); - spa_txg_history_init(spa); - spa_tx_assign_init(spa); - spa_io_history_init(spa); - spa_mmp_history_init(spa); - spa_state_init(spa); - spa_iostats_init(spa); -} - -void -spa_stats_destroy(spa_t *spa) -{ - spa_iostats_destroy(spa); - spa_health_destroy(spa); - spa_tx_assign_destroy(spa); - spa_txg_history_destroy(spa); - spa_read_history_destroy(spa); - spa_io_history_destroy(spa); - spa_mmp_history_destroy(spa); -} - -#if defined(_KERNEL) -/* CSTYLED */ -module_param(zfs_read_history, int, 0644); -MODULE_PARM_DESC(zfs_read_history, - "Historical statistics for the last N reads"); - -module_param(zfs_read_history_hits, int, 0644); -MODULE_PARM_DESC(zfs_read_history_hits, - "Include cache hits in read history"); - -module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, - "Historical statistics for the last N txgs"); - -module_param(zfs_multihost_history, int, 0644); -MODULE_PARM_DESC(zfs_multihost_history, - "Historical statistics for last N multihost writes"); -/* END CSTYLED */ -#endif diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c deleted file mode 100644 index 21f9ae454..000000000 --- a/module/zfs/vdev_disk.c +++ /dev/null @@ -1,954 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Rewritten for Linux by Brian Behlendorf <[email protected]>. - * LLNL-CODE-403049. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/spa_impl.h> -#include <sys/vdev_disk.h> -#include <sys/vdev_impl.h> -#include <sys/vdev_trim.h> -#include <sys/abd.h> -#include <sys/fs/zfs.h> -#include <sys/zio.h> -#include <linux/msdos_fs.h> -#include <linux/vfs_compat.h> - -char *zfs_vdev_scheduler = VDEV_SCHEDULER; -static void *zfs_vdev_holder = VDEV_HOLDER; - -/* size of the "reserved" partition, in blocks */ -#define EFI_MIN_RESV_SIZE (16 * 1024) - -/* - * Virtual device vector for disks. - */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[0]; /* Attached bio's */ -} dio_request_t; - - -#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH) -static fmode_t -vdev_bdev_mode(int smode) -{ - fmode_t mode = 0; - - ASSERT3S(smode & (FREAD | FWRITE), !=, 0); - - if (smode & FREAD) - mode |= FMODE_READ; - - if (smode & FWRITE) - mode |= FMODE_WRITE; - - return (mode); -} -#else -static int -vdev_bdev_mode(int smode) -{ - int mode = 0; - - ASSERT3S(smode & (FREAD | FWRITE), !=, 0); - - if ((smode & FREAD) && !(smode & FWRITE)) - mode = SB_RDONLY; - - return (mode); -} -#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ - -/* - * Returns the usable capacity (in bytes) for the partition or disk. - */ -static uint64_t -bdev_capacity(struct block_device *bdev) -{ - return (i_size_read(bdev->bd_inode)); -} - -/* - * Returns the maximum expansion capacity of the block device (in bytes). - * - * It is possible to expand a vdev when it has been created as a wholedisk - * and the containing block device has increased in capacity. Or when the - * partition containing the pool has been manually increased in size. - * - * This function is only responsible for calculating the potential expansion - * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is - * responsible for verifying the expected partition layout in the wholedisk - * case, and updating the partition table if appropriate. Once the partition - * size has been increased the additional capacity will be visible using - * bdev_capacity(). - * - * The returned maximum expansion capacity is always expected to be larger, or - * at the very least equal, to its usable capacity to prevent overestimating - * the pool expandsize. - */ -static uint64_t -bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) -{ - uint64_t psize; - int64_t available; - - if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { - /* - * When reporting maximum expansion capacity for a wholedisk - * deduct any capacity which is expected to be lost due to - * alignment restrictions. Over reporting this value isn't - * harmful and would only result in slightly less capacity - * than expected post expansion. - * The estimated available space may be slightly smaller than - * bdev_capacity() for devices where the number of sectors is - * not a multiple of the alignment size and the partition layout - * is keeping less than PARTITION_END_ALIGNMENT bytes after the - * "reserved" EFI partition: in such cases return the device - * usable capacity. - */ - available = i_size_read(bdev->bd_contains->bd_inode) - - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + - PARTITION_END_ALIGNMENT) << SECTOR_BITS); - psize = MAX(available, bdev_capacity(bdev)); - } else { - psize = bdev_capacity(bdev); - } - - return (psize); -} - -static void -vdev_disk_error(zio_t *zio) -{ - /* - * This function can be called in interrupt context, for instance while - * handling IRQs coming from a misbehaving disk device; use printk() - * which is safe from any context. - */ - printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " - "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), - zio->io_vd->vdev_path, zio->io_error, zio->io_type, - (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, - zio->io_flags); -} - -/* - * Use the Linux 'noop' elevator for zfs managed block devices. This - * strikes the ideal balance by allowing the zfs elevator to do all - * request ordering and prioritization. While allowing the Linux - * elevator to do the maximum front/back merging allowed by the - * physical device. This yields the largest possible requests for - * the device with the lowest total overhead. - */ -static void -vdev_elevator_switch(vdev_t *v, char *elevator) -{ - vdev_disk_t *vd = v->vdev_tsd; - struct request_queue *q; - char *device; - int error; - - for (int c = 0; c < v->vdev_children; c++) - vdev_elevator_switch(v->vdev_child[c], elevator); - - if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) - return; - - q = bdev_get_queue(vd->vd_bdev); - device = vd->vd_bdev->bd_disk->disk_name; - - /* - * Skip devices which are not whole disks (partitions). - * Device-mapper devices are excepted since they may be whole - * disks despite the vdev_wholedisk flag, in which case we can - * and should switch the elevator. If the device-mapper device - * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the - * "Skip devices without schedulers" check below will fail. - */ - if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) - return; - - /* Leave existing scheduler when set to "none" */ - if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) - return; - - /* - * The elevator_change() function was available in kernels from - * 2.6.36 to 4.11. When not available fall back to using the user - * mode helper functionality to set the elevator via sysfs. This - * requires /bin/echo and sysfs to be mounted which may not be true - * early in the boot process. - */ -#ifdef HAVE_ELEVATOR_CHANGE - error = elevator_change(q, elevator); -#else -#define SET_SCHEDULER_CMD \ - "exec 0</dev/null " \ - " 1>/sys/block/%s/queue/scheduler " \ - " 2>/dev/null; " \ - "echo %s" - - char *argv[] = { "/bin/sh", "-c", NULL, NULL }; - char *envp[] = { NULL }; - - argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - strfree(argv[2]); -#endif /* HAVE_ELEVATOR_CHANGE */ - if (error) { - zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d", - elevator, v->vdev_path, device, error); - } -} - -static int -vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) -{ - struct block_device *bdev; - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); - int count = 0, block_size; - int bdev_retry_count = 50; - vdev_disk_t *vd; - - /* Must have a pathname and it must be absolute. */ - if (v->vdev_path == NULL || v->vdev_path[0] != '/') { - v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - vdev_dbgmsg(v, "invalid vdev_path"); - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it is currently open. When expanding a - * partition force re-scanning the partition table while closed - * in order to get an accurate updated block device size. Then - * since udev may need to recreate the device links increase the - * open retry count before reporting the device as unavailable. - */ - vd = v->vdev_tsd; - if (vd) { - char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; - boolean_t reread_part = B_FALSE; - - rw_enter(&vd->vd_lock, RW_WRITER); - bdev = vd->vd_bdev; - vd->vd_bdev = NULL; - - if (bdev) { - if (v->vdev_expanding && bdev != bdev->bd_contains) { - bdevname(bdev->bd_contains, disk_name + 5); - reread_part = B_TRUE; - } - - vdev_bdev_close(bdev, mode); - } - - if (reread_part) { - bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); - if (!IS_ERR(bdev)) { - int error = vdev_bdev_reread_part(bdev); - vdev_bdev_close(bdev, mode); - if (error == 0) - bdev_retry_count = 100; - } - } - } else { - vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); - - rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); - rw_enter(&vd->vd_lock, RW_WRITER); - } - - /* - * Devices are always opened by the path provided at configuration - * time. This means that if the provided path is a udev by-id path - * then drives may be re-cabled without an issue. If the provided - * path is a udev by-path path, then the physical location information - * will be preserved. This can be critical for more complicated - * configurations where drives are located in specific physical - * locations to maximize the systems tolerance to component failure. - * - * Alternatively, you can provide your own udev rule to flexibly map - * the drives as you see fit. It is not advised that you use the - * /dev/[hd]d devices which may be reordered due to probing order. - * Devices in the wrong locations will be detected by the higher - * level vdev validation. - * - * The specified paths may be briefly removed and recreated in - * response to udev events. This should be exceptionally unlikely - * because the zpool command makes every effort to verify these paths - * have already settled prior to reaching this point. Therefore, - * a ENOENT failure at this point is highly likely to be transient - * and it is reasonable to sleep and retry before giving up. In - * practice delays have been observed to be on the order of 100ms. - */ - bdev = ERR_PTR(-ENXIO); - while (IS_ERR(bdev) && count < bdev_retry_count) { - bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); - if (unlikely(PTR_ERR(bdev) == -ENOENT)) { - schedule_timeout(MSEC_TO_TICK(10)); - count++; - } else if (IS_ERR(bdev)) { - break; - } - } - - if (IS_ERR(bdev)) { - int error = -PTR_ERR(bdev); - vdev_dbgmsg(v, "open error=%d count=%d", error, count); - vd->vd_bdev = NULL; - v->vdev_tsd = vd; - rw_exit(&vd->vd_lock); - return (SET_ERROR(error)); - } else { - vd->vd_bdev = bdev; - v->vdev_tsd = vd; - rw_exit(&vd->vd_lock); - } - - struct request_queue *q = bdev_get_queue(vd->vd_bdev); - - /* Determine the physical block size */ - block_size = vdev_bdev_block_size(vd->vd_bdev); - - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - v->vdev_nowritecache = B_FALSE; - - /* Set when device reports it supports TRIM. */ - v->vdev_has_trim = !!blk_queue_discard(q); - - /* Set when device reports it supports secure TRIM. */ - v->vdev_has_securetrim = !!blk_queue_discard_secure(q); - - /* Inform the ZIO pipeline that we are non-rotational */ - v->vdev_nonrot = blk_queue_nonrot(q); - - /* Physical volume size in bytes for the partition */ - *psize = bdev_capacity(vd->vd_bdev); - - /* Physical volume size in bytes including possible expansion space */ - *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); - - /* Based on the minimum sector size set the block size */ - *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; - - /* Try to set the io scheduler elevator algorithm */ - (void) vdev_elevator_switch(v, zfs_vdev_scheduler); - - return (0); -} - -static void -vdev_disk_close(vdev_t *v) -{ - vdev_disk_t *vd = v->vdev_tsd; - - if (v->vdev_reopening || vd == NULL) - return; - - if (vd->vd_bdev != NULL) { - vdev_bdev_close(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa))); - } - - rw_destroy(&vd->vd_lock); - kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; -} - -static dio_request_t * -vdev_disk_dio_alloc(int bio_count) -{ - dio_request_t *dr; - int i; - - dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - if (dr) { - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - } - - return (dr); -} - -static void -vdev_disk_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_disk_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static int -vdev_disk_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_disk_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } - - return (rc); -} - -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) -{ - dio_request_t *dr = bio->bi_private; - int rc; - - if (dr->dr_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - dr->dr_error = BIO_END_IO_ERROR(bio); -#else - if (error) - dr->dr_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - dr->dr_error = EIO; -#endif - } - - /* Drop reference acquired by __vdev_disk_physio */ - rc = vdev_disk_dio_put(dr); -} - -static unsigned int -bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) -{ - unsigned int offset, size, i; - struct page *page; - - offset = offset_in_page(bio_ptr); - for (i = 0; i < bio->bi_max_vecs; i++) { - size = PAGE_SIZE - offset; - - if (bio_size <= 0) - break; - - if (size > bio_size) - size = bio_size; - - if (is_vmalloc_addr(bio_ptr)) - page = vmalloc_to_page(bio_ptr); - else - page = virt_to_page(bio_ptr); - - /* - * Some network related block device uses tcp_sendpage, which - * doesn't behave well when using 0-count page, this is a - * safety net to catch them. - */ - ASSERT3S(page_count(page), >, 0); - - if (bio_add_page(bio, page, size, offset) != size) - break; - - bio_ptr += size; - bio_size -= size; - offset = 0; - } - - return (bio_size); -} - -static unsigned int -bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) -{ - if (abd_is_linear(abd)) - return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); - - return (abd_scatter_bio_map_off(bio, abd, size, off)); -} - -static inline void -vdev_submit_bio_impl(struct bio *bio) -{ -#ifdef HAVE_1ARG_SUBMIT_BIO - submit_bio(bio); -#else - submit_bio(0, bio); -#endif -} - -#ifdef HAVE_BIO_SET_DEV -#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) -/* - * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the - * GPL-only bio_associate_blkg() symbol thus inadvertently converting - * the entire macro. Provide a minimal version which always assigns the - * request queue's root_blkg to the bio. - */ -static inline void -vdev_bio_associate_blkg(struct bio *bio) -{ - struct request_queue *q = bio->bi_disk->queue; - - ASSERT3P(q, !=, NULL); - ASSERT3P(bio->bi_blkg, ==, NULL); - - if (blkg_tryget(q->root_blkg)) - bio->bi_blkg = q->root_blkg; -} -#define bio_associate_blkg vdev_bio_associate_blkg -#endif -#else -/* - * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. - */ -static inline void -bio_set_dev(struct bio *bio, struct block_device *bdev) -{ - bio->bi_bdev = bdev; -} -#endif /* HAVE_BIO_SET_DEV */ - -static inline void -vdev_submit_bio(struct bio *bio) -{ -#ifdef HAVE_CURRENT_BIO_TAIL - struct bio **bio_tail = current->bio_tail; - current->bio_tail = NULL; - vdev_submit_bio_impl(bio); - current->bio_tail = bio_tail; -#else - struct bio_list *bio_list = current->bio_list; - current->bio_list = NULL; - vdev_submit_bio_impl(bio); - current->bio_list = bio_list; -#endif -} - -static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) -{ - dio_request_t *dr; - uint64_t abd_offset; - uint64_t bio_offset; - int bio_size, bio_count = 16; - int i = 0, error = 0; -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) - struct blk_plug plug; -#endif - /* - * Accessing outside the block device is never allowed. - */ - if (io_offset + io_size > bdev->bd_inode->i_size) { - vdev_dbgmsg(zio->io_vd, - "Illegal access %llu size %llu, device size %llu", - io_offset, io_size, i_size_read(bdev->bd_inode)); - return (SET_ERROR(EIO)); - } - -retry: - dr = vdev_disk_dio_alloc(bio_count); - if (dr == NULL) - return (SET_ERROR(ENOMEM)); - - if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) - bio_set_flags_failfast(bdev, &flags); - - dr->dr_zio = zio; - - /* - * When the IO size exceeds the maximum bio size for the request - * queue we are forced to break the IO in multiple bio's and wait - * for them all to complete. Ideally, all pool users will set - * their volume block size to match the maximum request size and - * the common case will be one bio per vdev IO request. - */ - - abd_offset = 0; - bio_offset = io_offset; - bio_size = io_size; - for (i = 0; i <= dr->dr_bio_count; i++) { - - /* Finished constructing bio's for given buffer */ - if (bio_size <= 0) - break; - - /* - * By default only 'bio_count' bio's per dio are allowed. - * However, if we find ourselves in a situation where more - * are needed we allocate a larger dio and warn the user. - */ - if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); - bio_count *= 2; - goto retry; - } - - /* bio_alloc() with __GFP_WAIT never returns NULL */ - dr->dr_bio[i] = bio_alloc(GFP_NOIO, - MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), - BIO_MAX_PAGES)); - if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_disk_dio_free(dr); - return (SET_ERROR(ENOMEM)); - } - - /* Matching put called by vdev_disk_physio_completion */ - vdev_disk_dio_get(dr); - - bio_set_dev(dr->dr_bio[i], bdev); - BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; - dr->dr_bio[i]->bi_private = dr; - bio_set_op_attrs(dr->dr_bio[i], rw, flags); - - /* Remaining size is returned to become the new size */ - bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, - bio_size, abd_offset); - - /* Advance in buffer and construct another bio if needed */ - abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); - bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); - } - - /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_disk_dio_get(dr); - -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) - if (dr->dr_bio_count > 1) - blk_start_plug(&plug); -#endif - - /* Submit all bio's associated with this dio */ - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - vdev_submit_bio(dr->dr_bio[i]); - -#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) - if (dr->dr_bio_count > 1) - blk_finish_plug(&plug); -#endif - - (void) vdev_disk_dio_put(dr); - - return (error); -} - -BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) -{ - zio_t *zio = bio->bi_private; -#ifdef HAVE_1ARG_BIO_END_IO_T - zio->io_error = BIO_END_IO_ERROR(bio); -#else - zio->io_error = -error; -#endif - - if (zio->io_error && (zio->io_error == EOPNOTSUPP)) - zio->io_vd->vdev_nowritecache = B_TRUE; - - bio_put(bio); - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - zio_interrupt(zio); -} - -static int -vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) -{ - struct request_queue *q; - struct bio *bio; - - q = bdev_get_queue(bdev); - if (!q) - return (SET_ERROR(ENXIO)); - - bio = bio_alloc(GFP_NOIO, 0); - /* bio_alloc() with __GFP_WAIT never returns NULL */ - if (unlikely(bio == NULL)) - return (SET_ERROR(ENOMEM)); - - bio->bi_end_io = vdev_disk_io_flush_completion; - bio->bi_private = zio; - bio_set_dev(bio, bdev); - bio_set_flush(bio); - vdev_submit_bio(bio); - invalidate_bdev(bdev); - - return (0); -} - -static void -vdev_disk_io_start(zio_t *zio) -{ - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - unsigned long trim_flags = 0; - int rw, flags, error; - - /* - * If the vdev is closed, it's likely in the REMOVED or FAULTED state. - * Nothing to be done here but return failure. - */ - if (vd == NULL) { - zio->io_error = ENXIO; - zio_interrupt(zio); - return; - } - - rw_enter(&vd->vd_lock, RW_READER); - - /* - * If the vdev is closed, it's likely due to a failed reopen and is - * in the UNAVAIL state. Nothing to be done here but return failure. - */ - if (vd->vd_bdev == NULL) { - rw_exit(&vd->vd_lock); - zio->io_error = ENXIO; - zio_interrupt(zio); - return; - } - - switch (zio->io_type) { - case ZIO_TYPE_IOCTL: - - if (!vdev_readable(v)) { - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (v->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - - error = vdev_disk_io_flush(vd->vd_bdev, zio); - if (error == 0) { - rw_exit(&vd->vd_lock); - return; - } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); - } - - rw_exit(&vd->vd_lock); - zio_execute(zio); - return; - case ZIO_TYPE_WRITE: - rw = WRITE; -#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) - flags = (1 << BIO_RW_UNPLUG); -#elif defined(REQ_UNPLUG) - flags = REQ_UNPLUG; -#else - flags = 0; -#endif - break; - - case ZIO_TYPE_READ: - rw = READ; -#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) - flags = (1 << BIO_RW_UNPLUG); -#elif defined(REQ_UNPLUG) - flags = REQ_UNPLUG; -#else - flags = 0; -#endif - break; - - case ZIO_TYPE_TRIM: -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) - trim_flags |= BLKDEV_DISCARD_SECURE; -#endif - zio->io_error = -blkdev_issue_discard(vd->vd_bdev, - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, - trim_flags); - - rw_exit(&vd->vd_lock); - zio_interrupt(zio); - return; - - default: - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENOTSUP); - zio_interrupt(zio); - return; - } - - zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, - zio->io_size, zio->io_offset, rw, flags); - rw_exit(&vd->vd_lock); - - if (error) { - zio->io_error = error; - zio_interrupt(zio); - return; - } -} - -static void -vdev_disk_io_done(zio_t *zio) -{ - /* - * If the device returned EIO, we revalidate the media. If it is - * determined the media has changed this triggers the asynchronous - * removal of the device from the configuration. - */ - if (zio->io_error == EIO) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - - if (check_disk_change(vd->vd_bdev)) { - vdev_bdev_invalidate(vd->vd_bdev); - v->vdev_remove_wanted = B_TRUE; - spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); - } - } -} - -static void -vdev_disk_hold(vdev_t *vd) -{ - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); - - /* We must have a pathname, and it must be absolute. */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') - return; - - /* - * Only prefetch path and devid info if the device has - * never been opened. - */ - if (vd->vdev_tsd != NULL) - return; - - /* XXX: Implement me as a vnode lookup for the device */ - vd->vdev_name_vp = NULL; - vd->vdev_devid_vp = NULL; -} - -static void -vdev_disk_rele(vdev_t *vd) -{ - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); - - /* XXX: Implement me as a vnode rele for the device */ -} - -static int -param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) -{ - spa_t *spa = NULL; - char *p; - - if (val == NULL) - return (SET_ERROR(-EINVAL)); - - if ((p = strchr(val, '\n')) != NULL) - *p = '\0'; - - if (spa_mode_global != 0) { - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || - !spa_writeable(spa) || spa_suspended(spa)) - continue; - - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - vdev_elevator_switch(spa->spa_root_vdev, (char *)val); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - } - mutex_exit(&spa_namespace_lock); - } - - return (param_set_charp(val, kp)); -} - -vdev_ops_t vdev_disk_ops = { - .vdev_op_open = vdev_disk_open, - .vdev_op_close = vdev_disk_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_io_start = vdev_disk_io_start, - .vdev_op_io_done = vdev_disk_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_disk_hold, - .vdev_op_rele = vdev_disk_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, - param_get_charp, &zfs_vdev_scheduler, 0644); -MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c deleted file mode 100644 index b79017f3a..000000000 --- a/module/zfs/vdev_file.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/spa_impl.h> -#include <sys/vdev_file.h> -#include <sys/vdev_impl.h> -#include <sys/vdev_trim.h> -#include <sys/zio.h> -#include <sys/fs/zfs.h> -#include <sys/fm/fs/zfs.h> -#include <sys/abd.h> -#include <sys/fcntl.h> -#include <sys/vnode.h> - -/* - * Virtual device vector for files. - */ - -static taskq_t *vdev_file_taskq; - -static void -vdev_file_hold(vdev_t *vd) -{ - ASSERT(vd->vdev_path != NULL); -} - -static void -vdev_file_rele(vdev_t *vd) -{ - ASSERT(vd->vdev_path != NULL); -} - -static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) -{ - vdev_file_t *vf; - vnode_t *vp; - vattr_t vattr; - int error; - - /* - * Rotational optimizations only make sense on block devices. - */ - vd->vdev_nonrot = B_TRUE; - - /* - * Allow TRIM on file based vdevs. This may not always be supported, - * since it depends on your kernel version and underlying filesystem - * type but it is always safe to attempt. - */ - vd->vdev_has_trim = B_TRUE; - - /* - * Disable secure TRIM on file based vdevs. There is no way to - * request this behavior from the underlying filesystem. - */ - vd->vdev_has_securetrim = B_FALSE; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if (vd->vdev_tsd != NULL) { - ASSERT(vd->vdev_reopening); - vf = vd->vdev_tsd; - goto skip_open; - } - - vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); - - /* - * We always open the files from the root of the global zone, even if - * we're in a local zone. If the user has gotten to this point, the - * administrator has already decided that the pool should be available - * to local zone users, so the underlying devices should be as well. - */ - ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); - - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - vf->vf_vnode = vp; - -#ifdef _KERNEL - /* - * Make sure it's a regular file. - */ - if (vp->v_type != VREG) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (SET_ERROR(ENODEV)); - } -#endif - -skip_open: - /* - * Determine the physical size of the file. - */ - vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - *max_psize = *psize = vattr.va_size; - *ashift = SPA_MINBLOCKSHIFT; - - return (0); -} - -static void -vdev_file_close(vdev_t *vd) -{ - vdev_file_t *vf = vd->vdev_tsd; - - if (vd->vdev_reopening || vf == NULL) - return; - - if (vf->vf_vnode != NULL) { - (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, - kcred, NULL); - } - - vd->vdev_delayed_close = B_FALSE; - kmem_free(vf, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -} - -static void -vdev_file_io_strategy(void *arg) -{ - zio_t *zio = (zio_t *)arg; - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; - void *buf; - - if (zio->io_type == ZIO_TYPE_READ) - buf = abd_borrow_buf(zio->io_abd, zio->io_size); - else - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, - zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, buf, zio->io_size); - else - abd_return_buf(zio->io_abd, buf, zio->io_size); - - if (resid != 0 && zio->io_error == 0) - zio->io_error = SET_ERROR(ENOSPC); - - zio_delay_interrupt(zio); -} - -static void -vdev_file_io_fsync(void *arg) -{ - zio_t *zio = (zio_t *)arg; - vdev_file_t *vf = zio->io_vd->vdev_tsd; - - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL); - - zio_interrupt(zio); -} - -static void -vdev_file_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - - if (zio->io_type == ZIO_TYPE_IOCTL) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } - - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, - kcred, NULL); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } - - zio_execute(zio); - return; - } else if (zio->io_type == ZIO_TYPE_TRIM) { - struct flock flck; - - ASSERT3U(zio->io_size, !=, 0); - bzero(&flck, sizeof (flck)); - flck.l_type = F_FREESP; - flck.l_start = zio->io_offset; - flck.l_len = zio->io_size; - flck.l_whence = SEEK_SET; - - zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck, - 0, 0, kcred, NULL); - - zio_execute(zio); - return; - } - - zio->io_target_timestamp = zio_handle_io_delay(zio); - - VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, - TQ_SLEEP), !=, TASKQID_INVALID); -} - -/* ARGSUSED */ -static void -vdev_file_io_done(zio_t *zio) -{ -} - -vdev_ops_t vdev_file_ops = { - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), - minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); - - VERIFY(vdev_file_taskq); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - -/* - * From userland we access disks just like files. - */ -#ifndef _KERNEL - -vdev_ops_t vdev_disk_ops = { - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -#endif diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c deleted file mode 100644 index 26af91e27..000000000 --- a/module/zfs/zfs_acl.c +++ /dev/null @@ -1,2816 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/sid.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/sdt.h> -#include <sys/fs/zfs.h> -#include <sys/mode.h> -#include <sys/policy.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_fuid.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_vfsops.h> -#include <sys/dmu.h> -#include <sys/dnode.h> -#include <sys/zap.h> -#include <sys/sa.h> -#include <sys/trace_acl.h> -#include <sys/zpl.h> - -#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE -#define DENY ACE_ACCESS_DENIED_ACE_TYPE -#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE -#define MIN_ACE_TYPE ALLOW - -#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) -#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) -#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) - -#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ - ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ - ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ - ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) - -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ - ACE_DELETE|ACE_DELETE_CHILD) -#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) - -#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) - -#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) - -#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ - ZFS_ACL_PROTECTED) - -#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ - ZFS_ACL_OBJ_ACE) - -#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) - -#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U - -static uint16_t -zfs_ace_v0_get_type(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_v0_get_flags(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_v0_get_mask(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_v0_get_who(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_fuid); -} - -static void -zfs_ace_v0_set_type(void *acep, uint16_t type) -{ - ((zfs_oldace_t *)acep)->z_type = type; -} - -static void -zfs_ace_v0_set_flags(void *acep, uint16_t flags) -{ - ((zfs_oldace_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_v0_set_mask(void *acep, uint32_t mask) -{ - ((zfs_oldace_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_v0_set_who(void *acep, uint64_t who) -{ - ((zfs_oldace_t *)acep)->z_fuid = who; -} - -/*ARGSUSED*/ -static size_t -zfs_ace_v0_size(void *acep) -{ - return (sizeof (zfs_oldace_t)); -} - -static size_t -zfs_ace_v0_abstract_size(void) -{ - return (sizeof (zfs_oldace_t)); -} - -static int -zfs_ace_v0_mask_off(void) -{ - return (offsetof(zfs_oldace_t, z_access_mask)); -} - -/*ARGSUSED*/ -static int -zfs_ace_v0_data(void *acep, void **datap) -{ - *datap = NULL; - return (0); -} - -static acl_ops_t zfs_acl_v0_ops = { - .ace_mask_get = zfs_ace_v0_get_mask, - .ace_mask_set = zfs_ace_v0_set_mask, - .ace_flags_get = zfs_ace_v0_get_flags, - .ace_flags_set = zfs_ace_v0_set_flags, - .ace_type_get = zfs_ace_v0_get_type, - .ace_type_set = zfs_ace_v0_set_type, - .ace_who_get = zfs_ace_v0_get_who, - .ace_who_set = zfs_ace_v0_set_who, - .ace_size = zfs_ace_v0_size, - .ace_abstract_size = zfs_ace_v0_abstract_size, - .ace_mask_off = zfs_ace_v0_mask_off, - .ace_data = zfs_ace_v0_data -}; - -static uint16_t -zfs_ace_fuid_get_type(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_fuid_get_flags(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_fuid_get_mask(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_fuid_get_who(void *args) -{ - uint16_t entry_type; - zfs_ace_t *acep = args; - - entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return (-1); - return (((zfs_ace_t *)acep)->z_fuid); -} - -static void -zfs_ace_fuid_set_type(void *acep, uint16_t type) -{ - ((zfs_ace_hdr_t *)acep)->z_type = type; -} - -static void -zfs_ace_fuid_set_flags(void *acep, uint16_t flags) -{ - ((zfs_ace_hdr_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_fuid_set_mask(void *acep, uint32_t mask) -{ - ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_fuid_set_who(void *arg, uint64_t who) -{ - zfs_ace_t *acep = arg; - - uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return; - acep->z_fuid = who; -} - -static size_t -zfs_ace_fuid_size(void *acep) -{ - zfs_ace_hdr_t *zacep = acep; - uint16_t entry_type; - - switch (zacep->z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - return (sizeof (zfs_object_ace_t)); - case ALLOW: - case DENY: - entry_type = - (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); - if (entry_type == ACE_OWNER || - entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return (sizeof (zfs_ace_hdr_t)); - /*FALLTHROUGH*/ - default: - return (sizeof (zfs_ace_t)); - } -} - -static size_t -zfs_ace_fuid_abstract_size(void) -{ - return (sizeof (zfs_ace_hdr_t)); -} - -static int -zfs_ace_fuid_mask_off(void) -{ - return (offsetof(zfs_ace_hdr_t, z_access_mask)); -} - -static int -zfs_ace_fuid_data(void *acep, void **datap) -{ - zfs_ace_t *zacep = acep; - zfs_object_ace_t *zobjp; - - switch (zacep->z_hdr.z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjp = acep; - *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); - return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); - default: - *datap = NULL; - return (0); - } -} - -static acl_ops_t zfs_acl_fuid_ops = { - .ace_mask_get = zfs_ace_fuid_get_mask, - .ace_mask_set = zfs_ace_fuid_set_mask, - .ace_flags_get = zfs_ace_fuid_get_flags, - .ace_flags_set = zfs_ace_fuid_set_flags, - .ace_type_get = zfs_ace_fuid_get_type, - .ace_type_set = zfs_ace_fuid_set_type, - .ace_who_get = zfs_ace_fuid_get_who, - .ace_who_set = zfs_ace_fuid_set_who, - .ace_size = zfs_ace_fuid_size, - .ace_abstract_size = zfs_ace_fuid_abstract_size, - .ace_mask_off = zfs_ace_fuid_mask_off, - .ace_data = zfs_ace_fuid_data -}; - -/* - * The following three functions are provided for compatibility with - * older ZPL version in order to determine if the file use to have - * an external ACL and what version of ACL previously existed on the - * file. Would really be nice to not need this, sigh. - */ -uint64_t -zfs_external_acl(znode_t *zp) -{ - zfs_acl_phys_t acl_phys; - int error; - - if (zp->z_is_sa) - return (0); - - /* - * Need to deal with a potential - * race where zfs_sa_upgrade could cause - * z_isa_sa to change. - * - * If the lookup fails then the state of z_is_sa should have - * changed. - */ - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), - &acl_phys, sizeof (acl_phys))) == 0) - return (acl_phys.z_acl_extern_obj); - else { - /* - * after upgrade the SA_ZPL_ZNODE_ACL should have been - * removed - */ - VERIFY(zp->z_is_sa && error == ENOENT); - return (0); - } -} - -/* - * Determine size of ACL in bytes - * - * This is more complicated than it should be since we have to deal - * with old external ACLs. - */ -static int -zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, - zfs_acl_phys_t *aclphys) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - uint64_t acl_count; - int size; - int error; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - if (zp->z_is_sa) { - if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), - &size)) != 0) - return (error); - *aclsize = size; - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), - &acl_count, sizeof (acl_count))) != 0) - return (error); - *aclcount = acl_count; - } else { - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), - aclphys, sizeof (*aclphys))) != 0) - return (error); - - if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { - *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); - *aclcount = aclphys->z_acl_size; - } else { - *aclsize = aclphys->z_acl_size; - *aclcount = aclphys->z_acl_count; - } - } - return (0); -} - -int -zfs_znode_acl_version(znode_t *zp) -{ - zfs_acl_phys_t acl_phys; - - if (zp->z_is_sa) - return (ZFS_ACL_VERSION_FUID); - else { - int error; - - /* - * Need to deal with a potential - * race where zfs_sa_upgrade could cause - * z_isa_sa to change. - * - * If the lookup fails then the state of z_is_sa should have - * changed. - */ - if ((error = sa_lookup(zp->z_sa_hdl, - SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), - &acl_phys, sizeof (acl_phys))) == 0) - return (acl_phys.z_acl_version); - else { - /* - * After upgrade SA_ZPL_ZNODE_ACL should have - * been removed. - */ - VERIFY(zp->z_is_sa && error == ENOENT); - return (ZFS_ACL_VERSION_FUID); - } - } -} - -static int -zfs_acl_version(int version) -{ - if (version < ZPL_VERSION_FUID) - return (ZFS_ACL_VERSION_INITIAL); - else - return (ZFS_ACL_VERSION_FUID); -} - -static int -zfs_acl_version_zp(znode_t *zp) -{ - return (zfs_acl_version(ZTOZSB(zp)->z_version)); -} - -zfs_acl_t * -zfs_acl_alloc(int vers) -{ - zfs_acl_t *aclp; - - aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); - list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), - offsetof(zfs_acl_node_t, z_next)); - aclp->z_version = vers; - if (vers == ZFS_ACL_VERSION_FUID) - aclp->z_ops = &zfs_acl_fuid_ops; - else - aclp->z_ops = &zfs_acl_v0_ops; - return (aclp); -} - -zfs_acl_node_t * -zfs_acl_node_alloc(size_t bytes) -{ - zfs_acl_node_t *aclnode; - - aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); - if (bytes) { - aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); - aclnode->z_allocdata = aclnode->z_acldata; - aclnode->z_allocsize = bytes; - aclnode->z_size = bytes; - } - - return (aclnode); -} - -static void -zfs_acl_node_free(zfs_acl_node_t *aclnode) -{ - if (aclnode->z_allocsize) - kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); - kmem_free(aclnode, sizeof (zfs_acl_node_t)); -} - -static void -zfs_acl_release_nodes(zfs_acl_t *aclp) -{ - zfs_acl_node_t *aclnode; - - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); - zfs_acl_node_free(aclnode); - } - aclp->z_acl_count = 0; - aclp->z_acl_bytes = 0; -} - -void -zfs_acl_free(zfs_acl_t *aclp) -{ - zfs_acl_release_nodes(aclp); - list_destroy(&aclp->z_acl); - kmem_free(aclp, sizeof (zfs_acl_t)); -} - -static boolean_t -zfs_acl_valid_ace_type(uint_t type, uint_t flags) -{ - uint16_t entry_type; - - switch (type) { - case ALLOW: - case DENY: - case ACE_SYSTEM_AUDIT_ACE_TYPE: - case ACE_SYSTEM_ALARM_ACE_TYPE: - entry_type = flags & ACE_TYPE_FLAGS; - return (entry_type == ACE_OWNER || - entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE || entry_type == 0 || - entry_type == ACE_IDENTIFIER_GROUP); - default: - if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) - return (B_TRUE); - } - return (B_FALSE); -} - -static boolean_t -zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) -{ - /* - * first check type of entry - */ - - if (!zfs_acl_valid_ace_type(type, iflags)) - return (B_FALSE); - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (aclp->z_version < ZFS_ACL_VERSION_FUID) - return (B_FALSE); - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - } - - /* - * next check inheritance level flags - */ - - if (S_ISDIR(obj_mode) && - (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) == 0) { - return (B_FALSE); - } - } - - return (B_TRUE); -} - -static void * -zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, - uint32_t *access_mask, uint16_t *iflags, uint16_t *type) -{ - zfs_acl_node_t *aclnode; - - ASSERT(aclp); - - if (start == NULL) { - aclnode = list_head(&aclp->z_acl); - if (aclnode == NULL) - return (NULL); - - aclp->z_next_ace = aclnode->z_acldata; - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - } - - aclnode = aclp->z_curr_node; - - if (aclnode == NULL) - return (NULL); - - if (aclnode->z_ace_idx >= aclnode->z_ace_count) { - aclnode = list_next(&aclp->z_acl, aclnode); - if (aclnode == NULL) - return (NULL); - else { - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - aclp->z_next_ace = aclnode->z_acldata; - } - } - - if (aclnode->z_ace_idx < aclnode->z_ace_count) { - void *acep = aclp->z_next_ace; - size_t ace_size; - - /* - * Make sure we don't overstep our bounds - */ - ace_size = aclp->z_ops->ace_size(acep); - - if (((caddr_t)acep + ace_size) > - ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { - return (NULL); - } - - *iflags = aclp->z_ops->ace_flags_get(acep); - *type = aclp->z_ops->ace_type_get(acep); - *access_mask = aclp->z_ops->ace_mask_get(acep); - *who = aclp->z_ops->ace_who_get(acep); - aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; - aclnode->z_ace_idx++; - - return ((void *)acep); - } - return (NULL); -} - -/*ARGSUSED*/ -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, - uint16_t *flags, uint16_t *type, uint32_t *mask) -{ - zfs_acl_t *aclp = datap; - zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; - uint64_t who; - - acep = zfs_acl_next_ace(aclp, acep, &who, mask, - flags, type); - return ((uint64_t)(uintptr_t)acep); -} - -/* - * Copy ACE to internal ZFS format. - * While processing the ACL each ACE will be validated for correctness. - * ACE FUIDs will be created later. - */ -int -zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, - void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, - zfs_fuid_info_t **fuidp, cred_t *cr) -{ - int i; - uint16_t entry_type; - zfs_ace_t *aceptr = z_acl; - ace_t *acep = datap; - zfs_object_ace_t *zobjacep; - ace_object_t *aceobjp; - - for (i = 0; i != aclcnt; i++) { - aceptr->z_hdr.z_access_mask = acep->a_access_mask; - aceptr->z_hdr.z_flags = acep->a_flags; - aceptr->z_hdr.z_type = acep->a_type; - entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; - if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && - entry_type != ACE_EVERYONE) { - aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, - cr, (entry_type == 0) ? - ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); - } - - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, - aceptr->z_hdr.z_flags) != B_TRUE) - return (SET_ERROR(EINVAL)); - - switch (acep->a_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjacep = (zfs_object_ace_t *)aceptr; - aceobjp = (ace_object_t *)acep; - - bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, - sizeof (aceobjp->a_obj_type)); - bcopy(aceobjp->a_inherit_obj_type, - zobjacep->z_inherit_type, - sizeof (aceobjp->a_inherit_obj_type)); - acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); - break; - default: - acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); - } - - aceptr = (zfs_ace_t *)((caddr_t)aceptr + - aclp->z_ops->ace_size(aceptr)); - } - - *size = (caddr_t)aceptr - (caddr_t)z_acl; - - return (0); -} - -/* - * Copy ZFS ACEs to fixed size ace_t layout - */ -static void -zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, - void *datap, int filter) -{ - uint64_t who; - uint32_t access_mask; - uint16_t iflags, type; - zfs_ace_hdr_t *zacep = NULL; - ace_t *acep = datap; - ace_object_t *objacep; - zfs_object_ace_t *zobjacep; - size_t ace_size; - uint16_t entry_type; - - while ((zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type))) { - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (filter) { - continue; - } - zobjacep = (zfs_object_ace_t *)zacep; - objacep = (ace_object_t *)acep; - bcopy(zobjacep->z_object_type, - objacep->a_obj_type, - sizeof (zobjacep->z_object_type)); - bcopy(zobjacep->z_inherit_type, - objacep->a_inherit_obj_type, - sizeof (zobjacep->z_inherit_type)); - ace_size = sizeof (ace_object_t); - break; - default: - ace_size = sizeof (ace_t); - break; - } - - entry_type = (iflags & ACE_TYPE_FLAGS); - if ((entry_type != ACE_OWNER && - entry_type != OWNING_GROUP && - entry_type != ACE_EVERYONE)) { - acep->a_who = zfs_fuid_map_id(zfsvfs, who, - cr, (entry_type & ACE_IDENTIFIER_GROUP) ? - ZFS_ACE_GROUP : ZFS_ACE_USER); - } else { - acep->a_who = (uid_t)(int64_t)who; - } - acep->a_access_mask = access_mask; - acep->a_flags = iflags; - acep->a_type = type; - acep = (ace_t *)((caddr_t)acep + ace_size); - } -} - -static int -zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, - zfs_oldace_t *z_acl, int aclcnt, size_t *size) -{ - int i; - zfs_oldace_t *aceptr = z_acl; - - for (i = 0; i != aclcnt; i++, aceptr++) { - aceptr->z_access_mask = acep[i].a_access_mask; - aceptr->z_type = acep[i].a_type; - aceptr->z_flags = acep[i].a_flags; - aceptr->z_fuid = acep[i].a_who; - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, - aceptr->z_flags) != B_TRUE) - return (SET_ERROR(EINVAL)); - } - *size = (caddr_t)aceptr - (caddr_t)z_acl; - return (0); -} - -/* - * convert old ACL format to new - */ -void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) -{ - zfs_oldace_t *oldaclp; - int i; - uint16_t type, iflags; - uint32_t access_mask; - uint64_t who; - void *cookie = NULL; - zfs_acl_node_t *newaclnode; - - ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); - /* - * First create the ACE in a contiguous piece of memory - * for zfs_copy_ace_2_fuid(). - * - * We only convert an ACL once, so this won't happen - * every time. - */ - oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, - KM_SLEEP); - i = 0; - while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, - &access_mask, &iflags, &type))) { - oldaclp[i].z_flags = iflags; - oldaclp[i].z_type = type; - oldaclp[i].z_fuid = who; - oldaclp[i++].z_access_mask = access_mask; - } - - newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * - sizeof (zfs_object_ace_t)); - aclp->z_ops = &zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, - aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size, NULL, cr) == 0); - newaclnode->z_ace_count = aclp->z_acl_count; - aclp->z_version = ZFS_ACL_VERSION; - kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); - - /* - * Release all previous ACL nodes - */ - - zfs_acl_release_nodes(aclp); - - list_insert_head(&aclp->z_acl, newaclnode); - - aclp->z_acl_bytes = newaclnode->z_size; - aclp->z_acl_count = newaclnode->z_ace_count; - -} - -/* - * Convert unix access mask to v4 access mask - */ -static uint32_t -zfs_unix_to_v4(uint32_t access_mask) -{ - uint32_t new_mask = 0; - - if (access_mask & S_IXOTH) - new_mask |= ACE_EXECUTE; - if (access_mask & S_IWOTH) - new_mask |= ACE_WRITE_DATA; - if (access_mask & S_IROTH) - new_mask |= ACE_READ_DATA; - return (new_mask); -} - -static void -zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, - uint16_t access_type, uint64_t fuid, uint16_t entry_type) -{ - uint16_t type = entry_type & ACE_TYPE_FLAGS; - - aclp->z_ops->ace_mask_set(acep, access_mask); - aclp->z_ops->ace_type_set(acep, access_type); - aclp->z_ops->ace_flags_set(acep, entry_type); - if ((type != ACE_OWNER && type != OWNING_GROUP && - type != ACE_EVERYONE)) - aclp->z_ops->ace_who_set(acep, fuid); -} - -/* - * Determine mode of file based on ACL. - * Also, create FUIDs for any User/Group ACEs - */ -uint64_t -zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, - uint64_t *pflags, uint64_t fuid, uint64_t fgid) -{ - int entry_type; - mode_t mode; - mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; - uint64_t who; - uint16_t iflags, type; - uint32_t access_mask; - boolean_t an_exec_denied = B_FALSE; - - mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); - - while ((acep = zfs_acl_next_ace(aclp, acep, &who, - &access_mask, &iflags, &type))) { - - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - /* - * Skip over owner@, group@ or everyone@ inherit only ACEs - */ - if ((iflags & ACE_INHERIT_ONLY_ACE) && - (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - entry_type == OWNING_GROUP)) - continue; - - if (entry_type == ACE_OWNER || (entry_type == 0 && - who == fuid)) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRUSR))) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWUSR))) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXUSR))) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - } else if (entry_type == OWNING_GROUP || - (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRGRP))) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWGRP))) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXGRP))) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - } else if (entry_type == ACE_EVERYONE) { - if ((access_mask & ACE_READ_DATA)) { - if (!(seen & S_IRUSR)) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if (!(seen & S_IRGRP)) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if (!(seen & S_IROTH)) { - seen |= S_IROTH; - if (type == ALLOW) { - mode |= S_IROTH; - } - } - } - if ((access_mask & ACE_WRITE_DATA)) { - if (!(seen & S_IWUSR)) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if (!(seen & S_IWGRP)) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if (!(seen & S_IWOTH)) { - seen |= S_IWOTH; - if (type == ALLOW) { - mode |= S_IWOTH; - } - } - } - if ((access_mask & ACE_EXECUTE)) { - if (!(seen & S_IXUSR)) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - if (!(seen & S_IXGRP)) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - if (!(seen & S_IXOTH)) { - seen |= S_IXOTH; - if (type == ALLOW) { - mode |= S_IXOTH; - } - } - } - } else { - /* - * Only care if this IDENTIFIER_GROUP or - * USER ACE denies execute access to someone, - * mode is not affected - */ - if ((access_mask & ACE_EXECUTE) && type == DENY) - an_exec_denied = B_TRUE; - } - } - - /* - * Failure to allow is effectively a deny, so execute permission - * is denied if it was never mentioned or if we explicitly - * weren't allowed it. - */ - if (!an_exec_denied && - ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || - (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) - an_exec_denied = B_TRUE; - - if (an_exec_denied) - *pflags &= ~ZFS_NO_EXECS_DENIED; - else - *pflags |= ZFS_NO_EXECS_DENIED; - - return (mode); -} - -/* - * Read an external acl object. If the intent is to modify, always - * create a new acl and leave any cached acl in place. - */ -int -zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp, - boolean_t will_modify) -{ - zfs_acl_t *aclp; - int aclsize = 0; - int acl_count = 0; - zfs_acl_node_t *aclnode; - zfs_acl_phys_t znode_acl; - int version; - int error; - boolean_t drop_lock = B_FALSE; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - if (zp->z_acl_cached && !will_modify) { - *aclpp = zp->z_acl_cached; - return (0); - } - - /* - * close race where znode could be upgrade while trying to - * read the znode attributes. - * - * But this could only happen if the file isn't already an SA - * znode - */ - if (!zp->z_is_sa && !have_lock) { - mutex_enter(&zp->z_lock); - drop_lock = B_TRUE; - } - version = zfs_znode_acl_version(zp); - - if ((error = zfs_acl_znode_info(zp, &aclsize, - &acl_count, &znode_acl)) != 0) { - goto done; - } - - aclp = zfs_acl_alloc(version); - - aclp->z_acl_count = acl_count; - aclp->z_acl_bytes = aclsize; - - aclnode = zfs_acl_node_alloc(aclsize); - aclnode->z_ace_count = aclp->z_acl_count; - aclnode->z_size = aclsize; - - if (!zp->z_is_sa) { - if (znode_acl.z_acl_extern_obj) { - error = dmu_read(ZTOZSB(zp)->z_os, - znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH); - } else { - bcopy(znode_acl.z_ace_data, aclnode->z_acldata, - aclnode->z_size); - } - } else { - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), - aclnode->z_acldata, aclnode->z_size); - } - - if (error != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - goto done; - } - - list_insert_head(&aclp->z_acl, aclnode); - - *aclpp = aclp; - if (!will_modify) - zp->z_acl_cached = aclp; -done: - if (drop_lock) - mutex_exit(&zp->z_lock); - return (error); -} - -/*ARGSUSED*/ -void -zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, - boolean_t start, void *userdata) -{ - zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; - - if (start) { - cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); - } else { - cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, - cb->cb_acl_node); - } - *dataptr = cb->cb_acl_node->z_acldata; - *length = cb->cb_acl_node->z_size; -} - -int -zfs_acl_chown_setattr(znode_t *zp) -{ - int error; - zfs_acl_t *aclp; - - if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) - return (0); - - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - - error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); - if (error == 0 && aclp->z_acl_count > 0) - zp->z_mode = ZTOI(zp)->i_mode = - zfs_mode_compute(zp->z_mode, aclp, - &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), - KGID_TO_SGID(ZTOI(zp)->i_gid)); - - /* - * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL - * nor a DACL_ACES SA in which case ENOENT is returned from - * zfs_acl_node_read() when the SA can't be located. - * Allow chown/chgrp to succeed in these cases rather than - * returning an error that makes no sense in the context of - * the caller. - */ - if (error == ENOENT) - return (0); - - return (error); -} - -static void -acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, - uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone) -{ - *deny1 = *deny2 = *allow0 = *group = 0; - - if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) - *deny1 |= ACE_READ_DATA; - if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) - *deny1 |= ACE_WRITE_DATA; - if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) - *deny1 |= ACE_EXECUTE; - - if (!(mode & S_IRGRP) && (mode & S_IROTH)) - *deny2 = ACE_READ_DATA; - if (!(mode & S_IWGRP) && (mode & S_IWOTH)) - *deny2 |= ACE_WRITE_DATA; - if (!(mode & S_IXGRP) && (mode & S_IXOTH)) - *deny2 |= ACE_EXECUTE; - - if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) - *allow0 |= ACE_READ_DATA; - if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) - *allow0 |= ACE_WRITE_DATA; - if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) - *allow0 |= ACE_EXECUTE; - - *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| - ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; - if (mode & S_IRUSR) - *owner |= ACE_READ_DATA; - if (mode & S_IWUSR) - *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA; - if (mode & S_IXUSR) - *owner |= ACE_EXECUTE; - - *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| - ACE_SYNCHRONIZE; - if (mode & S_IRGRP) - *group |= ACE_READ_DATA; - if (mode & S_IWGRP) - *group |= ACE_WRITE_DATA|ACE_APPEND_DATA; - if (mode & S_IXGRP) - *group |= ACE_EXECUTE; - - *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| - ACE_SYNCHRONIZE; - if (mode & S_IROTH) - *everyone |= ACE_READ_DATA; - if (mode & S_IWOTH) - *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA; - if (mode & S_IXOTH) - *everyone |= ACE_EXECUTE; -} - -/* - * ace_trivial: - * determine whether an ace_t acl is trivial - * - * Trivialness implies that the acl is composed of only - * owner, group, everyone entries. ACL can't - * have read_acl denied, and write_owner/write_acl/write_attributes - * can only be owner@ entry. - */ -static int -ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, - uint16_t *, uint16_t *, uint32_t *)) -{ - uint16_t flags; - uint32_t mask; - uint16_t type; - uint64_t cookie = 0; - - while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { - switch (flags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - case ACE_GROUP|ACE_IDENTIFIER_GROUP: - case ACE_EVERYONE: - break; - default: - return (1); - } - - if (flags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE)) - return (1); - - /* - * Special check for some special bits - * - * Don't allow anybody to deny reading basic - * attributes or a files ACL. - */ - if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - (type == ACE_ACCESS_DENIED_ACE_TYPE)) - return (1); - - /* - * Delete permissions are never set by default - */ - if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) - return (1); - /* - * only allow owner@ to have - * write_acl/write_owner/write_attributes/write_xattr/ - */ - if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && - (!(flags & ACE_OWNER) && (mask & - (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| - ACE_WRITE_NAMED_ATTRS)))) - return (1); - - } - - return (0); -} - -/* - * common code for setting ACLs. - * - * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. - * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's - * already checked the acl and knows whether to inherit. - */ -int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) -{ - int error; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - dmu_object_type_t otype; - zfs_acl_locator_cb_t locate = { 0 }; - uint64_t mode; - sa_bulk_attr_t bulk[5]; - uint64_t ctime[2]; - int count = 0; - zfs_acl_phys_t acl_phys; - - mode = zp->z_mode; - - mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, - KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); - - zp->z_mode = ZTOI(zp)->i_mode = mode; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &mode, sizeof (mode)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, sizeof (ctime)); - - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - - /* - * Upgrade needed? - */ - if (!zfsvfs->z_use_fuids) { - otype = DMU_OT_OLDACL; - } else { - if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && - (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp, cr); - ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); - otype = DMU_OT_ACL; - } - - /* - * Arrgh, we have to handle old on disk format - * as well as newer (preferred) SA format. - */ - - if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ - locate.cb_aclp = aclp; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), - zfs_acl_data_locator, &locate, aclp->z_acl_bytes); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), - NULL, &aclp->z_acl_count, sizeof (uint64_t)); - } else { /* Painful legacy way */ - zfs_acl_node_t *aclnode; - uint64_t off = 0; - uint64_t aoid; - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), - &acl_phys, sizeof (acl_phys))) != 0) - return (error); - - aoid = acl_phys.z_acl_extern_obj; - - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && - aclp->z_version != acl_phys.z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, aoid, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? - DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? - DN_OLD_MAX_BONUSLEN : 0, tx); - } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, - aoid, aclp->z_acl_bytes, 0, tx); - } - acl_phys.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; - } - } else { - void *start = acl_phys.z_ace_data; - /* - * Migrating back embedded? - */ - if (acl_phys.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - acl_phys.z_acl_extern_obj, tx); - if (error) - return (error); - acl_phys.z_acl_extern_obj = 0; - } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, - aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - } - /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - acl_phys.z_acl_size = aclp->z_acl_count; - acl_phys.z_acl_count = aclp->z_acl_bytes; - } else { - acl_phys.z_acl_size = aclp->z_acl_bytes; - acl_phys.z_acl_count = aclp->z_acl_count; - } - acl_phys.z_acl_version = aclp->z_version; - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, - &acl_phys, sizeof (acl_phys)); - } - - /* - * Replace ACL wide bits, but first clear them. - */ - zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; - - zp->z_pflags |= aclp->z_hints; - - if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_pflags |= ZFS_ACL_TRIVIAL; - - zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); - return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); -} - -static void -zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) -{ - void *acep = NULL; - uint64_t who; - int new_count, new_bytes; - int ace_size; - int entry_type; - uint16_t iflags, type; - uint32_t access_mask; - zfs_acl_node_t *newnode; - size_t abstract_size = aclp->z_ops->ace_abstract_size(); - void *zacep; - uint32_t owner, group, everyone; - uint32_t deny1, deny2, allow0; - - new_count = new_bytes = 0; - - acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, - &owner, &group, &everyone); - - newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); - - zacep = newnode->z_acldata; - if (allow0) { - zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - if (deny1) { - zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - if (deny2) { - zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - - while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type))) { - uint16_t inherit_flags; - - entry_type = (iflags & ACE_TYPE_FLAGS); - inherit_flags = (iflags & ALL_INHERIT); - - if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) && - ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) { - continue; - } - - if ((type != ALLOW && type != DENY) || - (inherit_flags & ACE_INHERIT_ONLY_ACE)) { - if (inherit_flags) - aclp->z_hints |= ZFS_INHERIT_ACE; - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - break; - } - } else { - - /* - * Limit permissions to be no greater than - * group permissions - */ - if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) { - if (!(mode & S_IRGRP)) - access_mask &= ~ACE_READ_DATA; - if (!(mode & S_IWGRP)) - access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - if (!(mode & S_IXGRP)) - access_mask &= ~ACE_EXECUTE; - access_mask &= - ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); - } - } - zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); - ace_size = aclp->z_ops->ace_size(acep); - zacep = (void *)((uintptr_t)zacep + ace_size); - new_count++; - new_bytes += ace_size; - } - zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); - zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); - - new_count += 3; - new_bytes += abstract_size * 3; - zfs_acl_release_nodes(aclp); - aclp->z_acl_count = new_count; - aclp->z_acl_bytes = new_bytes; - newnode->z_ace_count = new_count; - newnode->z_size = new_bytes; - list_insert_tail(&aclp->z_acl, newnode); -} - -void -zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) -{ - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(ZTOZSB(zp), mode, *aclp); - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT(*aclp); -} - -/* - * strip off write_owner and write_acl - */ -static void -zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) -{ - uint32_t mask = aclp->z_ops->ace_mask_get(acep); - - if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && - (aclp->z_ops->ace_type_get(acep) == ALLOW)) { - mask &= ~RESTRICTED_CLEAR; - aclp->z_ops->ace_mask_set(acep, mask); - } -} - -/* - * Should ACE be inherited? - */ -static int -zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) -{ - int iflags = (acep_flags & 0xf); - - if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) - return (1); - else if (iflags & ACE_FILE_INHERIT_ACE) - return (!(S_ISDIR(obj_mode) && - (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); - return (0); -} - -/* - * inherit inheritable ACEs from parent - */ -static zfs_acl_t * -zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, - uint64_t mode, boolean_t *need_chmod) -{ - void *pacep; - void *acep; - zfs_acl_node_t *aclnode; - zfs_acl_t *aclp = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t iflags, newflags, type; - size_t ace_size; - void *data1, *data2; - size_t data1sz, data2sz; - boolean_t vdir = S_ISDIR(obj_mode); - boolean_t vreg = S_ISREG(obj_mode); - boolean_t passthrough, passthrough_x, noallow; - - passthrough_x = - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; - passthrough = passthrough_x || - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH; - noallow = - zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW; - - *need_chmod = B_TRUE; - pacep = NULL; - aclp = zfs_acl_alloc(paclp->z_version); - if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode)) - return (aclp); - while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, - &access_mask, &iflags, &type))) { - - /* - * don't inherit bogus ACEs - */ - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - if (noallow && type == ALLOW) - continue; - - ace_size = aclp->z_ops->ace_size(pacep); - - if (!zfs_ace_can_use(obj_mode, iflags)) - continue; - - /* - * If owner@, group@, or everyone@ inheritable - * then zfs_acl_chmod() isn't needed. - */ - if (passthrough && - ((iflags & (ACE_OWNER|ACE_EVERYONE)) || - ((iflags & OWNING_GROUP) == - OWNING_GROUP)) && (vreg || (vdir && (iflags & - ACE_DIRECTORY_INHERIT_ACE)))) { - *need_chmod = B_FALSE; - } - - if (!vdir && passthrough_x && - ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { - access_mask &= ~ACE_EXECUTE; - } - - aclnode = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode); - acep = aclnode->z_acldata; - - zfs_set_ace(aclp, acep, access_mask, type, - who, iflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { - VERIFY((data2sz = aclp->z_ops->ace_data(acep, - &data2)) == data1sz); - bcopy(data1, data2, data2sz); - } - - aclp->z_acl_count++; - aclnode->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - newflags = aclp->z_ops->ace_flags_get(acep); - - if (vdir) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { - newflags &= ~ALL_INHERIT; - aclp->z_ops->ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - zfs_restricted_update(zfsvfs, aclp, acep); - continue; - } - - ASSERT(vdir); - - /* - * If only FILE_INHERIT is set then turn on - * inherit_only - */ - if ((iflags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { - newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops->ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - } else { - newflags &= ~ACE_INHERIT_ONLY_ACE; - aclp->z_ops->ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - } - } - return (aclp); -} - -/* - * Create file system object initial permissions - * including inheritable ACEs. - */ -int -zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) -{ - int error; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zfs_acl_t *paclp; - gid_t gid = vap->va_gid; - boolean_t need_chmod = B_TRUE; - boolean_t inherited = B_FALSE; - - bzero(acl_ids, sizeof (zfs_acl_ids_t)); - acl_ids->z_mode = vap->va_mode; - - if (vsecp) - if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp, - cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) - return (error); - - acl_ids->z_fuid = vap->va_uid; - acl_ids->z_fgid = vap->va_gid; -#ifdef HAVE_KSID - /* - * Determine uid and gid. - */ - if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || - ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { - acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, - cr, ZFS_OWNER, &acl_ids->z_fuidp); - acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, - cr, ZFS_GROUP, &acl_ids->z_fuidp); - gid = vap->va_gid; - } else { - acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, - cr, &acl_ids->z_fuidp); - acl_ids->z_fgid = 0; - if (vap->va_mask & AT_GID) { - acl_ids->z_fgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, - cr, ZFS_GROUP, &acl_ids->z_fuidp); - gid = vap->va_gid; - if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) && - !groupmember(vap->va_gid, cr) && - secpolicy_vnode_create_gid(cr) != 0) - acl_ids->z_fgid = 0; - } - if (acl_ids->z_fgid == 0) { - if (dzp->z_mode & S_ISGID) { - char *domain; - uint32_t rid; - - acl_ids->z_fgid = KGID_TO_SGID( - ZTOI(dzp)->i_gid); - gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, - cr, ZFS_GROUP); - - if (zfsvfs->z_use_fuids && - IS_EPHEMERAL(acl_ids->z_fgid)) { - domain = zfs_fuid_idx_domain( - &zfsvfs->z_fuid_idx, - FUID_INDEX(acl_ids->z_fgid)); - rid = FUID_RID(acl_ids->z_fgid); - zfs_fuid_node_add(&acl_ids->z_fuidp, - domain, rid, - FUID_INDEX(acl_ids->z_fgid), - acl_ids->z_fgid, ZFS_GROUP); - } - } else { - acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, cr, &acl_ids->z_fuidp); - gid = crgetgid(cr); - } - } - } -#endif /* HAVE_KSID */ - - /* - * If we're creating a directory, and the parent directory has the - * set-GID bit set, set in on the new directory. - * Otherwise, if the user is neither privileged nor a member of the - * file's new group, clear the file's set-GID bit. - */ - - if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && - (S_ISDIR(vap->va_mode))) { - acl_ids->z_mode |= S_ISGID; - } else { - if ((acl_ids->z_mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) - acl_ids->z_mode &= ~S_ISGID; - } - - if (acl_ids->z_aclp == NULL) { - mutex_enter(&dzp->z_acl_lock); - mutex_enter(&dzp->z_lock); - if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) && - (dzp->z_pflags & ZFS_INHERIT_ACE)) && - !(dzp->z_pflags & ZFS_XATTR)) { - VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, - &paclp, B_FALSE)); - acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, - vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); - inherited = B_TRUE; - } else { - acl_ids->z_aclp = - zfs_acl_alloc(zfs_acl_version_zp(dzp)); - acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; - } - mutex_exit(&dzp->z_lock); - mutex_exit(&dzp->z_acl_lock); - if (need_chmod) { - acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ? - ZFS_ACL_AUTO_INHERIT : 0; - zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp); - } - } - - if (inherited || vsecp) { - acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, - acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, - acl_ids->z_fuid, acl_ids->z_fgid); - if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) - acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; - } - - return (0); -} - -/* - * Free ACL and fuid_infop, but not the acl_ids structure - */ -void -zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) -{ - if (acl_ids->z_aclp) - zfs_acl_free(acl_ids->z_aclp); - if (acl_ids->z_fuidp) - zfs_fuid_info_free(acl_ids->z_fuidp); - acl_ids->z_aclp = NULL; - acl_ids->z_fuidp = NULL; -} - -boolean_t -zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) -{ - return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || - zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || - (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && - zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); -} - -/* - * Retrieve a file's ACL - */ -int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfs_acl_t *aclp; - ulong_t mask; - int error; - int count = 0; - int largeace = 0; - - mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | - VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); - - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) - return (error); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - /* - * Scan ACL to determine number of ACEs - */ - if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { - void *zacep = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t type, iflags; - - while ((zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type))) { - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - largeace++; - continue; - default: - count++; - } - } - vsecp->vsa_aclcnt = count; - } else - count = (int)aclp->z_acl_count; - - if (mask & VSA_ACECNT) { - vsecp->vsa_aclcnt = count; - } - - if (mask & VSA_ACE) { - size_t aclsz; - - aclsz = count * sizeof (ace_t) + - sizeof (ace_object_t) * largeace; - - vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); - vsecp->vsa_aclentsz = aclsz; - - if (aclp->z_version == ZFS_ACL_VERSION_FUID) - zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, - vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); - else { - zfs_acl_node_t *aclnode; - void *start = vsecp->vsa_aclentp; - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - bcopy(aclnode->z_acldata, start, - aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == - aclp->z_acl_bytes); - } - } - if (mask & VSA_ACE_ACLFLAGS) { - vsecp->vsa_aclflags = 0; - if (zp->z_pflags & ZFS_ACL_DEFAULTED) - vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_pflags & ZFS_ACL_PROTECTED) - vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) - vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; - } - - mutex_exit(&zp->z_acl_lock); - - return (0); -} - -int -zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, - vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - int aclcnt = vsecp->vsa_aclcnt; - int error; - - if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) - return (SET_ERROR(EINVAL)); - - aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); - - aclp->z_hints = 0; - aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, - (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, - aclcnt, &aclnode->z_size)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } else { - if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, - vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size, fuidp, cr)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } - aclp->z_acl_bytes = aclnode->z_size; - aclnode->z_ace_count = aclcnt; - aclp->z_acl_count = aclcnt; - list_insert_head(&aclp->z_acl, aclnode); - - /* - * If flags are being set then add them to z_hints - */ - if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { - if (vsecp->vsa_aclflags & ACL_PROTECTED) - aclp->z_hints |= ZFS_ACL_PROTECTED; - if (vsecp->vsa_aclflags & ACL_DEFAULTED) - aclp->z_hints |= ZFS_ACL_DEFAULTED; - if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - } - - *zaclp = aclp; - - return (0); -} - -/* - * Set a file's ACL - */ -int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - zilog_t *zilog = zfsvfs->z_log; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); - dmu_tx_t *tx; - int error; - zfs_acl_t *aclp; - zfs_fuid_info_t *fuidp = NULL; - boolean_t fuid_dirtied; - uint64_t acl_obj; - - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if (zp->z_pflags & ZFS_IMMUTABLE) - return (SET_ERROR(EPERM)); - - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) - return (error); - - error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, - &aclp); - if (error) - return (error); - - /* - * If ACL wide flags aren't being set then preserve any - * existing flags. - */ - if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= - (zp->z_pflags & V4_ACL_WIDE_FLAGS); - } -top: - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - - tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - /* - * If old version and ACL won't fit in bonus and we aren't - * upgrading then take out necessary DMU holds - */ - - if ((acl_obj = zfs_external_acl(zp)) != 0) { - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); - } - - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_NOWAIT); - if (error) { - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - zfs_acl_free(aclp); - return (error); - } - - error = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT(error == 0); - ASSERT(zp->z_acl_cached == NULL); - zp->z_acl_cached = aclp; - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - zfs_log_acl(zilog, tx, zp, vsecp, fuidp); - - if (fuidp) - zfs_fuid_info_free(fuidp); - dmu_tx_commit(tx); - - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - - return (error); -} - -/* - * Check accesses of interest (AoI) against attributes of the dataset - * such as read-only. Returns zero if no AoI conflict with dataset - * attributes, otherwise an appropriate errno is returned. - */ -static int -zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) -{ - if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && - (!S_ISDEV(ZTOI(zp)->i_mode) || - (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { - return (SET_ERROR(EROFS)); - } - - /* - * Only check for READONLY on non-directories. - */ - if ((v4_mode & WRITE_MASK_DATA) && - ((!S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || - (S_ISDIR(ZTOI(zp)->i_mode) && - (zp->z_pflags & ZFS_IMMUTABLE)))) { - return (SET_ERROR(EPERM)); - } - - if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_pflags & ZFS_NOUNLINK)) { - return (SET_ERROR(EPERM)); - } - - if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_pflags & ZFS_AV_QUARANTINED))) { - return (SET_ERROR(EACCES)); - } - - return (0); -} - -/* - * The primary usage of this function is to loop through all of the - * ACEs in the znode, determining what accesses of interest (AoI) to - * the caller are allowed or denied. The AoI are expressed as bits in - * the working_mode parameter. As each ACE is processed, bits covered - * by that ACE are removed from the working_mode. This removal - * facilitates two things. The first is that when the working mode is - * empty (= 0), we know we've looked at all the AoI. The second is - * that the ACE interpretation rules don't allow a later ACE to undo - * something granted or denied by an earlier ACE. Removing the - * discovered access or denial enforces this rule. At the end of - * processing the ACEs, all AoI that were found to be denied are - * placed into the working_mode, giving the caller a mask of denied - * accesses. Returns: - * 0 if all AoI granted - * EACCES if the denied mask is non-zero - * other error if abnormal failure (e.g., IO error) - * - * A secondary usage of the function is to determine if any of the - * AoI are granted. If an ACE grants any access in - * the working_mode, we immediately short circuit out of the function. - * This mode is chosen by setting anyaccess to B_TRUE. The - * working_mode is not a denied access mask upon exit if the function - * is used in this manner. - */ -static int -zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - boolean_t anyaccess, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - zfs_acl_t *aclp; - int error; - uid_t uid = crgetuid(cr); - uint64_t who; - uint16_t type, iflags; - uint16_t entry_type; - uint32_t access_mask; - uint32_t deny_mask = 0; - zfs_ace_hdr_t *acep = NULL; - boolean_t checkit; - uid_t gowner; - uid_t fowner; - - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - ASSERT(zp->z_acl_cached); - - while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type))) { - uint32_t mask_matched; - - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - if (S_ISDIR(ZTOI(zp)->i_mode) && - (iflags & ACE_INHERIT_ONLY_ACE)) - continue; - - /* Skip ACE if it does not affect any AoI */ - mask_matched = (access_mask & *working_mode); - if (!mask_matched) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - checkit = B_FALSE; - - switch (entry_type) { - case ACE_OWNER: - if (uid == fowner) - checkit = B_TRUE; - break; - case OWNING_GROUP: - who = gowner; - /*FALLTHROUGH*/ - case ACE_IDENTIFIER_GROUP: - checkit = zfs_groupmember(zfsvfs, who, cr); - break; - case ACE_EVERYONE: - checkit = B_TRUE; - break; - - /* USER Entry */ - default: - if (entry_type == 0) { - uid_t newid; - - newid = zfs_fuid_map_id(zfsvfs, who, cr, - ZFS_ACE_USER); - if (newid != IDMAP_WK_CREATOR_OWNER_UID && - uid == newid) - checkit = B_TRUE; - break; - } else { - mutex_exit(&zp->z_acl_lock); - return (SET_ERROR(EIO)); - } - } - - if (checkit) { - if (type == DENY) { - DTRACE_PROBE3(zfs__ace__denies, - znode_t *, zp, - zfs_ace_hdr_t *, acep, - uint32_t, mask_matched); - deny_mask |= mask_matched; - } else { - DTRACE_PROBE3(zfs__ace__allows, - znode_t *, zp, - zfs_ace_hdr_t *, acep, - uint32_t, mask_matched); - if (anyaccess) { - mutex_exit(&zp->z_acl_lock); - return (0); - } - } - *working_mode &= ~mask_matched; - } - - /* Are we done? */ - if (*working_mode == 0) - break; - } - - mutex_exit(&zp->z_acl_lock); - - /* Put the found 'denies' back on the working mode */ - if (deny_mask) { - *working_mode |= deny_mask; - return (SET_ERROR(EACCES)); - } else if (*working_mode) { - return (-1); - } - - return (0); -} - -/* - * Return true if any access whatsoever granted, we don't actually - * care what access is granted. - */ -boolean_t -zfs_has_access(znode_t *zp, cred_t *cr) -{ - uint32_t have = ACE_ALL_PERMS; - - if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - uid_t owner; - - owner = zfs_fuid_map_id(ZTOZSB(zp), - KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); - return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); - } - return (B_TRUE); -} - -static int -zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - int err; - - *working_mode = v4_mode; - *check_privs = B_TRUE; - - /* - * Short circuit empty requests - */ - if (v4_mode == 0 || zfsvfs->z_replay) { - *working_mode = 0; - return (0); - } - - if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { - *check_privs = B_FALSE; - return (err); - } - - /* - * The caller requested that the ACL check be skipped. This - * would only happen if the caller checked VOP_ACCESS() with a - * 32 bit ACE mask and already had the appropriate permissions. - */ - if (skipaclchk) { - *working_mode = 0; - return (0); - } - - return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); -} - -static int -zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr) -{ - if (*working_mode != ACE_WRITE_DATA) - return (SET_ERROR(EACCES)); - - return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, - check_privs, B_FALSE, cr)); -} - -int -zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) -{ - boolean_t owner = B_FALSE; - boolean_t groupmbr = B_FALSE; - boolean_t is_attr; - uid_t uid = crgetuid(cr); - int error; - - if (zdp->z_pflags & ZFS_AV_QUARANTINED) - return (SET_ERROR(EACCES)); - - is_attr = ((zdp->z_pflags & ZFS_XATTR) && - (S_ISDIR(ZTOI(zdp)->i_mode))); - if (is_attr) - goto slow; - - - mutex_enter(&zdp->z_acl_lock); - - if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } - - if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 || - KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - - if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { - owner = B_TRUE; - if (zdp->z_mode & S_IXUSR) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } else { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - } - if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { - groupmbr = B_TRUE; - if (zdp->z_mode & S_IXGRP) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } else { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - } - if (!owner && !groupmbr) { - if (zdp->z_mode & S_IXOTH) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } - } - - mutex_exit(&zdp->z_acl_lock); - -slow: - DTRACE_PROBE(zfs__fastpath__execute__access__miss); - ZFS_ENTER(ZTOZSB(zdp)); - error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); - ZFS_EXIT(ZTOZSB(zdp)); - return (error); -} - -/* - * Determine whether Access should be granted/denied. - * - * The least priv subsystem is always consulted as a basic privilege - * can define any form of access. - */ -int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) -{ - uint32_t working_mode; - int error; - int is_attr; - boolean_t check_privs; - znode_t *xzp; - znode_t *check_zp = zp; - mode_t needed_bits; - uid_t owner; - - is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); - - /* - * If attribute then validate against base file - */ - if (is_attr) { - if ((error = zfs_zget(ZTOZSB(zp), - zp->z_xattr_parent, &xzp)) != 0) { - return (error); - } - - check_zp = xzp; - - /* - * fixup mode to map to xattr perms - */ - - if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { - mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mode |= ACE_WRITE_NAMED_ATTRS; - } - - if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { - mode &= ~(ACE_READ_DATA|ACE_EXECUTE); - mode |= ACE_READ_NAMED_ATTRS; - } - } - - owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), - cr, ZFS_OWNER); - /* - * Map the bits required to the standard inode flags - * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits - * mapped by working_mode (currently missing) in missing_bits. - * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), - * needed_bits. - */ - needed_bits = 0; - - working_mode = mode; - if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - owner == crgetuid(cr)) - working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); - - if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| - ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) - needed_bits |= S_IRUSR; - if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| - ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) - needed_bits |= S_IWUSR; - if (working_mode & ACE_EXECUTE) - needed_bits |= S_IXUSR; - - if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, - &check_privs, skipaclchk, cr)) == 0) { - if (is_attr) - iput(ZTOI(xzp)); - return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, - needed_bits, needed_bits)); - } - - if (error && !check_privs) { - if (is_attr) - iput(ZTOI(xzp)); - return (error); - } - - if (error && (flags & V_APPEND)) { - error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); - } - - if (error && check_privs) { - mode_t checkmode = 0; - - /* - * First check for implicit owner permission on - * read_acl/read_attributes - */ - - error = 0; - ASSERT(working_mode != 0); - - if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && - owner == crgetuid(cr))) - working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); - - if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| - ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) - checkmode |= S_IRUSR; - if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| - ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) - checkmode |= S_IWUSR; - if (working_mode & ACE_EXECUTE) - checkmode |= S_IXUSR; - - error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, - needed_bits & ~checkmode, needed_bits); - - if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, owner); - if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(cr, owner); - - if (error == 0 && (working_mode & - (ACE_DELETE|ACE_DELETE_CHILD))) - error = secpolicy_vnode_remove(cr); - - if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, owner); - } - if (error == 0) { - /* - * See if any bits other than those already checked - * for are still present. If so then return EACCES - */ - if (working_mode & ~(ZFS_CHECKED_MASKS)) { - error = SET_ERROR(EACCES); - } - } - } else if (error == 0) { - error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, - needed_bits, needed_bits); - } - - if (is_attr) - iput(ZTOI(xzp)); - - return (error); -} - -/* - * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into - * native ACL format and call zfs_zaccess() - */ -int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) -{ - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); -} - -/* - * Access function for secpolicy_vnode_setattr - */ -int -zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) -{ - int v4_mode = zfs_unix_to_v4(mode >> 6); - - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); -} - -static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t available_perms, cred_t *cr) -{ - int error; - uid_t downer; - - downer = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid), - cr, ZFS_OWNER); - - error = secpolicy_vnode_access2(cr, ZTOI(dzp), - downer, available_perms, S_IWUSR|S_IXUSR); - - if (error == 0) - error = zfs_sticky_remove_access(dzp, zp, cr); - - return (error); -} - -/* - * Determine whether Access should be granted/deny, without - * consulting least priv subsystem. - * - * The following chart is the recommended NFSv4 enforcement for - * ability to delete an object. - * - * ------------------------------------------------------- - * | Parent Dir | Target Object Permissions | - * | permissions | | - * ------------------------------------------------------- - * | | ACL Allows | ACL Denies| Delete | - * | | Delete | Delete | unspecified| - * ------------------------------------------------------- - * | ACL Allows | Permit | Permit | Permit | - * | DELETE_CHILD | | - * ------------------------------------------------------- - * | ACL Denies | Permit | Deny | Deny | - * | DELETE_CHILD | | | | - * ------------------------------------------------------- - * | ACL specifies | | | | - * | only allow | Permit | Permit | Permit | - * | write and | | | | - * | execute | | | | - * ------------------------------------------------------- - * | ACL denies | | | | - * | write and | Permit | Deny | Deny | - * | execute | | | | - * ------------------------------------------------------- - * ^ - * | - * No search privilege, can't even look up file? - * - */ -int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) -{ - uint32_t dzp_working_mode = 0; - uint32_t zp_working_mode = 0; - int dzp_error, zp_error; - mode_t available_perms; - boolean_t dzpcheck_privs = B_TRUE; - boolean_t zpcheck_privs = B_TRUE; - - /* - * We want specific DELETE permissions to - * take precedence over WRITE/EXECUTE. We don't - * want an ACL such as this to mess us up. - * user:joe:write_data:deny,user:joe:delete:allow - * - * However, deny permissions may ultimately be overridden - * by secpolicy_vnode_access(). - * - * We will ask for all of the necessary permissions and then - * look at the working modes from the directory and target object - * to determine what was found. - */ - - if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) - return (SET_ERROR(EPERM)); - - /* - * First row - * If the directory permissions allow the delete, we are done. - */ - if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - /* - * If target object has delete permission then we are done - */ - if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - ASSERT(dzp_error && zp_error); - - if (!dzpcheck_privs) - return (dzp_error); - if (!zpcheck_privs) - return (zp_error); - - /* - * Second row - * - * If directory returns EACCES then delete_child was denied - * due to deny delete_child. In this case send the request through - * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() - * since that *could* allow the delete based on write/execute permission - * and we want delete permissions to override write/execute. - */ - - if (dzp_error == EACCES) - return (secpolicy_vnode_remove(cr)); - - /* - * Third Row - * only need to see if we have write/execute on directory. - */ - - dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); - - if (dzp_error != 0 && !dzpcheck_privs) - return (dzp_error); - - /* - * Fourth row - */ - - available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR; - available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR; - - return (zfs_delete_final_check(zp, dzp, available_perms, cr)); - -} - -int -zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) -{ - int add_perm; - int error; - - if (szp->z_pflags & ZFS_AV_QUARANTINED) - return (SET_ERROR(EACCES)); - - add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? - ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; - - /* - * Rename permissions are combination of delete permission + - * add file/subdir permission. - */ - - /* - * first make sure we do the delete portion. - * - * If that succeeds then check for add_file/add_subdir permissions - */ - - if ((error = zfs_zaccess_delete(sdzp, szp, cr))) - return (error); - - /* - * If we have a tzp, see if we can delete it? - */ - if (tzp) { - if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) - return (error); - } - - /* - * Now check for add permissions - */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); - - return (error); -} diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c deleted file mode 100644 index 1e61ef06d..000000000 --- a/module/zfs/zfs_ctldir.c +++ /dev/null @@ -1,1240 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (C) 2011 Lawrence Livermore National Security, LLC. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * LLNL-CODE-403049. - * Rewritten for Linux by: - * Rohan Puri <[email protected]> - * Brian Behlendorf <[email protected]> - * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2018 George Melikov. All Rights Reserved. - * Copyright (c) 2019 Datto, Inc. All rights reserved. - */ - -/* - * ZFS control directory (a.k.a. ".zfs") - * - * This directory provides a common location for all ZFS meta-objects. - * Currently, this is only the 'snapshot' and 'shares' directory, but this may - * expand in the future. The elements are built dynamically, as the hierarchy - * does not actually exist on disk. - * - * For 'snapshot', we don't want to have all snapshots always mounted, because - * this would take up a huge amount of space in /etc/mnttab. We have three - * types of objects: - * - * ctldir ------> snapshotdir -------> snapshot - * | - * | - * V - * mounted fs - * - * The 'snapshot' node contains just enough information to lookup '..' and act - * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we - * perform an automount of the underlying filesystem and return the - * corresponding inode. - * - * All mounts are handled automatically by an user mode helper which invokes - * the mount procedure. Unmounts are handled by allowing the mount - * point to expire so the kernel may automatically unmount it. - * - * The '.zfs', '.zfs/snapshot', and all directories created under - * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same - * zfsvfs_t as the head filesystem (what '.zfs' lives under). - * - * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths - * (ie: snapshots) are complete ZFS filesystems and have their own unique - * zfsvfs_t. However, the fsid reported by these mounts will be the same - * as that used by the parent zfsvfs_t to make NFS happy. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <sys/pathname.h> -#include <sys/vfs.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/stat.h> -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dsl_destroy.h> -#include <sys/dsl_deleg.h> -#include <sys/zpl.h> -#include <sys/mntent.h> -#include "zfs_namecheck.h" - -/* - * Two AVL trees are maintained which contain all currently automounted - * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t - * entry which MUST: - * - * - be attached to both trees, and - * - be unique, no duplicate entries are allowed. - * - * The zfs_snapshots_by_name tree is indexed by the full dataset name - * while the zfs_snapshots_by_objsetid tree is indexed by the unique - * objsetid. This allows for fast lookups either by name or objsetid. - */ -static avl_tree_t zfs_snapshots_by_name; -static avl_tree_t zfs_snapshots_by_objsetid; -static krwlock_t zfs_snapshot_lock; - -/* - * Control Directory Tunables (.zfs) - */ -int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; -int zfs_admin_snapshot = 0; - -typedef struct { - char *se_name; /* full snapshot name */ - char *se_path; /* full mount path */ - spa_t *se_spa; /* pool spa */ - uint64_t se_objsetid; /* snapshot objset id */ - struct dentry *se_root_dentry; /* snapshot root dentry */ - taskqid_t se_taskqid; /* scheduled unmount taskqid */ - avl_node_t se_node_name; /* zfs_snapshots_by_name link */ - avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ - zfs_refcount_t se_refcount; /* reference count */ -} zfs_snapentry_t; - -static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); - -/* - * Allocate a new zfs_snapentry_t being careful to make a copy of the - * the snapshot name and provided mount point. No reference is taken. - */ -static zfs_snapentry_t * -zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa, - uint64_t objsetid, struct dentry *root_dentry) -{ - zfs_snapentry_t *se; - - se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); - - se->se_name = strdup(full_name); - se->se_path = strdup(full_path); - se->se_spa = spa; - se->se_objsetid = objsetid; - se->se_root_dentry = root_dentry; - se->se_taskqid = TASKQID_INVALID; - - zfs_refcount_create(&se->se_refcount); - - return (se); -} - -/* - * Free a zfs_snapentry_t the caller must ensure there are no active - * references. - */ -static void -zfsctl_snapshot_free(zfs_snapentry_t *se) -{ - zfs_refcount_destroy(&se->se_refcount); - strfree(se->se_name); - strfree(se->se_path); - - kmem_free(se, sizeof (zfs_snapentry_t)); -} - -/* - * Hold a reference on the zfs_snapentry_t. - */ -static void -zfsctl_snapshot_hold(zfs_snapentry_t *se) -{ - zfs_refcount_add(&se->se_refcount, NULL); -} - -/* - * Release a reference on the zfs_snapentry_t. When the number of - * references drops to zero the structure will be freed. - */ -static void -zfsctl_snapshot_rele(zfs_snapentry_t *se) -{ - if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) - zfsctl_snapshot_free(se); -} - -/* - * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and - * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part - * of the trees a reference is held. - */ -static void -zfsctl_snapshot_add(zfs_snapentry_t *se) -{ - ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); - zfsctl_snapshot_hold(se); - avl_add(&zfs_snapshots_by_name, se); - avl_add(&zfs_snapshots_by_objsetid, se); -} - -/* - * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and - * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, - * this can result in the structure being freed if that was the last - * remaining reference. - */ -static void -zfsctl_snapshot_remove(zfs_snapentry_t *se) -{ - ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); - avl_remove(&zfs_snapshots_by_name, se); - avl_remove(&zfs_snapshots_by_objsetid, se); - zfsctl_snapshot_rele(se); -} - -/* - * Snapshot name comparison function for the zfs_snapshots_by_name. - */ -static int -snapentry_compare_by_name(const void *a, const void *b) -{ - const zfs_snapentry_t *se_a = a; - const zfs_snapentry_t *se_b = b; - int ret; - - ret = strcmp(se_a->se_name, se_b->se_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -/* - * Snapshot name comparison function for the zfs_snapshots_by_objsetid. - */ -static int -snapentry_compare_by_objsetid(const void *a, const void *b) -{ - const zfs_snapentry_t *se_a = a; - const zfs_snapentry_t *se_b = b; - - if (se_a->se_spa != se_b->se_spa) - return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); - - if (se_a->se_objsetid < se_b->se_objsetid) - return (-1); - else if (se_a->se_objsetid > se_b->se_objsetid) - return (1); - else - return (0); -} - -/* - * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname - * is found a pointer to the zfs_snapentry_t is returned and a reference - * taken on the structure. The caller is responsible for dropping the - * reference with zfsctl_snapshot_rele(). If the snapname is not found - * NULL will be returned. - */ -static zfs_snapentry_t * -zfsctl_snapshot_find_by_name(char *snapname) -{ - zfs_snapentry_t *se, search; - - ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); - - search.se_name = snapname; - se = avl_find(&zfs_snapshots_by_name, &search, NULL); - if (se) - zfsctl_snapshot_hold(se); - - return (se); -} - -/* - * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id - * rather than the snapname. In all other respects it behaves the same - * as zfsctl_snapshot_find_by_name(). - */ -static zfs_snapentry_t * -zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) -{ - zfs_snapentry_t *se, search; - - ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); - - search.se_spa = spa; - search.se_objsetid = objsetid; - se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); - if (se) - zfsctl_snapshot_hold(se); - - return (se); -} - -/* - * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is - * removed, renamed, and added back to the new correct location in the tree. - */ -static int -zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) -{ - zfs_snapentry_t *se; - - ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); - - se = zfsctl_snapshot_find_by_name(old_snapname); - if (se == NULL) - return (SET_ERROR(ENOENT)); - - zfsctl_snapshot_remove(se); - strfree(se->se_name); - se->se_name = strdup(new_snapname); - zfsctl_snapshot_add(se); - zfsctl_snapshot_rele(se); - - return (0); -} - -/* - * Delayed task responsible for unmounting an expired automounted snapshot. - */ -static void -snapentry_expire(void *data) -{ - zfs_snapentry_t *se = (zfs_snapentry_t *)data; - spa_t *spa = se->se_spa; - uint64_t objsetid = se->se_objsetid; - - if (zfs_expire_snapshot <= 0) { - zfsctl_snapshot_rele(se); - return; - } - - se->se_taskqid = TASKQID_INVALID; - (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); - zfsctl_snapshot_rele(se); - - /* - * Reschedule the unmount if the zfs_snapentry_t wasn't removed. - * This can occur when the snapshot is busy. - */ - rw_enter(&zfs_snapshot_lock, RW_READER); - if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { - zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); - zfsctl_snapshot_rele(se); - } - rw_exit(&zfs_snapshot_lock); -} - -/* - * Cancel an automatic unmount of a snapname. This callback is responsible - * for dropping the reference on the zfs_snapentry_t which was taken when - * during dispatch. - */ -static void -zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) -{ - if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { - se->se_taskqid = TASKQID_INVALID; - zfsctl_snapshot_rele(se); - } -} - -/* - * Dispatch the unmount task for delayed handling with a hold protecting it. - */ -static void -zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) -{ - ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); - - if (delay <= 0) - return; - - zfsctl_snapshot_hold(se); - se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, - snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); -} - -/* - * Schedule an automatic unmount of objset id to occur in delay seconds from - * now. Any previous delayed unmount will be cancelled in favor of the - * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() - * and held until the outstanding task is handled or cancelled. - */ -int -zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) -{ - zfs_snapentry_t *se; - int error = ENOENT; - - rw_enter(&zfs_snapshot_lock, RW_READER); - if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { - zfsctl_snapshot_unmount_cancel(se); - zfsctl_snapshot_unmount_delay_impl(se, delay); - zfsctl_snapshot_rele(se); - error = 0; - } - rw_exit(&zfs_snapshot_lock); - - return (error); -} - -/* - * Check if snapname is currently mounted. Returned non-zero when mounted - * and zero when unmounted. - */ -static boolean_t -zfsctl_snapshot_ismounted(char *snapname) -{ - zfs_snapentry_t *se; - boolean_t ismounted = B_FALSE; - - rw_enter(&zfs_snapshot_lock, RW_READER); - if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { - zfsctl_snapshot_rele(se); - ismounted = B_TRUE; - } - rw_exit(&zfs_snapshot_lock); - - return (ismounted); -} - -/* - * Check if the given inode is a part of the virtual .zfs directory. - */ -boolean_t -zfsctl_is_node(struct inode *ip) -{ - return (ITOZ(ip)->z_is_ctldir); -} - -/* - * Check if the given inode is a .zfs/snapshots/snapname directory. - */ -boolean_t -zfsctl_is_snapdir(struct inode *ip) -{ - return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); -} - -/* - * Allocate a new inode with the passed id and ops. - */ -static struct inode * -zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) -{ - inode_timespec_t now; - struct inode *ip; - znode_t *zp; - - ip = new_inode(zfsvfs->z_sb); - if (ip == NULL) - return (NULL); - - now = current_time(ip); - zp = ITOZ(ip); - ASSERT3P(zp->z_dirlocks, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - zp->z_id = id; - zp->z_unlinked = B_FALSE; - zp->z_atime_dirty = B_FALSE; - zp->z_zn_prefetch = B_FALSE; - zp->z_moved = B_FALSE; - zp->z_is_sa = B_FALSE; - zp->z_is_mapped = B_FALSE; - zp->z_is_ctldir = B_TRUE; - zp->z_is_stale = B_FALSE; - zp->z_sa_hdl = NULL; - zp->z_blksz = 0; - zp->z_seq = 0; - zp->z_mapcnt = 0; - zp->z_size = 0; - zp->z_pflags = 0; - zp->z_mode = 0; - zp->z_sync_cnt = 0; - ip->i_generation = 0; - ip->i_ino = id; - ip->i_mode = (S_IFDIR | S_IRWXUGO); - ip->i_uid = SUID_TO_KUID(0); - ip->i_gid = SGID_TO_KGID(0); - ip->i_blkbits = SPA_MINBLOCKSHIFT; - ip->i_atime = now; - ip->i_mtime = now; - ip->i_ctime = now; - ip->i_fop = fops; - ip->i_op = ops; -#if defined(IOP_XATTR) - ip->i_opflags &= ~IOP_XATTR; -#endif - - if (insert_inode_locked(ip)) { - unlock_new_inode(ip); - iput(ip); - return (NULL); - } - - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; - membar_producer(); - mutex_exit(&zfsvfs->z_znodes_lock); - - unlock_new_inode(ip); - - return (ip); -} - -/* - * Lookup the inode with given id, it will be allocated if needed. - */ -static struct inode * -zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) -{ - struct inode *ip = NULL; - - while (ip == NULL) { - ip = ilookup(zfsvfs->z_sb, (unsigned long)id); - if (ip) - break; - - /* May fail due to concurrent zfsctl_inode_alloc() */ - ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); - } - - return (ip); -} - -/* - * Create the '.zfs' directory. This directory is cached as part of the VFS - * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() - * therefore checks against a vfs_count of 2 instead of 1. This reference - * is removed when the ctldir is destroyed in the unmount. All other entities - * under the '.zfs' directory are created dynamically as needed. - * - * Because the dynamically created '.zfs' directory entries assume the use - * of 64-bit inode numbers this support must be disabled on 32-bit systems. - */ -int -zfsctl_create(zfsvfs_t *zfsvfs) -{ - ASSERT(zfsvfs->z_ctldir == NULL); - - zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, - &zpl_fops_root, &zpl_ops_root); - if (zfsvfs->z_ctldir == NULL) - return (SET_ERROR(ENOENT)); - - return (0); -} - -/* - * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. - * Only called when the filesystem is unmounted. - */ -void -zfsctl_destroy(zfsvfs_t *zfsvfs) -{ - if (zfsvfs->z_issnap) { - zfs_snapentry_t *se; - spa_t *spa = zfsvfs->z_os->os_spa; - uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); - - rw_enter(&zfs_snapshot_lock, RW_WRITER); - se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); - if (se != NULL) - zfsctl_snapshot_remove(se); - rw_exit(&zfs_snapshot_lock); - if (se != NULL) { - zfsctl_snapshot_unmount_cancel(se); - zfsctl_snapshot_rele(se); - } - } else if (zfsvfs->z_ctldir) { - iput(zfsvfs->z_ctldir); - zfsvfs->z_ctldir = NULL; - } -} - -/* - * Given a root znode, retrieve the associated .zfs directory. - * Add a hold to the vnode and return it. - */ -struct inode * -zfsctl_root(znode_t *zp) -{ - ASSERT(zfs_has_ctldir(zp)); - igrab(ZTOZSB(zp)->z_ctldir); - return (ZTOZSB(zp)->z_ctldir); -} - -/* - * Generate a long fid to indicate a snapdir. We encode whether snapdir is - * already mounted in gen field. We do this because nfsd lookup will not - * trigger automount. Next time the nfsd does fh_to_dentry, we will notice - * this and do automount and return ESTALE to force nfsd revalidate and follow - * mount. - */ -static int -zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) -{ - zfid_short_t *zfid = (zfid_short_t *)fidp; - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint32_t gen = 0; - uint64_t object; - uint64_t objsetid; - int i; - struct dentry *dentry; - - if (fidp->fid_len < LONG_FID_LEN) { - fidp->fid_len = LONG_FID_LEN; - return (SET_ERROR(ENOSPC)); - } - - object = ip->i_ino; - objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; - zfid->zf_len = LONG_FID_LEN; - - dentry = d_obtain_alias(igrab(ip)); - if (!IS_ERR(dentry)) { - gen = !!d_mountpoint(dentry); - dput(dentry); - } - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); - - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - zlfid->zf_setgen[i] = 0; - - return (0); -} - -/* - * Generate an appropriate fid for an entry in the .zfs directory. - */ -int -zfsctl_fid(struct inode *ip, fid_t *fidp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int i; - - ZFS_ENTER(zfsvfs); - - if (zfsctl_is_snapdir(ip)) { - ZFS_EXIT(zfsvfs); - return (zfsctl_snapdir_fid(ip, fidp)); - } - - if (fidp->fid_len < SHORT_FID_LEN) { - fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOSPC)); - } - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = SHORT_FID_LEN; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* .zfs znodes always have a generation number of 0 */ - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = 0; - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Construct a full dataset name in full_name: "pool/dataset@snap_name" - */ -static int -zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, - char *full_name) -{ - objset_t *os = zfsvfs->z_os; - - if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) - return (SET_ERROR(EILSEQ)); - - dmu_objset_name(os, full_name); - if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) - return (SET_ERROR(ENAMETOOLONG)); - - (void) strcat(full_name, "@"); - (void) strcat(full_name, snap_name); - - return (0); -} - -/* - * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" - */ -static int -zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, - int path_len, char *full_path) -{ - objset_t *os = zfsvfs->z_os; - fstrans_cookie_t cookie; - char *snapname; - boolean_t case_conflict; - uint64_t id, pos = 0; - int error = 0; - - if (zfsvfs->z_vfs->vfs_mntpoint == NULL) - return (SET_ERROR(ENOENT)); - - cookie = spl_fstrans_mark(); - snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - - while (error == 0) { - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dmu_snapshot_list_next(zfsvfs->z_os, - ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, - &case_conflict); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error) - goto out; - - if (id == objsetid) - break; - } - - snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", - zfsvfs->z_vfs->vfs_mntpoint, snapname); -out: - kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); - spl_fstrans_unmark(cookie); - - return (error); -} - -/* - * Special case the handling of "..". - */ -int -zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, - int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = ITOZSB(dip); - int error = 0; - - ZFS_ENTER(zfsvfs); - - if (strcmp(name, "..") == 0) { - *ipp = dip->i_sb->s_root->d_inode; - } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { - *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, - &zpl_fops_snapdir, &zpl_ops_snapdir); - } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { - *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, - &zpl_fops_shares, &zpl_ops_shares); - } else { - *ipp = NULL; - } - - if (*ipp == NULL) - error = SET_ERROR(ENOENT); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Lookup entry point for the 'snapshot' directory. Try to open the - * snapshot if it exist, creating the pseudo filesystem inode as necessary. - */ -int -zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, - int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = ITOZSB(dip); - uint64_t id; - int error; - - ZFS_ENTER(zfsvfs); - - error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, - &simple_dir_operations, &simple_dir_inode_operations); - if (*ipp == NULL) - error = SET_ERROR(ENOENT); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Renaming a directory under '.zfs/snapshot' will automatically trigger - * a rename of the snapshot to the new given name. The rename is confined - * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. - */ -int -zfsctl_snapdir_rename(struct inode *sdip, char *snm, - struct inode *tdip, char *tnm, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = ITOZSB(sdip); - char *to, *from, *real, *fsname; - int error; - - if (!zfs_admin_snapshot) - return (SET_ERROR(EACCES)); - - ZFS_ENTER(zfsvfs); - - to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, - ZFS_MAX_DATASET_NAME_LEN, NULL); - if (error == 0) { - snm = real; - } else if (error != ENOTSUP) { - goto out; - } - } - - dmu_objset_name(zfsvfs->z_os, fsname); - - error = zfsctl_snapshot_name(ITOZSB(sdip), snm, - ZFS_MAX_DATASET_NAME_LEN, from); - if (error == 0) - error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, - ZFS_MAX_DATASET_NAME_LEN, to); - if (error == 0) - error = zfs_secpolicy_rename_perms(from, to, cr); - if (error != 0) - goto out; - - /* - * Cannot move snapshots out of the snapdir. - */ - if (sdip != tdip) { - error = SET_ERROR(EINVAL); - goto out; - } - - /* - * No-op when names are identical. - */ - if (strcmp(snm, tnm) == 0) { - error = 0; - goto out; - } - - rw_enter(&zfs_snapshot_lock, RW_WRITER); - - error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); - if (error == 0) - (void) zfsctl_snapshot_rename(snm, tnm); - - rw_exit(&zfs_snapshot_lock); -out: - kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Removing a directory under '.zfs/snapshot' will automatically trigger - * the removal of the snapshot with the given name. - */ -int -zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = ITOZSB(dip); - char *snapname, *real; - int error; - - if (!zfs_admin_snapshot) - return (SET_ERROR(EACCES)); - - ZFS_ENTER(zfsvfs); - - snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - error = dmu_snapshot_realname(zfsvfs->z_os, name, real, - ZFS_MAX_DATASET_NAME_LEN, NULL); - if (error == 0) { - name = real; - } else if (error != ENOTSUP) { - goto out; - } - } - - error = zfsctl_snapshot_name(ITOZSB(dip), name, - ZFS_MAX_DATASET_NAME_LEN, snapname); - if (error == 0) - error = zfs_secpolicy_destroy_perms(snapname, cr); - if (error != 0) - goto out; - - error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); - if ((error == 0) || (error == ENOENT)) - error = dsl_destroy_snapshot(snapname, B_FALSE); -out: - kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Creating a directory under '.zfs/snapshot' will automatically trigger - * the creation of a new snapshot with the given name. - */ -int -zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - struct inode **ipp, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = ITOZSB(dip); - char *dsname; - int error; - - if (!zfs_admin_snapshot) - return (SET_ERROR(EACCES)); - - dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - - if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { - error = SET_ERROR(EILSEQ); - goto out; - } - - dmu_objset_name(zfsvfs->z_os, dsname); - - error = zfs_secpolicy_snapshot_perms(dsname, cr); - if (error != 0) - goto out; - - if (error == 0) { - error = dmu_objset_snapshot_one(dsname, dirname); - if (error != 0) - goto out; - - error = zfsctl_snapdir_lookup(dip, dirname, ipp, - 0, cr, NULL, NULL); - } -out: - kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); - - return (error); -} - -/* - * Attempt to unmount a snapshot by making a call to user space. - * There is no assurance that this can or will succeed, is just a - * best effort. In the case where it does fail, perhaps because - * it's in use, the unmount will fail harmlessly. - */ -int -zfsctl_snapshot_unmount(char *snapname, int flags) -{ - char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, - NULL }; - char *envp[] = { NULL }; - zfs_snapentry_t *se; - int error; - - rw_enter(&zfs_snapshot_lock, RW_READER); - if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { - rw_exit(&zfs_snapshot_lock); - return (SET_ERROR(ENOENT)); - } - rw_exit(&zfs_snapshot_lock); - - if (flags & MNT_FORCE) - argv[4] = "-fn"; - argv[5] = se->se_path; - dprintf("unmount; path=%s\n", se->se_path); - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - zfsctl_snapshot_rele(se); - - - /* - * The umount system utility will return 256 on error. We must - * assume this error is because the file system is busy so it is - * converted to the more sensible EBUSY. - */ - if (error) - error = SET_ERROR(EBUSY); - - return (error); -} - -int -zfsctl_snapshot_mount(struct path *path, int flags) -{ - struct dentry *dentry = path->dentry; - struct inode *ip = dentry->d_inode; - zfsvfs_t *zfsvfs; - zfsvfs_t *snap_zfsvfs; - zfs_snapentry_t *se; - char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, - NULL }; - char *envp[] = { NULL }; - int error; - struct path spath; - - if (ip == NULL) - return (SET_ERROR(EISDIR)); - - zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); - - full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - - error = zfsctl_snapshot_name(zfsvfs, dname(dentry), - ZFS_MAX_DATASET_NAME_LEN, full_name); - if (error) - goto error; - - /* - * Construct a mount point path from sb of the ctldir inode and dirent - * name, instead of from d_path(), so that chroot'd process doesn't fail - * on mount.zfs(8). - */ - snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", - zfsvfs->z_vfs->vfs_mntpoint, dname(dentry)); - - /* - * Multiple concurrent automounts of a snapshot are never allowed. - * The snapshot may be manually mounted as many times as desired. - */ - if (zfsctl_snapshot_ismounted(full_name)) { - error = 0; - goto error; - } - - /* - * Attempt to mount the snapshot from user space. Normally this - * would be done using the vfs_kern_mount() function, however that - * function is marked GPL-only and cannot be used. On error we - * careful to log the real error to the console and return EISDIR - * to safely abort the automount. This should be very rare. - * - * If the user mode helper happens to return EBUSY, a concurrent - * mount is already in progress in which case the error is ignored. - * Take note that if the program was executed successfully the return - * value from call_usermodehelper() will be (exitcode << 8 + signal). - */ - dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[5] = full_name; - argv[6] = full_path; - error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - if (error) { - if (!(error & MOUNT_BUSY << 8)) { - zfs_dbgmsg("Unable to automount %s error=%d", - full_path, error); - error = SET_ERROR(EISDIR); - } else { - /* - * EBUSY, this could mean a concurrent mount, or the - * snapshot has already been mounted at completely - * different place. We return 0 so VFS will retry. For - * the latter case the VFS will retry several times - * and return ELOOP, which is probably not a very good - * behavior. - */ - error = 0; - } - goto error; - } - - /* - * Follow down in to the mounted snapshot and set MNT_SHRINKABLE - * to identify this as an automounted filesystem. - */ - spath = *path; - path_get(&spath); - if (zpl_follow_down_one(&spath)) { - snap_zfsvfs = ITOZSB(spath.dentry->d_inode); - snap_zfsvfs->z_parent = zfsvfs; - dentry = spath.dentry; - spath.mnt->mnt_flags |= MNT_SHRINKABLE; - - rw_enter(&zfs_snapshot_lock, RW_WRITER); - se = zfsctl_snapshot_alloc(full_name, full_path, - snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os), - dentry); - zfsctl_snapshot_add(se); - zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); - rw_exit(&zfs_snapshot_lock); - } - path_put(&spath); -error: - kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(full_path, MAXPATHLEN); - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Get the snapdir inode from fid - */ -int -zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, - struct inode **ipp) -{ - int error; - struct path path; - char *mnt; - struct dentry *dentry; - - mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, - MAXPATHLEN, mnt); - if (error) - goto out; - - /* Trigger automount */ - error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); - if (error) - goto out; - - path_put(&path); - /* - * Get the snapdir inode. Note, we don't want to use the above - * path because it contains the root of the snapshot rather - * than the snapdir. - */ - *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); - if (*ipp == NULL) { - error = SET_ERROR(ENOENT); - goto out; - } - - /* check gen, see zfsctl_snapdir_fid */ - dentry = d_obtain_alias(igrab(*ipp)); - if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { - iput(*ipp); - *ipp = NULL; - error = SET_ERROR(ENOENT); - } - if (!IS_ERR(dentry)) - dput(dentry); -out: - kmem_free(mnt, MAXPATHLEN); - return (error); -} - -int -zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, - int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = ITOZSB(dip); - struct inode *ip; - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); - - if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { - error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL); - iput(ZTOI(dzp)); - } - - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Initialize the various pieces we'll need to create and manipulate .zfs - * directories. Currently this is unused but available. - */ -void -zfsctl_init(void) -{ - avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, - sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, - se_node_name)); - avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, - sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, - se_node_objsetid)); - rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); -} - -/* - * Cleanup the various pieces we needed for .zfs directories. In particular - * ensure the expiry timer is canceled safely. - */ -void -zfsctl_fini(void) -{ - avl_destroy(&zfs_snapshots_by_name); - avl_destroy(&zfs_snapshots_by_objsetid); - rw_destroy(&zfs_snapshot_lock); -} - -module_param(zfs_admin_snapshot, int, 0644); -MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); - -module_param(zfs_expire_snapshot, int, 0644); -MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c deleted file mode 100644 index 538533d27..000000000 --- a/module/zfs/zfs_debug.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> - -typedef struct zfs_dbgmsg { - procfs_list_node_t zdm_node; - time_t zdm_timestamp; - int zdm_size; - char zdm_msg[1]; /* variable length allocation */ -} zfs_dbgmsg_t; - -procfs_list_t zfs_dbgmsgs; -int zfs_dbgmsg_size = 0; -int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ - -/* - * Internal ZFS debug messages are enabled by default. - * - * # Print debug messages - * cat /proc/spl/kstat/zfs/dbgmsg - * - * # Disable the kernel debug message log. - * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable - * - * # Clear the kernel debug message log. - * echo 0 >/proc/spl/kstat/zfs/dbgmsg - */ -int zfs_dbgmsg_enable = 1; - -static int -zfs_dbgmsg_show_header(struct seq_file *f) -{ - seq_printf(f, "%-12s %-8s\n", "timestamp", "message"); - return (0); -} - -static int -zfs_dbgmsg_show(struct seq_file *f, void *p) -{ - zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p; - seq_printf(f, "%-12llu %-s\n", - (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); - return (0); -} - -static void -zfs_dbgmsg_purge(int max_size) -{ - while (zfs_dbgmsg_size > max_size) { - zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); - if (zdm == NULL) - return; - - int size = zdm->zdm_size; - kmem_free(zdm, size); - zfs_dbgmsg_size -= size; - } -} - -static int -zfs_dbgmsg_clear(procfs_list_t *procfs_list) -{ - mutex_enter(&zfs_dbgmsgs.pl_lock); - zfs_dbgmsg_purge(0); - mutex_exit(&zfs_dbgmsgs.pl_lock); - return (0); -} - -void -zfs_dbgmsg_init(void) -{ - procfs_list_install("zfs", - "dbgmsg", - 0600, - &zfs_dbgmsgs, - zfs_dbgmsg_show, - zfs_dbgmsg_show_header, - zfs_dbgmsg_clear, - offsetof(zfs_dbgmsg_t, zdm_node)); -} - -void -zfs_dbgmsg_fini(void) -{ - procfs_list_uninstall(&zfs_dbgmsgs); - zfs_dbgmsg_purge(0); - - /* - * TODO - decide how to make this permanent - */ -#ifdef _KERNEL - procfs_list_destroy(&zfs_dbgmsgs); -#endif -} - -void -__set_error(const char *file, const char *func, int line, int err) -{ - /* - * To enable this: - * - * $ echo 512 >/sys/module/zfs/parameters/zfs_flags - */ - if (zfs_flags & ZFS_DEBUG_SET_ERROR) - __dprintf(B_FALSE, file, func, line, "error %lu", err); -} - -void -__zfs_dbgmsg(char *buf) -{ - int size = sizeof (zfs_dbgmsg_t) + strlen(buf); - zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); - zdm->zdm_size = size; - zdm->zdm_timestamp = gethrestime_sec(); - strcpy(zdm->zdm_msg, buf); - - mutex_enter(&zfs_dbgmsgs.pl_lock); - procfs_list_add(&zfs_dbgmsgs, zdm); - zfs_dbgmsg_size += size; - zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); - mutex_exit(&zfs_dbgmsgs.pl_lock); -} - -#ifdef _KERNEL - -void -__dprintf(boolean_t dprint, const char *file, const char *func, - int line, const char *fmt, ...) -{ - const char *newfile; - va_list adx; - size_t size; - char *buf; - char *nl; - int i; - char *prefix = (dprint) ? "dprintf: " : ""; - - size = 1024; - buf = kmem_alloc(size, KM_SLEEP); - - /* - * Get rid of annoying prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - - i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); - - if (i < size) { - va_start(adx, fmt); - (void) vsnprintf(buf + i, size - i, fmt, adx); - va_end(adx); - } - - /* - * Get rid of trailing newline for dprintf logs. - */ - if (dprint && buf[0] != '\0') { - nl = &buf[strlen(buf) - 1]; - if (*nl == '\n') - *nl = '\0'; - } - - /* - * To get this data enable the zfs__dprintf trace point as shown: - * - * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer - * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable - * $ echo 0 > /sys/kernel/debug/tracing/trace - * - * # Dump the ring buffer. - * $ cat /sys/kernel/debug/tracing/trace - */ - DTRACE_PROBE1(zfs__dprintf, char *, buf); - - /* - * To get this data: - * - * $ cat /proc/spl/kstat/zfs/dbgmsg - * - * To clear the buffer: - * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg - */ - __zfs_dbgmsg(buf); - - kmem_free(buf, size); -} - -#else - -void -zfs_dbgmsg_print(const char *tag) -{ - ssize_t ret __attribute__((unused)); - - /* - * We use write() in this function instead of printf() - * so it is safe to call from a signal handler. - */ - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") START:\n", 9); - - mutex_enter(&zfs_dbgmsgs.pl_lock); - for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; - zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { - ret = write(STDOUT_FILENO, zdm->zdm_msg, - strlen(zdm->zdm_msg)); - ret = write(STDOUT_FILENO, "\n", 1); - } - - ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); - ret = write(STDOUT_FILENO, tag, strlen(tag)); - ret = write(STDOUT_FILENO, ") END\n", 6); - - mutex_exit(&zfs_dbgmsgs.pl_lock); -} -#endif /* _KERNEL */ - -#ifdef _KERNEL -module_param(zfs_dbgmsg_enable, int, 0644); -MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); - -module_param(zfs_dbgmsg_maxsize, int, 0644); -MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); -#endif diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c deleted file mode 100644 index 6bdad737c..000000000 --- a/module/zfs/zfs_dir.c +++ /dev/null @@ -1,1205 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/mode.h> -#include <sys/kmem.h> -#include <sys/uio.h> -#include <sys/pathname.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/stat.h> -#include <sys/sunddi.h> -#include <sys/random.h> -#include <sys/policy.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_vnops.h> -#include <sys/fs/zfs.h> -#include <sys/zap.h> -#include <sys/dmu.h> -#include <sys/atomic.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/sa.h> -#include <sys/zfs_sa.h> - -/* - * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups - * of names after deciding which is the appropriate lookup interface. - */ -static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, - boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) -{ - boolean_t conflict = B_FALSE; - int error; - - if (zfsvfs->z_norm) { - size_t bufsz = 0; - char *buf = NULL; - - if (rpnp) { - buf = rpnp->pn_buf; - bufsz = rpnp->pn_bufsize; - } - - /* - * In the non-mixed case we only expect there would ever - * be one match, but we need to use the normalizing lookup. - */ - error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, - zoid, mt, buf, bufsz, &conflict); - } else { - error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); - } - - /* - * Allow multiple entries provided the first entry is - * the object id. Non-zpl consumers may safely make - * use of the additional space. - * - * XXX: This should be a feature flag for compatibility - */ - if (error == EOVERFLOW) - error = 0; - - if (zfsvfs->z_norm && !error && deflags) - *deflags = conflict ? ED_CASE_CONFLICT : 0; - - *zoid = ZFS_DIRENT_OBJ(*zoid); - - return (error); -} - -/* - * Lock a directory entry. A dirlock on <dzp, name> protects that name - * in dzp's directory zap object. As long as you hold a dirlock, you can - * assume two things: (1) dzp cannot be reaped, and (2) no other thread - * can change the zap entry for (i.e. link or unlink) this name. - * - * Input arguments: - * dzp - znode for directory - * name - name of entry to lock - * flag - ZNEW: if the entry already exists, fail with EEXIST. - * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZSHARED: allow concurrent access with other ZSHARED callers. - * ZXATTR: we want dzp's xattr directory - * ZCILOOK: On a mixed sensitivity file system, - * this lookup should be case-insensitive. - * ZCIEXACT: On a purely case-insensitive file system, - * this lookup should be case-sensitive. - * ZRENAMING: we are locking for renaming, force narrow locks - * ZHAVELOCK: Don't grab the z_name_lock for this call. The - * current thread already holds it. - * - * Output arguments: - * zpp - pointer to the znode for the entry (NULL if there isn't one) - * dlpp - pointer to the dirlock for this entry (NULL on error) - * direntflags - (case-insensitive lookup only) - * flags if multiple case-sensitive matches exist in directory - * realpnp - (case-insensitive lookup only) - * actual name matched within the directory - * - * Return value: 0 on success or errno on failure. - * - * NOTE: Always checks for, and rejects, '.' and '..'. - * NOTE: For case-insensitive file systems we take wide locks (see below), - * but return znode pointers to a single match. - */ -int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag, int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zfs_dirlock_t *dl; - boolean_t update; - matchtype_t mt = 0; - uint64_t zoid; - int error = 0; - int cmpflags; - - *zpp = NULL; - *dlpp = NULL; - - /* - * Verify that we are not trying to lock '.', '..', or '.zfs' - */ - if ((name[0] == '.' && - (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || - (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) - return (SET_ERROR(EEXIST)); - - /* - * Case sensitivity and normalization preferences are set when - * the file system is created. These are stored in the - * zfsvfs->z_case and zfsvfs->z_norm fields. These choices - * affect what vnodes can be cached in the DNLC, how we - * perform zap lookups, and the "width" of our dirlocks. - * - * A normal dirlock locks a single name. Note that with - * normalization a name can be composed multiple ways, but - * when normalized, these names all compare equal. A wide - * dirlock locks multiple names. We need these when the file - * system is supporting mixed-mode access. It is sometimes - * necessary to lock all case permutations of file name at - * once so that simultaneous case-insensitive/case-sensitive - * behaves as rationally as possible. - */ - - /* - * When matching we may need to normalize & change case according to - * FS settings. - * - * Note that a normalized match is necessary for a case insensitive - * filesystem when the lookup request is not exact because normalization - * can fold case independent of normalizing code point sequences. - * - * See the table above zfs_dropname(). - */ - if (zfsvfs->z_norm != 0) { - mt = MT_NORMALIZE; - - /* - * Determine if the match needs to honor the case specified in - * lookup, and if so keep track of that so that during - * normalization we don't fold case. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && - (flag & ZCIEXACT)) || - (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { - mt |= MT_MATCH_CASE; - } - } - - /* - * Only look in or update the DNLC if we are looking for the - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name. - * - * Maybe can add TO-UPPERed version of name to dnlc in ci-only - * case for performance improvement? - */ - update = !zfsvfs->z_norm || - (zfsvfs->z_case == ZFS_CASE_MIXED && - !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); - - /* - * ZRENAMING indicates we are in a situation where we should - * take narrow locks regardless of the file system's - * preferences for normalizing and case folding. This will - * prevent us deadlocking trying to grab the same wide lock - * twice if the two names happen to be case-insensitive - * matches. - */ - if (flag & ZRENAMING) - cmpflags = 0; - else - cmpflags = zfsvfs->z_norm; - - /* - * Wait until there are no locks on this name. - * - * Don't grab the lock if it is already held. However, cannot - * have both ZSHARED and ZHAVELOCK together. - */ - ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); - if (!(flag & ZHAVELOCK)) - rw_enter(&dzp->z_name_lock, RW_READER); - - mutex_enter(&dzp->z_lock); - for (;;) { - if (dzp->z_unlinked && !(flag & ZXATTR)) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (SET_ERROR(ENOENT)); - } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { - if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, - U8_UNICODE_LATEST, &error) == 0) || error != 0) - break; - } - if (error != 0) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (SET_ERROR(ENOENT)); - } - if (dl == NULL) { - /* - * Allocate a new dirlock and add it to the list. - */ - dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); - cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); - dl->dl_name = name; - dl->dl_sharecnt = 0; - dl->dl_namelock = 0; - dl->dl_namesize = 0; - dl->dl_dzp = dzp; - dl->dl_next = dzp->z_dirlocks; - dzp->z_dirlocks = dl; - break; - } - if ((flag & ZSHARED) && dl->dl_sharecnt != 0) - break; - cv_wait(&dl->dl_cv, &dzp->z_lock); - } - - /* - * If the z_name_lock was NOT held for this dirlock record it. - */ - if (flag & ZHAVELOCK) - dl->dl_namelock = 1; - - if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { - /* - * We're the second shared reference to dl. Make a copy of - * dl_name in case the first thread goes away before we do. - * Note that we initialize the new name before storing its - * pointer into dl_name, because the first thread may load - * dl->dl_name at any time. It'll either see the old value, - * which belongs to it, or the new shared copy; either is OK. - */ - dl->dl_namesize = strlen(dl->dl_name) + 1; - name = kmem_alloc(dl->dl_namesize, KM_SLEEP); - bcopy(dl->dl_name, name, dl->dl_namesize); - dl->dl_name = name; - } - - mutex_exit(&dzp->z_lock); - - /* - * We have a dirlock on the name. (Note that it is the dirlock, - * not the dzp's z_lock, that protects the name in the zap object.) - * See if there's an object by this name; if so, put a hold on it. - */ - if (flag & ZXATTR) { - error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, - sizeof (zoid)); - if (error == 0) - error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); - } else { - error = zfs_match_find(zfsvfs, dzp, name, mt, - update, direntflags, realpnp, &zoid); - } - if (error) { - if (error != ENOENT || (flag & ZEXISTS)) { - zfs_dirent_unlock(dl); - return (error); - } - } else { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - return (SET_ERROR(EEXIST)); - } - error = zfs_zget(zfsvfs, zoid, zpp); - if (error) { - zfs_dirent_unlock(dl); - return (error); - } - } - - *dlpp = dl; - - return (0); -} - -/* - * Unlock this directory entry and wake anyone who was waiting for it. - */ -void -zfs_dirent_unlock(zfs_dirlock_t *dl) -{ - znode_t *dzp = dl->dl_dzp; - zfs_dirlock_t **prev_dl, *cur_dl; - - mutex_enter(&dzp->z_lock); - - if (!dl->dl_namelock) - rw_exit(&dzp->z_name_lock); - - if (dl->dl_sharecnt > 1) { - dl->dl_sharecnt--; - mutex_exit(&dzp->z_lock); - return; - } - prev_dl = &dzp->z_dirlocks; - while ((cur_dl = *prev_dl) != dl) - prev_dl = &cur_dl->dl_next; - *prev_dl = dl->dl_next; - cv_broadcast(&dl->dl_cv); - mutex_exit(&dzp->z_lock); - - if (dl->dl_namesize != 0) - kmem_free(dl->dl_name, dl->dl_namesize); - cv_destroy(&dl->dl_cv); - kmem_free(dl, sizeof (*dl)); -} - -/* - * Look up an entry in a directory. - * - * NOTE: '.' and '..' are handled as special cases because - * no directory entries are actually stored for them. If this is - * the root of a filesystem, then '.zfs' is also treated as a - * special pseudo-directory. - */ -int -zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, - int *deflg, pathname_t *rpnp) -{ - zfs_dirlock_t *dl; - znode_t *zp; - int error = 0; - uint64_t parent; - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *ipp = ZTOI(dzp); - igrab(*ipp); - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - - /* - * If we are a snapshot mounted under .zfs, return - * the inode pointer for the snapshot directory. - */ - if ((error = sa_lookup(dzp->z_sa_hdl, - SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) - return (error); - - if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { - error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", ipp, 0, kcred, NULL, NULL); - return (error); - } - rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, parent, &zp); - if (error == 0) - *ipp = ZTOI(zp); - rw_exit(&dzp->z_parent_lock); - } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *ipp = zfsctl_root(dzp); - } else { - int zf; - - zf = ZEXISTS | ZSHARED; - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); - if (error == 0) { - *ipp = ZTOI(zp); - zfs_dirent_unlock(dl); - dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ - } - rpnp = NULL; - } - - if ((flags & FIGNORECASE) && rpnp && !error) - (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); - - return (error); -} - -/* - * unlinked Set (formerly known as the "delete queue") Error Handling - * - * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we - * don't specify the name of the entry that we will be manipulating. We - * also fib and say that we won't be adding any new entries to the - * unlinked set, even though we might (this is to lower the minimum file - * size that can be deleted in a full filesystem). So on the small - * chance that the nlink list is using a fat zap (ie. has more than - * 2000 entries), we *may* not pre-read a block that's needed. - * Therefore it is remotely possible for some of the assertions - * regarding the unlinked set below to fail due to i/o error. On a - * nondebug system, this will result in the space being leaked. - */ -void -zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - - ASSERT(zp->z_unlinked); - ASSERT(ZTOI(zp)->i_nlink == 0); - - VERIFY3U(0, ==, - zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); - - dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); -} - -/* - * Clean up any znodes that had no links when we either crashed or - * (force) umounted the file system. - */ -static void -zfs_unlinked_drain_task(void *arg) -{ - zfsvfs_t *zfsvfs = arg; - zap_cursor_t zc; - zap_attribute_t zap; - dmu_object_info_t doi; - znode_t *zp; - int error; - - ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); - - /* - * Iterate over the contents of the unlinked set. - */ - for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); - zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; - zap_cursor_advance(&zc)) { - - /* - * See what kind of object we have in list - */ - - error = dmu_object_info(zfsvfs->z_os, - zap.za_first_integer, &doi); - if (error != 0) - continue; - - ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || - (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); - /* - * We need to re-mark these list entries for deletion, - * so we pull them back into core and set zp->z_unlinked. - */ - error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); - - /* - * We may pick up znodes that are already marked for deletion. - * This could happen during the purge of an extended attribute - * directory. All we need to do is skip over them, since they - * are already in the system marked z_unlinked. - */ - if (error != 0) - continue; - - zp->z_unlinked = B_TRUE; - - /* - * iput() is Linux's equivalent to illumos' VN_RELE(). It will - * decrement the inode's ref count and may cause the inode to be - * synchronously freed. We interrupt freeing of this inode, by - * checking the return value of dmu_objset_zfs_unmounting() in - * dmu_free_long_range(), when an unmount is requested. - */ - iput(ZTOI(zp)); - ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); - } - zap_cursor_fini(&zc); - - zfsvfs->z_draining = B_FALSE; - zfsvfs->z_drain_task = TASKQID_INVALID; -} - -/* - * Sets z_draining then tries to dispatch async unlinked drain. - * If that fails executes synchronous unlinked drain. - */ -void -zfs_unlinked_drain(zfsvfs_t *zfsvfs) -{ - ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); - ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); - - zfsvfs->z_draining = B_TRUE; - zfsvfs->z_drain_cancel = B_FALSE; - - zfsvfs->z_drain_task = taskq_dispatch( - dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), - zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); - if (zfsvfs->z_drain_task == TASKQID_INVALID) { - zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); - zfs_unlinked_drain_task(zfsvfs); - } -} - -/* - * Wait for the unlinked drain taskq task to stop. This will interrupt the - * unlinked set processing if it is in progress. - */ -void -zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) -{ - ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); - - if (zfsvfs->z_draining) { - zfsvfs->z_drain_cancel = B_TRUE; - taskq_cancel_id(dsl_pool_unlinked_drain_taskq( - dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); - zfsvfs->z_drain_task = TASKQID_INVALID; - zfsvfs->z_draining = B_FALSE; - } -} - -/* - * Delete the entire contents of a directory. Return a count - * of the number of entries that could not be deleted. If we encounter - * an error, return a count of at least one so that the directory stays - * in the unlinked set. - * - * NOTE: this function assumes that the directory is inactive, - * so there is no need to lock its entries before deletion. - * Also, it assumes the directory contents is *only* regular - * files. - */ -static int -zfs_purgedir(znode_t *dzp) -{ - zap_cursor_t zc; - zap_attribute_t zap; - znode_t *xzp; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - zfs_dirlock_t dl; - int skipped = 0; - int error; - - for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); - (error = zap_cursor_retrieve(&zc, &zap)) == 0; - zap_cursor_advance(&zc)) { - error = zfs_zget(zfsvfs, - ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); - if (error) { - skipped += 1; - continue; - } - - ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || - S_ISLNK(ZTOI(xzp)->i_mode)); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - /* Is this really needed ? */ - zfs_sa_upgrade_txholds(tx, xzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - zfs_iput_async(ZTOI(xzp)); - skipped += 1; - continue; - } - bzero(&dl, sizeof (dl)); - dl.dl_dzp = dzp; - dl.dl_name = zap.za_name; - - error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); - if (error) - skipped += 1; - dmu_tx_commit(tx); - - zfs_iput_async(ZTOI(xzp)); - } - zap_cursor_fini(&zc); - if (error != ENOENT) - skipped += 1; - return (skipped); -} - -void -zfs_rmnode(znode_t *zp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; - znode_t *xzp = NULL; - dmu_tx_t *tx; - uint64_t acl_obj; - uint64_t xattr_obj; - uint64_t links; - int error; - - ASSERT(ZTOI(zp)->i_nlink == 0); - ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); - - /* - * If this is an attribute directory, purge its contents. - */ - if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { - if (zfs_purgedir(zp) != 0) { - /* - * Not enough space to delete some xattrs. - * Leave it in the unlinked set. - */ - zfs_znode_dmu_fini(zp); - - return; - } - } - - /* - * Free up all the data in the file. We don't do this for directories - * because we need truncate and remove to be in the same tx, like in - * zfs_znode_delete(). Otherwise, if we crash here we'll end up with - * an inconsistent truncated zap object in the delete queue. Note a - * truncated file is harmless since it only contains user data. - */ - if (S_ISREG(ZTOI(zp)->i_mode)) { - error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); - if (error) { - /* - * Not enough space or we were interrupted by unmount. - * Leave the file in the unlinked set. - */ - zfs_znode_dmu_fini(zp); - return; - } - } - - /* - * If the file has extended attributes, we're going to unlink - * the xattr dir. - */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT(error == 0); - } - - acl_obj = zfs_external_acl(zp); - - /* - * Set up the final transaction. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - if (xzp) { - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - } - if (acl_obj) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - /* - * Not enough space to delete the file. Leave it in the - * unlinked set, leaking it until the fs is remounted (at - * which point we'll call zfs_unlinked_drain() to process it). - */ - dmu_tx_abort(tx); - zfs_znode_dmu_fini(zp); - goto out; - } - - if (xzp) { - ASSERT(error == 0); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - clear_nlink(ZTOI(xzp)); /* no more links to it */ - links = 0; - VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &links, sizeof (links), tx)); - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - } - - /* Remove this znode from the unlinked set */ - VERIFY3U(0, ==, - zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); - - dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); - - zfs_znode_delete(zp, tx); - - dmu_tx_commit(tx); -out: - if (xzp) - zfs_iput_async(ZTOI(xzp)); -} - -static uint64_t -zfs_dirent(znode_t *zp, uint64_t mode) -{ - uint64_t de = zp->z_id; - - if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT(mode) << 60; - return (de); -} - -/* - * Link zp into dl. Can fail in the following cases : - * - if zp has been unlinked. - * - if the number of entries with the same hash (aka. colliding entries) - * exceed the capacity of a leaf-block of fatzap and splitting of the - * leaf-block does not help. - */ -int -zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) -{ - znode_t *dzp = dl->dl_dzp; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - uint64_t value; - int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); - sa_bulk_attr_t bulk[5]; - uint64_t mtime[2], ctime[2]; - uint64_t links; - int count = 0; - int error; - - mutex_enter(&zp->z_lock); - - if (!(flag & ZRENAMING)) { - if (zp->z_unlinked) { /* no new links to unlinked zp */ - ASSERT(!(flag & (ZNEW | ZEXISTS))); - mutex_exit(&zp->z_lock); - return (SET_ERROR(ENOENT)); - } - if (!(flag & ZNEW)) { - /* - * ZNEW nodes come from zfs_mknode() where the link - * count has already been initialised - */ - inc_nlink(ZTOI(zp)); - links = ZTOI(zp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - } - } - - value = zfs_dirent(zp, zp->z_mode); - error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, - &value, tx); - - /* - * zap_add could fail to add the entry if it exceeds the capacity of the - * leaf-block and zap_leaf_split() failed to help. - * The caller of this routine is responsible for failing the transaction - * which will rollback the SA updates done above. - */ - if (error != 0) { - if (!(flag & ZRENAMING) && !(flag & ZNEW)) - drop_nlink(ZTOI(zp)); - mutex_exit(&zp->z_lock); - return (error); - } - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, - &dzp->z_id, sizeof (dzp->z_id)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (!(flag & ZNEW)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime); - } - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - - mutex_exit(&zp->z_lock); - - mutex_enter(&dzp->z_lock); - dzp->z_size++; - if (zp_is_dir) - inc_nlink(ZTOI(dzp)); - links = ZTOI(dzp)->i_nlink; - count = 0; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &dzp->z_size, sizeof (dzp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &links, sizeof (links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &dzp->z_pflags, sizeof (dzp->z_pflags)); - zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); - error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - mutex_exit(&dzp->z_lock); - - return (0); -} - -/* - * The match type in the code for this function should conform to: - * - * ------------------------------------------------------------------------ - * fs type | z_norm | lookup type | match type - * ---------|-------------|-------------|---------------------------------- - * CS !norm | 0 | 0 | 0 (exact) - * CS norm | formX | 0 | MT_NORMALIZE - * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE - * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE - * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE - * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE - * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE - * CM !norm | upper | ZCILOOK | MT_NORMALIZE - * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE - * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE - * - * Abbreviations: - * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed - * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) - * formX = unicode normalization form set on fs creation - */ -static int -zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, - int flag) -{ - int error; - - if (ZTOZSB(zp)->z_norm) { - matchtype_t mt = MT_NORMALIZE; - - if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && - (flag & ZCIEXACT)) || - (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && - !(flag & ZCILOOK))) { - mt |= MT_MATCH_CASE; - } - - error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, - dl->dl_name, mt, tx); - } else { - error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, - tx); - } - - return (error); -} - -/* - * Unlink zp from dl, and mark zp for deletion if this was the last link. Can - * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). - * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. - * If it's non-NULL, we use it to indicate whether the znode needs deletion, - * and it's the caller's job to do it. - */ -int -zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) -{ - znode_t *dzp = dl->dl_dzp; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); - boolean_t unlinked = B_FALSE; - sa_bulk_attr_t bulk[5]; - uint64_t mtime[2], ctime[2]; - uint64_t links; - int count = 0; - int error; - - if (!(flag & ZRENAMING)) { - mutex_enter(&zp->z_lock); - - if (zp_is_dir && !zfs_dirempty(zp)) { - mutex_exit(&zp->z_lock); - return (SET_ERROR(ENOTEMPTY)); - } - - /* - * If we get here, we are going to try to remove the object. - * First try removing the name from the directory; if that - * fails, return the error. - */ - error = zfs_dropname(dl, zp, dzp, tx, flag); - if (error != 0) { - mutex_exit(&zp->z_lock); - return (error); - } - - if (ZTOI(zp)->i_nlink <= zp_is_dir) { - zfs_panic_recover("zfs: link count on %lu is %u, " - "should be at least %u", zp->z_id, - (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); - set_nlink(ZTOI(zp), zp_is_dir + 1); - } - drop_nlink(ZTOI(zp)); - if (ZTOI(zp)->i_nlink == zp_is_dir) { - zp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(zp)); - unlinked = B_TRUE; - } else { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, sizeof (zp->z_pflags)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime); - } - links = ZTOI(zp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - count = 0; - ASSERT(error == 0); - mutex_exit(&zp->z_lock); - } else { - error = zfs_dropname(dl, zp, dzp, tx, flag); - if (error != 0) - return (error); - } - - mutex_enter(&dzp->z_lock); - dzp->z_size--; /* one dirent removed */ - if (zp_is_dir) - drop_nlink(ZTOI(dzp)); /* ".." link from zp */ - links = ZTOI(dzp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), - NULL, &dzp->z_size, sizeof (dzp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), - NULL, mtime, sizeof (mtime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); - zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); - error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - mutex_exit(&dzp->z_lock); - - if (unlinkedp != NULL) - *unlinkedp = unlinked; - else if (unlinked) - zfs_unlinked_add(zp, tx); - - return (0); -} - -/* - * Indicate whether the directory is empty. Works with or without z_lock - * held, but can only be consider a hint in the latter case. Returns true - * if only "." and ".." remain and there's no work in progress. - * - * The internal ZAP size, rather than zp->z_size, needs to be checked since - * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. - */ -boolean_t -zfs_dirempty(znode_t *dzp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - uint64_t count; - int error; - - if (dzp->z_dirlocks != NULL) - return (B_FALSE); - - error = zap_count(zfsvfs->z_os, dzp->z_id, &count); - if (error != 0 || count != 0) - return (B_FALSE); - - return (B_TRUE); -} - -int -zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - znode_t *xzp; - dmu_tx_t *tx; - int error; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; -#ifdef DEBUG - uint64_t parent; -#endif - - *xipp = NULL; - - if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) - return (error); - - if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids)) != 0) - return (error); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { - zfs_acl_ids_free(&acl_ids); - return (SET_ERROR(EDQUOT)); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - return (error); - } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - -#ifdef DEBUG - error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent)); - ASSERT(error == 0 && parent == zp->z_id); -#endif - - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, - sizeof (xzp->z_id), tx)); - - if (!zp->z_unlinked) - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, acl_ids.z_fuidp, vap); - - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - - *xipp = ZTOI(xzp); - - return (0); -} - -/* - * Return a znode for the extended attribute directory for zp. - * ** If the directory does not already exist, it is created ** - * - * IN: zp - znode to obtain attribute directory from - * cr - credentials of caller - * flags - flags from the VOP_LOOKUP call - * - * OUT: xipp - pointer to extended attribute znode - * - * RETURN: 0 on success - * error number on failure - */ -int -zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - znode_t *xzp; - zfs_dirlock_t *dl; - vattr_t va; - int error; -top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); - if (error) - return (error); - - if (xzp != NULL) { - *xipp = ZTOI(xzp); - zfs_dirent_unlock(dl); - return (0); - } - - if (!(flags & CREATE_XATTR_DIR)) { - zfs_dirent_unlock(dl); - return (SET_ERROR(ENOENT)); - } - - if (zfs_is_readonly(zfsvfs)) { - zfs_dirent_unlock(dl); - return (SET_ERROR(EROFS)); - } - - /* - * The ability to 'create' files in an attribute - * directory comes from the write_xattr permission on the base file. - * - * The ability to 'search' an attribute directory requires - * read_xattr permission on the base file. - * - * Once in a directory the ability to read/write attributes - * is controlled by the permissions on the attribute file. - */ - va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; - va.va_mode = S_IFDIR | S_ISVTX | 0777; - zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); - - va.va_dentry = NULL; - error = zfs_make_xattrdir(zp, &va, xipp, cr); - zfs_dirent_unlock(dl); - - if (error == ERESTART) { - /* NB: we already did dmu_tx_wait() if necessary */ - goto top; - } - - return (error); -} - -/* - * Decide whether it is okay to remove within a sticky directory. - * - * In sticky directories, write access is not sufficient; - * you can remove entries from a directory only if: - * - * you own the directory, - * you own the entry, - * you have write access to the entry, - * or you are privileged (checked in secpolicy...). - * - * The function returns 0 if remove access is granted. - */ -int -zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) -{ - uid_t uid; - uid_t downer; - uid_t fowner; - zfsvfs_t *zfsvfs = ZTOZSB(zdp); - - if (zfsvfs->z_replay) - return (0); - - if ((zdp->z_mode & S_ISVTX) == 0) - return (0); - - downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), - cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), - cr, ZFS_OWNER); - - if ((uid = crgetuid(cr)) == downer || uid == fowner || - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) - return (0); - else - return (secpolicy_vnode_remove(cr)); -} diff --git a/module/zfs/zfs_sysfs.c b/module/zfs/zfs_sysfs.c deleted file mode 100644 index bb7f3b69a..000000000 --- a/module/zfs/zfs_sysfs.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2018, 2019 by Delphix. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/zfeature.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_sysfs.h> -#include <sys/kmem.h> -#include <sys/fs/zfs.h> -#include <linux/kobject.h> - -#include "zfs_prop.h" - -#if !defined(_KERNEL) -#error kernel builds only -#endif - -/* - * ZFS Module sysfs support - * - * This extends our sysfs '/sys/module/zfs' entry to include feature - * and property attributes. The primary consumer of this information - * is user processes, like the zfs CLI, that need to know what the - * current loaded ZFS module supports. The libzfs binary will consult - * this information when instantiating the zfs|zpool property tables - * and the pool features table. - * - * The added top-level directories are: - * /sys/module/zfs - * ├── features.kernel - * ├── features.pool - * ├── properties.dataset - * └── properties.pool - * - * The local interface for the zfs kobjects includes: - * zfs_kobj_init() - * zfs_kobj_add() - * zfs_kobj_release() - * zfs_kobj_add_attr() - * zfs_kobj_fini() - */ - -/* - * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs' - */ -struct zfs_mod_kobj; -typedef struct zfs_mod_kobj zfs_mod_kobj_t; - -struct zfs_mod_kobj { - struct kobject zko_kobj; - struct kobj_type zko_kobj_type; - struct sysfs_ops zko_sysfs_ops; - size_t zko_attr_count; - struct attribute *zko_attr_list; /* allocated */ - struct attribute **zko_default_attrs; /* allocated */ - size_t zko_child_count; - zfs_mod_kobj_t *zko_children; /* allocated */ -}; - -#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt)) -/* Note +1 for NULL terminator slot */ -#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1)) -#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt)) - -/* - * These are the top-level kobjects under '/sys/module/zfs/' - */ -static zfs_mod_kobj_t kernel_features_kobj; -static zfs_mod_kobj_t pool_features_kobj; -static zfs_mod_kobj_t dataset_props_kobj; -static zfs_mod_kobj_t pool_props_kobj; - -/* - * The show function is used to provide the content - * of an attribute into a PAGE_SIZE buffer. - */ -typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *, - char *); - -static void -zfs_kobj_fini(zfs_mod_kobj_t *zkobj) -{ - /* finalize any child kobjects */ - if (zkobj->zko_child_count != 0) { - ASSERT(zkobj->zko_children); - for (int i = 0; i < zkobj->zko_child_count; i++) - zfs_kobj_fini(&zkobj->zko_children[i]); - } - - /* kobject_put() will call zfs_kobj_release() to release memory */ - kobject_del(&zkobj->zko_kobj); - kobject_put(&zkobj->zko_kobj); -} - -static void -zfs_kobj_release(struct kobject *kobj) -{ - zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj); - - if (zkobj->zko_attr_list != NULL) { - ASSERT3S(zkobj->zko_attr_count, !=, 0); - kmem_free(zkobj->zko_attr_list, - ATTR_TABLE_SIZE(zkobj->zko_attr_count)); - zkobj->zko_attr_list = NULL; - } - - if (zkobj->zko_default_attrs != NULL) { - kmem_free(zkobj->zko_default_attrs, - DEFAULT_ATTR_SIZE(zkobj->zko_attr_count)); - zkobj->zko_default_attrs = NULL; - } - - if (zkobj->zko_child_count != 0) { - ASSERT(zkobj->zko_children); - - kmem_free(zkobj->zko_children, - CHILD_TABLE_SIZE(zkobj->zko_child_count)); - zkobj->zko_child_count = 0; - zkobj->zko_children = NULL; - } - - zkobj->zko_attr_count = 0; -} - -#ifndef sysfs_attr_init -#define sysfs_attr_init(attr) do {} while (0) -#endif - -static void -zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) -{ - VERIFY3U(attr_num, <, zkobj->zko_attr_count); - ASSERT(zkobj->zko_attr_list); - ASSERT(zkobj->zko_default_attrs); - - zkobj->zko_attr_list[attr_num].name = attr_name; - zkobj->zko_attr_list[attr_num].mode = 0444; - zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; - sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); -} - -static int -zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, - sysfs_show_func show_func) -{ - /* - * Initialize object's attributes. Count can be zero. - */ - if (attr_cnt > 0) { - zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt), - KM_SLEEP); - if (zkobj->zko_attr_list == NULL) - return (ENOMEM); - } - /* this will always have at least one slot for NULL termination */ - zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt), - KM_SLEEP); - if (zkobj->zko_default_attrs == NULL) { - if (zkobj->zko_attr_list != NULL) { - kmem_free(zkobj->zko_attr_list, - ATTR_TABLE_SIZE(attr_cnt)); - } - return (ENOMEM); - } - zkobj->zko_attr_count = attr_cnt; - zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs; - - if (child_cnt > 0) { - zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt), - KM_SLEEP); - if (zkobj->zko_children == NULL) { - if (zkobj->zko_default_attrs != NULL) { - kmem_free(zkobj->zko_default_attrs, - DEFAULT_ATTR_SIZE(attr_cnt)); - } - if (zkobj->zko_attr_list != NULL) { - kmem_free(zkobj->zko_attr_list, - ATTR_TABLE_SIZE(attr_cnt)); - } - return (ENOMEM); - } - zkobj->zko_child_count = child_cnt; - } - - zkobj->zko_sysfs_ops.show = show_func; - zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops; - zkobj->zko_kobj_type.release = zfs_kobj_release; - - return (0); -} - -static int -zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) -{ - /* zko_default_attrs must be NULL terminated */ - ASSERT(zkobj->zko_default_attrs != NULL); - ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL); - - kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); - return (kobject_add(&zkobj->zko_kobj, parent, name)); -} - -/* - * Each zfs property has these common attributes - */ -static const char *zprop_attrs[] = { - "type", - "readonly", - "setonce", - "visible", - "values", - "default", - "datasets" /* zfs properties only */ -}; - -#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs) -#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1) - -static const char *zprop_types[] = { - "number", - "string", - "index", -}; - -typedef struct zfs_type_map { - zfs_type_t ztm_type; - const char *ztm_name; -} zfs_type_map_t; - -static zfs_type_map_t type_map[] = { - {ZFS_TYPE_FILESYSTEM, "filesystem"}, - {ZFS_TYPE_SNAPSHOT, "snapshot"}, - {ZFS_TYPE_VOLUME, "volume"}, - {ZFS_TYPE_BOOKMARK, "bookmark"} -}; - -/* - * Show the content for a zfs property attribute - */ -static ssize_t -zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, - char *buf, size_t buflen) -{ - const char *show_str; - char number[32]; - - /* For dataset properties list the dataset types that apply */ - if (strcmp(attr_name, "datasets") == 0 && - property->pd_types != ZFS_TYPE_POOL) { - int len = 0; - - for (int i = 0; i < ARRAY_SIZE(type_map); i++) { - if (type_map[i].ztm_type & property->pd_types) { - len += snprintf(buf + len, buflen - len, "%s ", - type_map[i].ztm_name); - } - } - len += snprintf(buf + len, buflen - len, "\n"); - return (len); - } - - if (strcmp(attr_name, "type") == 0) { - show_str = zprop_types[property->pd_proptype]; - } else if (strcmp(attr_name, "readonly") == 0) { - show_str = property->pd_attr == PROP_READONLY ? "1" : "0"; - } else if (strcmp(attr_name, "setonce") == 0) { - show_str = property->pd_attr == PROP_ONETIME ? "1" : "0"; - } else if (strcmp(attr_name, "visible") == 0) { - show_str = property->pd_visible ? "1" : "0"; - } else if (strcmp(attr_name, "values") == 0) { - show_str = property->pd_values ? property->pd_values : ""; - } else if (strcmp(attr_name, "default") == 0) { - switch (property->pd_proptype) { - case PROP_TYPE_NUMBER: - (void) snprintf(number, sizeof (number), "%llu", - (u_longlong_t)property->pd_numdefault); - show_str = number; - break; - case PROP_TYPE_STRING: - show_str = property->pd_strdefault ? - property->pd_strdefault : ""; - break; - case PROP_TYPE_INDEX: - if (zprop_index_to_string(property->pd_propnum, - property->pd_numdefault, &show_str, - property->pd_types) != 0) { - show_str = ""; - } - break; - default: - return (0); - } - } else { - return (0); - } - - return (snprintf(buf, buflen, "%s\n", show_str)); -} - -static ssize_t -dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj)); - zprop_desc_t *prop_tbl = zfs_prop_get_table(); - ssize_t len; - - ASSERT3U(prop, <, ZFS_NUM_PROPS); - - len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); - - return (len); -} - -static ssize_t -pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj)); - zprop_desc_t *prop_tbl = zpool_prop_get_table(); - ssize_t len; - - ASSERT3U(prop, <, ZPOOL_NUM_PROPS); - - len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); - - return (len); -} - -/* - * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel' - * - * This list is intended for kernel features that don't have a pool feature - * association or that extend existing user kernel interfaces. - * - * A user processes can easily check if the running zfs kernel module - * supports the new feature. - */ -static const char *zfs_kernel_features[] = { - /* --> Add new kernel features here */ - "com.delphix:vdev_initialize", - "org.zfsonlinux:vdev_trim", -}; - -#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features) - -static ssize_t -kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - if (strcmp(attr->name, "supported") == 0) - return (snprintf(buf, PAGE_SIZE, "yes\n")); - return (0); -} - -static void -kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name) -{ - zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot]; - - ASSERT3U(slot, <, KERNEL_FEATURE_COUNT); - ASSERT(name); - - int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show); - if (err) - return; - - zfs_kobj_add_attr(zfs_kobj, 0, "supported"); - - err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); - if (err) - zfs_kobj_release(&zfs_kobj->zko_kobj); -} - -static int -zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) -{ - /* - * Create a parent kobject to host kernel features. - * - * '/sys/module/zfs/features.kernel' - */ - int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT, - kernel_feature_show); - if (err) - return (err); - err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES); - if (err) { - zfs_kobj_release(&zfs_kobj->zko_kobj); - return (err); - } - - /* - * Now create a kobject for each feature. - * - * '/sys/module/zfs/features.kernel/<feature>' - */ - for (int f = 0; f < KERNEL_FEATURE_COUNT; f++) - kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]); - - return (0); -} - -/* - * Each pool feature has these common attributes - */ -static const char *pool_feature_attrs[] = { - "description", - "guid", - "uname", - "readonly_compatible", - "required_for_mos", - "activate_on_enable", - "per_dataset" -}; - -#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs) - -/* - * Show the content for the given zfs pool feature attribute - */ -static ssize_t -pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - spa_feature_t fid; - - if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0) - return (0); - - ASSERT3U(fid, <, SPA_FEATURES); - - zfeature_flags_t flags = spa_feature_table[fid].fi_flags; - const char *show_str = NULL; - - if (strcmp(attr->name, "description") == 0) { - show_str = spa_feature_table[fid].fi_desc; - } else if (strcmp(attr->name, "guid") == 0) { - show_str = spa_feature_table[fid].fi_guid; - } else if (strcmp(attr->name, "uname") == 0) { - show_str = spa_feature_table[fid].fi_uname; - } else if (strcmp(attr->name, "readonly_compatible") == 0) { - show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0"; - } else if (strcmp(attr->name, "required_for_mos") == 0) { - show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0"; - } else if (strcmp(attr->name, "activate_on_enable") == 0) { - show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0"; - } else if (strcmp(attr->name, "per_dataset") == 0) { - show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0"; - } - if (show_str == NULL) - return (0); - - return (snprintf(buf, PAGE_SIZE, "%s\n", show_str)); -} - -static void -pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid, - const char *name) -{ - zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid]; - - ASSERT3U(fid, <, SPA_FEATURES); - ASSERT(name); - - int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0, - pool_feature_show); - if (err) - return; - - for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++) - zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]); - - err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); - if (err) - zfs_kobj_release(&zfs_kobj->zko_kobj); -} - -static int -zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) -{ - /* - * Create a parent kobject to host pool features. - * - * '/sys/module/zfs/features.pool' - */ - int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show); - if (err) - return (err); - err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES); - if (err) { - zfs_kobj_release(&zfs_kobj->zko_kobj); - return (err); - } - - /* - * Now create a kobject for each feature. - * - * '/sys/module/zfs/features.pool/<feature>' - */ - for (spa_feature_t i = 0; i < SPA_FEATURES; i++) - pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid); - - return (0); -} - -typedef struct prop_to_kobj_arg { - zprop_desc_t *p2k_table; - zfs_mod_kobj_t *p2k_parent; - sysfs_show_func p2k_show_func; - int p2k_attr_count; -} prop_to_kobj_arg_t; - -static int -zprop_to_kobj(int prop, void *args) -{ - prop_to_kobj_arg_t *data = args; - zfs_mod_kobj_t *parent = data->p2k_parent; - zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop]; - const char *name = data->p2k_table[prop].pd_name; - int err; - - ASSERT(name); - - err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0, - data->p2k_show_func); - if (err) - return (ZPROP_CONT); - - for (int i = 0; i < data->p2k_attr_count; i++) - zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]); - - err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); - if (err) - zfs_kobj_release(&zfs_kobj->zko_kobj); - - return (ZPROP_CONT); -} - -static int -zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent, - zfs_type_t type) -{ - prop_to_kobj_arg_t context; - const char *name; - int err; - - /* - * Create a parent kobject to host properties. - * - * '/sys/module/zfs/properties.<type>' - */ - if (type == ZFS_TYPE_POOL) { - name = ZFS_SYSFS_POOL_PROPERTIES; - context.p2k_table = zpool_prop_get_table(); - context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT; - context.p2k_parent = zfs_kobj; - context.p2k_show_func = pool_property_show; - err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS, - pool_property_show); - } else { - name = ZFS_SYSFS_DATASET_PROPERTIES; - context.p2k_table = zfs_prop_get_table(); - context.p2k_attr_count = ZFS_PROP_ATTR_COUNT; - context.p2k_parent = zfs_kobj; - context.p2k_show_func = dataset_property_show; - err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS, - dataset_property_show); - } - - if (err) - return (err); - - err = zfs_kobj_add(zfs_kobj, parent, name); - if (err) { - zfs_kobj_release(&zfs_kobj->zko_kobj); - return (err); - } - - /* - * Create a kobject for each property. - * - * '/sys/module/zfs/properties.<type>/<property>' - */ - (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE, - B_FALSE, type); - - return (err); -} - -void -zfs_sysfs_init(void) -{ - struct kobject *parent; -#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE) - parent = kobject_create_and_add("zfs", fs_kobj); -#else - parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj; -#endif - int err; - - if (parent == NULL) - return; - - err = zfs_kernel_features_init(&kernel_features_kobj, parent); - if (err) - return; - - err = zfs_pool_features_init(&pool_features_kobj, parent); - if (err) { - zfs_kobj_fini(&kernel_features_kobj); - return; - } - - err = zfs_sysfs_properties_init(&pool_props_kobj, parent, - ZFS_TYPE_POOL); - if (err) { - zfs_kobj_fini(&kernel_features_kobj); - zfs_kobj_fini(&pool_features_kobj); - return; - } - - err = zfs_sysfs_properties_init(&dataset_props_kobj, parent, - ZFS_TYPE_FILESYSTEM); - if (err) { - zfs_kobj_fini(&kernel_features_kobj); - zfs_kobj_fini(&pool_features_kobj); - zfs_kobj_fini(&pool_props_kobj); - return; - } -} - -void -zfs_sysfs_fini(void) -{ - /* - * Remove top-level kobjects; each will remove any children kobjects - */ - zfs_kobj_fini(&kernel_features_kobj); - zfs_kobj_fini(&pool_features_kobj); - zfs_kobj_fini(&dataset_props_kobj); - zfs_kobj_fini(&pool_props_kobj); -} diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c deleted file mode 100644 index 0914e4b7d..000000000 --- a/module/zfs/zfs_vfsops.c +++ /dev/null @@ -1,2562 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/sysmacros.h> -#include <sys/kmem.h> -#include <sys/pathname.h> -#include <sys/vnode.h> -#include <sys/vfs.h> -#include <sys/mntent.h> -#include <sys/cmn_err.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_dir.h> -#include <sys/zil.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_deleg.h> -#include <sys/spa.h> -#include <sys/zap.h> -#include <sys/sa.h> -#include <sys/sa_impl.h> -#include <sys/policy.h> -#include <sys/atomic.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/sunddi.h> -#include <sys/dmu_objset.h> -#include <sys/spa_boot.h> -#include <sys/objlist.h> -#include <sys/zpl.h> -#include <linux/vfs_compat.h> -#include "zfs_comutil.h" - -enum { - TOKEN_RO, - TOKEN_RW, - TOKEN_SETUID, - TOKEN_NOSETUID, - TOKEN_EXEC, - TOKEN_NOEXEC, - TOKEN_DEVICES, - TOKEN_NODEVICES, - TOKEN_DIRXATTR, - TOKEN_SAXATTR, - TOKEN_XATTR, - TOKEN_NOXATTR, - TOKEN_ATIME, - TOKEN_NOATIME, - TOKEN_RELATIME, - TOKEN_NORELATIME, - TOKEN_NBMAND, - TOKEN_NONBMAND, - TOKEN_MNTPOINT, - TOKEN_LAST, -}; - -static const match_table_t zpl_tokens = { - { TOKEN_RO, MNTOPT_RO }, - { TOKEN_RW, MNTOPT_RW }, - { TOKEN_SETUID, MNTOPT_SETUID }, - { TOKEN_NOSETUID, MNTOPT_NOSETUID }, - { TOKEN_EXEC, MNTOPT_EXEC }, - { TOKEN_NOEXEC, MNTOPT_NOEXEC }, - { TOKEN_DEVICES, MNTOPT_DEVICES }, - { TOKEN_NODEVICES, MNTOPT_NODEVICES }, - { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, - { TOKEN_SAXATTR, MNTOPT_SAXATTR }, - { TOKEN_XATTR, MNTOPT_XATTR }, - { TOKEN_NOXATTR, MNTOPT_NOXATTR }, - { TOKEN_ATIME, MNTOPT_ATIME }, - { TOKEN_NOATIME, MNTOPT_NOATIME }, - { TOKEN_RELATIME, MNTOPT_RELATIME }, - { TOKEN_NORELATIME, MNTOPT_NORELATIME }, - { TOKEN_NBMAND, MNTOPT_NBMAND }, - { TOKEN_NONBMAND, MNTOPT_NONBMAND }, - { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, - { TOKEN_LAST, NULL }, -}; - -static void -zfsvfs_vfs_free(vfs_t *vfsp) -{ - if (vfsp != NULL) { - if (vfsp->vfs_mntpoint != NULL) - strfree(vfsp->vfs_mntpoint); - - kmem_free(vfsp, sizeof (vfs_t)); - } -} - -static int -zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) -{ - switch (token) { - case TOKEN_RO: - vfsp->vfs_readonly = B_TRUE; - vfsp->vfs_do_readonly = B_TRUE; - break; - case TOKEN_RW: - vfsp->vfs_readonly = B_FALSE; - vfsp->vfs_do_readonly = B_TRUE; - break; - case TOKEN_SETUID: - vfsp->vfs_setuid = B_TRUE; - vfsp->vfs_do_setuid = B_TRUE; - break; - case TOKEN_NOSETUID: - vfsp->vfs_setuid = B_FALSE; - vfsp->vfs_do_setuid = B_TRUE; - break; - case TOKEN_EXEC: - vfsp->vfs_exec = B_TRUE; - vfsp->vfs_do_exec = B_TRUE; - break; - case TOKEN_NOEXEC: - vfsp->vfs_exec = B_FALSE; - vfsp->vfs_do_exec = B_TRUE; - break; - case TOKEN_DEVICES: - vfsp->vfs_devices = B_TRUE; - vfsp->vfs_do_devices = B_TRUE; - break; - case TOKEN_NODEVICES: - vfsp->vfs_devices = B_FALSE; - vfsp->vfs_do_devices = B_TRUE; - break; - case TOKEN_DIRXATTR: - vfsp->vfs_xattr = ZFS_XATTR_DIR; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_SAXATTR: - vfsp->vfs_xattr = ZFS_XATTR_SA; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_XATTR: - vfsp->vfs_xattr = ZFS_XATTR_DIR; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_NOXATTR: - vfsp->vfs_xattr = ZFS_XATTR_OFF; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_ATIME: - vfsp->vfs_atime = B_TRUE; - vfsp->vfs_do_atime = B_TRUE; - break; - case TOKEN_NOATIME: - vfsp->vfs_atime = B_FALSE; - vfsp->vfs_do_atime = B_TRUE; - break; - case TOKEN_RELATIME: - vfsp->vfs_relatime = B_TRUE; - vfsp->vfs_do_relatime = B_TRUE; - break; - case TOKEN_NORELATIME: - vfsp->vfs_relatime = B_FALSE; - vfsp->vfs_do_relatime = B_TRUE; - break; - case TOKEN_NBMAND: - vfsp->vfs_nbmand = B_TRUE; - vfsp->vfs_do_nbmand = B_TRUE; - break; - case TOKEN_NONBMAND: - vfsp->vfs_nbmand = B_FALSE; - vfsp->vfs_do_nbmand = B_TRUE; - break; - case TOKEN_MNTPOINT: - vfsp->vfs_mntpoint = match_strdup(&args[0]); - if (vfsp->vfs_mntpoint == NULL) - return (SET_ERROR(ENOMEM)); - - break; - default: - break; - } - - return (0); -} - -/* - * Parse the raw mntopts and return a vfs_t describing the options. - */ -static int -zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) -{ - vfs_t *tmp_vfsp; - int error; - - tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); - - if (mntopts != NULL) { - substring_t args[MAX_OPT_ARGS]; - char *tmp_mntopts, *p, *t; - int token; - - tmp_mntopts = t = strdup(mntopts); - if (tmp_mntopts == NULL) - return (SET_ERROR(ENOMEM)); - - while ((p = strsep(&t, ",")) != NULL) { - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, zpl_tokens, args); - error = zfsvfs_parse_option(p, token, args, tmp_vfsp); - if (error) { - strfree(tmp_mntopts); - zfsvfs_vfs_free(tmp_vfsp); - return (error); - } - } - - strfree(tmp_mntopts); - } - - *vfsp = tmp_vfsp; - - return (0); -} - -boolean_t -zfs_is_readonly(zfsvfs_t *zfsvfs) -{ - return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); -} - -/*ARGSUSED*/ -int -zfs_sync(struct super_block *sb, int wait, cred_t *cr) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - - /* - * Semantically, the only requirement is that the sync be initiated. - * The DMU syncs out txgs frequently, so there's nothing to do. - */ - if (!wait) - return (0); - - if (zfsvfs != NULL) { - /* - * Sync a specific filesystem. - */ - dsl_pool_t *dp; - - ZFS_ENTER(zfsvfs); - dp = dmu_objset_pool(zfsvfs->z_os); - - /* - * If the system is shutting down, then skip any - * filesystems which may exist on a suspended pool. - */ - if (spa_suspended(dp->dp_spa)) { - ZFS_EXIT(zfsvfs); - return (0); - } - - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); - - ZFS_EXIT(zfsvfs); - } else { - /* - * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the - * request by waiting for all pools to commit all dirty data. - */ - spa_sync_allpools(); - } - - return (0); -} - -static void -atime_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - struct super_block *sb = zfsvfs->z_sb; - - if (sb == NULL) - return; - /* - * Update SB_NOATIME bit in VFS super block. Since atime update is - * determined by atime_needs_update(), atime_needs_update() needs to - * return false if atime is turned off, and not unconditionally return - * false if atime is turned on. - */ - if (newval) - sb->s_flags &= ~SB_NOATIME; - else - sb->s_flags |= SB_NOATIME; -} - -static void -relatime_changed_cb(void *arg, uint64_t newval) -{ - ((zfsvfs_t *)arg)->z_relatime = newval; -} - -static void -xattr_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == ZFS_XATTR_OFF) { - zfsvfs->z_flags &= ~ZSB_XATTR; - } else { - zfsvfs->z_flags |= ZSB_XATTR; - - if (newval == ZFS_XATTR_SA) - zfsvfs->z_xattr_sa = B_TRUE; - else - zfsvfs->z_xattr_sa = B_FALSE; - } -} - -static void -acltype_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - switch (newval) { - case ZFS_ACLTYPE_OFF: - zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; - zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; - break; - case ZFS_ACLTYPE_POSIXACL: -#ifdef CONFIG_FS_POSIX_ACL - zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; - zfsvfs->z_sb->s_flags |= SB_POSIXACL; -#else - zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; - zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; -#endif /* CONFIG_FS_POSIX_ACL */ - break; - default: - break; - } -} - -static void -blksz_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); - ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); - ASSERT(ISP2(newval)); - - zfsvfs->z_max_blksz = newval; -} - -static void -readonly_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - struct super_block *sb = zfsvfs->z_sb; - - if (sb == NULL) - return; - - if (newval) - sb->s_flags |= SB_RDONLY; - else - sb->s_flags &= ~SB_RDONLY; -} - -static void -devices_changed_cb(void *arg, uint64_t newval) -{ -} - -static void -setuid_changed_cb(void *arg, uint64_t newval) -{ -} - -static void -exec_changed_cb(void *arg, uint64_t newval) -{ -} - -static void -nbmand_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - struct super_block *sb = zfsvfs->z_sb; - - if (sb == NULL) - return; - - if (newval == TRUE) - sb->s_flags |= SB_MANDLOCK; - else - sb->s_flags &= ~SB_MANDLOCK; -} - -static void -snapdir_changed_cb(void *arg, uint64_t newval) -{ - ((zfsvfs_t *)arg)->z_show_ctldir = newval; -} - -static void -vscan_changed_cb(void *arg, uint64_t newval) -{ - ((zfsvfs_t *)arg)->z_vscan = newval; -} - -static void -acl_inherit_changed_cb(void *arg, uint64_t newval) -{ - ((zfsvfs_t *)arg)->z_acl_inherit = newval; -} - -static int -zfs_register_callbacks(vfs_t *vfsp) -{ - struct dsl_dataset *ds = NULL; - objset_t *os = NULL; - zfsvfs_t *zfsvfs = NULL; - int error = 0; - - ASSERT(vfsp); - zfsvfs = vfsp->vfs_data; - ASSERT(zfsvfs); - os = zfsvfs->z_os; - - /* - * The act of registering our callbacks will destroy any mount - * options we may have. In order to enable temporary overrides - * of mount options, we stash away the current values and - * restore them after we register the callbacks. - */ - if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { - vfsp->vfs_do_readonly = B_TRUE; - vfsp->vfs_readonly = B_TRUE; - } - - /* - * Register property callbacks. - * - * It would probably be fine to just check for i/o error from - * the first prop_register(), but I guess I like to go - * overboard... - */ - ds = dmu_objset_ds(os); - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, - zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error) - goto unregister; - - /* - * Invoke our callbacks to restore temporary mount options. - */ - if (vfsp->vfs_do_readonly) - readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); - if (vfsp->vfs_do_setuid) - setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); - if (vfsp->vfs_do_exec) - exec_changed_cb(zfsvfs, vfsp->vfs_exec); - if (vfsp->vfs_do_devices) - devices_changed_cb(zfsvfs, vfsp->vfs_devices); - if (vfsp->vfs_do_xattr) - xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); - if (vfsp->vfs_do_atime) - atime_changed_cb(zfsvfs, vfsp->vfs_atime); - if (vfsp->vfs_do_relatime) - relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); - if (vfsp->vfs_do_nbmand) - nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); - - return (0); - -unregister: - dsl_prop_unregister_all(ds, zfsvfs); - return (error); -} - -static int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) -{ - sa_hdr_phys_t sa; - sa_hdr_phys_t *sap = data; - uint64_t flags; - int hdrsize; - boolean_t swap = B_FALSE; - - /* - * Is it a valid type of object to track? - */ - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) - return (SET_ERROR(ENOENT)); - - /* - * If we have a NULL data pointer - * then assume the id's aren't changing and - * return EEXIST to the dmu to let it know to - * use the same ids - */ - if (data == NULL) - return (SET_ERROR(EEXIST)); - - if (bonustype == DMU_OT_ZNODE) { - znode_phys_t *znp = data; - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - *projectp = ZFS_DEFAULT_PROJID; - return (0); - } - - if (sap->sa_magic == 0) { - /* - * This should only happen for newly created files - * that haven't had the znode data filled in yet. - */ - *userp = 0; - *groupp = 0; - *projectp = ZFS_DEFAULT_PROJID; - return (0); - } - - sa = *sap; - if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { - sa.sa_magic = SA_MAGIC; - sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); - swap = B_TRUE; - } else { - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); - } - - hdrsize = sa_hdrsize(&sa); - VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); - - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); - flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); - if (swap) - flags = BSWAP_64(flags); - - if (flags & ZFS_PROJID) - *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_PROJID_OFFSET)); - else - *projectp = ZFS_DEFAULT_PROJID; - - if (swap) { - *userp = BSWAP_64(*userp); - *groupp = BSWAP_64(*groupp); - *projectp = BSWAP_64(*projectp); - } - return (0); -} - -static void -fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, - char *domainbuf, int buflen, uid_t *ridp) -{ - uint64_t fuid; - const char *domain; - - fuid = zfs_strtonum(fuidstr, NULL); - - domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); - if (domain) - (void) strlcpy(domainbuf, domain, buflen); - else - domainbuf[0] = '\0'; - *ridp = FUID_RID(fuid); -} - -static uint64_t -zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) -{ - switch (type) { - case ZFS_PROP_USERUSED: - case ZFS_PROP_USEROBJUSED: - return (DMU_USERUSED_OBJECT); - case ZFS_PROP_GROUPUSED: - case ZFS_PROP_GROUPOBJUSED: - return (DMU_GROUPUSED_OBJECT); - case ZFS_PROP_PROJECTUSED: - case ZFS_PROP_PROJECTOBJUSED: - return (DMU_PROJECTUSED_OBJECT); - case ZFS_PROP_USERQUOTA: - return (zfsvfs->z_userquota_obj); - case ZFS_PROP_GROUPQUOTA: - return (zfsvfs->z_groupquota_obj); - case ZFS_PROP_USEROBJQUOTA: - return (zfsvfs->z_userobjquota_obj); - case ZFS_PROP_GROUPOBJQUOTA: - return (zfsvfs->z_groupobjquota_obj); - case ZFS_PROP_PROJECTQUOTA: - return (zfsvfs->z_projectquota_obj); - case ZFS_PROP_PROJECTOBJQUOTA: - return (zfsvfs->z_projectobjquota_obj); - default: - return (ZFS_NO_OBJECT); - } -} - -int -zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) -{ - int error; - zap_cursor_t zc; - zap_attribute_t za; - zfs_useracct_t *buf = vbuf; - uint64_t obj; - int offset = 0; - - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || - type == ZFS_PROP_PROJECTOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED) && - !dmu_objset_projectquota_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED || - type == ZFS_PROP_PROJECTOBJQUOTA) && - !dmu_objset_userobjspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == ZFS_NO_OBJECT) { - *bufsizep = 0; - return (0); - } - - if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_PROJECTOBJUSED) - offset = DMU_OBJACCT_PREFIX_LEN; - - for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); - (error = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > - *bufsizep) - break; - - /* - * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) - * when dealing with block quota and vice versa. - */ - if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, - DMU_OBJACCT_PREFIX_LEN) == 0)) - continue; - - fuidstr_to_sid(zfsvfs, za.za_name + offset, - buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); - - buf->zu_space = za.za_first_integer; - buf++; - } - if (error == ENOENT) - error = 0; - - ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); - *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; - *cookiep = zap_cursor_serialize(&zc); - zap_cursor_fini(&zc); - return (error); -} - -/* - * buf must be big enough (eg, 32 bytes) - */ -static int -id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, - char *buf, boolean_t addok) -{ - uint64_t fuid; - int domainid = 0; - - if (domain && domain[0]) { - domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); - if (domainid == -1) - return (SET_ERROR(ENOENT)); - } - fuid = FUID_ENCODE(domainid, rid); - (void) sprintf(buf, "%llx", (longlong_t)fuid); - return (0); -} - -int -zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valp) -{ - char buf[20 + DMU_OBJACCT_PREFIX_LEN]; - int offset = 0; - int err; - uint64_t obj; - - *valp = 0; - - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED || - type == ZFS_PROP_PROJECTOBJQUOTA) && - !dmu_objset_userobjspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || - type == ZFS_PROP_PROJECTOBJQUOTA || - type == ZFS_PROP_PROJECTOBJUSED) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - } - - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == ZFS_NO_OBJECT) - return (0); - - if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || - type == ZFS_PROP_PROJECTOBJUSED) { - strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); - offset = DMU_OBJACCT_PREFIX_LEN; - } - - err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE); - if (err) - return (err); - - err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); - if (err == ENOENT) - err = 0; - return (err); -} - -int -zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota) -{ - char buf[32]; - int err; - dmu_tx_t *tx; - uint64_t *objp; - boolean_t fuid_dirtied; - - if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) - return (SET_ERROR(ENOTSUP)); - - switch (type) { - case ZFS_PROP_USERQUOTA: - objp = &zfsvfs->z_userquota_obj; - break; - case ZFS_PROP_GROUPQUOTA: - objp = &zfsvfs->z_groupquota_obj; - break; - case ZFS_PROP_USEROBJQUOTA: - objp = &zfsvfs->z_userobjquota_obj; - break; - case ZFS_PROP_GROUPOBJQUOTA: - objp = &zfsvfs->z_groupobjquota_obj; - break; - case ZFS_PROP_PROJECTQUOTA: - if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - - objp = &zfsvfs->z_projectquota_obj; - break; - case ZFS_PROP_PROJECTOBJQUOTA: - if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - if (!zpl_is_valid_projid(rid)) - return (SET_ERROR(EINVAL)); - - objp = &zfsvfs->z_projectobjquota_obj; - break; - default: - return (SET_ERROR(EINVAL)); - } - - err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); - if (err) - return (err); - fuid_dirtied = zfsvfs->z_fuid_dirty; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); - if (*objp == 0) { - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, - zfs_userquota_prop_prefixes[type]); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - mutex_enter(&zfsvfs->z_lock); - if (*objp == 0) { - *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, - DMU_OT_NONE, 0, tx); - VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); - } - mutex_exit(&zfsvfs->z_lock); - - if (quota == 0) { - err = zap_remove(zfsvfs->z_os, *objp, buf, tx); - if (err == ENOENT) - err = 0; - } else { - err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); - } - ASSERT(err == 0); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - dmu_tx_commit(tx); - return (err); -} - -boolean_t -zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - char buf[20 + DMU_OBJACCT_PREFIX_LEN]; - uint64_t used, quota, quotaobj; - int err; - - if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { - if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - - if (usedobj == DMU_PROJECTUSED_OBJECT) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { - if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - quotaobj = zfsvfs->z_projectobjquota_obj; - } else if (usedobj == DMU_USERUSED_OBJECT) { - quotaobj = zfsvfs->z_userobjquota_obj; - } else if (usedobj == DMU_GROUPUSED_OBJECT) { - quotaobj = zfsvfs->z_groupobjquota_obj; - } else { - return (B_FALSE); - } - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - (void) sprintf(buf, "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); - if (err != 0) - return (B_FALSE); - - (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); - if (err != 0) - return (B_FALSE); - return (used >= quota); -} - -boolean_t -zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - char buf[20]; - uint64_t used, quota, quotaobj; - int err; - - if (usedobj == DMU_PROJECTUSED_OBJECT) { - if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { - if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { - dsl_pool_config_enter( - dmu_objset_pool(zfsvfs->z_os), FTAG); - dmu_objset_id_quota_upgrade(zfsvfs->z_os); - dsl_pool_config_exit( - dmu_objset_pool(zfsvfs->z_os), FTAG); - } - return (B_FALSE); - } - quotaobj = zfsvfs->z_projectquota_obj; - } else if (usedobj == DMU_USERUSED_OBJECT) { - quotaobj = zfsvfs->z_userquota_obj; - } else if (usedobj == DMU_GROUPUSED_OBJECT) { - quotaobj = zfsvfs->z_groupquota_obj; - } else { - return (B_FALSE); - } - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - (void) sprintf(buf, "%llx", (longlong_t)id); - err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); - if (err != 0) - return (B_FALSE); - - err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); - if (err != 0) - return (B_FALSE); - return (used >= quota); -} - -boolean_t -zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) -{ - return (zfs_id_overblockquota(zfsvfs, usedobj, id) || - zfs_id_overobjquota(zfsvfs, usedobj, id)); -} - -/* - * Associate this zfsvfs with the given objset, which must be owned. - * This will cache a bunch of on-disk state from the objset in the - * zfsvfs. - */ -static int -zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) -{ - int error; - uint64_t val; - - zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - zfsvfs->z_os = os; - - error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error != 0) - return (error); - if (zfsvfs->z_version > - zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { - (void) printk("Can't mount a version %lld file system " - "on a version %lld pool\n. Pool must be upgraded to mount " - "this file system.\n", (u_longlong_t)zfsvfs->z_version, - (u_longlong_t)spa_version(dmu_objset_spa(os))); - return (SET_ERROR(ENOTSUP)); - } - error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); - if (error != 0) - return (error); - zfsvfs->z_norm = (int)val; - - error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); - if (error != 0) - return (error); - zfsvfs->z_utf8 = (val != 0); - - error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); - if (error != 0) - return (error); - zfsvfs->z_case = (uint_t)val; - - if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) - return (error); - zfsvfs->z_acl_type = (uint_t)val; - - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - zfsvfs->z_case == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; - - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); - - uint64_t sa_obj = 0; - if (zfsvfs->z_use_sa) { - /* should either have both of these objects or none */ - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, - &sa_obj); - if (error != 0) - return (error); - - error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); - if ((error == 0) && (val == ZFS_XATTR_SA)) - zfsvfs->z_xattr_sa = B_TRUE; - } - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error != 0) - return (error); - ASSERT(zfsvfs->z_root != 0); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], - 8, 1, &zfsvfs->z_userquota_obj); - if (error == ENOENT) - zfsvfs->z_userquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], - 8, 1, &zfsvfs->z_groupquota_obj); - if (error == ENOENT) - zfsvfs->z_groupquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], - 8, 1, &zfsvfs->z_projectquota_obj); - if (error == ENOENT) - zfsvfs->z_projectquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], - 8, 1, &zfsvfs->z_userobjquota_obj); - if (error == ENOENT) - zfsvfs->z_userobjquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], - 8, 1, &zfsvfs->z_groupobjquota_obj); - if (error == ENOENT) - zfsvfs->z_groupobjquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], - 8, 1, &zfsvfs->z_projectobjquota_obj); - if (error == ENOENT) - zfsvfs->z_projectobjquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, - &zfsvfs->z_fuid_obj); - if (error == ENOENT) - zfsvfs->z_fuid_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, - &zfsvfs->z_shares_dir); - if (error == ENOENT) - zfsvfs->z_shares_dir = 0; - else if (error != 0) - return (error); - - error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, - &zfsvfs->z_attr_table); - if (error != 0) - return (error); - - if (zfsvfs->z_version >= ZPL_VERSION_SA) - sa_register_update_callback(os, zfs_sa_upgrade); - - return (0); -} - -int -zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) -{ - objset_t *os; - zfsvfs_t *zfsvfs; - int error; - boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); - - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - - error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); - if (error != 0) { - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - return (error); - } - - error = zfsvfs_create_impl(zfvp, zfsvfs, os); - if (error != 0) { - dmu_objset_disown(os, B_TRUE, zfsvfs); - } - return (error); -} - - -/* - * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function - * on a failure. Do not pass in a statically allocated zfsvfs. - */ -int -zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) -{ - int error; - - zfsvfs->z_vfs = NULL; - zfsvfs->z_sb = NULL; - zfsvfs->z_parent = zfsvfs; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); - - int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), - ZFS_OBJ_MTX_MAX); - zfsvfs->z_hold_size = size; - zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, - KM_SLEEP); - zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); - for (int i = 0; i != size; i++) { - avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, - sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); - mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); - } - - error = zfsvfs_init(zfsvfs, os); - if (error != 0) { - *zfvp = NULL; - zfsvfs_free(zfsvfs); - return (error); - } - - zfsvfs->z_drain_task = TASKQID_INVALID; - zfsvfs->z_draining = B_FALSE; - zfsvfs->z_drain_cancel = B_TRUE; - - *zfvp = zfsvfs; - return (0); -} - -static int -zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) -{ - int error; - boolean_t readonly = zfs_is_readonly(zfsvfs); - - error = zfs_register_callbacks(zfsvfs->z_vfs); - if (error) - return (error); - - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - - /* - * If we are not mounting (ie: online recv), then we don't - * have to worry about replaying the log as we blocked all - * operations out since we closed the ZIL. - */ - if (mounting) { - ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); - dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); - - /* - * During replay we remove the read only flag to - * allow replays to succeed. - */ - if (readonly != 0) { - readonly_changed_cb(zfsvfs, B_FALSE); - } else { - zap_stats_t zs; - if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, - &zs) == 0) { - dataset_kstats_update_nunlinks_kstat( - &zfsvfs->z_kstat, zs.zs_num_entries); - } - dprintf_ds(zfsvfs->z_os->os_dsl_dataset, - "num_entries in unlinked set: %llu", - zs.zs_num_entries); - zfs_unlinked_drain(zfsvfs); - } - - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { - if (zil_replay_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - } else { - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, - zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; - } - } - - /* restore readonly bit */ - if (readonly != 0) - readonly_changed_cb(zfsvfs, B_TRUE); - } - - /* - * Set the objset user_ptr to track its zfsvfs. - */ - mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); - - return (0); -} - -void -zfsvfs_free(zfsvfs_t *zfsvfs) -{ - int i, size = zfsvfs->z_hold_size; - - zfs_fuid_destroy(zfsvfs); - - mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_lock); - list_destroy(&zfsvfs->z_all_znodes); - rrm_destroy(&zfsvfs->z_teardown_lock); - rw_destroy(&zfsvfs->z_teardown_inactive_lock); - rw_destroy(&zfsvfs->z_fuid_lock); - for (i = 0; i != size; i++) { - avl_destroy(&zfsvfs->z_hold_trees[i]); - mutex_destroy(&zfsvfs->z_hold_locks[i]); - } - vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); - vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); - zfsvfs_vfs_free(zfsvfs->z_vfs); - dataset_kstats_destroy(&zfsvfs->z_kstat); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); -} - -static void -zfs_set_fuid_feature(zfsvfs_t *zfsvfs) -{ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); -} - -void -zfs_unregister_callbacks(zfsvfs_t *zfsvfs) -{ - objset_t *os = zfsvfs->z_os; - - if (!dmu_objset_is_snapshot(os)) - dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); -} - -#ifdef HAVE_MLSLABEL -/* - * Check that the hex label string is appropriate for the dataset being - * mounted into the global_zone proper. - * - * Return an error if the hex label string is not default or - * admin_low/admin_high. For admin_low labels, the corresponding - * dataset must be readonly. - */ -int -zfs_check_global_label(const char *dsname, const char *hexsl) -{ - if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_HIGH) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_LOW) == 0) { - /* must be readonly */ - uint64_t rdonly; - - if (dsl_prop_get_integer(dsname, - zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) - return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); - } - return (SET_ERROR(EACCES)); -} -#endif /* HAVE_MLSLABEL */ - -static int -zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, - uint32_t bshift) -{ - char buf[20 + DMU_OBJACCT_PREFIX_LEN]; - uint64_t offset = DMU_OBJACCT_PREFIX_LEN; - uint64_t quota; - uint64_t used; - int err; - - strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); - err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE); - if (err) - return (err); - - if (zfsvfs->z_projectquota_obj == 0) - goto objs; - - err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, - buf + offset, 8, 1, "a); - if (err == ENOENT) - goto objs; - else if (err) - return (err); - - err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, - buf + offset, 8, 1, &used); - if (unlikely(err == ENOENT)) { - uint32_t blksize; - u_longlong_t nblocks; - - /* - * Quota accounting is async, so it is possible race case. - * There is at least one object with the given project ID. - */ - sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); - if (unlikely(zp->z_blksz == 0)) - blksize = zfsvfs->z_max_blksz; - - used = blksize * nblocks; - } else if (err) { - return (err); - } - - statp->f_blocks = quota >> bshift; - statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; - statp->f_bavail = statp->f_bfree; - -objs: - if (zfsvfs->z_projectobjquota_obj == 0) - return (0); - - err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, - buf + offset, 8, 1, "a); - if (err == ENOENT) - return (0); - else if (err) - return (err); - - err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, - buf, 8, 1, &used); - if (unlikely(err == ENOENT)) { - /* - * Quota accounting is async, so it is possible race case. - * There is at least one object with the given project ID. - */ - used = 1; - } else if (err) { - return (err); - } - - statp->f_files = quota; - statp->f_ffree = (quota > used) ? (quota - used) : 0; - - return (0); -} - -int -zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) -{ - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - uint64_t refdbytes, availbytes, usedobjs, availobjs; - int err = 0; - - ZFS_ENTER(zfsvfs); - - dmu_objset_space(zfsvfs->z_os, - &refdbytes, &availbytes, &usedobjs, &availobjs); - - uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); - /* - * The underlying storage pool actually uses multiple block - * size. Under Solaris frsize (fragment size) is reported as - * the smallest block size we support, and bsize (block size) - * as the filesystem's maximum block size. Unfortunately, - * under Linux the fragment size and block size are often used - * interchangeably. Thus we are forced to report both of them - * as the filesystem's maximum block size. - */ - statp->f_frsize = zfsvfs->z_max_blksz; - statp->f_bsize = zfsvfs->z_max_blksz; - uint32_t bshift = fls(statp->f_bsize) - 1; - - /* - * The following report "total" blocks of various kinds in - * the file system, but reported in terms of f_bsize - the - * "preferred" size. - */ - - /* Round up so we never have a filesystem using 0 blocks. */ - refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); - statp->f_blocks = (refdbytes + availbytes) >> bshift; - statp->f_bfree = availbytes >> bshift; - statp->f_bavail = statp->f_bfree; /* no root reservation */ - - /* - * statvfs() should really be called statufs(), because it assumes - * static metadata. ZFS doesn't preallocate files, so the best - * we can do is report the max that could possibly fit in f_files, - * and that minus the number actually used in f_ffree. - * For f_ffree, report the smaller of the number of objects available - * and the number of blocks (each object will take at least a block). - */ - statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); - statp->f_files = statp->f_ffree + usedobjs; - statp->f_fsid.val[0] = (uint32_t)fsid; - statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); - statp->f_type = ZFS_SUPER_MAGIC; - statp->f_namelen = MAXNAMELEN - 1; - - /* - * We have all of 40 characters to stuff a string here. - * Is there anything useful we could/should provide? - */ - bzero(statp->f_spare, sizeof (statp->f_spare)); - - if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && - dmu_objset_projectquota_present(zfsvfs->z_os)) { - znode_t *zp = ITOZ(dentry->d_inode); - - if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && - zpl_is_valid_projid(zp->z_projid)) - err = zfs_statfs_project(zfsvfs, zp, statp, bshift); - } - - ZFS_EXIT(zfsvfs); - return (err); -} - -int -zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) -{ - znode_t *rootzp; - int error; - - ZFS_ENTER(zfsvfs); - - error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); - if (error == 0) - *ipp = ZTOI(rootzp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef HAVE_D_PRUNE_ALIASES -/* - * Linux kernels older than 3.1 do not support a per-filesystem shrinker. - * To accommodate this we must improvise and manually walk the list of znodes - * attempting to prune dentries in order to be able to drop the inodes. - * - * To avoid scanning the same znodes multiple times they are always rotated - * to the end of the z_all_znodes list. New znodes are inserted at the - * end of the list so we're always scanning the oldest znodes first. - */ -static int -zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) -{ - znode_t **zp_array, *zp; - int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); - int objects = 0; - int i = 0, j = 0; - - zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); - - mutex_enter(&zfsvfs->z_znodes_lock); - while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { - - if ((i++ > nr_to_scan) || (j >= max_array)) - break; - - ASSERT(list_link_active(&zp->z_link_node)); - list_remove(&zfsvfs->z_all_znodes, zp); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - - /* Skip active znodes and .zfs entries */ - if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) - continue; - - if (igrab(ZTOI(zp)) == NULL) - continue; - - zp_array[j] = zp; - j++; - } - mutex_exit(&zfsvfs->z_znodes_lock); - - for (i = 0; i < j; i++) { - zp = zp_array[i]; - - ASSERT3P(zp, !=, NULL); - d_prune_aliases(ZTOI(zp)); - - if (atomic_read(&ZTOI(zp)->i_count) == 1) - objects++; - - iput(ZTOI(zp)); - } - - kmem_free(zp_array, max_array * sizeof (znode_t *)); - - return (objects); -} -#endif /* HAVE_D_PRUNE_ALIASES */ - -/* - * The ARC has requested that the filesystem drop entries from the dentry - * and inode caches. This can occur when the ARC needs to free meta data - * blocks but can't because they are all pinned by entries in these caches. - */ -int -zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - int error = 0; -#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) - struct shrinker *shrinker = &sb->s_shrink; - struct shrink_control sc = { - .nr_to_scan = nr_to_scan, - .gfp_mask = GFP_KERNEL, - }; -#endif - - ZFS_ENTER(zfsvfs); - -#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ - defined(SHRINK_CONTROL_HAS_NID) && \ - defined(SHRINKER_NUMA_AWARE) - if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { - *objects = 0; - for_each_online_node(sc.nid) { - *objects += (*shrinker->scan_objects)(shrinker, &sc); - } - } else { - *objects = (*shrinker->scan_objects)(shrinker, &sc); - } - -#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) - *objects = (*shrinker->scan_objects)(shrinker, &sc); -#elif defined(HAVE_SHRINK) - *objects = (*shrinker->shrink)(shrinker, &sc); -#elif defined(HAVE_D_PRUNE_ALIASES) -#define D_PRUNE_ALIASES_IS_DEFAULT - *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); -#else -#error "No available dentry and inode cache pruning mechanism." -#endif - -#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) -#undef D_PRUNE_ALIASES_IS_DEFAULT - /* - * Fall back to zfs_prune_aliases if the kernel's per-superblock - * shrinker couldn't free anything, possibly due to the inodes being - * allocated in a different memcg. - */ - if (*objects == 0) - *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); -#endif - - ZFS_EXIT(zfsvfs); - - dprintf_ds(zfsvfs->z_os->os_dsl_dataset, - "pruning, nr_to_scan=%lu objects=%d error=%d\n", - nr_to_scan, *objects, error); - - return (error); -} - -/* - * Teardown the zfsvfs_t. - * - * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' - * and 'z_teardown_inactive_lock' held. - */ -static int -zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) -{ - znode_t *zp; - - zfs_unlinked_drain_stop_wait(zfsvfs); - - /* - * If someone has not already unmounted this file system, - * drain the iput_taskq to ensure all active references to the - * zfsvfs_t have been handled only then can it be safely destroyed. - */ - if (zfsvfs->z_os) { - /* - * If we're unmounting we have to wait for the list to - * drain completely. - * - * If we're not unmounting there's no guarantee the list - * will drain completely, but iputs run from the taskq - * may add the parents of dir-based xattrs to the taskq - * so we want to wait for these. - * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. - */ - int round = 0; - while (zfsvfs->z_nr_znodes > 0) { - taskq_wait_outstanding(dsl_pool_iput_taskq( - dmu_objset_pool(zfsvfs->z_os)), 0); - if (++round > 1 && !unmounting) - break; - } - } - - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); - - if (!unmounting) { - /* - * We purge the parent filesystem's super block as the - * parent filesystem and all of its snapshots have their - * inode's super block set to the parent's filesystem's - * super block. Note, 'z_parent' is self referential - * for non-snapshots. - */ - shrink_dcache_sb(zfsvfs->z_parent->z_sb); - } - - /* - * Close the zil. NB: Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; - } - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); - - /* - * If we are not unmounting (ie: online recv) and someone already - * unmounted this file system while we were doing the switcheroo, - * or a reopen of z_os failed then just bail out now. - */ - if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - return (SET_ERROR(EIO)); - } - - /* - * At this point there are no VFS ops active, and any new VFS ops - * will fail with EIO since we have z_teardown_lock for writer (only - * relevant for forced unmount). - * - * Release all holds on dbufs. We also grab an extra reference to all - * the remaining inodes so that the kernel does not attempt to free - * any inodes of a suspended fs. This can cause deadlocks since the - * zfs_resume_fs() process may involve starting threads, which might - * attempt to free unreferenced inodes to free up memory for the new - * thread. - */ - if (!unmounting) { - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { - if (zp->z_sa_hdl) - zfs_znode_dmu_fini(zp); - if (igrab(ZTOI(zp)) != NULL) - zp->z_suspended = B_TRUE; - - } - mutex_exit(&zfsvfs->z_znodes_lock); - } - - /* - * If we are unmounting, set the unmounted flag and let new VFS ops - * unblock. zfs_inactive will have the unmounted behavior, and all - * other VFS ops will fail with EIO. - */ - if (unmounting) { - zfsvfs->z_unmounted = B_TRUE; - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - } - - /* - * z_os will be NULL if there was an error in attempting to reopen - * zfsvfs, so just return as the properties had already been - * - * unregistered and cached data had been evicted before. - */ - if (zfsvfs->z_os == NULL) - return (0); - - /* - * Unregister properties. - */ - zfs_unregister_callbacks(zfsvfs); - - /* - * Evict cached data. We must write out any dirty data before - * disowning the dataset. - */ - objset_t *os = zfsvfs->z_os; - boolean_t os_dirty = B_FALSE; - for (int t = 0; t < TXG_SIZE; t++) { - if (dmu_objset_is_dirty(os, t)) { - os_dirty = B_TRUE; - break; - } - } - if (!zfs_is_readonly(zfsvfs) && os_dirty) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - } - dmu_objset_evict_dbufs(zfsvfs->z_os); - - return (0); -} - -#if !defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) && \ - !defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) -atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); -#endif - -int -zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) -{ - const char *osname = zm->mnt_osname; - struct inode *root_inode; - uint64_t recordsize; - int error = 0; - zfsvfs_t *zfsvfs = NULL; - vfs_t *vfs = NULL; - - ASSERT(zm); - ASSERT(osname); - - error = zfsvfs_parse_options(zm->mnt_data, &vfs); - if (error) - return (error); - - error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); - if (error) { - zfsvfs_vfs_free(vfs); - goto out; - } - - if ((error = dsl_prop_get_integer(osname, "recordsize", - &recordsize, NULL))) { - zfsvfs_vfs_free(vfs); - goto out; - } - - vfs->vfs_data = zfsvfs; - zfsvfs->z_vfs = vfs; - zfsvfs->z_sb = sb; - sb->s_fs_info = zfsvfs; - sb->s_magic = ZFS_SUPER_MAGIC; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_blocksize = recordsize; - sb->s_blocksize_bits = ilog2(recordsize); - - error = -zpl_bdi_setup(sb, "zfs"); - if (error) - goto out; - - sb->s_bdi->ra_pages = 0; - - /* Set callback operations for the file system. */ - sb->s_op = &zpl_super_operations; - sb->s_xattr = zpl_xattr_handlers; - sb->s_export_op = &zpl_export_operations; -#ifdef HAVE_S_D_OP - sb->s_d_op = &zpl_dentry_operations; -#endif /* HAVE_S_D_OP */ - - /* Set features for file system. */ - zfs_set_fuid_feature(zfsvfs); - - if (dmu_objset_is_snapshot(zfsvfs->z_os)) { - uint64_t pval; - - atime_changed_cb(zfsvfs, B_FALSE); - readonly_changed_cb(zfsvfs, B_TRUE); - if ((error = dsl_prop_get_integer(osname, - "xattr", &pval, NULL))) - goto out; - xattr_changed_cb(zfsvfs, pval); - if ((error = dsl_prop_get_integer(osname, - "acltype", &pval, NULL))) - goto out; - acltype_changed_cb(zfsvfs, pval); - zfsvfs->z_issnap = B_TRUE; - zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; - zfsvfs->z_snap_defer_time = jiffies; - - mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); - } else { - if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) - goto out; - } - - /* Allocate a root inode for the filesystem. */ - error = zfs_root(zfsvfs, &root_inode); - if (error) { - (void) zfs_umount(sb); - goto out; - } - - /* Allocate a root dentry for the filesystem */ - sb->s_root = d_make_root(root_inode); - if (sb->s_root == NULL) { - (void) zfs_umount(sb); - error = SET_ERROR(ENOMEM); - goto out; - } - - if (!zfsvfs->z_issnap) - zfsctl_create(zfsvfs); - - zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); -out: - if (error) { - if (zfsvfs != NULL) { - dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); - zfsvfs_free(zfsvfs); - } - /* - * make sure we don't have dangling sb->s_fs_info which - * zfs_preumount will use. - */ - sb->s_fs_info = NULL; - } - - return (error); -} - -/* - * Called when an unmount is requested and certain sanity checks have - * already passed. At this point no dentries or inodes have been reclaimed - * from their respective caches. We drop the extra reference on the .zfs - * control directory to allow everything to be reclaimed. All snapshots - * must already have been unmounted to reach this point. - */ -void -zfs_preumount(struct super_block *sb) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - - /* zfsvfs is NULL when zfs_domount fails during mount */ - if (zfsvfs) { - zfs_unlinked_drain_stop_wait(zfsvfs); - zfsctl_destroy(sb->s_fs_info); - /* - * Wait for iput_async before entering evict_inodes in - * generic_shutdown_super. The reason we must finish before - * evict_inodes is when lazytime is on, or when zfs_purgedir - * calls zfs_zget, iput would bump i_count from 0 to 1. This - * would race with the i_count check in evict_inodes. This means - * it could destroy the inode while we are still using it. - * - * We wait for two passes. xattr directories in the first pass - * may add xattr entries in zfs_purgedir, so in the second pass - * we wait for them. We don't use taskq_wait here because it is - * a pool wide taskq. Other mounted filesystems can constantly - * do iput_async and there's no guarantee when taskq will be - * empty. - */ - taskq_wait_outstanding(dsl_pool_iput_taskq( - dmu_objset_pool(zfsvfs->z_os)), 0); - taskq_wait_outstanding(dsl_pool_iput_taskq( - dmu_objset_pool(zfsvfs->z_os)), 0); - } -} - -/* - * Called once all other unmount released tear down has occurred. - * It is our responsibility to release any remaining infrastructure. - */ -/*ARGSUSED*/ -int -zfs_umount(struct super_block *sb) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - objset_t *os; - - if (zfsvfs->z_arc_prune != NULL) - arc_remove_prune_callback(zfsvfs->z_arc_prune); - VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); - os = zfsvfs->z_os; - zpl_bdi_destroy(sb); - - /* - * z_os will be NULL if there was an error in - * attempting to reopen zfsvfs. - */ - if (os != NULL) { - /* - * Unset the objset user_ptr. - */ - mutex_enter(&os->os_user_ptr_lock); - dmu_objset_set_user(os, NULL); - mutex_exit(&os->os_user_ptr_lock); - - /* - * Finally release the objset - */ - dmu_objset_disown(os, B_TRUE, zfsvfs); - } - - zfsvfs_free(zfsvfs); - return (0); -} - -int -zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - vfs_t *vfsp; - boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); - int error; - - if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && - !(*flags & SB_RDONLY)) { - *flags |= SB_RDONLY; - return (EROFS); - } - - error = zfsvfs_parse_options(zm->mnt_data, &vfsp); - if (error) - return (error); - - if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - - zfs_unregister_callbacks(zfsvfs); - zfsvfs_vfs_free(zfsvfs->z_vfs); - - vfsp->vfs_data = zfsvfs; - zfsvfs->z_vfs = vfsp; - if (!issnap) - (void) zfs_register_callbacks(vfsp); - - return (error); -} - -int -zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) -{ - zfsvfs_t *zfsvfs = sb->s_fs_info; - znode_t *zp; - uint64_t object = 0; - uint64_t fid_gen = 0; - uint64_t gen_mask; - uint64_t zp_gen; - int i, err; - - *ipp = NULL; - - if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { - zfid_short_t *zfid = (zfid_short_t *)fidp; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); - - for (i = 0; i < sizeof (zfid->zf_gen); i++) - fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); - } else { - return (SET_ERROR(EINVAL)); - } - - /* LONG_FID_LEN means snapdirs */ - if (fidp->fid_len == LONG_FID_LEN) { - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint64_t objsetid = 0; - uint64_t setgen = 0; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); - - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); - - if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { - dprintf("snapdir fid: objsetid (%llu) != " - "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", - objsetid, ZFSCTL_INO_SNAPDIRS, object); - - return (SET_ERROR(EINVAL)); - } - - if (fid_gen > 1 || setgen != 0) { - dprintf("snapdir fid: fid_gen (%llu) and setgen " - "(%llu)\n", fid_gen, setgen); - return (SET_ERROR(EINVAL)); - } - - return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); - } - - ZFS_ENTER(zfsvfs); - /* A zero fid_gen means we are in the .zfs control directories */ - if (fid_gen == 0 && - (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *ipp = zfsvfs->z_ctldir; - ASSERT(*ipp != NULL); - if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, - 0, kcred, NULL, NULL) == 0); - } else { - igrab(*ipp); - } - ZFS_EXIT(zfsvfs); - return (0); - } - - gen_mask = -1ULL >> (64 - 8 * i); - - dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); - if ((err = zfs_zget(zfsvfs, object, &zp))) { - ZFS_EXIT(zfsvfs); - return (err); - } - - /* Don't export xattr stuff */ - if (zp->z_pflags & ZFS_XATTR) { - iput(ZTOI(zp)); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOENT)); - } - - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, - sizeof (uint64_t)); - zp_gen = zp_gen & gen_mask; - if (zp_gen == 0) - zp_gen = 1; - if ((fid_gen == 0) && (zfsvfs->z_root == object)) - fid_gen = zp_gen; - if (zp->z_unlinked || zp_gen != fid_gen) { - dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, - fid_gen); - iput(ZTOI(zp)); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOENT)); - } - - *ipp = ZTOI(zp); - if (*ipp) - zfs_inode_update(ITOZ(*ipp)); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Block out VFS ops and close zfsvfs_t - * - * Note, if successful, then we return with the 'z_teardown_lock' and - * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying - * dataset and objset intact so that they can be atomically handed off during - * a subsequent rollback or recv operation and the resume thereafter. - */ -int -zfs_suspend_fs(zfsvfs_t *zfsvfs) -{ - int error; - - if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) - return (error); - - return (0); -} - -/* - * Rebuild SA and release VOPs. Note that ownership of the underlying dataset - * is an invariant across any of the operations that can be performed while the - * filesystem was suspended. Whether it succeeded or failed, the preconditions - * are the same: the relevant objset and associated dataset are owned by - * zfsvfs, held, and long held on entry. - */ -int -zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) -{ - int err, err2; - znode_t *zp; - - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - - /* - * We already own this, so just update the objset_t, as the one we - * had before may have been evicted. - */ - objset_t *os; - VERIFY3P(ds->ds_owner, ==, zfsvfs); - VERIFY(dsl_dataset_long_held(ds)); - VERIFY0(dmu_objset_from_ds(ds, &os)); - - err = zfsvfs_init(zfsvfs, os); - if (err != 0) - goto bail; - - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); - - zfs_set_fuid_feature(zfsvfs); - zfsvfs->z_rollback_time = jiffies; - - /* - * Attempt to re-establish all the active inodes with their - * dbufs. If a zfs_rezget() fails, then we unhash the inode - * and mark it stale. This prevents a collision if a new - * inode/object is created which must use the same inode - * number. The stale inode will be be released when the - * VFS prunes the dentry holding the remaining references - * on the stale inode. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { - err2 = zfs_rezget(zp); - if (err2) { - remove_inode_hash(ZTOI(zp)); - zp->z_is_stale = B_TRUE; - } - - /* see comment in zfs_suspend_fs() */ - if (zp->z_suspended) { - zfs_iput_async(ZTOI(zp)); - zp->z_suspended = B_FALSE; - } - } - mutex_exit(&zfsvfs->z_znodes_lock); - - if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { - /* - * zfs_suspend_fs() could have interrupted freeing - * of dnodes. We need to restart this freeing so - * that we don't "leak" the space. - */ - zfs_unlinked_drain(zfsvfs); - } - -bail: - if (err != 0) - zfsvfs->z_unmounted = B_TRUE; - - /* release the VFS ops */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - - if (err != 0) { - /* - * Since we couldn't setup the sa framework, try to force - * unmount this file system. - */ - if (zfsvfs->z_os) - (void) zfs_umount(zfsvfs->z_sb); - } - return (err); -} - -/* - * Release VOPs and unmount a suspended filesystem. - */ -int -zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) -{ - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - - /* - * We already own this, so just hold and rele it to update the - * objset_t, as the one we had before may have been evicted. - */ - objset_t *os; - VERIFY3P(ds->ds_owner, ==, zfsvfs); - VERIFY(dsl_dataset_long_held(ds)); - VERIFY0(dmu_objset_from_ds(ds, &os)); - zfsvfs->z_os = os; - - /* release the VOPs */ - rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - - /* - * Try to force unmount this file system. - */ - (void) zfs_umount(zfsvfs->z_sb); - zfsvfs->z_unmounted = B_TRUE; - return (0); -} - -int -zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) -{ - int error; - objset_t *os = zfsvfs->z_os; - dmu_tx_t *tx; - - if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) - return (SET_ERROR(EINVAL)); - - if (newvers < zfsvfs->z_version) - return (SET_ERROR(EINVAL)); - - if (zfs_spa_version_map(newvers) > - spa_version(dmu_objset_spa(zfsvfs->z_os))) - return (SET_ERROR(ENOTSUP)); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); - if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, - ZFS_SA_ATTRS); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - } - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &newvers, tx); - - if (error) { - dmu_tx_commit(tx); - return (error); - } - - if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { - uint64_t sa_obj; - - ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, - SPA_VERSION_SA); - sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, - DMU_OT_NONE, 0, tx); - - error = zap_add(os, MASTER_NODE_OBJ, - ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT0(error); - - VERIFY(0 == sa_set_sa_object(os, sa_obj)); - sa_register_update_callback(os, zfs_sa_upgrade); - } - - spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, - "from %llu to %llu", zfsvfs->z_version, newvers); - - dmu_tx_commit(tx); - - zfsvfs->z_version = newvers; - os->os_version = newvers; - - zfs_set_fuid_feature(zfsvfs); - - return (0); -} - -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - -/* - * Return true if the corresponding vfs's unmounted flag is set. - * Otherwise return false. - * If this function returns true we know VFS unmount has been initiated. - */ -boolean_t -zfs_get_vfs_flag_unmounted(objset_t *os) -{ - zfsvfs_t *zfvp; - boolean_t unmounted = B_FALSE; - - ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); - - mutex_enter(&os->os_user_ptr_lock); - zfvp = dmu_objset_get_user(os); - if (zfvp != NULL && zfvp->z_unmounted) - unmounted = B_TRUE; - mutex_exit(&os->os_user_ptr_lock); - - return (unmounted); -} - -struct objnode { - avl_node_t node; - uint64_t obj; -}; - -static int -objnode_compare(const void *o1, const void *o2) -{ - const struct objnode *obj1 = o1; - const struct objnode *obj2 = o2; - if (obj1->obj < obj2->obj) - return (-1); - if (obj1->obj > obj2->obj) - return (1); - return (0); -} - -objlist_t * -zfs_get_deleteq(objset_t *os) -{ - objlist_t *deleteq_objlist = objlist_create(); - uint64_t deleteq_obj; - zap_cursor_t zc; - zap_attribute_t za; - dmu_object_info_t doi; - - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); - ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); - - VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, - ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); - - /* - * In order to insert objects into the objlist, they must be in sorted - * order. We don't know what order we'll get them out of the ZAP in, so - * we insert them into and remove them from an avl_tree_t to sort them. - */ - avl_tree_t at; - avl_create(&at, objnode_compare, sizeof (struct objnode), - offsetof(struct objnode, node)); - - for (zap_cursor_init(&zc, os, deleteq_obj); - zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { - struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); - obj->obj = za.za_first_integer; - avl_add(&at, obj); - } - zap_cursor_fini(&zc); - - struct objnode *next, *found = avl_first(&at); - while (found != NULL) { - next = AVL_NEXT(&at, found); - objlist_insert(deleteq_objlist, found->obj); - found = next; - } - - void *cookie = NULL; - while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) - kmem_free(found, sizeof (*found)); - avl_destroy(&at); - return (deleteq_objlist); -} - - -void -zfs_init(void) -{ - zfsctl_init(); - zfs_znode_init(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); - register_filesystem(&zpl_fs_type); -} - -void -zfs_fini(void) -{ - /* - * we don't use outstanding because zpl_posix_acl_free might add more. - */ - taskq_wait(system_delay_taskq); - taskq_wait(system_taskq); - unregister_filesystem(&zpl_fs_type); - zfs_znode_fini(); - zfsctl_fini(); -} - -#if defined(_KERNEL) -EXPORT_SYMBOL(zfs_suspend_fs); -EXPORT_SYMBOL(zfs_resume_fs); -EXPORT_SYMBOL(zfs_userspace_one); -EXPORT_SYMBOL(zfs_userspace_many); -EXPORT_SYMBOL(zfs_set_userquota); -EXPORT_SYMBOL(zfs_id_overblockquota); -EXPORT_SYMBOL(zfs_id_overobjquota); -EXPORT_SYMBOL(zfs_id_overquota); -EXPORT_SYMBOL(zfs_set_version); -EXPORT_SYMBOL(zfsvfs_create); -EXPORT_SYMBOL(zfsvfs_free); -EXPORT_SYMBOL(zfs_is_readonly); -EXPORT_SYMBOL(zfs_domount); -EXPORT_SYMBOL(zfs_preumount); -EXPORT_SYMBOL(zfs_umount); -EXPORT_SYMBOL(zfs_remount); -EXPORT_SYMBOL(zfs_statvfs); -EXPORT_SYMBOL(zfs_vget); -EXPORT_SYMBOL(zfs_prune); -#endif diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c deleted file mode 100644 index de7b59935..000000000 --- a/module/zfs/zfs_vnops.c +++ /dev/null @@ -1,5275 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -/* Portions Copyright 2007 Jeremy Teo */ -/* Portions Copyright 2010 Robert Milkowski */ - - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <sys/vfs.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/kmem.h> -#include <sys/taskq.h> -#include <sys/uio.h> -#include <sys/vmsystm.h> -#include <sys/atomic.h> -#include <sys/pathname.h> -#include <sys/cmn_err.h> -#include <sys/errno.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_ioctl.h> -#include <sys/fs/zfs.h> -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/spa.h> -#include <sys/txg.h> -#include <sys/dbuf.h> -#include <sys/zap.h> -#include <sys/sa.h> -#include <sys/policy.h> -#include <sys/sunddi.h> -#include <sys/sid.h> -#include <sys/mode.h> -#include <sys/zfs_ctldir.h> -#include <sys/zfs_fuid.h> -#include <sys/zfs_sa.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_rlock.h> -#include <sys/cred.h> -#include <sys/zpl.h> -#include <sys/zil.h> -#include <sys/sa_impl.h> - -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait for the intent log to commit if it is a synchronous operation. - * Moreover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. - * - * (2) iput() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * If you must call iput() within a tx then use zfs_iput_async(). - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to - * dmu_tx_assign(). This is critical because we don't want to block - * while holding locks. - * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This - * reduces lock contention and CPU usage when we must wait (note that if - * throughput is constrained by the storage, nearly every transaction - * must wait). - * - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing - * to use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. On subsequent - * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, - * to indicate that this operation has already called dmu_tx_wait(). - * This will ensure that we don't retry forever, waiting a short bit - * each time. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * iput(...); // release held vnodes - * if (error == ERESTART) { - * waited = B_TRUE; - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * iput(...); // release held vnodes - * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ - -/* - * Virus scanning is unsupported. It would be possible to add a hook - * here to performance the required virus scan. This could be done - * entirely in the kernel or potentially as an update to invoke a - * scanning utility. - */ -static int -zfs_vscan(struct inode *ip, cred_t *cr, int async) -{ - return (0); -} - -/* ARGSUSED */ -int -zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && - ((flag & O_APPEND) == 0)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* Virus scan eligible files on open */ - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { - if (zfs_vscan(ip, cr, 0) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - } - - /* Keep a count of the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_inc_32(&zp->z_sync_cnt); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* ARGSUSED */ -int -zfs_close(struct inode *ip, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_dec_32(&zp->z_sync_cnt); - - if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) - VERIFY(zfs_vscan(ip, cr, 1) == 0); - - ZFS_EXIT(zfsvfs); - return (0); -} - -#if defined(SEEK_HOLE) && defined(SEEK_DATA) -/* - * Lseek support for finding holes (cmd == SEEK_HOLE) and - * data (cmd == SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey_common(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* file was dirty, so fall back to using generic logic */ - if (error == EBUSY) { - if (hole) - *off = file_sz; - - return (0); - } - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -int -zfs_holey(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_holey_common(ip, cmd, off); - - ZFS_EXIT(zfsvfs); - return (error); -} -#endif /* SEEK_HOLE && SEEK_DATA */ - -#if defined(_KERNEL) -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - */ -static void -update_pages(struct inode *ip, int64_t start, int len, - objset_t *os, uint64_t oid) -{ - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; - - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - pb = kmap(pp); - (void) dmu_read(os, oid, start+off, nbytes, pb+off, - DMU_READ_PREFETCH); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); - unlock_page(pp); - put_page(pp); - } - - len -= nbytes; - off = 0; - } -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedread(struct inode *ip, int nbytes, uio_t *uio) -{ - struct address_space *mp = ip->i_mapping; - struct page *pp; - znode_t *zp = ITOZ(ip); - int64_t start, off; - uint64_t bytes; - int len = nbytes; - int error = 0; - void *pb; - - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); - for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - ASSERT(PageUptodate(pp)); - unlock_page(pp); - - pb = kmap(pp); - error = uiomove(pb + off, bytes, UIO_READ, uio); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - put_page(pp); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); - } - - len -= bytes; - off = 0; - if (error) - break; - } - return (error); -} -#endif /* _KERNEL */ - -unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ -unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: ip - inode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - FSYNC flags; used to provide FRSYNC semantics. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * inode - atime updated if byte count > 0 - */ -/* ARGSUSED */ -int -zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - boolean_t frsync = B_FALSE; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - -#ifdef FRSYNC - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - * Only do this for non-snapshots. - * - * Some platforms do not support FRSYNC and instead map it - * to FSYNC, which results in unnecessary calls to zil_commit. We - * only honor FRSYNC requests on platforms which support it. - */ - frsync = !!(ioflag & FRSYNC); -#endif - if (zfsvfs->z_log && - (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - locked_range_t *lr = rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - ssize_t start_resid = n; - -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { - int nblk; - int blksz = zp->z_blksz; - uint64_t offset = uio->uio_loffset; - - xuio = (xuio_t *)uio; - if ((ISP2(blksz))) { - nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, - blksz)) / blksz; - } else { - ASSERT(offset + n <= blksz); - nblk = 1; - } - (void) dmu_xuio_init(xuio, nblk); - - if (vn_has_cached_data(ip)) { - /* - * For simplicity, we always allocate a full buffer - * even if we only expect to read a portion of a block. - */ - while (--nblk >= 0) { - (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz), 0, blksz); - } - } - } -#endif /* HAVE_UIO_ZEROCOPY */ - - while (n > 0) { - ssize_t nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { - error = mappedread(ip, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } - - int64_t nread = start_resid - n; - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); -out: - rangelock_exit(lr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: ip - inode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - FAPPEND flag set if in append mode. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -int -zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - ssize_t start_resid = uio->uio_resid; - - /* - * Fasttrack empty write - */ - ssize_t n = start_resid; - if (n == 0) - return (0); - - rlim64_t limit = uio->uio_limit; - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - sa_bulk_attr_t bulk[4]; - int count = 0; - uint64_t mtime[2], ctime[2]; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM - */ - if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* - * Validate file offset - */ - offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - int max_blksz = zfsvfs->z_max_blksz; - xuio_t *xuio = NULL; - - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ -#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else -#endif - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFAULT)); - } - - /* - * If in append mode, set the io offset pointer to eof. - */ - locked_range_t *lr; - if (ioflag & FAPPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (woff >= limit) { - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - int write_eof = (woff + n > zp->z_size); - - uint64_t end_size = MAX(zp->z_size, woff + n); - zilog_t *zilog = zfsvfs->z_log; -#ifdef HAVE_UIO_ZEROCOPY - int i_iov = 0; - const iovec_t *iovp = uio->uio_iov; - ASSERTV(int iovcnt = uio->uio_iovcnt); -#endif - - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - woff = uio->uio_loffset; - - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - error = SET_ERROR(EDQUOT); - break; - } - - arc_buf_t *abuf = NULL; - const iovec_t *aiov = NULL; - if (xuio) { -#ifdef HAVE_UIO_ZEROCOPY - ASSERT(i_iov < iovcnt); - ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; -#endif - } else if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); - DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); - DB_DNODE_EXIT(db); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If rangelock_enter() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since rangelock_reduce() will - * shrink down lr_length to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - ssize_t tx_bytes; - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - uio->uio_fault_disable = B_TRUE; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - uio->uio_fault_disable = B_FALSE; - if (error == EFAULT) { - dmu_tx_commit(tx); - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - break; - } - continue; - } else if (error != 0) { - dmu_tx_commit(tx); - break; - } - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - /* cppcheck-suppress nullPointer */ - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - error = dmu_assign_arcbuf_by_dbuf( - sa_get_db(zp->z_sa_hdl), woff, abuf, tx); - if (error != 0) { - dmu_return_arcbuf(abuf); - dmu_tx_commit(tx); - break; - } - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { - update_pages(ip, woff, - tx_bytes, zfsvfs->z_os, zp->z_id); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to do this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - uint32_t uid = KUID_TO_SUID(ip->i_uid); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - ip->i_mode = newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); - ASSERT(error == 0); - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, - NULL, NULL); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - - if (!xuio && n > 0) { - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = EFAULT; - break; - } - } - } - - zfs_inode_update(zp); - rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (FSYNC | FDSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - int64_t nwritten = start_resid - uio->uio_resid; - dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Drop a reference on the passed inode asynchronously. This ensures - * that the caller will never drop the last reference on an inode in - * the current context. Doing so while holding open a tx could result - * in a deadlock if iput_final() re-enters the filesystem code. - */ -void -zfs_iput_async(struct inode *ip) -{ - objset_t *os = ITOZSB(ip)->z_os; - - ASSERT(atomic_read(&ip->i_count) > 0); - ASSERT(os != NULL); - - if (atomic_read(&ip->i_count) == 1) - VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)), - (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); - else - iput(ip); -} - -/* ARGSUSED */ -void -zfs_get_done(zgd_t *zgd, int error) -{ - znode_t *zp = zgd->zgd_private; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_iput_async(ZTOI(zp)); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_iput_async(ZTOI(zp)); - return (SET_ERROR(ENOENT)); - } - - zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held inode reference for it. - * - * IN: dip - inode of directory to search. - * nm - name of entry to lookup. - * flags - LOOKUP_XATTR set if looking for an attribute. - * cr - credentials of caller. - * direntflags - directory lookup flags - * realpnp - returned pathname. - * - * OUT: ipp - inode of located entry, NULL if not found. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -int -zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - cred_t *cr, int *direntflags, pathname_t *realpnp) -{ - znode_t *zdp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - int error = 0; - - /* - * Fast path lookup, however we must skip DNLC lookup - * for case folding or normalizing lookups because the - * DNLC code only stores the passed in name. This means - * creating 'a' and removing 'A' on a case insensitive - * file system would work, but DNLC still thinks 'a' - * exists and won't let you create it again on the next - * pass through fast path. - */ - if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { - - if (!S_ISDIR(dip->i_mode)) { - return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { - return (SET_ERROR(EIO)); - } - - if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { - error = zfs_fastaccesschk_execute(zdp, cr); - if (!error) { - *ipp = dip; - igrab(*ipp); - return (0); - } - return (error); - } - } - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); - - *ipp = NULL; - - if (flags & LOOKUP_XATTR) { - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - - if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0, - B_FALSE, cr))) { - iput(*ipp); - *ipp = NULL; - } - - ZFS_EXIT(zfsvfs); - return (error); - } - - if (!S_ISDIR(dip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTDIR)); - } - - /* - * Check accessibility of directory. - */ - - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp); - if ((error == 0) && (*ipp)) - zfs_inode_update(ITOZ(*ipp)); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the ip of the created or trunc'd file. - * - * IN: dip - inode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - file flag. - * vsecp - ACL to be set - * - * OUT: ipp - inode of created or trunc'd entry. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated if new entry created - * ip - ctime|mtime always, atime if new - */ - -/* ARGSUSED */ -int -zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - objset_t *os; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *ipp = NULL; - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - igrab(dip); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible igrab(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - if (strcmp(name, "..") == 0) - error = SET_ERROR(EISDIR); - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if (zp == NULL) { - uint64_t txtype; - uint64_t projid = ZFS_DEFAULT_PROJID; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - - if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EINVAL); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - - error = dmu_tx_assign(tx, - (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - /* - * Since, we failed to add the directory entry for it, - * delete the newly created dnode. - */ - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - } else { - int aflags = (flag & FAPPEND) ? V_APPEND : 0; - - if (have_acl) - zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; - - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl) { - error = SET_ERROR(EEXIST); - goto out; - } - /* - * Can't open a directory for writing. - */ - if (S_ISDIR(ZTOI(zp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } - - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); - - /* - * Truncate regular files if requested. - */ - if (S_ISREG(ZTOI(zp)->i_mode) && - (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { - /* we can't hold any locks when calling zfs_freesp() */ - if (dl) { - zfs_dirent_unlock(dl); - dl = NULL; - } - error = zfs_freesp(zp, 0, 0, mode, TRUE); - } - } -out: - - if (dl) - zfs_dirent_unlock(dl); - - if (error) { - if (zp) - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *ipp = ZTOI(zp); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -int -zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) -{ - znode_t *zp = NULL, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - objset_t *os; - dmu_tx_t *tx; - int error; - uid_t uid; - gid_t gid; - zfs_acl_ids_t acl_ids; - uint64_t projid = ZFS_DEFAULT_PROJID; - boolean_t fuid_dirtied; - boolean_t have_acl = B_FALSE; - boolean_t waited = B_FALSE; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - gid = crgetgid(cr); - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - -top: - *ipp = NULL; - - /* - * Create a new file object and update the directory - * to reference it. - */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - if (have_acl) - zfs_acl_ids_free(&acl_ids); - goto out; - } - - if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - have_acl = B_TRUE; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) - projid = zfs_inherit_projid(dzp); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - /* Add to unlinked set */ - zp->z_unlinked = B_TRUE; - zfs_unlinked_add(zp, tx); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); -out: - - if (error) { - if (zp) - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - *ipp = ZTOI(zp); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dip - inode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dip - ctime|mtime - * ip - ctime (if nlink > 0) - */ - -uint64_t null_xattr = 0; - -/*ARGSUSED*/ -int -zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) -{ - znode_t *zp, *dzp = ITOZ(dip); - znode_t *xzp; - struct inode *ip; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - uint64_t acl_obj, xattr_obj; - uint64_t xattr_obj_unlinked = 0; - uint64_t obj = 0; - uint64_t links; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked, toobig = FALSE; - uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: - xattr_obj = 0; - xzp = NULL; - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp))) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - ip = ZTOI(zp); - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (S_ISDIR(ip->i_mode)) { - error = SET_ERROR(EPERM); - goto out; - } - - mutex_enter(&zp->z_lock); - may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); - mutex_exit(&zp->z_lock); - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the inode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - obj = zp->z_id; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - if (may_delete_now) { - toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; - /* if the file is too big, only hold_free a token amount */ - dmu_tx_hold_free(tx, zp->z_id, 0, - (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); - } - - /* are there any extended attributes? */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT0(error); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - } - - mutex_enter(&zp->z_lock); - if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - mutex_exit(&zp->z_lock); - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - /* - * Mark this transaction as typically resulting in a net free of space - */ - dmu_tx_mark_netfree(tx); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ip); - if (xzp) - iput(ZTOI(xzp)); - goto top; - } - if (realnmp) - pn_free(realnmp); - dmu_tx_abort(tx); - iput(ip); - if (xzp) - iput(ZTOI(xzp)); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (unlinked) { - /* - * Hold z_lock so that we can make sure that the ACL obj - * hasn't changed. Could have been deleted due to - * zfs_sa_upgrade(). - */ - mutex_enter(&zp->z_lock); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); - delete_now = may_delete_now && !toobig && - atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) && - xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == - acl_obj; - } - - if (delete_now) { - if (xattr_obj_unlinked) { - ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(xzp)); - links = 0; - error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &links, sizeof (links), tx); - ASSERT3U(error, ==, 0); - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - - if (zp->z_is_sa) - error = sa_remove(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), tx); - else - error = sa_update(zp->z_sa_hdl, - SA_ZPL_XATTR(zfsvfs), &null_xattr, - sizeof (uint64_t), tx); - ASSERT0(error); - } - /* - * Add to the unlinked set because a new reference could be - * taken concurrently resulting in a deferred destruction. - */ - zfs_unlinked_add(zp, tx); - mutex_exit(&zp->z_lock); - } else if (unlinked) { - mutex_exit(&zp->z_lock); - zfs_unlinked_add(zp, tx); - } - - txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); - - dmu_tx_commit(tx); -out: - if (realnmp) - pn_free(realnmp); - - zfs_dirent_unlock(dl); - zfs_inode_update(dzp); - zfs_inode_update(zp); - - if (delete_now) - iput(ip); - else - zfs_iput_async(ip); - - if (xzp) { - zfs_inode_update(xzp); - zfs_iput_async(ZTOI(xzp)); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dip using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dip - inode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * flags - case flags. - * vsecp - ACL to be set - * - * OUT: ipp - inode of created directory. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * dip - ctime|mtime updated - * ipp - ctime|mtime|atime updated - */ -/*ARGSUSED*/ -int -zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - cred_t *cr, int flags, vsecattr_t *vsecp) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - zfs_dirlock_t *dl; - uint64_t txtype; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uid_t uid; - gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - boolean_t waited = B_FALSE; - - ASSERT(S_ISDIR(vap->va_mode)); - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - uid = crgetuid(cr); - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - if (dirname == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (zfsvfs->z_utf8 && u8_validate(dirname, - strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - if (vap->va_mask & ATTR_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, - crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - /* - * First make sure the new directory doesn't exist. - * - * Existence is checked first to make sure we don't return - * EACCES instead of EEXIST which can cause some applications - * to fail. - */ -top: - *ipp = NULL; - - if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL))) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - - /* - * Add a new entry to the directory. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - /* - * Now put new name in parent dir. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - goto out; - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - *ipp = ZTOI(zp); - - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, - acl_ids.z_fuidp, vap); - -out: - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (error != 0) { - iput(ZTOI(zp)); - } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dip - inode of directory to remove from. - * name - name of directory to be removed. - * cwd - inode of current working directory. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, - int flags) -{ - znode_t *dzp = ITOZ(dip); - znode_t *zp; - struct inode *ip; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zflg = ZEXISTS; - boolean_t waited = B_FALSE; - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - ip = ZTOI(zp); - - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { - goto out; - } - - if (!S_ISDIR(ip->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - - if (ip == cwd) { - error = SET_ERROR(EINVAL); - goto out; - } - - /* - * Grab a lock on the directory to make sure that no one is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ip); - goto top; - } - dmu_tx_abort(tx); - iput(ip); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); - - if (error == 0) { - uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, - B_FALSE); - } - - dmu_tx_commit(tx); - - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); -out: - zfs_dirent_unlock(dl); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - iput(ip); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read directory entries from the given directory cursor position and emit - * name and position for each entry. - * - * IN: ip - inode of directory to read. - * ctx - directory entry context. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -int -zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - zap_cursor_t zc; - zap_attribute_t zap; - int error; - uint8_t prefetch; - uint8_t type; - int done = 0; - uint64_t parent; - uint64_t offset; /* must be unsigned; checks for < 1 */ - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent))) != 0) - goto out; - - /* - * Quit if directory has been removed (posix) - */ - if (zp->z_unlinked) - goto out; - - error = 0; - os = zfsvfs->z_os; - offset = ctx->pos; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Transform to file-system independent format - */ - while (!done) { - uint64_t objnum; - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - zap.za_normalization_conflict = 0; - objnum = zp->z_id; - type = DT_DIR; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - zap.za_normalization_conflict = 0; - objnum = parent; - type = DT_DIR; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - zap.za_normalization_conflict = 0; - objnum = ZFSCTL_INO_ROOT; - type = DT_DIR; - } else { - /* - * Grab next entry. - */ - if ((error = zap_cursor_retrieve(&zc, &zap))) { - if (error == ENOENT) - break; - else - goto update; - } - - /* - * Allow multiple entries provided the first entry is - * the object id. Non-zpl consumers may safely make - * use of the additional space. - * - * XXX: This should be a feature flag for compatibility - */ - if (zap.za_integer_length != 8 || - zap.za_num_integers == 0) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld, " - "length = %d, num = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset, - zap.za_integer_length, - (u_longlong_t)zap.za_num_integers); - error = SET_ERROR(ENXIO); - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } - - done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), - objnum, type); - if (done) - break; - - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - ctx->pos = offset; - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - -update: - zap_cursor_fini(&zc); - if (error == ENOENT) - error = 0; -out: - ZFS_EXIT(zfsvfs); - - return (error); -} - -ulong_t zfs_fsync_sync_cnt = 4; - -int -zfs_fsync(struct inode *ip, int syncflag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - - return (0); -} - - -/* - * Get the requested file attributes and place them in the provided - * vattr structure. - * - * IN: ip - inode of file. - * vap - va_mask identifies requested attributes. - * If ATTR_XVATTR set, then optional attrs are requested - * flags - ATTR_NOACLCHECK (CIFS server context) - * cr - credentials of caller. - * - * OUT: vap - attribute values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -int -zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error = 0; - uint64_t links; - uint64_t atime[2], mtime[2], ctime[2]; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap = NULL; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - sa_bulk_attr_t bulk[3]; - int count = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. - */ - if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && - (vap->va_uid != crgetuid(cr))) { - if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, - skipaclchk, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * Return all attributes. It's cheaper to provide the answer - * than to determine whether we were asked the question. - */ - - mutex_enter(&zp->z_lock); - vap->va_type = vn_mode_to_vtype(zp->z_mode); - vap->va_mode = zp->z_mode; - vap->va_fsid = ZTOI(zp)->i_sb->s_dev; - vap->va_nodeid = zp->z_id; - if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) - links = ZTOI(zp)->i_nlink + 1; - else - links = ZTOI(zp)->i_nlink; - vap->va_nlink = MIN(links, ZFS_LINK_MAX); - vap->va_size = i_size_read(ip); - vap->va_rdev = ip->i_rdev; - vap->va_seq = ip->i_generation; - - /* - * Add in any requested optional attributes and the create time. - * Also set the corresponding bits in the returned attribute bitmap. - */ - if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - xoap->xoa_archive = - ((zp->z_pflags & ZFS_ARCHIVE) != 0); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - xoap->xoa_readonly = - ((zp->z_pflags & ZFS_READONLY) != 0); - XVA_SET_RTN(xvap, XAT_READONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - xoap->xoa_system = - ((zp->z_pflags & ZFS_SYSTEM) != 0); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - xoap->xoa_hidden = - ((zp->z_pflags & ZFS_HIDDEN) != 0); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - xoap->xoa_nounlink = - ((zp->z_pflags & ZFS_NOUNLINK) != 0); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - xoap->xoa_immutable = - ((zp->z_pflags & ZFS_IMMUTABLE) != 0); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - } - - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - xoap->xoa_appendonly = - ((zp->z_pflags & ZFS_APPENDONLY) != 0); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - xoap->xoa_nodump = - ((zp->z_pflags & ZFS_NODUMP) != 0); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - xoap->xoa_opaque = - ((zp->z_pflags & ZFS_OPAQUE) != 0); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - xoap->xoa_av_quarantined = - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - xoap->xoa_av_modified = - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - S_ISREG(ip->i_mode)) { - zfs_sa_get_scanstamp(zp, xvap); - } - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - uint64_t times[2]; - - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), - times, sizeof (times)); - ZFS_TIME_DECODE(&xoap->xoa_createtime, times); - XVA_SET_RTN(xvap, XAT_CREATETIME); - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); - XVA_SET_RTN(xvap, XAT_REPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_GEN)) { - xoap->xoa_generation = ip->i_generation; - XVA_SET_RTN(xvap, XAT_GEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { - xoap->xoa_offline = - ((zp->z_pflags & ZFS_OFFLINE) != 0); - XVA_SET_RTN(xvap, XAT_OFFLINE); - } - - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { - xoap->xoa_sparse = - ((zp->z_pflags & ZFS_SPARSE) != 0); - XVA_SET_RTN(xvap, XAT_SPARSE); - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - xoap->xoa_projinherit = - ((zp->z_pflags & ZFS_PROJINHERIT) != 0); - XVA_SET_RTN(xvap, XAT_PROJINHERIT); - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { - xoap->xoa_projid = zp->z_projid; - XVA_SET_RTN(xvap, XAT_PROJID); - } - } - - ZFS_TIME_DECODE(&vap->va_atime, atime); - ZFS_TIME_DECODE(&vap->va_mtime, mtime); - ZFS_TIME_DECODE(&vap->va_ctime, ctime); - - mutex_exit(&zp->z_lock); - - sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); - - if (zp->z_blksz == 0) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - vap->va_blksize = zfsvfs->z_max_blksz; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Get the basic file attributes and place them in the provided kstat - * structure. The inode is assumed to be the authoritative source - * for most of the attributes. However, the znode currently has the - * authoritative atime, blksize, and block count. - * - * IN: ip - inode of file. - * - * OUT: sp - kstat values. - * - * RETURN: 0 (always succeeds) - */ -/* ARGSUSED */ -int -zfs_getattr_fast(struct inode *ip, struct kstat *sp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t blksize; - u_longlong_t nblocks; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - - generic_fillattr(ip, sp); - /* - * +1 link count for root inode with visible '.zfs' directory. - */ - if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) - if (sp->nlink < ZFS_LINK_MAX) - sp->nlink++; - - sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); - sp->blksize = blksize; - sp->blocks = nblocks; - - if (unlikely(zp->z_blksz == 0)) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - sp->blksize = zfsvfs->z_max_blksz; - } - - mutex_exit(&zp->z_lock); - - /* - * Required to prevent NFS client from detecting different inode - * numbers of snapshot root dentry before and after snapshot mount. - */ - if (zfsvfs->z_issnap) { - if (ip->i_sb->s_root->d_inode == ip) - sp->ino = ZFSCTL_INO_SNAPDIRS - - dmu_objset_id(zfsvfs->z_os); - } - - ZFS_EXIT(zfsvfs); - - return (0); -} - -/* - * For the operation of changing file's user/group/project, we need to - * handle not only the main object that is assigned to the file directly, - * but also the ones that are used by the file via hidden xattr directory. - * - * Because the xattr directory may contains many EA entries, as to it may - * be impossible to change all of them via the transaction of changing the - * main object's user/group/project attributes. Then we have to change them - * via other multiple independent transactions one by one. It may be not good - * solution, but we have no better idea yet. - */ -static int -zfs_setattr_dir(znode_t *dzp) -{ - struct inode *dxip = ZTOI(dzp); - struct inode *xip = NULL; - zfsvfs_t *zfsvfs = ITOZSB(dxip); - objset_t *os = zfsvfs->z_os; - zap_cursor_t zc; - zap_attribute_t zap; - zfs_dirlock_t *dl; - znode_t *zp; - dmu_tx_t *tx = NULL; - uint64_t uid, gid; - sa_bulk_attr_t bulk[4]; - int count; - int err; - - zap_cursor_init(&zc, os, dzp->z_id); - while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { - count = 0; - if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { - err = ENXIO; - break; - } - - err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, - ZEXISTS, NULL, NULL); - if (err == ENOENT) - goto next; - if (err) - break; - - xip = ZTOI(zp); - if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && - KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && - zp->z_projid == dzp->z_projid) - goto next; - - tx = dmu_tx_create(os); - if (!(zp->z_pflags & ZFS_PROJID)) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - break; - - mutex_enter(&dzp->z_lock); - - if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { - xip->i_uid = dxip->i_uid; - uid = zfs_uid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &uid, sizeof (uid)); - } - - if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { - xip->i_gid = dxip->i_gid; - gid = zfs_gid_read(dxip); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &gid, sizeof (gid)); - } - - if (zp->z_projid != dzp->z_projid) { - if (!(zp->z_pflags & ZFS_PROJID)) { - zp->z_pflags |= ZFS_PROJID; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, - sizeof (zp->z_pflags)); - } - - zp->z_projid = dzp->z_projid; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), - NULL, &zp->z_projid, sizeof (zp->z_projid)); - } - - mutex_exit(&dzp->z_lock); - - if (likely(count > 0)) { - err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - tx = NULL; - if (err != 0 && err != ENOENT) - break; - -next: - if (xip) { - iput(xip); - xip = NULL; - zfs_dirent_unlock(dl); - } - zap_cursor_advance(&zc); - } - - if (tx) - dmu_tx_abort(tx); - if (xip) { - iput(xip); - zfs_dirent_unlock(dl); - } - zap_cursor_fini(&zc); - - return (err == ENOENT ? 0 : err); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: ip - inode of file to be modified. - * vap - new attribute values. - * If ATTR_XVATTR set, then optional attrs are being set - * flags - ATTR_UTIME set if non-default time values provided. - * - ATTR_NOACLCHECK (CIFS context only). - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -int -zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os = zfsvfs->z_os; - zilog_t *zilog; - dmu_tx_t *tx; - vattr_t oldva; - xvattr_t *tmpxvattr; - uint_t mask = vap->va_mask; - uint_t saved_mask = 0; - int trim_mask = 0; - uint64_t new_mode; - uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; - uint64_t xattr_obj; - uint64_t mtime[2], ctime[2], atime[2]; - uint64_t projid = ZFS_INVALID_PROJID; - znode_t *attrzp; - int need_policy = FALSE; - int err, err2 = 0; - zfs_fuid_info_t *fuidp = NULL; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap; - zfs_acl_t *aclp; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; - boolean_t handle_eadir = B_FALSE; - sa_bulk_attr_t *bulk, *xattr_bulk; - int count = 0, xattr_count = 0, bulks = 8; - - if (mask == 0) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * If this is a xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - if (xoap != NULL && (mask & ATTR_XVATTR)) { - if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { - if (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - - projid = xoap->xoa_projid; - if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) - projid = ZFS_INVALID_PROJID; - else - need_policy = TRUE; - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && - (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && - (!dmu_objset_projectquota_enabled(os) || - (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOTSUP)); - } - } - - zilog = zfsvfs->z_log; - - /* - * Make sure that if we have ephemeral uid/gid or xvattr specified - * that file system is at proper version level - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || - (mask & ATTR_XVATTR))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); - xva_init(tmpxvattr); - - bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); - - /* - * Immutable files can only alter immutable bit and atime - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) && - ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || - ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - err = SET_ERROR(EPERM); - goto out3; - } - - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } - - /* - * Verify timestamps doesn't overflow 32 bits. - * ZFS can handle large timestamps, but 32bit syscalls can't - * handle times greater than 2039. This check should be removed - * once large timestamps are fully supported. - */ - if (mask & (ATTR_ATIME | ATTR_MTIME)) { - if (((mask & ATTR_ATIME) && - TIMESPEC_OVERFLOW(&vap->va_atime)) || - ((mask & ATTR_MTIME) && - TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = SET_ERROR(EOVERFLOW); - goto out3; - } - } - -top: - attrzp = NULL; - aclp = NULL; - - /* Can this be moved to before the top label? */ - if (zfs_is_readonly(zfsvfs)) { - err = SET_ERROR(EROFS); - goto out3; - } - - /* - * First validate permissions - */ - - if (mask & ATTR_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); - if (err) - goto out3; - - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - /* XXX - would it be OK to generate a log record here? */ - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - if (err) - goto out3; - } - - if (mask & (ATTR_ATIME|ATTR_MTIME) || - ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || - XVA_ISSET_REQ(xvap, XAT_READONLY) || - XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || - XVA_ISSET_REQ(xvap, XAT_OFFLINE) || - XVA_ISSET_REQ(xvap, XAT_SPARSE) || - XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { - need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); - } - - if (mask & (ATTR_UID|ATTR_GID)) { - int idmask = (mask & (ATTR_UID|ATTR_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & ATTR_MODE)) - vap->va_mode = zp->z_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & ATTR_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); - - /* - * If both ATTR_UID and ATTR_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (ATTR_UID|ATTR_GID)) && - take_owner && take_group) || - ((idmask == ATTR_UID) && take_owner) || - ((idmask == ATTR_GID) && take_group)) { - if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - (void) secpolicy_setid_clear(vap, cr); - trim_mask = (mask & (ATTR_UID|ATTR_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - mutex_enter(&zp->z_lock); - oldva.va_mode = zp->z_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); - if (mask & ATTR_XVATTR) { - /* - * Update xvattr mask to include only those attributes - * that are actually changing. - * - * the bits will be restored prior to actually setting - * the attributes so the caller thinks they were set. - */ - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - if (xoap->xoa_appendonly != - ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_APPENDONLY); - XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - if (xoap->xoa_projinherit != - ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_PROJINHERIT); - XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - if (xoap->xoa_nounlink != - ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NOUNLINK); - XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - if (xoap->xoa_immutable != - ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_IMMUTABLE); - XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - if (xoap->xoa_nodump != - ((zp->z_pflags & ZFS_NODUMP) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NODUMP); - XVA_SET_REQ(tmpxvattr, XAT_NODUMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - if (xoap->xoa_av_modified != - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); - XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - if ((!S_ISREG(ip->i_mode) && - xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); - XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - mutex_exit(&zp->z_lock); - err = SET_ERROR(EPERM); - goto out3; - } - - if (need_policy == FALSE && - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || - XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; - } - } - - mutex_exit(&zp->z_lock); - - if (mask & ATTR_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(ip, vap, - &oldva, cr); - if (err) - goto out3; - - trim_mask |= ATTR_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - } - err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); - if (err) - goto out3; - - if (trim_mask) - vap->va_mask |= saved_mask; - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { - handle_eadir = B_TRUE; - err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - - if (err == 0 && xattr_obj) { - err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); - if (err) - goto out2; - } - if (mask & ATTR_UID) { - new_kuid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && - zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, - new_kuid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (mask & ATTR_GID) { - new_kgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); - if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && - zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, - new_kgid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (projid != ZFS_INVALID_PROJID && - zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { - if (attrzp) - iput(ZTOI(attrzp)); - err = EDQUOT; - goto out2; - } - } - tx = dmu_tx_create(os); - - if (mask & ATTR_MODE) { - uint64_t pmode = zp->z_mode; - uint64_t acl_obj; - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - zfs_acl_chmod_setattr(zp, &aclp, new_mode); - - mutex_enter(&zp->z_lock); - if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { - /* - * Are we upgrading ACL from old V0 format - * to V1 format? - */ - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, - aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - mutex_exit(&zp->z_lock); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - } else { - if (((mask & ATTR_XVATTR) && - XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (projid != ZFS_INVALID_PROJID && - !(zp->z_pflags & ZFS_PROJID))) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - } - - if (attrzp) { - dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); - } - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - goto out; - - count = 0; - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { - /* - * For the existed object that is upgraded from old system, - * its on-disk layout has no slot for the project ID attribute. - * But quota accounting logic needs to access related slots by - * offset directly. So we need to adjust old objects' layout - * to make the project ID to some unified and fixed offset. - */ - if (attrzp) - err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); - if (err == 0) - err = sa_add_projid(zp->z_sa_hdl, tx, projid); - - if (unlikely(err == EEXIST)) - err = 0; - else if (err != 0) - goto out; - else - projid = ZFS_INVALID_PROJID; - } - - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_enter(&attrzp->z_acl_lock); - mutex_enter(&attrzp->z_lock); - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, - sizeof (attrzp->z_pflags)); - if (projid != ZFS_INVALID_PROJID) { - attrzp->z_projid = projid; - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, - sizeof (attrzp->z_projid)); - } - } - - if (mask & (ATTR_UID|ATTR_GID)) { - - if (mask & ATTR_UID) { - ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); - new_uid = zfs_uid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &new_uid, sizeof (new_uid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_UID(zfsvfs), NULL, &new_uid, - sizeof (new_uid)); - ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); - } - } - - if (mask & ATTR_GID) { - ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); - new_gid = zfs_gid_read(ZTOI(zp)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), - NULL, &new_gid, sizeof (new_gid)); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_GID(zfsvfs), NULL, &new_gid, - sizeof (new_gid)); - ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); - } - } - if (!(mask & ATTR_MODE)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), - NULL, &new_mode, sizeof (new_mode)); - new_mode = zp->z_mode; - } - err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); - if (attrzp) { - err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); - } - } - - if (mask & ATTR_MODE) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &new_mode, sizeof (new_mode)); - zp->z_mode = ZTOI(zp)->i_mode = new_mode; - ASSERT3P(aclp, !=, NULL); - err = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT0(err); - if (zp->z_acl_cached) - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = aclp; - aclp = NULL; - } - - if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { - zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &atime, sizeof (atime)); - } - - if (mask & (ATTR_MTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime, - ZTOI(zp)->i_sb->s_time_gran); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - } - - if (mask & (ATTR_CTIME | ATTR_SIZE)) { - ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime, - ZTOI(zp)->i_sb->s_time_gran); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - } - - if (projid != ZFS_INVALID_PROJID) { - zp->z_projid = projid; - SA_ADD_BULK_ATTR(bulk, count, - SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, - sizeof (zp->z_projid)); - } - - if (attrzp && mask) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_CTIME(zfsvfs), NULL, &ctime, - sizeof (ctime)); - } - - /* - * Do this after setting timestamps to prevent timestamp - * update from toggling bit - */ - - if (xoap && (mask & ATTR_XVATTR)) { - - /* - * restore trimmed off masks - * so that return masks can be set for caller. - */ - - if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { - XVA_SET_REQ(xvap, XAT_APPENDONLY); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { - XVA_SET_REQ(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { - XVA_SET_REQ(xvap, XAT_IMMUTABLE); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { - XVA_SET_REQ(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { - XVA_SET_REQ(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { - XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); - } - if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { - XVA_SET_REQ(xvap, XAT_PROJINHERIT); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - ASSERT(S_ISREG(ip->i_mode)); - - zfs_xvattr_set(zp, xvap, tx); - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - - mutex_exit(&zp->z_lock); - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&zp->z_acl_lock); - - if (attrzp) { - if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) - mutex_exit(&attrzp->z_acl_lock); - mutex_exit(&attrzp->z_lock); - } -out: - if (err == 0 && xattr_count > 0) { - err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, - xattr_count, tx); - ASSERT(err2 == 0); - } - - if (aclp) - zfs_acl_free(aclp); - - if (fuidp) { - zfs_fuid_info_free(fuidp); - fuidp = NULL; - } - - if (err) { - dmu_tx_abort(tx); - if (attrzp) - iput(ZTOI(attrzp)); - if (err == ERESTART) - goto top; - } else { - if (count > 0) - err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - if (attrzp) { - if (err2 == 0 && handle_eadir) - err2 = zfs_setattr_dir(attrzp); - iput(ZTOI(attrzp)); - } - zfs_inode_update(zp); - } - -out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - -out3: - kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); - kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zfsvfs); - return (err); -} - -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - -/* - * Drop locks and release vnodes that were held by zfs_rename_lock(). - */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - zfs_iput_async(ZTOI(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); - } -} - -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = ZTOZSB(zp)->z_root; - uint64_t oidp = zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; - - /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. - */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); - } - } - - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; - - if (oidp == szp->z_id) /* We're a descendant of szp */ - return (SET_ERROR(EINVAL)); - - if (oidp == rootid) /* We've hit the top */ - return (0); - - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(ZTOZSB(zp), oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), - &oidp, sizeof (oidp)); - rwlp = &zp->z_parent_lock; - rw = RW_READER; - - } while (zp->z_id != sdzp->z_id); - - return (0); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdip - Source directory containing the "old entry". - * snm - Old entry name. - * tdip - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * sdip,tdip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - cred_t *cr, int flags) -{ - znode_t *tdzp, *szp, *tzp; - znode_t *sdzp = ITOZ(sdip); - zfsvfs_t *zfsvfs = ITOZSB(sdip); - zilog_t *zilog; - zfs_dirlock_t *sdl, *tdl; - dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; - int error = 0; - int zflg = 0; - boolean_t waited = B_FALSE; - - if (snm == NULL || tnm == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - zilog = zfsvfs->z_log; - - tdzp = ITOZ(tdip); - ZFS_VERIFY_ZP(tdzp); - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. - */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); - - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } - } - - /* - * If the source and destination directories are the same, we should - * grab the z_name_lock of that directory only once. - */ - if (sdzp == tdzp) { - zflg |= ZHAVELOCK; - rw_enter(&sdzp->z_name_lock, RW_READER); - } - - if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); - } - - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - iput(ZTOI(tzp)); - } - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); - } - if (terr) { - zfs_dirent_unlock(sdl); - iput(ZTOI(szp)); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); - } - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow renames into our tree when the project - * IDs are the same. - */ - if (tdzp->z_pflags & ZFS_PROJINHERIT && - tdzp->z_projid != szp->z_projid) { - error = SET_ERROR(EXDEV); - goto out; - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) - goto out; - - if (S_ISDIR(ZTOI(szp)->i_mode)) { - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) - goto out; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); - goto out; - } - } - /* - * POSIX dictates that when the source and target - * entries refer to the same file object, rename - * must do nothing and exit without error. - */ - if (szp->z_id == tzp->z_id) { - error = 0; - goto out; - } - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) { - dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tdzp); - } - if (tzp) { - dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tzp); - } - - zfs_sa_upgrade_txholds(tx, szp); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - iput(ZTOI(szp)); - if (tzp) - iput(ZTOI(tzp)); - goto top; - } - dmu_tx_abort(tx); - iput(ZTOI(szp)); - if (tzp) - iput(ZTOI(tzp)); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); - - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); - - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); - } - } - - dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - zfs_inode_update(sdzp); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (sdzp != tdzp) - zfs_inode_update(tdzp); - - zfs_inode_update(szp); - iput(ZTOI(szp)); - if (tzp) { - zfs_inode_update(tzp); - iput(ZTOI(tzp)); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dip - Directory to contain new symbolic link. - * name - Name of directory entry in dip. - * vap - Attributes of new entry. - * link - Name for new symlink entry. - * cr - credentials of caller. - * flags - case flags - * - * OUT: ipp - Inode for new symbolic link. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dip - ctime|mtime updated - */ -/*ARGSUSED*/ -int -zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - struct inode **ipp, cred_t *cr, int flags) -{ - znode_t *zp, *dzp = ITOZ(dip); - zfs_dirlock_t *dl; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ITOZSB(dip); - zilog_t *zilog; - uint64_t len = strlen(link); - int error; - int zflg = ZNEW; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - uint64_t txtype = TX_SYMLINK; - boolean_t waited = B_FALSE; - - ASSERT(S_ISLNK(vap->va_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENAMETOOLONG)); - } - - if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } -top: - *ipp = NULL; - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); - if (error) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - tx = dmu_tx_create(zfsvfs->z_os); - fuid_dirtied = zfsvfs->z_fuid_dirty; - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE + len); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create a new object for the symlink. - * for version 4 ZPL datsets the symlink will be an SA attribute - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), - link, len, tx); - else - zfs_sa_symlink(zp, link, len, tx); - mutex_exit(&zp->z_lock); - - zp->z_size = len; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx); - /* - * Insert the new object into the directory. - */ - error = zfs_link_create(dl, zp, tx, ZNEW); - if (error != 0) { - zfs_znode_delete(zp, tx); - remove_inode_hash(ZTOI(zp)); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - - zfs_inode_update(dzp); - zfs_inode_update(zp); - } - - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (error == 0) { - *ipp = ZTOI(zp); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - } else { - iput(ZTOI(zp)); - } - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by ip. - * - * IN: ip - inode of symbolic link - * uio - structure to contain the link path. - * cr - credentials of caller. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - atime updated - */ -/* ARGSUSED */ -int -zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - mutex_enter(&zp->z_lock); - if (zp->z_is_sa) - error = sa_lookup_uio(zp->z_sa_hdl, - SA_ZPL_SYMLINK(zfsvfs), uio); - else - error = zfs_sa_readlink(zp, uio); - mutex_exit(&zp->z_lock); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdip referencing sip. - * - * IN: tdip - Directory to contain new entry. - * sip - inode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * flags - case flags. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * tdip - ctime|mtime updated - * sip - ctime updated - */ -/* ARGSUSED */ -int -zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr, - int flags) -{ - znode_t *dzp = ITOZ(tdip); - znode_t *tzp, *szp; - zfsvfs_t *zfsvfs = ITOZSB(tdip); - zilog_t *zilog; - zfs_dirlock_t *dl; - dmu_tx_t *tx; - int error; - int zf = ZNEW; - uint64_t parent; - uid_t owner; - boolean_t waited = B_FALSE; - boolean_t is_tmpfile = 0; - uint64_t txg; -#ifdef HAVE_TMPFILE - is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); -#endif - ASSERT(S_ISDIR(tdip->i_mode)); - - if (name == NULL) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (S_ISDIR(sip->i_mode)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - szp = ITOZ(sip); - ZFS_VERIFY_ZP(szp); - - /* - * If we are using project inheritance, means if the directory has - * ZFS_PROJINHERIT set, then its descendant directories will inherit - * not only the project ID, but also the ZFS_PROJINHERIT flag. Under - * such case, we only allow hard link creation in our tree when the - * project IDs are the same. - */ - if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* - * We check i_sb because snapshots and the ctldir must have different - * super blocks. - */ - if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EXDEV)); - } - - /* Prevent links to .zfs/shares files */ - - if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), - cr, ZFS_OWNER); - if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - -top: - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (is_tmpfile) - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - zfs_sa_upgrade_txholds(tx, szp); - zfs_sa_upgrade_txholds(tx, dzp); - error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - waited = B_TRUE; - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - /* unmark z_unlinked so zfs_link_create will not reject */ - if (is_tmpfile) - szp->z_unlinked = B_FALSE; - error = zfs_link_create(dl, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - /* - * tmpfile is created to be in z_unlinkedobj, so remove it. - * Also, we don't log in ZIL, because all previous file - * operation on the tmpfile are ignored by ZIL. Instead we - * always wait for txg to sync to make sure all previous - * operation are sync safe. - */ - if (is_tmpfile) { - VERIFY(zap_remove_int(zfsvfs->z_os, - zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); - } else { - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); - } - } else if (is_tmpfile) { - /* restore z_unlinked since when linking failed */ - szp->z_unlinked = B_TRUE; - } - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); - - zfs_dirent_unlock(dl); - - if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (is_tmpfile) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); - - zfs_inode_update(dzp); - zfs_inode_update(szp); - ZFS_EXIT(zfsvfs); - return (error); -} - -static void -zfs_putpage_commit_cb(void *arg) -{ - struct page *pp = arg; - - ClearPageError(pp); - end_page_writeback(pp); -} - -/* - * Push a page out to disk, once the page is on stable storage the - * registered commit callback will be run as notification of completion. - * - * IN: ip - page mapped for inode. - * pp - page to push (page is locked) - * wbc - writeback control data - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - loff_t offset; - loff_t pgoff; - unsigned int pglen; - dmu_tx_t *tx; - caddr_t va; - int err = 0; - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int cnt = 0; - struct address_space *mapping; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - ASSERT(PageLocked(pp)); - - pgoff = page_offset(pp); /* Page byte-offset in file */ - offset = i_size_read(ip); /* File length in bytes */ - pglen = MIN(PAGE_SIZE, /* Page length in bytes */ - P2ROUNDUP(offset, PAGE_SIZE)-pgoff); - - /* Page is beyond end of file */ - if (pgoff >= offset) { - unlock_page(pp); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Truncate page length to end of file */ - if (pgoff + pglen > offset) - pglen = offset - pgoff; - -#if 0 - /* - * FIXME: Allow mmap writes past its quota. The correct fix - * is to register a page_mkwrite() handler to count the page - * against its quota when it is about to be dirtied. - */ - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - err = EDQUOT; - } -#endif - - /* - * The ordering here is critical and must adhere to the following - * rules in order to avoid deadlocking in either zfs_read() or - * zfs_free_range() due to a lock inversion. - * - * 1) The page must be unlocked prior to acquiring the range lock. - * This is critical because zfs_read() calls find_lock_page() - * which may block on the page lock while holding the range lock. - * - * 2) Before setting or clearing write back on a page the range lock - * must be held in order to prevent a lock inversion with the - * zfs_free_range() function. - * - * This presents a problem because upon entering this function the - * page lock is already held. To safely acquire the range lock the - * page lock must be dropped. This creates a window where another - * process could truncate, invalidate, dirty, or write out the page. - * - * Therefore, after successfully reacquiring the range and page locks - * the current page state is checked. In the common case everything - * will be as is expected and it can be written out. However, if - * the page state has changed it must be handled accordingly. - */ - mapping = pp->mapping; - redirty_page_for_writepage(wbc, pp); - unlock_page(pp); - - locked_range_t *lr = rangelock_enter(&zp->z_rangelock, - pgoff, pglen, RL_WRITER); - lock_page(pp); - - /* Page mapping changed or it was no longer dirty, we're done */ - if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { - unlock_page(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Another process started write block if required */ - if (PageWriteback(pp)) { - unlock_page(pp); - rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(pp)) - wait_on_page_bit(pp, PG_writeback); - } - - ZFS_EXIT(zfsvfs); - return (0); - } - - /* Clear the dirty flag the required locks are held */ - if (!clear_page_dirty_for_io(pp)) { - unlock_page(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Counterpart for redirty_page_for_writepage() above. This page - * was in fact not skipped and should not be counted as if it were. - */ - wbc->pages_skipped--; - set_page_writeback(pp); - unlock_page(pp); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_NOWAIT); - if (err != 0) { - if (err == ERESTART) - dmu_tx_wait(tx); - - dmu_tx_abort(tx); - __set_page_dirty_nobuffers(pp); - ClearPageError(pp); - end_page_writeback(pp); - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (err); - } - - va = kmap(pp); - ASSERT3U(pglen, <=, PAGE_SIZE); - dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); - kunmap(pp); - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - zp->z_atime_dirty = B_FALSE; - zp->z_seq++; - - err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - zfs_putpage_commit_cb, pp); - dmu_tx_commit(tx); - - rangelock_exit(lr); - - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - zil_commit(zfsvfs->z_log, zp->z_id); - } - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Update the system attributes when the inode has been dirtied. For the - * moment we only update the mode, atime, mtime, and ctime. - */ -int -zfs_dirty_inode(struct inode *ip, int flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - dmu_tx_t *tx; - uint64_t mode, atime[2], mtime[2], ctime[2]; - sa_bulk_attr_t bulk[4]; - int error = 0; - int cnt = 0; - - if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - -#ifdef I_DIRTY_TIME - /* - * This is the lazytime semantic introduced in Linux 4.0 - * This flag will only be called from update_time when lazytime is set. - * (Note, I_DIRTY_SYNC will also set if not lazytime) - * Fortunately mtime and ctime are managed within ZFS itself, so we - * only need to dirty atime. - */ - if (flags == I_DIRTY_TIME) { - zp->z_atime_dirty = B_TRUE; - goto out; - } -#endif - - tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto out; - } - - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = B_FALSE; - - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); - mode = ip->i_mode; - - zp->z_mode = mode; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - mutex_exit(&zp->z_lock); - - dmu_tx_commit(tx); -out: - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -void -zfs_inactive(struct inode *ip) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t atime[2]; - int error; - int need_unlock = 0; - - /* Only read lock if we haven't already write locked, e.g. rollback */ - if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { - need_unlock = 1; - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - } - if (zp->z_sa_hdl == NULL) { - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&atime, sizeof (atime), tx); - zp->z_atime_dirty = B_FALSE; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - if (need_unlock) - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} - -/* - * Bounds-check the seek operation. - * - * IN: ip - inode seeking within - * ooff - old file offset - * noffp - pointer to new file offset - * - * RETURN: 0 if success - * EINVAL if new offset invalid - */ -/* ARGSUSED */ -int -zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp) -{ - if (S_ISDIR(ip->i_mode)) - return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} - -/* - * Fill pages with data from the disk. - */ -static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); - - if (io_off + io_len > i_size) - io_len = i_size - io_off; - - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } - } - - return (0); -} - -/* - * Uses zfs_fillpage to read data from the file and fill the pages. - * - * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - err = zfs_fillpage(ip, pl, nr_pages); - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Check ZFS specific permissions to memory map a section of a file. - * - * IN: ip - inode of the file to mmap - * off - file offset - * addrp - start address in memory region - * len - length of memory region - * vm_flags- address flags - * - * RETURN: 0 if success - * error code if failure - */ -/*ARGSUSED*/ -int -zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - unsigned long vm_flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if ((vm_flags & (VM_READ | VM_EXEC)) && - (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENXIO)); - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * convoff - converts the given data (start, whence) to the - * given whence. - */ -int -convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) -{ - vattr_t vap; - int error; - - if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) { - if ((error = zfs_getattr(ip, &vap, 0, CRED()))) - return (error); - } - - switch (lckdat->l_whence) { - case SEEK_CUR: - lckdat->l_start += offset; - break; - case SEEK_END: - lckdat->l_start += vap.va_size; - /* FALLTHRU */ - case SEEK_SET: - break; - default: - return (SET_ERROR(EINVAL)); - } - - if (lckdat->l_start < 0) - return (SET_ERROR(EINVAL)); - - switch (whence) { - case SEEK_CUR: - lckdat->l_start -= offset; - break; - case SEEK_END: - lckdat->l_start -= vap.va_size; - /* FALLTHRU */ - case SEEK_SET: - break; - default: - return (SET_ERROR(EINVAL)); - } - - lckdat->l_whence = (short)whence; - return (0); -} - -/* - * Free or allocate space in a file. Currently, this function only - * supports the `F_FREESP' command. However, this command is somewhat - * misnamed, as its functionality includes the ability to allocate as - * well as free space. - * - * IN: ip - inode of file to free data in. - * cmd - action to take (only F_FREESP supported). - * bfp - section of file to free/alloc. - * flag - current file open mode flags. - * offset - current file offset. - * cr - credentials of caller. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * ip - ctime|mtime updated - */ -/* ARGSUSED */ -int -zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint64_t off, len; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - if ((error = convoff(ip, bfp, SEEK_SET, offset))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Permissions aren't checked on Solaris because on this OS - * zfs_space() can only be called with an opened file handle. - * On Linux we can get here through truncate_range() which - * operates directly on inodes, so we need to check access rights. - */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - - off = bfp->l_start; - len = bfp->l_len; /* 0 means from off to end of file */ - - error = zfs_freesp(zp, off, len, flag, TRUE); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -int -zfs_fid(struct inode *ip, fid_t *fidp) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - uint32_t gen; - uint64_t gen64; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i, error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), - &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - gen = (uint32_t)gen64; - - size = SHORT_FID_LEN; - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -int -zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef HAVE_UIO_ZEROCOPY -/* - * The smallest read we may consider to loan out an arcbuf. - * This must be a power of 2. - */ -int zcr_blksz_min = (1 << 10); /* 1K */ -/* - * If set to less than the file block size, allow loaning out of an - * arcbuf for a partial block read. This must be a power of 2. - */ -int zcr_blksz_max = (1 << 17); /* 128K */ - -/*ARGSUSED*/ -static int -zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int max_blksz = zfsvfs->z_max_blksz; - uio_t *uio = &xuio->xu_uio; - ssize_t size = uio->uio_resid; - offset_t offset = uio->uio_loffset; - int blksz; - int fullblk, i; - arc_buf_t *abuf; - ssize_t maxsize; - int preamble, postamble; - - if (xuio->xu_type != UIOTYPE_ZEROCOPY) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - switch (ioflag) { - case UIO_WRITE: - /* - * Loan out an arc_buf for write if write size is bigger than - * max_blksz, and the file's block size is also max_blksz. - */ - blksz = max_blksz; - if (size < blksz || zp->z_blksz != blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* - * Caller requests buffers for write before knowing where the - * write offset might be (e.g. NFS TCP write). - */ - if (offset == -1) { - preamble = 0; - } else { - preamble = P2PHASE(offset, blksz); - if (preamble) { - preamble = blksz - preamble; - size -= preamble; - } - } - - postamble = P2PHASE(size, blksz); - size -= postamble; - - fullblk = size / blksz; - (void) dmu_xuio_init(xuio, - (preamble != 0) + fullblk + (postamble != 0)); - - /* - * Have to fix iov base/len for partial buffers. They - * currently represent full arc_buf's. - */ - if (preamble) { - /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, - blksz - preamble, preamble); - } - - for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, blksz); - } - - if (postamble) { - /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, postamble); - } - break; - case UIO_READ: - /* - * Loan out an arc_buf for read if the read size is larger than - * the current file block size. Block alignment is not - * considered. Partial arc_buf will be loaned out for read. - */ - blksz = zp->z_blksz; - if (blksz < zcr_blksz_min) - blksz = zcr_blksz_min; - if (blksz > zcr_blksz_max) - blksz = zcr_blksz_max; - /* avoid potential complexity of dealing with it */ - if (blksz > max_blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - maxsize = zp->z_size - uio->uio_loffset; - if (size > maxsize) - size = maxsize; - - if (size < blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - break; - default: - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - uio->uio_extflg = UIO_XUIO; - XUIO_XUZC_RW(xuio) = ioflag; - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -static int -zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) -{ - int i; - arc_buf_t *abuf; - int ioflag = XUIO_XUZC_RW(xuio); - - ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); - - i = dmu_xuio_cnt(xuio); - while (i-- > 0) { - abuf = dmu_xuio_arcbuf(xuio, i); - /* - * if abuf == NULL, it must be a write buffer - * that has been returned in zfs_write(). - */ - if (abuf) - dmu_return_arcbuf(abuf); - ASSERT(abuf || ioflag == UIO_WRITE); - } - - dmu_xuio_fini(xuio); - return (0); -} -#endif /* HAVE_UIO_ZEROCOPY */ - -#if defined(_KERNEL) -EXPORT_SYMBOL(zfs_open); -EXPORT_SYMBOL(zfs_close); -EXPORT_SYMBOL(zfs_read); -EXPORT_SYMBOL(zfs_write); -EXPORT_SYMBOL(zfs_access); -EXPORT_SYMBOL(zfs_lookup); -EXPORT_SYMBOL(zfs_create); -EXPORT_SYMBOL(zfs_tmpfile); -EXPORT_SYMBOL(zfs_remove); -EXPORT_SYMBOL(zfs_mkdir); -EXPORT_SYMBOL(zfs_rmdir); -EXPORT_SYMBOL(zfs_readdir); -EXPORT_SYMBOL(zfs_fsync); -EXPORT_SYMBOL(zfs_getattr); -EXPORT_SYMBOL(zfs_getattr_fast); -EXPORT_SYMBOL(zfs_setattr); -EXPORT_SYMBOL(zfs_rename); -EXPORT_SYMBOL(zfs_symlink); -EXPORT_SYMBOL(zfs_readlink); -EXPORT_SYMBOL(zfs_link); -EXPORT_SYMBOL(zfs_inactive); -EXPORT_SYMBOL(zfs_space); -EXPORT_SYMBOL(zfs_fid); -EXPORT_SYMBOL(zfs_getsecattr); -EXPORT_SYMBOL(zfs_setsecattr); -EXPORT_SYMBOL(zfs_getpage); -EXPORT_SYMBOL(zfs_putpage); -EXPORT_SYMBOL(zfs_dirty_inode); -EXPORT_SYMBOL(zfs_map); - -/* BEGIN CSTYLED */ -module_param(zfs_delete_blocks, ulong, 0644); -MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -module_param(zfs_read_chunk_size, ulong, 0644); -MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); -/* END CSTYLED */ - -#endif diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c deleted file mode 100644 index 549c701a0..000000000 --- a/module/zfs/zfs_znode.c +++ /dev/null @@ -1,2234 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -/* Portions Copyright 2007 Jeremy Teo */ - -#ifdef _KERNEL -#include <sys/types.h> -#include <sys/param.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <sys/mntent.h> -#include <sys/u8_textprep.h> -#include <sys/dsl_dataset.h> -#include <sys/vfs.h> -#include <sys/vnode.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/errno.h> -#include <sys/mode.h> -#include <sys/atomic.h> -#include <sys/zfs_dir.h> -#include <sys/zfs_acl.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_rlock.h> -#include <sys/zfs_fuid.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_ctldir.h> -#include <sys/dnode.h> -#include <sys/fs/zfs.h> -#include <sys/zpl.h> -#endif /* _KERNEL */ - -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dmu_tx.h> -#include <sys/refcount.h> -#include <sys/stat.h> -#include <sys/zap.h> -#include <sys/zfs_znode.h> -#include <sys/sa.h> -#include <sys/zfs_sa.h> -#include <sys/zfs_stat.h> - -#include "zfs_prop.h" -#include "zfs_comutil.h" - -/* - * Functions needed for userland (ie: libzpool) are not put under - * #ifdef_KERNEL; the rest of the functions have dependencies - * (such as VFS logic) that will not compile easily in userland. - */ -#ifdef _KERNEL - -static kmem_cache_t *znode_cache = NULL; -static kmem_cache_t *znode_hold_cache = NULL; -unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; - -/* - * This is used by the test suite so that it can delay znodes from being - * freed in order to inspect the unlinked set. - */ -int zfs_unlink_suspend_progress = 0; - -/* - * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on - * z_rangelock. It will modify the offset and length of the lock to reflect - * znode-specific information, and convert RL_APPEND to RL_WRITER. This is - * called with the rangelock_t's rl_lock held, which avoids races. - */ -static void -zfs_rangelock_cb(locked_range_t *new, void *arg) -{ - znode_t *zp = arg; - - /* - * If in append mode, convert to writer and lock starting at the - * current end of file. - */ - if (new->lr_type == RL_APPEND) { - new->lr_offset = zp->z_size; - new->lr_type = RL_WRITER; - } - - /* - * If we need to grow the block size then lock the whole file range. - */ - uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { - new->lr_offset = 0; - new->lr_length = UINT64_MAX; - } -} - -/*ARGSUSED*/ -static int -zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) -{ - znode_t *zp = buf; - - inode_init_once(ZTOI(zp)); - list_link_init(&zp->z_link_node); - - mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); - mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); - - rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); - - zp->z_dirlocks = NULL; - zp->z_acl_cached = NULL; - zp->z_xattr_cached = NULL; - zp->z_xattr_parent = 0; - zp->z_moved = B_FALSE; - return (0); -} - -/*ARGSUSED*/ -static void -zfs_znode_cache_destructor(void *buf, void *arg) -{ - znode_t *zp = buf; - - ASSERT(!list_link_active(&zp->z_link_node)); - mutex_destroy(&zp->z_lock); - rw_destroy(&zp->z_parent_lock); - rw_destroy(&zp->z_name_lock); - mutex_destroy(&zp->z_acl_lock); - rw_destroy(&zp->z_xattr_lock); - rangelock_fini(&zp->z_rangelock); - - ASSERT(zp->z_dirlocks == NULL); - ASSERT(zp->z_acl_cached == NULL); - ASSERT(zp->z_xattr_cached == NULL); -} - -static int -zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) -{ - znode_hold_t *zh = buf; - - mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); - zfs_refcount_create(&zh->zh_refcount); - zh->zh_obj = ZFS_NO_OBJECT; - - return (0); -} - -static void -zfs_znode_hold_cache_destructor(void *buf, void *arg) -{ - znode_hold_t *zh = buf; - - mutex_destroy(&zh->zh_lock); - zfs_refcount_destroy(&zh->zh_refcount); -} - -void -zfs_znode_init(void) -{ - /* - * Initialize zcache. The KMC_SLAB hint is used in order that it be - * backed by kmalloc() when on the Linux slab in order that any - * wait_on_bit() operations on the related inode operate properly. - */ - ASSERT(znode_cache == NULL); - znode_cache = kmem_cache_create("zfs_znode_cache", - sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); - - ASSERT(znode_hold_cache == NULL); - znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", - sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, - zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); -} - -void -zfs_znode_fini(void) -{ - /* - * Cleanup zcache - */ - if (znode_cache) - kmem_cache_destroy(znode_cache); - znode_cache = NULL; - - if (znode_hold_cache) - kmem_cache_destroy(znode_hold_cache); - znode_hold_cache = NULL; -} - -/* - * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to - * serialize access to a znode and its SA buffer while the object is being - * created or destroyed. This kind of locking would normally reside in the - * znode itself but in this case that's impossible because the znode and SA - * buffer may not yet exist. Therefore the locking is handled externally - * with an array of mutexs and AVLs trees which contain per-object locks. - * - * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted - * in to the correct AVL tree and finally the per-object lock is held. In - * zfs_znode_hold_exit() the process is reversed. The per-object lock is - * released, removed from the AVL tree and destroyed if there are no waiters. - * - * This scheme has two important properties: - * - * 1) No memory allocations are performed while holding one of the z_hold_locks. - * This ensures evict(), which can be called from direct memory reclaim, will - * never block waiting on a z_hold_locks which just happens to have hashed - * to the same index. - * - * 2) All locks used to serialize access to an object are per-object and never - * shared. This minimizes lock contention without creating a large number - * of dedicated locks. - * - * On the downside it does require znode_lock_t structures to be frequently - * allocated and freed. However, because these are backed by a kmem cache - * and very short lived this cost is minimal. - */ -int -zfs_znode_hold_compare(const void *a, const void *b) -{ - const znode_hold_t *zh_a = (const znode_hold_t *)a; - const znode_hold_t *zh_b = (const znode_hold_t *)b; - - return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj)); -} - -boolean_t -zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) -{ - znode_hold_t *zh, search; - int i = ZFS_OBJ_HASH(zfsvfs, obj); - boolean_t held; - - search.zh_obj = obj; - - mutex_enter(&zfsvfs->z_hold_locks[i]); - zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); - held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; - mutex_exit(&zfsvfs->z_hold_locks[i]); - - return (held); -} - -static znode_hold_t * -zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) -{ - znode_hold_t *zh, *zh_new, search; - int i = ZFS_OBJ_HASH(zfsvfs, obj); - boolean_t found = B_FALSE; - - zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); - zh_new->zh_obj = obj; - search.zh_obj = obj; - - mutex_enter(&zfsvfs->z_hold_locks[i]); - zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); - if (likely(zh == NULL)) { - zh = zh_new; - avl_add(&zfsvfs->z_hold_trees[i], zh); - } else { - ASSERT3U(zh->zh_obj, ==, obj); - found = B_TRUE; - } - zfs_refcount_add(&zh->zh_refcount, NULL); - mutex_exit(&zfsvfs->z_hold_locks[i]); - - if (found == B_TRUE) - kmem_cache_free(znode_hold_cache, zh_new); - - ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); - ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); - mutex_enter(&zh->zh_lock); - - return (zh); -} - -static void -zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) -{ - int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); - boolean_t remove = B_FALSE; - - ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); - ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); - mutex_exit(&zh->zh_lock); - - mutex_enter(&zfsvfs->z_hold_locks[i]); - if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { - avl_remove(&zfsvfs->z_hold_trees[i], zh); - remove = B_TRUE; - } - mutex_exit(&zfsvfs->z_hold_locks[i]); - - if (remove == B_TRUE) - kmem_cache_free(znode_hold_cache, zh); -} - -static void -zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, - dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) -{ - ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); - - mutex_enter(&zp->z_lock); - - ASSERT(zp->z_sa_hdl == NULL); - ASSERT(zp->z_acl_cached == NULL); - if (sa_hdl == NULL) { - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, - SA_HDL_SHARED, &zp->z_sa_hdl)); - } else { - zp->z_sa_hdl = sa_hdl; - sa_set_userp(sa_hdl, zp); - } - - zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; - - mutex_exit(&zp->z_lock); -} - -void -zfs_znode_dmu_fini(znode_t *zp) -{ - ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || - RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); - - sa_handle_destroy(zp->z_sa_hdl); - zp->z_sa_hdl = NULL; -} - -/* - * Called by new_inode() to allocate a new inode. - */ -int -zfs_inode_alloc(struct super_block *sb, struct inode **ip) -{ - znode_t *zp; - - zp = kmem_cache_alloc(znode_cache, KM_SLEEP); - *ip = ZTOI(zp); - - return (0); -} - -/* - * Called in multiple places when an inode should be destroyed. - */ -void -zfs_inode_destroy(struct inode *ip) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - - mutex_enter(&zfsvfs->z_znodes_lock); - if (list_link_active(&zp->z_link_node)) { - list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; - } - mutex_exit(&zfsvfs->z_znodes_lock); - - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - - if (zp->z_xattr_cached) { - nvlist_free(zp->z_xattr_cached); - zp->z_xattr_cached = NULL; - } - - kmem_cache_free(znode_cache, zp); -} - -static void -zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) -{ - uint64_t rdev = 0; - - switch (ip->i_mode & S_IFMT) { - case S_IFREG: - ip->i_op = &zpl_inode_operations; - ip->i_fop = &zpl_file_operations; - ip->i_mapping->a_ops = &zpl_address_space_operations; - break; - - case S_IFDIR: - ip->i_op = &zpl_dir_inode_operations; - ip->i_fop = &zpl_dir_file_operations; - ITOZ(ip)->z_zn_prefetch = B_TRUE; - break; - - case S_IFLNK: - ip->i_op = &zpl_symlink_inode_operations; - break; - - /* - * rdev is only stored in a SA only for device files. - */ - case S_IFCHR: - case S_IFBLK: - (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, - sizeof (rdev)); - /*FALLTHROUGH*/ - case S_IFIFO: - case S_IFSOCK: - init_special_inode(ip, ip->i_mode, rdev); - ip->i_op = &zpl_special_inode_operations; - break; - - default: - zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", - (u_longlong_t)ip->i_ino, ip->i_mode); - - /* Assume the inode is a file and attempt to continue */ - ip->i_mode = S_IFREG | 0644; - ip->i_op = &zpl_inode_operations; - ip->i_fop = &zpl_file_operations; - ip->i_mapping->a_ops = &zpl_address_space_operations; - break; - } -} - -void -zfs_set_inode_flags(znode_t *zp, struct inode *ip) -{ - /* - * Linux and Solaris have different sets of file attributes, so we - * restrict this conversion to the intersection of the two. - */ -#ifdef HAVE_INODE_SET_FLAGS - unsigned int flags = 0; - if (zp->z_pflags & ZFS_IMMUTABLE) - flags |= S_IMMUTABLE; - if (zp->z_pflags & ZFS_APPENDONLY) - flags |= S_APPEND; - - inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); -#else - if (zp->z_pflags & ZFS_IMMUTABLE) - ip->i_flags |= S_IMMUTABLE; - else - ip->i_flags &= ~S_IMMUTABLE; - - if (zp->z_pflags & ZFS_APPENDONLY) - ip->i_flags |= S_APPEND; - else - ip->i_flags &= ~S_APPEND; -#endif -} - -/* - * Update the embedded inode given the znode. We should work toward - * eliminating this function as soon as possible by removing values - * which are duplicated between the znode and inode. If the generic - * inode has the correct field it should be used, and the ZFS code - * updated to access the inode. This can be done incrementally. - */ -void -zfs_inode_update(znode_t *zp) -{ - zfsvfs_t *zfsvfs; - struct inode *ip; - uint32_t blksize; - u_longlong_t i_blocks; - - ASSERT(zp != NULL); - zfsvfs = ZTOZSB(zp); - ip = ZTOI(zp); - - /* Skip .zfs control nodes which do not exist on disk. */ - if (zfsctl_is_node(ip)) - return; - - dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); - - spin_lock(&ip->i_lock); - ip->i_blocks = i_blocks; - i_size_write(ip, zp->z_size); - spin_unlock(&ip->i_lock); -} - - -/* - * Construct a znode+inode and initialize. - * - * This does not do a call to dmu_set_user() that is - * up to the caller to do, in case you don't want to - * return the znode - */ -static znode_t * -zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, - dmu_object_type_t obj_type, sa_handle_t *hdl) -{ - znode_t *zp; - struct inode *ip; - uint64_t mode; - uint64_t parent; - uint64_t tmp_gen; - uint64_t links; - uint64_t z_uid, z_gid; - uint64_t atime[2], mtime[2], ctime[2]; - uint64_t projid = ZFS_DEFAULT_PROJID; - sa_bulk_attr_t bulk[11]; - int count = 0; - - ASSERT(zfsvfs != NULL); - - ip = new_inode(zfsvfs->z_sb); - if (ip == NULL) - return (NULL); - - zp = ITOZ(ip); - ASSERT(zp->z_dirlocks == NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - zp->z_unlinked = B_FALSE; - zp->z_atime_dirty = B_FALSE; - zp->z_moved = B_FALSE; - zp->z_is_mapped = B_FALSE; - zp->z_is_ctldir = B_FALSE; - zp->z_is_stale = B_FALSE; - zp->z_suspended = B_FALSE; - zp->z_sa_hdl = NULL; - zp->z_mapcnt = 0; - zp->z_id = db->db_object; - zp->z_blksz = blksz; - zp->z_seq = 0x7A4653; - zp->z_sync_cnt = 0; - - zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, - &parent, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || - (dmu_objset_projectquota_enabled(zfsvfs->z_os) && - (zp->z_pflags & ZFS_PROJID) && - sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { - if (hdl == NULL) - sa_handle_destroy(zp->z_sa_hdl); - zp->z_sa_hdl = NULL; - goto error; - } - - zp->z_projid = projid; - zp->z_mode = ip->i_mode = mode; - ip->i_generation = (uint32_t)tmp_gen; - ip->i_blkbits = SPA_MINBLOCKSHIFT; - set_nlink(ip, (uint32_t)links); - zfs_uid_write(ip, z_uid); - zfs_gid_write(ip, z_gid); - zfs_set_inode_flags(zp, ip); - - /* Cache the xattr parent id */ - if (zp->z_pflags & ZFS_XATTR) - zp->z_xattr_parent = parent; - - ZFS_TIME_DECODE(&ip->i_atime, atime); - ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&ip->i_ctime, ctime); - - ip->i_ino = zp->z_id; - zfs_inode_update(zp); - zfs_inode_set_ops(zfsvfs, ip); - - /* - * The only way insert_inode_locked() can fail is if the ip->i_ino - * number is already hashed for this super block. This can never - * happen because the inode numbers map 1:1 with the object numbers. - * - * The one exception is rolling back a mounted file system, but in - * this case all the active inode are unhashed during the rollback. - */ - VERIFY3S(insert_inode_locked(ip), ==, 0); - - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; - membar_producer(); - mutex_exit(&zfsvfs->z_znodes_lock); - - unlock_new_inode(ip); - return (zp); - -error: - iput(ip); - return (NULL); -} - -/* - * Safely mark an inode dirty. Inodes which are part of a read-only - * file system or snapshot may not be dirtied. - */ -void -zfs_mark_inode_dirty(struct inode *ip) -{ - zfsvfs_t *zfsvfs = ITOZSB(ip); - - if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) - return; - - mark_inode_dirty(ip); -} - -static uint64_t empty_xattr; -static uint64_t pad[4]; -static zfs_acl_phys_t acl_phys; -/* - * Create a new DMU object to hold a zfs znode. - * - * IN: dzp - parent directory for new znode - * vap - file attributes for new znode - * tx - dmu transaction id for zap operations - * cr - credentials of caller - * flag - flags: - * IS_ROOT_NODE - new object will be root - * IS_TMPFILE - new object is of O_TMPFILE - * IS_XATTR - new object is an attribute - * acl_ids - ACL related attributes - * - * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) - * - */ -void -zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) -{ - uint64_t crtime[2], atime[2], mtime[2], ctime[2]; - uint64_t mode, size, links, parent, pflags; - uint64_t projid = ZFS_DEFAULT_PROJID; - uint64_t rdev = 0; - zfsvfs_t *zfsvfs = ZTOZSB(dzp); - dmu_buf_t *db; - inode_timespec_t now; - uint64_t gen, obj; - int bonuslen; - int dnodesize; - sa_handle_t *sa_hdl; - dmu_object_type_t obj_type; - sa_bulk_attr_t *sa_attrs; - int cnt = 0; - zfs_acl_locator_cb_t locate = { 0 }; - znode_hold_t *zh; - - if (zfsvfs->z_replay) { - obj = vap->va_nodeid; - now = vap->va_ctime; /* see zfs_replay_create() */ - gen = vap->va_nblocks; /* ditto */ - dnodesize = vap->va_fsid; /* ditto */ - } else { - obj = 0; - gethrestime(&now); - gen = dmu_tx_get_txg(tx); - dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); - } - - if (dnodesize == 0) - dnodesize = DNODE_MIN_SIZE; - - obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; - - bonuslen = (obj_type == DMU_OT_SA) ? - DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; - - /* - * Create a new DMU object. - */ - /* - * There's currently no mechanism for pre-reading the blocks that will - * be needed to allocate a new object, so we accept the small chance - * that there will be an i/o error and we will fail one of the - * assertions below. - */ - if (S_ISDIR(vap->va_mode)) { - if (zfsvfs->z_replay) { - VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, - zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, dnodesize, tx)); - } else { - obj = zap_create_norm_dnsize(zfsvfs->z_os, - zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, dnodesize, tx); - } - } else { - if (zfsvfs->z_replay) { - VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, dnodesize, tx)); - } else { - obj = dmu_object_alloc_dnsize(zfsvfs->z_os, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, dnodesize, tx); - } - } - - zh = zfs_znode_hold_enter(zfsvfs, obj); - VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); - - /* - * If this is the root, fix up the half-initialized parent pointer - * to reference the just-allocated physical data area. - */ - if (flag & IS_ROOT_NODE) { - dzp->z_id = obj; - } - - /* - * If parent is an xattr, so am I. - */ - if (dzp->z_pflags & ZFS_XATTR) { - flag |= IS_XATTR; - } - - if (zfsvfs->z_use_fuids) - pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; - else - pflags = 0; - - if (S_ISDIR(vap->va_mode)) { - size = 2; /* contents ("." and "..") */ - links = 2; - } else { - size = 0; - links = (flag & IS_TMPFILE) ? 0 : 1; - } - - if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) - rdev = vap->va_rdev; - - parent = dzp->z_id; - mode = acl_ids->z_mode; - if (flag & IS_XATTR) - pflags |= ZFS_XATTR; - - if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { - /* - * With ZFS_PROJID flag, we can easily know whether there is - * project ID stored on disk or not. See zfs_space_delta_cb(). - */ - if (obj_type != DMU_OT_ZNODE && - dmu_objset_projectquota_enabled(zfsvfs->z_os)) - pflags |= ZFS_PROJID; - - /* - * Inherit project ID from parent if required. - */ - projid = zfs_inherit_projid(dzp); - if (dzp->z_pflags & ZFS_PROJINHERIT) - pflags |= ZFS_PROJINHERIT; - } - - /* - * No execs denied will be determined when zfs_mode_compute() is called. - */ - pflags |= acl_ids->z_aclp->z_hints & - (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| - ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); - - ZFS_TIME_ENCODE(&now, crtime); - ZFS_TIME_ENCODE(&now, ctime); - - if (vap->va_mask & ATTR_ATIME) { - ZFS_TIME_ENCODE(&vap->va_atime, atime); - } else { - ZFS_TIME_ENCODE(&now, atime); - } - - if (vap->va_mask & ATTR_MTIME) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - } else { - ZFS_TIME_ENCODE(&now, mtime); - } - - /* Now add in all of the "SA" attributes */ - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, - &sa_hdl)); - - /* - * Setup the array of attributes to be replaced/set on the new file - * - * order for DMU_OT_ZNODE is critical since it needs to be constructed - * in the old znode_phys_t format. Don't change this ordering - */ - sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); - - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), - NULL, &atime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), - NULL, &mtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), - NULL, &crtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), - NULL, &gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), - NULL, &mode, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), - NULL, &size, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), - NULL, &parent, 8); - } else { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), - NULL, &mode, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), - NULL, &size, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), - NULL, &gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), - NULL, &acl_ids->z_fuid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), - NULL, &acl_ids->z_fgid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), - NULL, &parent, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), - NULL, &pflags, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), - NULL, &atime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), - NULL, &mtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), - NULL, &crtime, 16); - } - - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); - - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, - &empty_xattr, 8); - } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && - pflags & ZFS_PROJID) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), - NULL, &projid, 8); - } - if (obj_type == DMU_OT_ZNODE || - (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), - NULL, &rdev, 8); - } - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), - NULL, &pflags, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, - &acl_ids->z_fuid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, - &acl_ids->z_fgid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, - sizeof (uint64_t) * 4); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, - &acl_phys, sizeof (zfs_acl_phys_t)); - } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, - &acl_ids->z_aclp->z_acl_count, 8); - locate.cb_aclp = acl_ids->z_aclp; - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), - zfs_acl_data_locator, &locate, - acl_ids->z_aclp->z_acl_bytes); - mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, - acl_ids->z_fuid, acl_ids->z_fgid); - } - - VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); - - if (!(flag & IS_ROOT_NODE)) { - /* - * The call to zfs_znode_alloc() may fail if memory is low - * via the call path: alloc_inode() -> inode_init_always() -> - * security_inode_alloc() -> inode_alloc_security(). Since - * the existing code is written such that zfs_mknode() can - * not fail retry until sufficient memory has been reclaimed. - */ - do { - *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); - } while (*zpp == NULL); - - VERIFY(*zpp != NULL); - VERIFY(dzp != NULL); - } else { - /* - * If we are creating the root node, the "parent" we - * passed in is the znode for the root. - */ - *zpp = dzp; - - (*zpp)->z_sa_hdl = sa_hdl; - } - - (*zpp)->z_pflags = pflags; - (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; - (*zpp)->z_dnodesize = dnodesize; - (*zpp)->z_projid = projid; - - if (obj_type == DMU_OT_ZNODE || - acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { - VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); - } - kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); - zfs_znode_hold_exit(zfsvfs, zh); -} - -/* - * Update in-core attributes. It is assumed the caller will be doing an - * sa_bulk_update to push the changes out. - */ -void -zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) -{ - xoptattr_t *xoap; - boolean_t update_inode = B_FALSE; - - xoap = xva_getxoptattr(xvap); - ASSERT(xoap); - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - uint64_t times[2]; - ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), - ×, sizeof (times), tx); - XVA_SET_RTN(xvap, XAT_CREATETIME); - } - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_READONLY); - } - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - - update_inode = B_TRUE; - } - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - - update_inode = B_TRUE; - } - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, - xoap->xoa_av_quarantined, zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - zfs_sa_set_scanstamp(zp, xvap, tx); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_REPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { - ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_OFFLINE); - } - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { - ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_SPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { - ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_PROJINHERIT); - } - - if (update_inode) - zfs_set_inode_flags(zp, ZTOI(zp)); -} - -int -zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) -{ - dmu_object_info_t doi; - dmu_buf_t *db; - znode_t *zp; - znode_hold_t *zh; - int err; - sa_handle_t *hdl; - - *zpp = NULL; - -again: - zh = zfs_znode_hold_enter(zfsvfs, obj_num); - - err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); - if (err) { - zfs_znode_hold_exit(zfsvfs, zh); - return (err); - } - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_SA && - (doi.doi_bonus_type != DMU_OT_ZNODE || - (doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t)))) { - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); - return (SET_ERROR(EINVAL)); - } - - hdl = dmu_buf_get_user(db); - if (hdl != NULL) { - zp = sa_get_userdata(hdl); - - - /* - * Since "SA" does immediate eviction we - * should never find a sa handle that doesn't - * know about the znode. - */ - - ASSERT3P(zp, !=, NULL); - - mutex_enter(&zp->z_lock); - ASSERT3U(zp->z_id, ==, obj_num); - /* - * If igrab() returns NULL the VFS has independently - * determined the inode should be evicted and has - * called iput_final() to start the eviction process. - * The SA handle is still valid but because the VFS - * requires that the eviction succeed we must drop - * our locks and references to allow the eviction to - * complete. The zfs_zget() may then be retried. - * - * This unlikely case could be optimized by registering - * a sops->drop_inode() callback. The callback would - * need to detect the active SA hold thereby informing - * the VFS that this inode should not be evicted. - */ - if (igrab(ZTOI(zp)) == NULL) { - mutex_exit(&zp->z_lock); - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); - /* inode might need this to finish evict */ - cond_resched(); - goto again; - } - *zpp = zp; - err = 0; - mutex_exit(&zp->z_lock); - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); - return (err); - } - - /* - * Not found create new znode/vnode but only if file exists. - * - * There is a small window where zfs_vget() could - * find this object while a file create is still in - * progress. This is checked for in zfs_znode_alloc() - * - * if zfs_znode_alloc() fails it will drop the hold on the - * bonus buffer. - */ - zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, - doi.doi_bonus_type, NULL); - if (zp == NULL) { - err = SET_ERROR(ENOENT); - } else { - *zpp = zp; - } - zfs_znode_hold_exit(zfsvfs, zh); - return (err); -} - -int -zfs_rezget(znode_t *zp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - dmu_object_info_t doi; - dmu_buf_t *db; - uint64_t obj_num = zp->z_id; - uint64_t mode; - uint64_t links; - sa_bulk_attr_t bulk[10]; - int err; - int count = 0; - uint64_t gen; - uint64_t z_uid, z_gid; - uint64_t atime[2], mtime[2], ctime[2]; - uint64_t projid = ZFS_DEFAULT_PROJID; - znode_hold_t *zh; - - /* - * skip ctldir, otherwise they will always get invalidated. This will - * cause funny behaviour for the mounted snapdirs. Especially for - * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent - * anyone automount it again as long as someone is still using the - * detached mount. - */ - if (zp->z_is_ctldir) - return (0); - - zh = zfs_znode_hold_enter(zfsvfs, obj_num); - - mutex_enter(&zp->z_acl_lock); - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - mutex_exit(&zp->z_acl_lock); - - rw_enter(&zp->z_xattr_lock, RW_WRITER); - if (zp->z_xattr_cached) { - nvlist_free(zp->z_xattr_cached); - zp->z_xattr_cached = NULL; - } - rw_exit(&zp->z_xattr_lock); - - ASSERT(zp->z_sa_hdl == NULL); - err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); - if (err) { - zfs_znode_hold_exit(zfsvfs, zh); - return (err); - } - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_SA && - (doi.doi_bonus_type != DMU_OT_ZNODE || - (doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t)))) { - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zfsvfs, zh); - return (SET_ERROR(EINVAL)); - } - - zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); - - /* reload cached values */ - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, - &gen, sizeof (gen)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, sizeof (zp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &links, sizeof (links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &z_uid, sizeof (z_uid)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &z_gid, sizeof (z_gid)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &mode, sizeof (mode)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &atime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, 16); - - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { - zfs_znode_dmu_fini(zp); - zfs_znode_hold_exit(zfsvfs, zh); - return (SET_ERROR(EIO)); - } - - if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { - err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), - &projid, 8); - if (err != 0 && err != ENOENT) { - zfs_znode_dmu_fini(zp); - zfs_znode_hold_exit(zfsvfs, zh); - return (SET_ERROR(err)); - } - } - - zp->z_projid = projid; - zp->z_mode = ZTOI(zp)->i_mode = mode; - zfs_uid_write(ZTOI(zp), z_uid); - zfs_gid_write(ZTOI(zp), z_gid); - - ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); - - if ((uint32_t)gen != ZTOI(zp)->i_generation) { - zfs_znode_dmu_fini(zp); - zfs_znode_hold_exit(zfsvfs, zh); - return (SET_ERROR(EIO)); - } - - set_nlink(ZTOI(zp), (uint32_t)links); - zfs_set_inode_flags(zp, ZTOI(zp)); - - zp->z_blksz = doi.doi_data_block_size; - zp->z_atime_dirty = B_FALSE; - zfs_inode_update(zp); - - /* - * If the file has zero links, then it has been unlinked on the send - * side and it must be in the received unlinked set. - * We call zfs_znode_dmu_fini() now to prevent any accesses to the - * stale data and to prevent automatic removal of the file in - * zfs_zinactive(). The file will be removed either when it is removed - * on the send side and the next incremental stream is received or - * when the unlinked set gets processed. - */ - zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); - if (zp->z_unlinked) - zfs_znode_dmu_fini(zp); - - zfs_znode_hold_exit(zfsvfs, zh); - - return (0); -} - -void -zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; - uint64_t obj = zp->z_id; - uint64_t acl_obj = zfs_external_acl(zp); - znode_hold_t *zh; - - zh = zfs_znode_hold_enter(zfsvfs, obj); - if (acl_obj) { - VERIFY(!zp->z_is_sa); - VERIFY(0 == dmu_object_free(os, acl_obj, tx)); - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - zfs_znode_dmu_fini(zp); - zfs_znode_hold_exit(zfsvfs, zh); -} - -void -zfs_zinactive(znode_t *zp) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - uint64_t z_id = zp->z_id; - znode_hold_t *zh; - - ASSERT(zp->z_sa_hdl); - - /* - * Don't allow a zfs_zget() while were trying to release this znode. - */ - zh = zfs_znode_hold_enter(zfsvfs, z_id); - - mutex_enter(&zp->z_lock); - - /* - * If this was the last reference to a file with no links, remove - * the file from the file system unless the file system is mounted - * read-only. That can happen, for example, if the file system was - * originally read-write, the file was opened, then unlinked and - * the file system was made read-only before the file was finally - * closed. The file will remain in the unlinked set. - */ - if (zp->z_unlinked) { - ASSERT(!zfsvfs->z_issnap); - if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { - mutex_exit(&zp->z_lock); - zfs_znode_hold_exit(zfsvfs, zh); - zfs_rmnode(zp); - return; - } - } - - mutex_exit(&zp->z_lock); - zfs_znode_dmu_fini(zp); - - zfs_znode_hold_exit(zfsvfs, zh); -} - -#if defined(HAVE_INODE_TIMESPEC64_TIMES) -#define zfs_compare_timespec timespec64_compare -#else -#define zfs_compare_timespec timespec_compare -#endif - -/* - * Determine whether the znode's atime must be updated. The logic mostly - * duplicates the Linux kernel's relatime_need_update() functionality. - * This function is only called if the underlying filesystem actually has - * atime updates enabled. - */ -boolean_t -zfs_relatime_need_update(const struct inode *ip) -{ - inode_timespec_t now; - - gethrestime(&now); - /* - * In relatime mode, only update the atime if the previous atime - * is earlier than either the ctime or mtime or if at least a day - * has passed since the last update of atime. - */ - if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) - return (B_TRUE); - - if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) - return (B_TRUE); - - if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) - return (B_TRUE); - - return (B_FALSE); -} - -/* - * Prepare to update znode time stamps. - * - * IN: zp - znode requiring timestamp update - * flag - ATTR_MTIME, ATTR_CTIME flags - * - * OUT: zp - z_seq - * mtime - new mtime - * ctime - new ctime - * - * Note: We don't update atime here, because we rely on Linux VFS to do - * atime updating. - */ -void -zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], - uint64_t ctime[2]) -{ - inode_timespec_t now; - - gethrestime(&now); - - zp->z_seq++; - - if (flag & ATTR_MTIME) { - ZFS_TIME_ENCODE(&now, mtime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); - if (ZTOZSB(zp)->z_use_fuids) { - zp->z_pflags |= (ZFS_ARCHIVE | - ZFS_AV_MODIFIED); - } - } - - if (flag & ATTR_CTIME) { - ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); - if (ZTOZSB(zp)->z_use_fuids) - zp->z_pflags |= ZFS_ARCHIVE; - } -} - -/* - * Grow the block size for a file. - * - * IN: zp - znode of file to free data in. - * size - requested block size - * tx - open transaction. - * - * NOTE: this function assumes that the znode is write locked. - */ -void -zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) -{ - int error; - u_longlong_t dummy; - - if (size <= zp->z_blksz) - return; - /* - * If the file size is already greater than the current blocksize, - * we will not grow. If there is more than one block in a file, - * the blocksize cannot change. - */ - if (zp->z_blksz && zp->z_size > zp->z_blksz) - return; - - error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, - size, 0, tx); - - if (error == ENOTSUP) - return; - ASSERT0(error); - - /* What blocksize did we actually get? */ - dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); -} - -/* - * Increase the file length - * - * IN: zp - znode of file to free data in. - * end - new end-of-file - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_extend(znode_t *zp, uint64_t end) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - dmu_tx_t *tx; - locked_range_t *lr; - uint64_t newblksz; - int error; - - /* - * We will change zp_size, lock the whole file. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (end <= zp->z_size) { - rangelock_exit(lr); - return (0); - } - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - if (end > zp->z_blksz && - (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { - /* - * We are growing the file past the current block size. - */ - if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); - } else { - newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); - } - dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); - } else { - newblksz = 0; - } - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - rangelock_exit(lr); - return (error); - } - - if (newblksz) - zfs_grow_blocksize(zp, newblksz, tx); - - zp->z_size = end; - - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), - &zp->z_size, sizeof (zp->z_size), tx)); - - rangelock_exit(lr); - - dmu_tx_commit(tx); - - return (0); -} - -/* - * zfs_zero_partial_page - Modeled after update_pages() but - * with different arguments and semantics for use by zfs_freesp(). - * - * Zeroes a piece of a single page cache entry for zp at offset - * start and length len. - * - * Caller must acquire a range lock on the file for the region - * being zeroed in order that the ARC and page cache stay in sync. - */ -static void -zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) -{ - struct address_space *mp = ZTOI(zp)->i_mapping; - struct page *pp; - int64_t off; - void *pb; - - ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); - - off = start & (PAGE_SIZE - 1); - start &= PAGE_MASK; - - pp = find_lock_page(mp, start >> PAGE_SHIFT); - if (pp) { - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - pb = kmap(pp); - bzero(pb + off, len); - kunmap(pp); - - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); - - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); - unlock_page(pp); - put_page(pp); - } -} - -/* - * Free space in a file. - * - * IN: zp - znode of file to free data in. - * off - start of section to free. - * len - length of section to free. - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - locked_range_t *lr; - int error; - - /* - * Lock the range being freed. - */ - lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (off >= zp->z_size) { - rangelock_exit(lr); - return (0); - } - - if (off + len > zp->z_size) - len = zp->z_size - off; - - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); - - /* - * Zero partial page cache entries. This must be done under a - * range lock in order to keep the ARC and page cache in sync. - */ - if (zp->z_is_mapped) { - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - - /* first possible full page in hole */ - first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; - /* last page of hole */ - last_page = (off + len) >> PAGE_SHIFT; - - /* offset of first_page */ - first_page_offset = first_page << PAGE_SHIFT; - /* offset of last_page */ - last_page_offset = last_page << PAGE_SHIFT; - - /* truncate whole pages */ - if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(ZTOI(zp)->i_mapping, - first_page_offset, last_page_offset - 1); - } - - /* truncate sub-page ranges */ - if (first_page > last_page) { - /* entire punched area within a single page */ - zfs_zero_partial_page(zp, off, len); - } else { - /* beginning of punched area at the end of a page */ - page_len = first_page_offset - off; - if (page_len > 0) - zfs_zero_partial_page(zp, off, page_len); - - /* end of punched area at the beginning of a page */ - page_len = off + len - last_page_offset; - if (page_len > 0) - zfs_zero_partial_page(zp, last_page_offset, - page_len); - } - } - rangelock_exit(lr); - - return (error); -} - -/* - * Truncate a file - * - * IN: zp - znode of file to free data in. - * end - new end-of-file. - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_trunc(znode_t *zp, uint64_t end) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - dmu_tx_t *tx; - locked_range_t *lr; - int error; - sa_bulk_attr_t bulk[2]; - int count = 0; - - /* - * We will change zp_size, lock the whole file. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (end >= zp->z_size) { - rangelock_exit(lr); - return (0); - } - - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, - DMU_OBJECT_END); - if (error) { - rangelock_exit(lr); - return (error); - } - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - rangelock_exit(lr); - return (error); - } - - zp->z_size = end; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), - NULL, &zp->z_size, sizeof (zp->z_size)); - - if (end == 0) { - zp->z_pflags &= ~ZFS_SPARSE; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, 8); - } - VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); - - dmu_tx_commit(tx); - rangelock_exit(lr); - - return (0); -} - -/* - * Free space in a file - * - * IN: zp - znode of file to free data in. - * off - start of range - * len - end of range (0 => EOF) - * flag - current file open mode flags. - * log - TRUE if this action should be logged - * - * RETURN: 0 on success, error code on failure - */ -int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) -{ - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = ZTOZSB(zp); - zilog_t *zilog = zfsvfs->z_log; - uint64_t mode; - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int count = 0; - int error; - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, - sizeof (mode))) != 0) - return (error); - - if (off > zp->z_size) { - error = zfs_extend(zp, off+len); - if (error == 0 && log) - goto log; - goto out; - } - - if (len == 0) { - error = zfs_trunc(zp, off); - } else { - if ((error = zfs_free_range(zp, off, len)) == 0 && - off + len > zp->z_size) - error = zfs_extend(zp, off+len); - } - if (error || !log) - goto out; -log: - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto out; - } - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, 8); - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - - zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); - - dmu_tx_commit(tx); - - zfs_inode_update(zp); - error = 0; - -out: - /* - * Truncate the page cache - for file truncate operations, use - * the purpose-built API for truncations. For punching operations, - * the truncation is handled under a range lock in zfs_free_range. - */ - if (len == 0) - truncate_setsize(ZTOI(zp), off); - return (error); -} - -void -zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) -{ - struct super_block *sb; - zfsvfs_t *zfsvfs; - uint64_t moid, obj, sa_obj, version; - uint64_t sense = ZFS_CASE_SENSITIVE; - uint64_t norm = 0; - nvpair_t *elem; - int size; - int error; - int i; - znode_t *rootzp = NULL; - vattr_t vattr; - znode_t *zp; - zfs_acl_ids_t acl_ids; - - /* - * First attempt to create master node. - */ - /* - * In an empty objset, there are no blocks to read and thus - * there can be no i/o errors (which we assert below). - */ - moid = MASTER_NODE_OBJ; - error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - /* - * Set starting attributes. - */ - version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); - elem = NULL; - while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { - /* For the moment we expect all zpl props to be uint64_ts */ - uint64_t val; - char *name; - - ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); - VERIFY(nvpair_value_uint64(elem, &val) == 0); - name = nvpair_name(elem); - if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { - if (val < version) - version = val; - } else { - error = zap_update(os, moid, name, 8, 1, &val, tx); - } - ASSERT(error == 0); - if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) - norm = val; - else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) - sense = val; - } - ASSERT(version != 0); - error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); - - /* - * Create zap object used for SA attribute registration - */ - - if (version >= ZPL_VERSION_SA) { - sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, - DMU_OT_NONE, 0, tx); - error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT(error == 0); - } else { - sa_obj = 0; - } - /* - * Create a delete queue. - */ - obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); - ASSERT(error == 0); - - /* - * Create root znode. Create minimal znode/inode/zfsvfs/sb - * to allow zfs_mknode to work. - */ - vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; - vattr.va_mode = S_IFDIR|0755; - vattr.va_uid = crgetuid(cr); - vattr.va_gid = crgetgid(cr); - - rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); - rootzp->z_unlinked = B_FALSE; - rootzp->z_atime_dirty = B_FALSE; - rootzp->z_moved = B_FALSE; - rootzp->z_is_sa = USE_SA(version, os); - rootzp->z_pflags = 0; - - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - zfsvfs->z_os = os; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_version = version; - zfsvfs->z_use_fuids = USE_FUIDS(version, os); - zfsvfs->z_use_sa = USE_SA(version, os); - zfsvfs->z_norm = norm; - - sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); - sb->s_fs_info = zfsvfs; - - ZTOI(rootzp)->i_sb = sb; - - error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, - &zfsvfs->z_attr_table); - - ASSERT(error == 0); - - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - - size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); - zfsvfs->z_hold_size = size; - zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, - KM_SLEEP); - zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); - for (i = 0; i != size; i++) { - avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, - sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); - mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); - } - - VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids)); - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); - ASSERT3P(zp, ==, rootzp); - error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); - ASSERT(error == 0); - zfs_acl_ids_free(&acl_ids); - - atomic_set(&ZTOI(rootzp)->i_count, 0); - sa_handle_destroy(rootzp->z_sa_hdl); - kmem_cache_free(znode_cache, rootzp); - - for (i = 0; i != size; i++) { - avl_destroy(&zfsvfs->z_hold_trees[i]); - mutex_destroy(&zfsvfs->z_hold_locks[i]); - } - - mutex_destroy(&zfsvfs->z_znodes_lock); - - vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); - vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); - kmem_free(sb, sizeof (struct super_block)); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); -} -#endif /* _KERNEL */ - -static int -zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) -{ - uint64_t sa_obj = 0; - int error; - - error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); - if (error != 0 && error != ENOENT) - return (error); - - error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); - return (error); -} - -static int -zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, - dmu_buf_t **db, void *tag) -{ - dmu_object_info_t doi; - int error; - - if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) - return (error); - - dmu_object_info_from_db(*db, &doi); - if ((doi.doi_bonus_type != DMU_OT_SA && - doi.doi_bonus_type != DMU_OT_ZNODE) || - (doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t))) { - sa_buf_rele(*db, tag); - return (SET_ERROR(ENOTSUP)); - } - - error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); - if (error != 0) { - sa_buf_rele(*db, tag); - return (error); - } - - return (0); -} - -void -zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) -{ - sa_handle_destroy(hdl); - sa_buf_rele(db, tag); -} - -/* - * Given an object number, return its parent object number and whether - * or not the object is an extended attribute directory. - */ -static int -zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, - uint64_t *pobjp, int *is_xattrdir) -{ - uint64_t parent; - uint64_t pflags; - uint64_t mode; - uint64_t parent_mode; - sa_bulk_attr_t bulk[3]; - sa_handle_t *sa_hdl; - dmu_buf_t *sa_db; - int count = 0; - int error; - - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, - &parent, sizeof (parent)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, - &pflags, sizeof (pflags)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, - &mode, sizeof (mode)); - - if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) - return (error); - - /* - * When a link is removed its parent pointer is not changed and will - * be invalid. There are two cases where a link is removed but the - * file stays around, when it goes to the delete queue and when there - * are additional links. - */ - error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); - if (error != 0) - return (error); - - error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); - zfs_release_sa_handle(sa_hdl, sa_db, FTAG); - if (error != 0) - return (error); - - *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); - - /* - * Extended attributes can be applied to files, directories, etc. - * Otherwise the parent must be a directory. - */ - if (!*is_xattrdir && !S_ISDIR(parent_mode)) - return (SET_ERROR(EINVAL)); - - *pobjp = parent; - - return (0); -} - -/* - * Given an object number, return some zpl level statistics - */ -static int -zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, - zfs_stat_t *sb) -{ - sa_bulk_attr_t bulk[4]; - int count = 0; - - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, - &sb->zs_mode, sizeof (sb->zs_mode)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, - &sb->zs_gen, sizeof (sb->zs_gen)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, - &sb->zs_links, sizeof (sb->zs_links)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, - &sb->zs_ctime, sizeof (sb->zs_ctime)); - - return (sa_bulk_lookup(hdl, bulk, count)); -} - -static int -zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, - sa_attr_type_t *sa_table, char *buf, int len) -{ - sa_handle_t *sa_hdl; - sa_handle_t *prevhdl = NULL; - dmu_buf_t *prevdb = NULL; - dmu_buf_t *sa_db = NULL; - char *path = buf + len - 1; - int error; - - *path = '\0'; - sa_hdl = hdl; - - uint64_t deleteq_obj; - VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, - ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); - error = zap_lookup_int(osp, deleteq_obj, obj); - if (error == 0) { - return (ESTALE); - } else if (error != ENOENT) { - return (error); - } - error = 0; - - for (;;) { - uint64_t pobj = 0; - char component[MAXNAMELEN + 2]; - size_t complen; - int is_xattrdir = 0; - - if (prevdb) - zfs_release_sa_handle(prevhdl, prevdb, FTAG); - - if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, - &is_xattrdir)) != 0) - break; - - if (pobj == obj) { - if (path[0] != '/') - *--path = '/'; - break; - } - - component[0] = '/'; - if (is_xattrdir) { - (void) sprintf(component + 1, "<xattrdir>"); - } else { - error = zap_value_search(osp, pobj, obj, - ZFS_DIRENT_OBJ(-1ULL), component + 1); - if (error != 0) - break; - } - - complen = strlen(component); - path -= complen; - ASSERT(path >= buf); - bcopy(component, path, complen); - obj = pobj; - - if (sa_hdl != hdl) { - prevhdl = sa_hdl; - prevdb = sa_db; - } - error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); - if (error != 0) { - sa_hdl = prevhdl; - sa_db = prevdb; - break; - } - } - - if (sa_hdl != NULL && sa_hdl != hdl) { - ASSERT(sa_db != NULL); - zfs_release_sa_handle(sa_hdl, sa_db, FTAG); - } - - if (error == 0) - (void) memmove(buf, path, buf + len - path); - - return (error); -} - -int -zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) -{ - sa_attr_type_t *sa_table; - sa_handle_t *hdl; - dmu_buf_t *db; - int error; - - error = zfs_sa_setup(osp, &sa_table); - if (error != 0) - return (error); - - error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); - if (error != 0) - return (error); - - error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); - - zfs_release_sa_handle(hdl, db, FTAG); - return (error); -} - -int -zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, - char *buf, int len) -{ - char *path = buf + len - 1; - sa_attr_type_t *sa_table; - sa_handle_t *hdl; - dmu_buf_t *db; - int error; - - *path = '\0'; - - error = zfs_sa_setup(osp, &sa_table); - if (error != 0) - return (error); - - error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); - if (error != 0) - return (error); - - error = zfs_obj_to_stats_impl(hdl, sa_table, sb); - if (error != 0) { - zfs_release_sa_handle(hdl, db, FTAG); - return (error); - } - - error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); - - zfs_release_sa_handle(hdl, db, FTAG); - return (error); -} - -#if defined(_KERNEL) -EXPORT_SYMBOL(zfs_create_fs); -EXPORT_SYMBOL(zfs_obj_to_path); - -/* CSTYLED */ -module_param(zfs_object_mutex_size, uint, 0644); -MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); -module_param(zfs_unlink_suspend_progress, int, 0644); -MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " -"(debug - leaks space into the unlinked set)"); -#endif diff --git a/module/zfs/zio_crypt.c b/module/zfs/zio_crypt.c deleted file mode 100644 index 7cf20f413..000000000 --- a/module/zfs/zio_crypt.c +++ /dev/null @@ -1,2036 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2017, Datto, Inc. All rights reserved. - */ - -#include <sys/zio_crypt.h> -#include <sys/dmu.h> -#include <sys/dmu_objset.h> -#include <sys/dnode.h> -#include <sys/fs/zfs.h> -#include <sys/zio.h> -#include <sys/zil.h> -#include <sys/sha2.h> -#include <sys/hkdf.h> -#include "qat.h" - -/* - * This file is responsible for handling all of the details of generating - * encryption parameters and performing encryption and authentication. - * - * BLOCK ENCRYPTION PARAMETERS: - * Encryption /Authentication Algorithm Suite (crypt): - * The encryption algorithm, mode, and key length we are going to use. We - * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit - * keys. All authentication is currently done with SHA512-HMAC. - * - * Plaintext: - * The unencrypted data that we want to encrypt. - * - * Initialization Vector (IV): - * An initialization vector for the encryption algorithms. This is used to - * "tweak" the encryption algorithms so that two blocks of the same data are - * encrypted into different ciphertext outputs, thus obfuscating block patterns. - * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is - * never reused with the same encryption key. This value is stored unencrypted - * and must simply be provided to the decryption function. We use a 96 bit IV - * (as recommended by NIST) for all block encryption. For non-dedup blocks we - * derive the IV randomly. The first 64 bits of the IV are stored in the second - * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of - * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits - * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count - * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of - * level 0 blocks is the number of allocated dnodes in that block. The on-disk - * format supports at most 2^15 slots per L0 dnode block, because the maximum - * block size is 16MB (2^24). In either case, for level 0 blocks this number - * will still be smaller than UINT32_MAX so it is safe to store the IV in the - * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count - * for the dnode code. - * - * Master key: - * This is the most important secret data of an encrypted dataset. It is used - * along with the salt to generate that actual encryption keys via HKDF. We - * do not use the master key to directly encrypt any data because there are - * theoretical limits on how much data can actually be safely encrypted with - * any encryption mode. The master key is stored encrypted on disk with the - * user's wrapping key. Its length is determined by the encryption algorithm. - * For details on how this is stored see the block comment in dsl_crypt.c - * - * Salt: - * Used as an input to the HKDF function, along with the master key. We use a - * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt - * can be used for encrypting many blocks, so we cache the current salt and the - * associated derived key in zio_crypt_t so we do not need to derive it again - * needlessly. - * - * Encryption Key: - * A secret binary key, generated from an HKDF function used to encrypt and - * decrypt data. - * - * Message Authentication Code (MAC) - * The MAC is an output of authenticated encryption modes such as AES-GCM and - * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted - * data on disk and return garbage to the application. Effectively, it is a - * checksum that can not be reproduced by an attacker. We store the MAC in the - * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated - * regular checksum of the ciphertext which can be used for scrubbing. - * - * OBJECT AUTHENTICATION: - * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because - * they contain some info that always needs to be readable. To prevent this - * data from being altered, we authenticate this data using SHA512-HMAC. This - * will produce a MAC (similar to the one produced via encryption) which can - * be used to verify the object was not modified. HMACs do not require key - * rotation or IVs, so we can keep up to the full 3 copies of authenticated - * data. - * - * ZIL ENCRYPTION: - * ZIL blocks have their bp written to disk ahead of the associated data, so we - * cannot store the MAC there as we normally do. For these blocks the MAC is - * stored in the embedded checksum within the zil_chain_t header. The salt and - * IV are generated for the block on bp allocation instead of at encryption - * time. In addition, ZIL blocks have some pieces that must be left in plaintext - * for claiming even though all of the sensitive user data still needs to be - * encrypted. The function zio_crypt_init_uios_zil() handles parsing which - * pieces of the block need to be encrypted. All data that is not encrypted is - * authenticated using the AAD mechanisms that the supported encryption modes - * provide for. In order to preserve the semantics of the ZIL for encrypted - * datasets, the ZIL is not protected at the objset level as described below. - * - * DNODE ENCRYPTION: - * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left - * in plaintext for scrubbing and claiming, but the bonus buffers might contain - * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing - * which which pieces of the block need to be encrypted. For more details about - * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). - * - * OBJECT SET AUTHENTICATION: - * Up to this point, everything we have encrypted and authenticated has been - * at level 0 (or -2 for the ZIL). If we did not do any further work the - * on-disk format would be susceptible to attacks that deleted or rearranged - * the order of level 0 blocks. Ideally, the cleanest solution would be to - * maintain a tree of authentication MACs going up the bp tree. However, this - * presents a problem for raw sends. Send files do not send information about - * indirect blocks so there would be no convenient way to transfer the MACs and - * they cannot be recalculated on the receive side without the master key which - * would defeat one of the purposes of raw sends in the first place. Instead, - * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs - * from the level below. We also include some portable fields from blk_prop such - * as the lsize and compression algorithm to prevent the data from being - * misinterpreted. - * - * At the objset level, we maintain 2 separate 256 bit MACs in the - * objset_phys_t. The first one is "portable" and is the logical root of the - * MAC tree maintained in the metadnode's bps. The second, is "local" and is - * used as the root MAC for the user accounting objects, which are also not - * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload - * of the send file. The useraccounting code ensures that the useraccounting - * info is not present upon a receive, so the local MAC can simply be cleared - * out at that time. For more info about objset_phys_t authentication, see - * zio_crypt_do_objset_hmacs(). - * - * CONSIDERATIONS FOR DEDUP: - * In order for dedup to work, blocks that we want to dedup with one another - * need to use the same IV and encryption key, so that they will have the same - * ciphertext. Normally, one should never reuse an IV with the same encryption - * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both - * blocks. In this case, however, since we are using the same plaintext as - * well all that we end up with is a duplicate of the original ciphertext we - * already had. As a result, an attacker with read access to the raw disk will - * be able to tell which blocks are the same but this information is given away - * by dedup anyway. In order to get the same IVs and encryption keys for - * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC - * here so that a reproducible checksum of the plaintext is never available to - * the attacker. The HMAC key is kept alongside the master key, encrypted on - * disk. The first 64 bits of the HMAC are used in place of the random salt, and - * the next 96 bits are used as the IV. As a result of this mechanism, dedup - * will only work within a clone family since encrypted dedup requires use of - * the same master and HMAC keys. - */ - -/* - * After encrypting many blocks with the same key we may start to run up - * against the theoretical limits of how much data can securely be encrypted - * with a single key using the supported encryption modes. The most obvious - * limitation is that our risk of generating 2 equivalent 96 bit IVs increases - * the more IVs we generate (which both GCM and CCM modes strictly forbid). - * This risk actually grows surprisingly quickly over time according to the - * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have - * generated n IVs with a cryptographically secure RNG, the approximate - * probability p(n) of a collision is given as: - * - * p(n) ~= e^(-n*(n-1)/(2*(2^96))) - * - * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] - * - * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion - * we must not write more than 398,065,730 blocks with the same encryption key. - * Therefore, we rotate our keys after 400,000,000 blocks have been written by - * generating a new random 64 bit salt for our HKDF encryption key generation - * function. - */ -#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 -#define ZFS_CURRENT_MAX_SALT_USES \ - (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) -unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; - -typedef struct blkptr_auth_buf { - uint64_t bab_prop; /* blk_prop - portable mask */ - uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
- uint64_t bab_pad; /* reserved for future use */ -} blkptr_auth_buf_t; - -zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { - {"", ZC_TYPE_NONE, 0, "inherit"}, - {"", ZC_TYPE_NONE, 0, "on"}, - {"", ZC_TYPE_NONE, 0, "off"}, - {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, - {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, - {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, - {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, - {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, - {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} -}; - -void -zio_crypt_key_destroy(zio_crypt_key_t *key) -{ - rw_destroy(&key->zk_salt_lock); - - /* free crypto templates */ - crypto_destroy_ctx_template(key->zk_current_tmpl); - crypto_destroy_ctx_template(key->zk_hmac_tmpl); - - /* zero out sensitive data */ - bzero(key, sizeof (zio_crypt_key_t)); -} - -int -zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) -{ - int ret; - crypto_mechanism_t mech; - uint_t keydata_len; - - ASSERT(key != NULL); - ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); - - keydata_len = zio_crypt_table[crypt].ci_keylen; - bzero(key, sizeof (zio_crypt_key_t)); - - /* fill keydata buffers and salt with random data */ - ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); - if (ret != 0) - goto error; - - ret = random_get_bytes(key->zk_master_keydata, keydata_len); - if (ret != 0) - goto error; - - ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); - if (ret != 0) - goto error; - - ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); - if (ret != 0) - goto error; - - /* derive the current key from the master key */ - ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, - key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, - keydata_len); - if (ret != 0) - goto error; - - /* initialize keys for the ICP */ - key->zk_current_key.ck_format = CRYPTO_KEY_RAW; - key->zk_current_key.ck_data = key->zk_current_keydata; - key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); - - key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; - key->zk_hmac_key.ck_data = &key->zk_hmac_key; - key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); - - /* - * Initialize the crypto templates. It's ok if this fails because - * this is just an optimization. - */ - mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); - ret = crypto_create_ctx_template(&mech, &key->zk_current_key, - &key->zk_current_tmpl, KM_SLEEP); - if (ret != CRYPTO_SUCCESS) - key->zk_current_tmpl = NULL; - - mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); - ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, - &key->zk_hmac_tmpl, KM_SLEEP); - if (ret != CRYPTO_SUCCESS) - key->zk_hmac_tmpl = NULL; - - key->zk_crypt = crypt; - key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; - key->zk_salt_count = 0; - rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); - - return (0); - -error: - zio_crypt_key_destroy(key); - return (ret); -} - -static int -zio_crypt_key_change_salt(zio_crypt_key_t *key) -{ - int ret = 0; - uint8_t salt[ZIO_DATA_SALT_LEN]; - crypto_mechanism_t mech; - uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; - - /* generate a new salt */ - ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); - if (ret != 0) - goto error; - - rw_enter(&key->zk_salt_lock, RW_WRITER); - - /* someone beat us to the salt rotation, just unlock and return */ - if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) - goto out_unlock; - - /* derive the current key from the master key and the new salt */ - ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, - salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); - if (ret != 0) - goto out_unlock; - - /* assign the salt and reset the usage count */ - bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); - key->zk_salt_count = 0; - - /* destroy the old context template and create the new one */ - crypto_destroy_ctx_template(key->zk_current_tmpl); - ret = crypto_create_ctx_template(&mech, &key->zk_current_key, - &key->zk_current_tmpl, KM_SLEEP); - if (ret != CRYPTO_SUCCESS) - key->zk_current_tmpl = NULL; - - rw_exit(&key->zk_salt_lock); - - return (0); - -out_unlock: - rw_exit(&key->zk_salt_lock); -error: - return (ret); -} - -/* See comment above zfs_key_max_salt_uses definition for details */ -int -zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) -{ - int ret; - boolean_t salt_change; - - rw_enter(&key->zk_salt_lock, RW_READER); - - bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); - salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= - ZFS_CURRENT_MAX_SALT_USES); - - rw_exit(&key->zk_salt_lock); - - if (salt_change) { - ret = zio_crypt_key_change_salt(key); - if (ret != 0) - goto error; - } - - return (0); - -error: - return (ret); -} - -/* - * This function handles all encryption and decryption in zfs. When - * encrypting it expects puio to reference the plaintext and cuio to - * reference the ciphertext. cuio must have enough space for the - * ciphertext + room for a MAC. datalen should be the length of the - * plaintext / ciphertext alone. - */ -static int -zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, - crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, - uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) -{ - int ret; - crypto_data_t plaindata, cipherdata; - CK_AES_CCM_PARAMS ccmp; - CK_AES_GCM_PARAMS gcmp; - crypto_mechanism_t mech; - zio_crypt_info_t crypt_info; - uint_t plain_full_len, maclen; - - ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); - ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); - - /* lookup the encryption info */ - crypt_info = zio_crypt_table[crypt]; - - /* the mac will always be the last iovec_t in the cipher uio */ - maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; - - ASSERT(maclen <= ZIO_DATA_MAC_LEN); - - /* setup encryption mechanism (same as crypt) */ - mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); - - /* - * Strangely, the ICP requires that plain_full_len must include - * the MAC length when decrypting, even though the UIO does not - * need to have the extra space allocated. - */ - if (encrypt) { - plain_full_len = datalen; - } else { - plain_full_len = datalen + maclen; - } - - /* - * setup encryption params (currently only AES CCM and AES GCM - * are supported) - */ - if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { - ccmp.ulNonceSize = ZIO_DATA_IV_LEN; - ccmp.ulAuthDataSize = auth_len; - ccmp.authData = authbuf; - ccmp.ulMACSize = maclen; - ccmp.nonce = ivbuf; - ccmp.ulDataSize = plain_full_len; - - mech.cm_param = (char *)(&ccmp); - mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); - } else { - gcmp.ulIvLen = ZIO_DATA_IV_LEN; - gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); - gcmp.ulAADLen = auth_len; - gcmp.pAAD = authbuf; - gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); - gcmp.pIv = ivbuf; - - mech.cm_param = (char *)(&gcmp); - mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); - } - - /* populate the cipher and plain data structs. */ - plaindata.cd_format = CRYPTO_DATA_UIO; - plaindata.cd_offset = 0; - plaindata.cd_uio = puio; - plaindata.cd_miscdata = NULL; - plaindata.cd_length = plain_full_len; - - cipherdata.cd_format = CRYPTO_DATA_UIO; - cipherdata.cd_offset = 0; - cipherdata.cd_uio = cuio; - cipherdata.cd_miscdata = NULL; - cipherdata.cd_length = datalen + maclen; - - /* perform the actual encryption */ - if (encrypt) { - ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, - NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - } else { - ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, - NULL); - if (ret != CRYPTO_SUCCESS) { - ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); - ret = SET_ERROR(ECKSUM); - goto error; - } - } - - return (0); - -error: - return (ret); -} - -int -zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, - uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) -{ - int ret; - uio_t puio, cuio; - uint64_t aad[3]; - iovec_t plain_iovecs[2], cipher_iovecs[3]; - uint64_t crypt = key->zk_crypt; - uint_t enc_len, keydata_len, aad_len; - - ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); - ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); - - keydata_len = zio_crypt_table[crypt].ci_keylen; - - /* generate iv for wrapping the master and hmac key */ - ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); - if (ret != 0) - goto error; - - /* initialize uio_ts */ - plain_iovecs[0].iov_base = key->zk_master_keydata; - plain_iovecs[0].iov_len = keydata_len; - plain_iovecs[1].iov_base = key->zk_hmac_keydata; - plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; - - cipher_iovecs[0].iov_base = keydata_out; - cipher_iovecs[0].iov_len = keydata_len; - cipher_iovecs[1].iov_base = hmac_keydata_out; - cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; - cipher_iovecs[2].iov_base = mac; - cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; - - /* - * Although we don't support writing to the old format, we do - * support rewrapping the key so that the user can move and - * quarantine datasets on the old format. - */ - if (key->zk_version == 0) { - aad_len = sizeof (uint64_t); - aad[0] = LE_64(key->zk_guid); - } else { - ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); - aad_len = sizeof (uint64_t) * 3; - aad[0] = LE_64(key->zk_guid); - aad[1] = LE_64(crypt); - aad[2] = LE_64(key->zk_version); - } - - enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; - puio.uio_iov = plain_iovecs; - puio.uio_iovcnt = 2; - puio.uio_segflg = UIO_SYSSPACE; - cuio.uio_iov = cipher_iovecs; - cuio.uio_iovcnt = 3; - cuio.uio_segflg = UIO_SYSSPACE; - - /* encrypt the keys and store the resulting ciphertext and mac */ - ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, - &puio, &cuio, (uint8_t *)aad, aad_len); - if (ret != 0) - goto error; - - return (0); - -error: - return (ret); -} - -int -zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, - uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, - uint8_t *mac, zio_crypt_key_t *key) -{ - int ret; - crypto_mechanism_t mech; - uio_t puio, cuio; - uint64_t aad[3]; - iovec_t plain_iovecs[2], cipher_iovecs[3]; - uint_t enc_len, keydata_len, aad_len; - - ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); - ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); - - rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); - - keydata_len = zio_crypt_table[crypt].ci_keylen; - - /* initialize uio_ts */ - plain_iovecs[0].iov_base = key->zk_master_keydata; - plain_iovecs[0].iov_len = keydata_len; - plain_iovecs[1].iov_base = key->zk_hmac_keydata; - plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; - - cipher_iovecs[0].iov_base = keydata; - cipher_iovecs[0].iov_len = keydata_len; - cipher_iovecs[1].iov_base = hmac_keydata; - cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; - cipher_iovecs[2].iov_base = mac; - cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; - - if (version == 0) { - aad_len = sizeof (uint64_t); - aad[0] = LE_64(guid); - } else { - ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); - aad_len = sizeof (uint64_t) * 3; - aad[0] = LE_64(guid); - aad[1] = LE_64(crypt); - aad[2] = LE_64(version); - } - - enc_len = keydata_len + SHA512_HMAC_KEYLEN; - puio.uio_iov = plain_iovecs; - puio.uio_segflg = UIO_SYSSPACE; - puio.uio_iovcnt = 2; - cuio.uio_iov = cipher_iovecs; - cuio.uio_iovcnt = 3; - cuio.uio_segflg = UIO_SYSSPACE; - - /* decrypt the keys and store the result in the output buffers */ - ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, - &puio, &cuio, (uint8_t *)aad, aad_len); - if (ret != 0) - goto error; - - /* generate a fresh salt */ - ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); - if (ret != 0) - goto error; - - /* derive the current key from the master key */ - ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, - key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, - keydata_len); - if (ret != 0) - goto error; - - /* initialize keys for ICP */ - key->zk_current_key.ck_format = CRYPTO_KEY_RAW; - key->zk_current_key.ck_data = key->zk_current_keydata; - key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); - - key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; - key->zk_hmac_key.ck_data = key->zk_hmac_keydata; - key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); - - /* - * Initialize the crypto templates. It's ok if this fails because - * this is just an optimization. - */ - mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); - ret = crypto_create_ctx_template(&mech, &key->zk_current_key, - &key->zk_current_tmpl, KM_SLEEP); - if (ret != CRYPTO_SUCCESS) - key->zk_current_tmpl = NULL; - - mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); - ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, - &key->zk_hmac_tmpl, KM_SLEEP); - if (ret != CRYPTO_SUCCESS) - key->zk_hmac_tmpl = NULL; - - key->zk_crypt = crypt; - key->zk_version = version; - key->zk_guid = guid; - key->zk_salt_count = 0; - - return (0); - -error: - zio_crypt_key_destroy(key); - return (ret); -} - -int -zio_crypt_generate_iv(uint8_t *ivbuf) -{ - int ret; - - /* randomly generate the IV */ - ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); - if (ret != 0) - goto error; - - return (0); - -error: - bzero(ivbuf, ZIO_DATA_IV_LEN); - return (ret); -} - -int -zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, - uint8_t *digestbuf, uint_t digestlen) -{ - int ret; - crypto_mechanism_t mech; - crypto_data_t in_data, digest_data; - uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; - - ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); - - /* initialize sha512-hmac mechanism and crypto data */ - mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); - mech.cm_param = NULL; - mech.cm_param_len = 0; - - /* initialize the crypto data */ - in_data.cd_format = CRYPTO_DATA_RAW; - in_data.cd_offset = 0; - in_data.cd_length = datalen; - in_data.cd_raw.iov_base = (char *)data; - in_data.cd_raw.iov_len = in_data.cd_length; - - digest_data.cd_format = CRYPTO_DATA_RAW; - digest_data.cd_offset = 0; - digest_data.cd_length = SHA512_DIGEST_LENGTH; - digest_data.cd_raw.iov_base = (char *)raw_digestbuf; - digest_data.cd_raw.iov_len = digest_data.cd_length; - - /* generate the hmac */ - ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, - &digest_data, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - bcopy(raw_digestbuf, digestbuf, digestlen); - - return (0); - -error: - bzero(digestbuf, digestlen); - return (ret); -} - -int -zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, - uint_t datalen, uint8_t *ivbuf, uint8_t *salt) -{ - int ret; - uint8_t digestbuf[SHA512_DIGEST_LENGTH]; - - ret = zio_crypt_do_hmac(key, data, datalen, - digestbuf, SHA512_DIGEST_LENGTH); - if (ret != 0) - return (ret); - - bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); - bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); - - return (0); -} - -/* - * The following functions are used to encode and decode encryption parameters - * into blkptr_t and zil_header_t. The ICP wants to use these parameters as - * byte strings, which normally means that these strings would not need to deal - * with byteswapping at all. However, both blkptr_t and zil_header_t may be - * byteswapped by lower layers and so we must "undo" that byteswap here upon - * decoding and encoding in a non-native byteorder. These functions require - * that the byteorder bit is correct before being called. - */ -void -zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) -{ - uint64_t val64; - uint32_t val32; - - ASSERT(BP_IS_ENCRYPTED(bp)); - - if (!BP_SHOULD_BYTESWAP(bp)) { - bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); - bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); - bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); - BP_SET_IV2(bp, val32); - } else { - bcopy(salt, &val64, sizeof (uint64_t)); - bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); - - bcopy(iv, &val64, sizeof (uint64_t)); - bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); - - bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); - BP_SET_IV2(bp, BSWAP_32(val32)); - } -} - -void -zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) -{ - uint64_t val64; - uint32_t val32; - - ASSERT(BP_IS_PROTECTED(bp)); - - /* for convenience, so callers don't need to check */ - if (BP_IS_AUTHENTICATED(bp)) { - bzero(salt, ZIO_DATA_SALT_LEN); - bzero(iv, ZIO_DATA_IV_LEN); - return; - } - - if (!BP_SHOULD_BYTESWAP(bp)) { - bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); - bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); - - val32 = (uint32_t)BP_GET_IV2(bp); - bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); - } else { - val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); - bcopy(&val64, salt, sizeof (uint64_t)); - - val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); - bcopy(&val64, iv, sizeof (uint64_t)); - - val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); - bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); - } -} - -void -zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) -{ - uint64_t val64; - - ASSERT(BP_USES_CRYPT(bp)); - ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); - - if (!BP_SHOULD_BYTESWAP(bp)) { - bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); - bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], - sizeof (uint64_t)); - } else { - bcopy(mac, &val64, sizeof (uint64_t)); - bp->blk_cksum.zc_word[2] = BSWAP_64(val64); - - bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); - bp->blk_cksum.zc_word[3] = BSWAP_64(val64); - } -} - -void -zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) -{ - uint64_t val64; - - ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); - - /* for convenience, so callers don't need to check */ - if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - bzero(mac, ZIO_DATA_MAC_LEN); - return; - } - - if (!BP_SHOULD_BYTESWAP(bp)) { - bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); - bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), - sizeof (uint64_t)); - } else { - val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); - bcopy(&val64, mac, sizeof (uint64_t)); - - val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); - bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); - } -} - -void -zio_crypt_encode_mac_zil(void *data, uint8_t *mac) -{ - zil_chain_t *zilc = data; - - bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); - bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], - sizeof (uint64_t)); -} - -void -zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) -{ - /* - * The ZIL MAC is embedded in the block it protects, which will - * not have been byteswapped by the time this function has been called. - * As a result, we don't need to worry about byteswapping the MAC. - */ - const zil_chain_t *zilc = data; - - bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); - bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), - sizeof (uint64_t)); -} - -/* - * This routine takes a block of dnodes (src_abd) and copies only the bonus - * buffers to the same offsets in the dst buffer. datalen should be the size - * of both the src_abd and the dst buffer (not just the length of the bonus - * buffers). - */ -void -zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) -{ - uint_t i, max_dnp = datalen >> DNODE_SHIFT; - uint8_t *src; - dnode_phys_t *dnp, *sdnp, *ddnp; - - src = abd_borrow_buf_copy(src_abd, datalen); - - sdnp = (dnode_phys_t *)src; - ddnp = (dnode_phys_t *)dst; - - for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { - dnp = &sdnp[i]; - if (dnp->dn_type != DMU_OT_NONE && - DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && - dnp->dn_bonuslen != 0) { - bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), - DN_MAX_BONUS_LEN(dnp)); - } - } - - abd_return_buf(src_abd, src, datalen); -} - -/* - * This function decides what fields from blk_prop are included in - * the on-disk various MAC algorithms. - */ -static void -zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) -{ - /* - * Version 0 did not properly zero out all non-portable fields - * as it should have done. We maintain this code so that we can - * do read-only imports of pools on this version. - */ - if (version == 0) { - BP_SET_DEDUP(bp, 0); - BP_SET_CHECKSUM(bp, 0); - BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); - return; - } - - ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); - - /* - * The hole_birth feature might set these fields even if this bp - * is a hole. We zero them out here to guarantee that raw sends - * will function with or without the feature. - */ - if (BP_IS_HOLE(bp)) { - bp->blk_prop = 0ULL; - return; - } - - /* - * At L0 we want to verify these fields to ensure that data blocks - * can not be reinterpreted. For instance, we do not want an attacker - * to trick us into returning raw lz4 compressed data to the user - * by modifying the compression bits. At higher levels, we cannot - * enforce this policy since raw sends do not convey any information - * about indirect blocks, so these values might be different on the - * receive side. Fortunately, this does not open any new attack - * vectors, since any alterations that can be made to a higher level - * bp must still verify the correct order of the layer below it. - */ - if (BP_GET_LEVEL(bp) != 0) { - BP_SET_BYTEORDER(bp, 0); - BP_SET_COMPRESS(bp, 0); - - /* - * psize cannot be set to zero or it will trigger - * asserts, but the value doesn't really matter as - * long as it is constant. - */ - BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); - } - - BP_SET_DEDUP(bp, 0); - BP_SET_CHECKSUM(bp, 0); -} - -static void -zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, - blkptr_auth_buf_t *bab, uint_t *bab_len) -{ - blkptr_t tmpbp = *bp; - - if (should_bswap) - byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); - - ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); - ASSERT0(BP_IS_EMBEDDED(&tmpbp)); - - zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); - - /* - * We always MAC blk_prop in LE to ensure portability. This - * must be done after decoding the mac, since the endianness - * will get zero'd out here. - */ - zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); - bab->bab_prop = LE_64(tmpbp.blk_prop); - bab->bab_pad = 0ULL; - - /* version 0 did not include the padding */ - *bab_len = sizeof (blkptr_auth_buf_t); - if (version == 0) - *bab_len -= sizeof (uint64_t); -} - -static int -zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, - boolean_t should_bswap, blkptr_t *bp) -{ - int ret; - uint_t bab_len; - blkptr_auth_buf_t bab; - crypto_data_t cd; - - zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); - cd.cd_format = CRYPTO_DATA_RAW; - cd.cd_offset = 0; - cd.cd_length = bab_len; - cd.cd_raw.iov_base = (char *)&bab; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_update(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - return (0); - -error: - return (ret); -} - -static void -zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, - boolean_t should_bswap, blkptr_t *bp) -{ - uint_t bab_len; - blkptr_auth_buf_t bab; - - zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); - SHA2Update(ctx, &bab, bab_len); -} - -static void -zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, - boolean_t should_bswap, blkptr_t *bp) -{ - uint_t bab_len; - blkptr_auth_buf_t bab; - - zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); - bcopy(&bab, *aadp, bab_len); - *aadp += bab_len; - *aad_len += bab_len; -} - -static int -zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, - boolean_t should_bswap, dnode_phys_t *dnp) -{ - int ret, i; - dnode_phys_t *adnp; - boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); - crypto_data_t cd; - uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; - - cd.cd_format = CRYPTO_DATA_RAW; - cd.cd_offset = 0; - - /* authenticate the core dnode (masking out non-portable bits) */ - bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); - adnp = (dnode_phys_t *)tmp_dncore; - if (le_bswap) { - adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); - adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); - adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); - adnp->dn_used = BSWAP_64(adnp->dn_used); - } - adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; - adnp->dn_used = 0; - - cd.cd_length = sizeof (tmp_dncore); - cd.cd_raw.iov_base = (char *)adnp; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_update(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - for (i = 0; i < dnp->dn_nblkptr; i++) { - ret = zio_crypt_bp_do_hmac_updates(ctx, version, - should_bswap, &dnp->dn_blkptr[i]); - if (ret != 0) - goto error; - } - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - ret = zio_crypt_bp_do_hmac_updates(ctx, version, - should_bswap, DN_SPILL_BLKPTR(dnp)); - if (ret != 0) - goto error; - } - - return (0); - -error: - return (ret); -} - -/* - * objset_phys_t blocks introduce a number of exceptions to the normal - * authentication process. objset_phys_t's contain 2 separate HMACS for - * protecting the integrity of their data. The portable_mac protects the - * metadnode. This MAC can be sent with a raw send and protects against - * reordering of data within the metadnode. The local_mac protects the user - * accounting objects which are not sent from one system to another. - * - * In addition, objset blocks are the only blocks that can be modified and - * written to disk without the key loaded under certain circumstances. During - * zil_claim() we need to be able to update the zil_header_t to complete - * claiming log blocks and during raw receives we need to write out the - * portable_mac from the send file. Both of these actions are possible - * because these fields are not protected by either MAC so neither one will - * need to modify the MACs without the key. However, when the modified blocks - * are written out they will be byteswapped into the host machine's native - * endianness which will modify fields protected by the MAC. As a result, MAC - * calculation for objset blocks works slightly differently from other block - * types. Where other block types MAC the data in whatever endianness is - * written to disk, objset blocks always MAC little endian version of their - * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() - * and le_bswap indicates whether a byteswap is needed to get this block - * into little endian format. - */ -int -zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, - boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) -{ - int ret; - crypto_mechanism_t mech; - crypto_context_t ctx; - crypto_data_t cd; - objset_phys_t *osp = data; - uint64_t intval; - boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); - uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; - uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; - - /* initialize HMAC mechanism */ - mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); - mech.cm_param = NULL; - mech.cm_param_len = 0; - - cd.cd_format = CRYPTO_DATA_RAW; - cd.cd_offset = 0; - - /* calculate the portable MAC from the portable fields and metadnode */ - ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - /* add in the os_type */ - intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); - cd.cd_length = sizeof (uint64_t); - cd.cd_raw.iov_base = (char *)&intval; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_update(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - /* add in the portable os_flags */ - intval = osp->os_flags; - if (should_bswap) - intval = BSWAP_64(intval); - intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; - if (!ZFS_HOST_BYTEORDER) - intval = BSWAP_64(intval); - - cd.cd_length = sizeof (uint64_t); - cd.cd_raw.iov_base = (char *)&intval; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_update(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - /* add in fields from the metadnode */ - ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, - should_bswap, &osp->os_meta_dnode); - if (ret) - goto error; - - /* store the final digest in a temporary buffer and copy what we need */ - cd.cd_length = SHA512_DIGEST_LENGTH; - cd.cd_raw.iov_base = (char *)raw_portable_mac; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_final(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); - - /* - * The local MAC protects the user, group and project accounting. - * If these objects are not present, the local MAC is zeroed out. - */ - if ((datalen >= OBJSET_PHYS_SIZE_V3 && - osp->os_userused_dnode.dn_type == DMU_OT_NONE && - osp->os_groupused_dnode.dn_type == DMU_OT_NONE && - osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || - (datalen >= OBJSET_PHYS_SIZE_V2 && - osp->os_userused_dnode.dn_type == DMU_OT_NONE && - osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || - (datalen <= OBJSET_PHYS_SIZE_V1)) { - bzero(local_mac, ZIO_OBJSET_MAC_LEN); - return (0); - } - - /* calculate the local MAC from the userused and groupused dnodes */ - ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - /* add in the non-portable os_flags */ - intval = osp->os_flags; - if (should_bswap) - intval = BSWAP_64(intval); - intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; - if (!ZFS_HOST_BYTEORDER) - intval = BSWAP_64(intval); - - cd.cd_length = sizeof (uint64_t); - cd.cd_raw.iov_base = (char *)&intval; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_update(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - /* add in fields from the user accounting dnodes */ - if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { - ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, - should_bswap, &osp->os_userused_dnode); - if (ret) - goto error; - } - - if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { - ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, - should_bswap, &osp->os_groupused_dnode); - if (ret) - goto error; - } - - if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && - datalen >= OBJSET_PHYS_SIZE_V3) { - ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, - should_bswap, &osp->os_projectused_dnode); - if (ret) - goto error; - } - - /* store the final digest in a temporary buffer and copy what we need */ - cd.cd_length = SHA512_DIGEST_LENGTH; - cd.cd_raw.iov_base = (char *)raw_local_mac; - cd.cd_raw.iov_len = cd.cd_length; - - ret = crypto_mac_final(ctx, &cd, NULL); - if (ret != CRYPTO_SUCCESS) { - ret = SET_ERROR(EIO); - goto error; - } - - bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); - - return (0); - -error: - bzero(portable_mac, ZIO_OBJSET_MAC_LEN); - bzero(local_mac, ZIO_OBJSET_MAC_LEN); - return (ret); -} - -static void -zio_crypt_destroy_uio(uio_t *uio) -{ - if (uio->uio_iov) - kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); -} - -/* - * This function parses an uncompressed indirect block and returns a checksum - * of all the portable fields from all of the contained bps. The portable - * fields are the MAC and all of the fields from blk_prop except for the dedup, - * checksum, and psize bits. For an explanation of the purpose of this, see - * the comment block on object set authentication. - */ -static int -zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, - uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) -{ - blkptr_t *bp; - int i, epb = datalen >> SPA_BLKPTRSHIFT; - SHA2_CTX ctx; - uint8_t digestbuf[SHA512_DIGEST_LENGTH]; - - /* checksum all of the MACs from the layer below */ - SHA2Init(SHA512, &ctx); - for (i = 0, bp = buf; i < epb; i++, bp++) { - zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, - byteswap, bp); - } - SHA2Final(digestbuf, &ctx); - - if (generate) { - bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); - return (0); - } - - if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) - return (SET_ERROR(ECKSUM)); - - return (0); -} - -int -zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, - uint_t datalen, boolean_t byteswap, uint8_t *cksum) -{ - int ret; - - /* - * Unfortunately, callers of this function will not always have - * easy access to the on-disk format version. This info is - * normally found in the DSL Crypto Key, but the checksum-of-MACs - * is expected to be verifiable even when the key isn't loaded. - * Here, instead of doing a ZAP lookup for the version for each - * zio, we simply try both existing formats. - */ - ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, - datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); - if (ret == ECKSUM) { - ASSERT(!generate); - ret = zio_crypt_do_indirect_mac_checksum_impl(generate, - buf, datalen, 0, byteswap, cksum); - } - - return (ret); -} - -int -zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, - uint_t datalen, boolean_t byteswap, uint8_t *cksum) -{ - int ret; - void *buf; - - buf = abd_borrow_buf_copy(abd, datalen); - ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, - byteswap, cksum); - abd_return_buf(abd, buf, datalen); - - return (ret); -} - -/* - * Special case handling routine for encrypting / decrypting ZIL blocks. - * We do not check for the older ZIL chain because the encryption feature - * was not available before the newer ZIL chain was introduced. The goal - * here is to encrypt everything except the blkptr_t of a lr_write_t and - * the zil_chain_t header. Everything that is not encrypted is authenticated. - */ -static int -zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, - uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, - boolean_t *no_crypt) -{ - int ret; - uint64_t txtype, lr_len; - uint_t nr_src, nr_dst, crypt_len; - uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; - iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; - uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; - zil_chain_t *zilc; - lr_t *lr; - uint8_t *aadbuf = zio_buf_alloc(datalen); - - /* cipherbuf always needs an extra iovec for the MAC */ - if (encrypt) { - src = plainbuf; - dst = cipherbuf; - nr_src = 0; - nr_dst = 1; - } else { - src = cipherbuf; - dst = plainbuf; - nr_src = 1; - nr_dst = 0; - } - - /* find the start and end record of the log block */ - zilc = (zil_chain_t *)src; - slrp = src + sizeof (zil_chain_t); - aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); - - /* calculate the number of encrypted iovecs we will need */ - for (; slrp < blkend; slrp += lr_len) { - lr = (lr_t *)slrp; - - if (!byteswap) { - txtype = lr->lrc_txtype; - lr_len = lr->lrc_reclen; - } else { - txtype = BSWAP_64(lr->lrc_txtype); - lr_len = BSWAP_64(lr->lrc_reclen); - } - - nr_iovecs++; - if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) - nr_iovecs++; - } - - nr_src += nr_iovecs; - nr_dst += nr_iovecs; - - /* allocate the iovec arrays */ - if (nr_src != 0) { - src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); - if (src_iovecs == NULL) { - ret = SET_ERROR(ENOMEM); - goto error; - } - } - - if (nr_dst != 0) { - dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); - if (dst_iovecs == NULL) { - ret = SET_ERROR(ENOMEM); - goto error; - } - } - - /* - * Copy the plain zil header over and authenticate everything except - * the checksum that will store our MAC. If we are writing the data - * the embedded checksum will not have been calculated yet, so we don't - * authenticate that. - */ - bcopy(src, dst, sizeof (zil_chain_t)); - bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); - aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); - aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); - - /* loop over records again, filling in iovecs */ - nr_iovecs = 0; - slrp = src + sizeof (zil_chain_t); - dlrp = dst + sizeof (zil_chain_t); - - for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { - lr = (lr_t *)slrp; - - if (!byteswap) { - txtype = lr->lrc_txtype; - lr_len = lr->lrc_reclen; - } else { - txtype = BSWAP_64(lr->lrc_txtype); - lr_len = BSWAP_64(lr->lrc_reclen); - } - - /* copy the common lr_t */ - bcopy(slrp, dlrp, sizeof (lr_t)); - bcopy(slrp, aadp, sizeof (lr_t)); - aadp += sizeof (lr_t); - aad_len += sizeof (lr_t); - - ASSERT3P(src_iovecs, !=, NULL); - ASSERT3P(dst_iovecs, !=, NULL); - - /* - * If this is a TX_WRITE record we want to encrypt everything - * except the bp if exists. If the bp does exist we want to - * authenticate it. - */ - if (txtype == TX_WRITE) { - crypt_len = sizeof (lr_write_t) - - sizeof (lr_t) - sizeof (blkptr_t); - src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); - src_iovecs[nr_iovecs].iov_len = crypt_len; - dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); - dst_iovecs[nr_iovecs].iov_len = crypt_len; - - /* copy the bp now since it will not be encrypted */ - bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); - bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - aadp, sizeof (blkptr_t)); - aadp += sizeof (blkptr_t); - aad_len += sizeof (blkptr_t); - nr_iovecs++; - total_len += crypt_len; - - if (lr_len != sizeof (lr_write_t)) { - crypt_len = lr_len - sizeof (lr_write_t); - src_iovecs[nr_iovecs].iov_base = - slrp + sizeof (lr_write_t); - src_iovecs[nr_iovecs].iov_len = crypt_len; - dst_iovecs[nr_iovecs].iov_base = - dlrp + sizeof (lr_write_t); - dst_iovecs[nr_iovecs].iov_len = crypt_len; - nr_iovecs++; - total_len += crypt_len; - } - } else { - crypt_len = lr_len - sizeof (lr_t); - src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); - src_iovecs[nr_iovecs].iov_len = crypt_len; - dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); - dst_iovecs[nr_iovecs].iov_len = crypt_len; - nr_iovecs++; - total_len += crypt_len; - } - } - - *no_crypt = (nr_iovecs == 0); - *enc_len = total_len; - *authbuf = aadbuf; - *auth_len = aad_len; - - if (encrypt) { - puio->uio_iov = src_iovecs; - puio->uio_iovcnt = nr_src; - cuio->uio_iov = dst_iovecs; - cuio->uio_iovcnt = nr_dst; - } else { - puio->uio_iov = dst_iovecs; - puio->uio_iovcnt = nr_dst; - cuio->uio_iov = src_iovecs; - cuio->uio_iovcnt = nr_src; - } - - return (0); - -error: - zio_buf_free(aadbuf, datalen); - if (src_iovecs != NULL) - kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); - if (dst_iovecs != NULL) - kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); - - *enc_len = 0; - *authbuf = NULL; - *auth_len = 0; - *no_crypt = B_FALSE; - puio->uio_iov = NULL; - puio->uio_iovcnt = 0; - cuio->uio_iov = NULL; - cuio->uio_iovcnt = 0; - return (ret); -} - -/* - * Special case handling routine for encrypting / decrypting dnode blocks. - */ -static int -zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, - uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, - uint_t *auth_len, boolean_t *no_crypt) -{ - int ret; - uint_t nr_src, nr_dst, crypt_len; - uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; - uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; - iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; - uint8_t *src, *dst, *aadp; - dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; - uint8_t *aadbuf = zio_buf_alloc(datalen); - - if (encrypt) { - src = plainbuf; - dst = cipherbuf; - nr_src = 0; - nr_dst = 1; - } else { - src = cipherbuf; - dst = plainbuf; - nr_src = 1; - nr_dst = 0; - } - - sdnp = (dnode_phys_t *)src; - ddnp = (dnode_phys_t *)dst; - aadp = aadbuf; - - /* - * Count the number of iovecs we will need to do the encryption by - * counting the number of bonus buffers that need to be encrypted. - */ - for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { - /* - * This block may still be byteswapped. However, all of the - * values we use are either uint8_t's (for which byteswapping - * is a noop) or a * != 0 check, which will work regardless - * of whether or not we byteswap. - */ - if (sdnp[i].dn_type != DMU_OT_NONE && - DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && - sdnp[i].dn_bonuslen != 0) { - nr_iovecs++; - } - } - - nr_src += nr_iovecs; - nr_dst += nr_iovecs; - - if (nr_src != 0) { - src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); - if (src_iovecs == NULL) { - ret = SET_ERROR(ENOMEM); - goto error; - } - } - - if (nr_dst != 0) { - dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); - if (dst_iovecs == NULL) { - ret = SET_ERROR(ENOMEM); - goto error; - } - } - - nr_iovecs = 0; - - /* - * Iterate through the dnodes again, this time filling in the uios - * we allocated earlier. We also concatenate any data we want to - * authenticate onto aadbuf. - */ - for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { - dnp = &sdnp[i]; - - /* copy over the core fields and blkptrs (kept as plaintext) */ - bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), - sizeof (blkptr_t)); - } - - /* - * Handle authenticated data. We authenticate everything in - * the dnode that can be brought over when we do a raw send. - * This includes all of the core fields as well as the MACs - * stored in the bp checksums and all of the portable bits - * from blk_prop. We include the dnode padding here in case it - * ever gets used in the future. Some dn_flags and dn_used are - * not portable so we mask those out values out of the - * authenticated data. - */ - crypt_len = offsetof(dnode_phys_t, dn_blkptr); - bcopy(dnp, aadp, crypt_len); - adnp = (dnode_phys_t *)aadp; - adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; - adnp->dn_used = 0; - aadp += crypt_len; - aad_len += crypt_len; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zio_crypt_bp_do_aad_updates(&aadp, &aad_len, - version, byteswap, &dnp->dn_blkptr[j]); - } - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zio_crypt_bp_do_aad_updates(&aadp, &aad_len, - version, byteswap, DN_SPILL_BLKPTR(dnp)); - } - - /* - * If this bonus buffer needs to be encrypted, we prepare an - * iovec_t. The encryption / decryption functions will fill - * this in for us with the encrypted or decrypted data. - * Otherwise we add the bonus buffer to the authenticated - * data buffer and copy it over to the destination. The - * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that - * we can guarantee alignment with the AES block size - * (128 bits). - */ - crypt_len = DN_MAX_BONUS_LEN(dnp); - if (dnp->dn_type != DMU_OT_NONE && - DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && - dnp->dn_bonuslen != 0) { - ASSERT3U(nr_iovecs, <, nr_src); - ASSERT3U(nr_iovecs, <, nr_dst); - ASSERT3P(src_iovecs, !=, NULL); - ASSERT3P(dst_iovecs, !=, NULL); - src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); - src_iovecs[nr_iovecs].iov_len = crypt_len; - dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); - dst_iovecs[nr_iovecs].iov_len = crypt_len; - - nr_iovecs++; - total_len += crypt_len; - } else { - bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); - bcopy(DN_BONUS(dnp), aadp, crypt_len); - aadp += crypt_len; - aad_len += crypt_len; - } - } - - *no_crypt = (nr_iovecs == 0); - *enc_len = total_len; - *authbuf = aadbuf; - *auth_len = aad_len; - - if (encrypt) { - puio->uio_iov = src_iovecs; - puio->uio_iovcnt = nr_src; - cuio->uio_iov = dst_iovecs; - cuio->uio_iovcnt = nr_dst; - } else { - puio->uio_iov = dst_iovecs; - puio->uio_iovcnt = nr_dst; - cuio->uio_iov = src_iovecs; - cuio->uio_iovcnt = nr_src; - } - - return (0); - -error: - zio_buf_free(aadbuf, datalen); - if (src_iovecs != NULL) - kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); - if (dst_iovecs != NULL) - kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); - - *enc_len = 0; - *authbuf = NULL; - *auth_len = 0; - *no_crypt = B_FALSE; - puio->uio_iov = NULL; - puio->uio_iovcnt = 0; - cuio->uio_iov = NULL; - cuio->uio_iovcnt = 0; - return (ret); -} - -static int -zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, - uint_t *enc_len) -{ - int ret; - uint_t nr_plain = 1, nr_cipher = 2; - iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; - - /* allocate the iovecs for the plain and cipher data */ - plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), - KM_SLEEP); - if (!plain_iovecs) { - ret = SET_ERROR(ENOMEM); - goto error; - } - - cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), - KM_SLEEP); - if (!cipher_iovecs) { - ret = SET_ERROR(ENOMEM); - goto error; - } - - plain_iovecs[0].iov_base = plainbuf; - plain_iovecs[0].iov_len = datalen; - cipher_iovecs[0].iov_base = cipherbuf; - cipher_iovecs[0].iov_len = datalen; - - *enc_len = datalen; - puio->uio_iov = plain_iovecs; - puio->uio_iovcnt = nr_plain; - cuio->uio_iov = cipher_iovecs; - cuio->uio_iovcnt = nr_cipher; - - return (0); - -error: - if (plain_iovecs != NULL) - kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); - if (cipher_iovecs != NULL) - kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); - - *enc_len = 0; - puio->uio_iov = NULL; - puio->uio_iovcnt = 0; - cuio->uio_iov = NULL; - cuio->uio_iovcnt = 0; - return (ret); -} - -/* - * This function builds up the plaintext (puio) and ciphertext (cuio) uios so - * that they can be used for encryption and decryption by zio_do_crypt_uio(). - * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks - * requiring special handling to parse out pieces that are to be encrypted. The - * authbuf is used by these special cases to store additional authenticated - * data (AAD) for the encryption modes. - */ -static int -zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, - uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, - uint_t *auth_len, boolean_t *no_crypt) -{ - int ret; - iovec_t *mac_iov; - - ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); - - /* route to handler */ - switch (ot) { - case DMU_OT_INTENT_LOG: - ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, - datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, - no_crypt); - break; - case DMU_OT_DNODE: - ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, - cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, - auth_len, no_crypt); - break; - default: - ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, - datalen, puio, cuio, enc_len); - *authbuf = NULL; - *auth_len = 0; - *no_crypt = B_FALSE; - break; - } - - if (ret != 0) - goto error; - - /* populate the uios */ - puio->uio_segflg = UIO_SYSSPACE; - cuio->uio_segflg = UIO_SYSSPACE; - - mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); - mac_iov->iov_base = mac; - mac_iov->iov_len = ZIO_DATA_MAC_LEN; - - return (0); - -error: - return (ret); -} - -/* - * Primary encryption / decryption entrypoint for zio data. - */ -int -zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, - dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, - uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, - boolean_t *no_crypt) -{ - int ret; - boolean_t locked = B_FALSE; - uint64_t crypt = key->zk_crypt; - uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; - uint_t enc_len, auth_len; - uio_t puio, cuio; - uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; - crypto_key_t tmp_ckey, *ckey = NULL; - crypto_ctx_template_t tmpl; - uint8_t *authbuf = NULL; - - /* - * If the needed key is the current one, just use it. Otherwise we - * need to generate a temporary one from the given salt + master key. - * If we are encrypting, we must return a copy of the current salt - * so that it can be stored in the blkptr_t. - */ - rw_enter(&key->zk_salt_lock, RW_READER); - locked = B_TRUE; - - if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { - ckey = &key->zk_current_key; - tmpl = key->zk_current_tmpl; - } else { - rw_exit(&key->zk_salt_lock); - locked = B_FALSE; - - ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, - salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); - if (ret != 0) - goto error; - - tmp_ckey.ck_format = CRYPTO_KEY_RAW; - tmp_ckey.ck_data = enc_keydata; - tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); - - ckey = &tmp_ckey; - tmpl = NULL; - } - - /* - * Attempt to use QAT acceleration if we can. We currently don't - * do this for metadnode and ZIL blocks, since they have a much - * more involved buffer layout and the qat_crypt() function only - * works in-place. - */ - if (qat_crypt_use_accel(datalen) && - ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { - uint8_t *srcbuf, *dstbuf; - - if (encrypt) { - srcbuf = plainbuf; - dstbuf = cipherbuf; - } else { - srcbuf = cipherbuf; - dstbuf = plainbuf; - } - - ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, - dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); - if (ret == CPA_STATUS_SUCCESS) { - if (locked) { - rw_exit(&key->zk_salt_lock); - locked = B_FALSE; - } - - return (0); - } - /* If the hardware implementation fails fall back to software */ - } - - bzero(&puio, sizeof (uio_t)); - bzero(&cuio, sizeof (uio_t)); - - /* create uios for encryption */ - ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, - cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, - &authbuf, &auth_len, no_crypt); - if (ret != 0) - goto error; - - /* perform the encryption / decryption in software */ - ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, - &puio, &cuio, authbuf, auth_len); - if (ret != 0) - goto error; - - if (locked) { - rw_exit(&key->zk_salt_lock); - locked = B_FALSE; - } - - if (authbuf != NULL) - zio_buf_free(authbuf, datalen); - if (ckey == &tmp_ckey) - bzero(enc_keydata, keydata_len); - zio_crypt_destroy_uio(&puio); - zio_crypt_destroy_uio(&cuio); - - return (0); - -error: - if (locked) - rw_exit(&key->zk_salt_lock); - if (authbuf != NULL) - zio_buf_free(authbuf, datalen); - if (ckey == &tmp_ckey) - bzero(enc_keydata, keydata_len); - zio_crypt_destroy_uio(&puio); - zio_crypt_destroy_uio(&cuio); - - return (ret); -} - -/* - * Simple wrapper around zio_do_crypt_data() to work with abd's instead of - * linear buffers. - */ -int -zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, - boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, - uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) -{ - int ret; - void *ptmp, *ctmp; - - if (encrypt) { - ptmp = abd_borrow_buf_copy(pabd, datalen); - ctmp = abd_borrow_buf(cabd, datalen); - } else { - ptmp = abd_borrow_buf(pabd, datalen); - ctmp = abd_borrow_buf_copy(cabd, datalen); - } - - ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, - datalen, ptmp, ctmp, no_crypt); - if (ret != 0) - goto error; - - if (encrypt) { - abd_return_buf(pabd, ptmp, datalen); - abd_return_buf_copy(cabd, ctmp, datalen); - } else { - abd_return_buf_copy(pabd, ptmp, datalen); - abd_return_buf(cabd, ctmp, datalen); - } - - return (0); - -error: - if (encrypt) { - abd_return_buf(pabd, ptmp, datalen); - abd_return_buf_copy(cabd, ctmp, datalen); - } else { - abd_return_buf_copy(pabd, ptmp, datalen); - abd_return_buf(cabd, ctmp, datalen); - } - - return (ret); -} - -#if defined(_KERNEL) -/* BEGIN CSTYLED */ -module_param(zfs_key_max_salt_uses, ulong, 0644); -MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " - "can be used for generating encryption keys before it is rotated"); -/* END CSTYLED */ -#endif diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c deleted file mode 100644 index 6df367b81..000000000 --- a/module/zfs/zpl_ctldir.c +++ /dev/null @@ -1,572 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (C) 2011 Lawrence Livermore National Security, LLC. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * LLNL-CODE-403049. - * Rewritten for Linux by: - * Rohan Puri <[email protected]> - * Brian Behlendorf <[email protected]> - */ - -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_ctldir.h> -#include <sys/zpl.h> - -/* - * Common open routine. Disallow any write access. - */ -/* ARGSUSED */ -static int -zpl_common_open(struct inode *ip, struct file *filp) -{ - if (filp->f_mode & FMODE_WRITE) - return (-EACCES); - - return (generic_file_open(ip, filp)); -} - -/* - * Get root directory contents. - */ -static int -zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) -{ - zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); - int error = 0; - - ZFS_ENTER(zfsvfs); - - if (!zpl_dir_emit_dots(filp, ctx)) - goto out; - - if (ctx->pos == 2) { - if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, - strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) - goto out; - - ctx->pos++; - } - - if (ctx->pos == 3) { - if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, - strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) - goto out; - - ctx->pos++; - } -out: - ZFS_EXIT(zfsvfs); - - return (error); -} - -#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) -static int -zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - zpl_dir_context_t ctx = - ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); - int error; - - error = zpl_root_iterate(filp, &ctx); - filp->f_pos = ctx.pos; - - return (error); -} -#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ - -/* - * Get root directory attributes. - */ -/* ARGSUSED */ -static int -zpl_root_getattr_impl(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) -{ - struct inode *ip = path->dentry->d_inode; - - generic_fillattr(ip, stat); - stat->atime = current_time(ip); - - return (0); -} -ZPL_GETATTR_WRAPPER(zpl_root_getattr); - -static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) -#else -zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) -#endif -{ - cred_t *cr = CRED(); - struct inode *ip; - int error; - - crhold(cr); - error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); - ASSERT3S(error, <=, 0); - crfree(cr); - - if (error) { - if (error == -ENOENT) - return (d_splice_alias(NULL, dentry)); - else - return (ERR_PTR(error)); - } - - return (d_splice_alias(ip, dentry)); -} - -/* - * The '.zfs' control directory file and inode operations. - */ -const struct file_operations zpl_fops_root = { - .open = zpl_common_open, - .llseek = generic_file_llseek, - .read = generic_read_dir, -#ifdef HAVE_VFS_ITERATE_SHARED - .iterate_shared = zpl_root_iterate, -#elif defined(HAVE_VFS_ITERATE) - .iterate = zpl_root_iterate, -#else - .readdir = zpl_root_readdir, -#endif -}; - -const struct inode_operations zpl_ops_root = { - .lookup = zpl_root_lookup, - .getattr = zpl_root_getattr, -}; - -#ifdef HAVE_AUTOMOUNT -static struct vfsmount * -zpl_snapdir_automount(struct path *path) -{ - int error; - - error = -zfsctl_snapshot_mount(path, 0); - if (error) - return (ERR_PTR(error)); - - /* - * Rather than returning the new vfsmount for the snapshot we must - * return NULL to indicate a mount collision. This is done because - * the user space mount calls do_add_mount() which adds the vfsmount - * to the name space. If we returned the new mount here it would be - * added again to the vfsmount list resulting in list corruption. - */ - return (NULL); -} -#endif /* HAVE_AUTOMOUNT */ - -/* - * Negative dentries must always be revalidated so newly created snapshots - * can be detected and automounted. Normal dentries should be kept because - * as of the 3.18 kernel revaliding the mountpoint dentry will result in - * the snapshot being immediately unmounted. - */ -static int -#ifdef HAVE_D_REVALIDATE_NAMEIDATA -zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) -#else -zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) -#endif -{ - return (!!dentry->d_inode); -} - -dentry_operations_t zpl_dops_snapdirs = { -/* - * Auto mounting of snapshots is only supported for 2.6.37 and - * newer kernels. Prior to this kernel the ops->follow_link() - * callback was used as a hack to trigger the mount. The - * resulting vfsmount was then explicitly grafted in to the - * name space. While it might be possible to add compatibility - * code to accomplish this it would require considerable care. - */ -#ifdef HAVE_AUTOMOUNT - .d_automount = zpl_snapdir_automount, -#endif /* HAVE_AUTOMOUNT */ - .d_revalidate = zpl_snapdir_revalidate, -}; - -static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - struct nameidata *nd) -#else -zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - unsigned int flags) -#endif - -{ - fstrans_cookie_t cookie; - cred_t *cr = CRED(); - struct inode *ip = NULL; - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, - 0, cr, NULL, NULL); - ASSERT3S(error, <=, 0); - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error && error != -ENOENT) - return (ERR_PTR(error)); - - ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); -#ifdef HAVE_AUTOMOUNT - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; -#endif - - return (d_splice_alias(ip, dentry)); -} - -static int -zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) -{ - zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); - fstrans_cookie_t cookie; - char snapname[MAXNAMELEN]; - boolean_t case_conflict; - uint64_t id, pos; - int error = 0; - - ZFS_ENTER(zfsvfs); - cookie = spl_fstrans_mark(); - - if (!zpl_dir_emit_dots(filp, ctx)) - goto out; - - pos = ctx->pos; - while (error == 0) { - dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); - error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, - snapname, &id, &pos, &case_conflict); - dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); - if (error) - goto out; - - if (!zpl_dir_emit(ctx, snapname, strlen(snapname), - ZFSCTL_INO_SHARES - id, DT_DIR)) - goto out; - - ctx->pos = pos; - } -out: - spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); - - if (error == -ENOENT) - return (0); - - return (error); -} - -#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) -static int -zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - zpl_dir_context_t ctx = - ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); - int error; - - error = zpl_snapdir_iterate(filp, &ctx); - filp->f_pos = ctx.pos; - - return (error); -} -#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ - -static int -zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry, unsigned int flags) -{ - cred_t *cr = CRED(); - int error; - - /* We probably don't want to support renameat2(2) in ctldir */ - if (flags) - return (-EINVAL); - - crhold(cr); - error = -zfsctl_snapdir_rename(sdip, dname(sdentry), - tdip, dname(tdentry), cr, 0); - ASSERT3S(error, <=, 0); - crfree(cr); - - return (error); -} - -#ifndef HAVE_RENAME_WANTS_FLAGS -static int -zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry) -{ - return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); -} -#endif - -static int -zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) -{ - cred_t *cr = CRED(); - int error; - - crhold(cr); - error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); - ASSERT3S(error, <=, 0); - crfree(cr); - - return (error); -} - -static int -zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) -{ - cred_t *cr = CRED(); - vattr_t *vap; - struct inode *ip; - int error; - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, mode | S_IFDIR, cr); - - error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); - if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - d_instantiate(dentry, ip); - } - - kmem_free(vap, sizeof (vattr_t)); - ASSERT3S(error, <=, 0); - crfree(cr); - - return (error); -} - -/* - * Get snapshot directory attributes. - */ -/* ARGSUSED */ -static int -zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) -{ - struct inode *ip = path->dentry->d_inode; - zfsvfs_t *zfsvfs = ITOZSB(ip); - - ZFS_ENTER(zfsvfs); - generic_fillattr(ip, stat); - - stat->nlink = stat->size = 2; - stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); - stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); - - return (0); -} -ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); - -/* - * The '.zfs/snapshot' directory file operations. These mainly control - * generating the list of available snapshots when doing an 'ls' in the - * directory. See zpl_snapdir_readdir(). - */ -const struct file_operations zpl_fops_snapdir = { - .open = zpl_common_open, - .llseek = generic_file_llseek, - .read = generic_read_dir, -#ifdef HAVE_VFS_ITERATE_SHARED - .iterate_shared = zpl_snapdir_iterate, -#elif defined(HAVE_VFS_ITERATE) - .iterate = zpl_snapdir_iterate, -#else - .readdir = zpl_snapdir_readdir, -#endif - -}; - -/* - * The '.zfs/snapshot' directory inode operations. These mainly control - * creating an inode for a snapshot directory and initializing the needed - * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). - */ -const struct inode_operations zpl_ops_snapdir = { - .lookup = zpl_snapdir_lookup, - .getattr = zpl_snapdir_getattr, -#ifdef HAVE_RENAME_WANTS_FLAGS - .rename = zpl_snapdir_rename2, -#else - .rename = zpl_snapdir_rename, -#endif - .rmdir = zpl_snapdir_rmdir, - .mkdir = zpl_snapdir_mkdir, -}; - -static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_shares_lookup(struct inode *dip, struct dentry *dentry, - struct nameidata *nd) -#else -zpl_shares_lookup(struct inode *dip, struct dentry *dentry, - unsigned int flags) -#endif -{ - fstrans_cookie_t cookie; - cred_t *cr = CRED(); - struct inode *ip = NULL; - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, - 0, cr, NULL, NULL); - ASSERT3S(error, <=, 0); - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error) { - if (error == -ENOENT) - return (d_splice_alias(NULL, dentry)); - else - return (ERR_PTR(error)); - } - - return (d_splice_alias(ip, dentry)); -} - -static int -zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) -{ - fstrans_cookie_t cookie; - cred_t *cr = CRED(); - zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); - znode_t *dzp; - int error = 0; - - ZFS_ENTER(zfsvfs); - cookie = spl_fstrans_mark(); - - if (zfsvfs->z_shares_dir == 0) { - zpl_dir_emit_dots(filp, ctx); - goto out; - } - - error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); - if (error) - goto out; - - crhold(cr); - error = -zfs_readdir(ZTOI(dzp), ctx, cr); - crfree(cr); - - iput(ZTOI(dzp)); -out: - spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); - ASSERT3S(error, <=, 0); - - return (error); -} - -#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) -static int -zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - zpl_dir_context_t ctx = - ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); - int error; - - error = zpl_shares_iterate(filp, &ctx); - filp->f_pos = ctx.pos; - - return (error); -} -#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ - -/* ARGSUSED */ -static int -zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) -{ - struct inode *ip = path->dentry->d_inode; - zfsvfs_t *zfsvfs = ITOZSB(ip); - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); - - if (zfsvfs->z_shares_dir == 0) { - generic_fillattr(path->dentry->d_inode, stat); - stat->nlink = stat->size = 2; - stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); - return (0); - } - - error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); - if (error == 0) { - error = -zfs_getattr_fast(ZTOI(dzp), stat); - iput(ZTOI(dzp)); - } - - ZFS_EXIT(zfsvfs); - ASSERT3S(error, <=, 0); - - return (error); -} -ZPL_GETATTR_WRAPPER(zpl_shares_getattr); - -/* - * The '.zfs/shares' directory file operations. - */ -const struct file_operations zpl_fops_shares = { - .open = zpl_common_open, - .llseek = generic_file_llseek, - .read = generic_read_dir, -#ifdef HAVE_VFS_ITERATE_SHARED - .iterate_shared = zpl_shares_iterate, -#elif defined(HAVE_VFS_ITERATE) - .iterate = zpl_shares_iterate, -#else - .readdir = zpl_shares_readdir, -#endif - -}; - -/* - * The '.zfs/shares' directory inode operations. - */ -const struct inode_operations zpl_ops_shares = { - .lookup = zpl_shares_lookup, - .getattr = zpl_shares_getattr, -}; diff --git a/module/zfs/zpl_export.c b/module/zfs/zpl_export.c deleted file mode 100644 index a264d664c..000000000 --- a/module/zfs/zpl_export.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2011 Gunnar Beutner - * Copyright (c) 2012 Cyril Plisko. All rights reserved. - */ - - -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_ctldir.h> -#include <sys/zpl.h> - - -static int -#ifdef HAVE_ENCODE_FH_WITH_INODE -zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) -{ -#else -zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) -{ - /* CSTYLED */ - struct inode *ip = dentry->d_inode; -#endif /* HAVE_ENCODE_FH_WITH_INODE */ - fstrans_cookie_t cookie; - fid_t *fid = (fid_t *)fh; - int len_bytes, rc; - - len_bytes = *max_len * sizeof (__u32); - - if (len_bytes < offsetof(fid_t, fid_data)) - return (255); - - fid->fid_len = len_bytes - offsetof(fid_t, fid_data); - cookie = spl_fstrans_mark(); - - if (zfsctl_is_node(ip)) - rc = zfsctl_fid(ip, fid); - else - rc = zfs_fid(ip, fid); - - spl_fstrans_unmark(cookie); - len_bytes = offsetof(fid_t, fid_data) + fid->fid_len; - *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32); - - return (rc == 0 ? FILEID_INO32_GEN : 255); -} - -static struct dentry * -zpl_dentry_obtain_alias(struct inode *ip) -{ - struct dentry *result; - -#ifdef HAVE_D_OBTAIN_ALIAS - result = d_obtain_alias(ip); -#else - result = d_alloc_anon(ip); - - if (result == NULL) { - iput(ip); - result = ERR_PTR(-ENOMEM); - } -#endif /* HAVE_D_OBTAIN_ALIAS */ - - return (result); -} - -static struct dentry * -zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - int fh_len, int fh_type) -{ - fid_t *fid = (fid_t *)fh; - fstrans_cookie_t cookie; - struct inode *ip; - int len_bytes, rc; - - len_bytes = fh_len * sizeof (__u32); - - if (fh_type != FILEID_INO32_GEN || - len_bytes < offsetof(fid_t, fid_data) || - len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) - return (ERR_PTR(-EINVAL)); - - cookie = spl_fstrans_mark(); - rc = zfs_vget(sb, &ip, fid); - spl_fstrans_unmark(cookie); - - if (rc) { - /* - * If we see ENOENT it might mean that an NFSv4 * client - * is using a cached inode value in a file handle and - * that the sought after file has had its inode changed - * by a third party. So change the error to ESTALE - * which will trigger a full lookup by the client and - * will find the new filename/inode pair if it still - * exists. - */ - if (rc == ENOENT) - rc = ESTALE; - - return (ERR_PTR(-rc)); - } - - ASSERT((ip != NULL) && !IS_ERR(ip)); - - return (zpl_dentry_obtain_alias(ip)); -} - -static struct dentry * -zpl_get_parent(struct dentry *child) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - struct inode *ip; - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - if (error) - return (ERR_PTR(error)); - - return (zpl_dentry_obtain_alias(ip)); -} - -#ifdef HAVE_COMMIT_METADATA -static int -zpl_commit_metadata(struct inode *inode) -{ - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - int error; - - if (zfsctl_is_node(inode)) - return (0); - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, 0, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} -#endif /* HAVE_COMMIT_METADATA */ - -const struct export_operations zpl_export_operations = { - .encode_fh = zpl_encode_fh, - .fh_to_dentry = zpl_fh_to_dentry, - .get_parent = zpl_get_parent, -#ifdef HAVE_COMMIT_METADATA - .commit_metadata = zpl_commit_metadata, -#endif /* HAVE_COMMIT_METADATA */ -}; diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c deleted file mode 100644 index acad4670d..000000000 --- a/module/zfs/zpl_file.c +++ /dev/null @@ -1,1075 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2011, Lawrence Livermore National Security, LLC. - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - */ - - -#ifdef CONFIG_COMPAT -#include <linux/compat.h> -#endif -#include <sys/file.h> -#include <sys/dmu_objset.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_project.h> - - -static int -zpl_open(struct inode *ip, struct file *filp) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - error = generic_file_open(ip, filp); - if (error) - return (error); - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_release(struct inode *ip, struct file *filp) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - if (ITOZ(ip)->z_atime_dirty) - zfs_mark_inode_dirty(ip); - - crhold(cr); - error = -zfs_close(ip, filp->f_flags, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_readdir(file_inode(filp), ctx, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) -static int -zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - zpl_dir_context_t ctx = - ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); - int error; - - error = zpl_iterate(filp, &ctx); - filp->f_pos = ctx.pos; - - return (error); -} -#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ - -#if defined(HAVE_FSYNC_WITH_DENTRY) -/* - * Linux 2.6.x - 2.6.34 API, - * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' - * to the fops->fsync() hook. For this reason, we must be careful not to - * use filp unconditionally. - */ -static int -zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_fsync(dentry->d_inode, datasync, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_FILE_AIO_FSYNC -static int -zpl_aio_fsync(struct kiocb *kiocb, int datasync) -{ - struct file *filp = kiocb->ki_filp; - return (zpl_fsync(filp, file_dentry(filp), datasync)); -} -#endif - -#elif defined(HAVE_FSYNC_WITHOUT_DENTRY) -/* - * Linux 2.6.35 - 3.0 API, - * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed - * redundant. The dentry is still accessible via filp->f_path.dentry, - * and we are guaranteed that filp will never be NULL. - */ -static int -zpl_fsync(struct file *filp, int datasync) -{ - struct inode *inode = filp->f_mapping->host; - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, datasync, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_FILE_AIO_FSYNC -static int -zpl_aio_fsync(struct kiocb *kiocb, int datasync) -{ - return (zpl_fsync(kiocb->ki_filp, datasync)); -} -#endif - -#elif defined(HAVE_FSYNC_RANGE) -/* - * Linux 3.1 - 3.x API, - * As of 3.1 the responsibility to call filemap_write_and_wait_range() has - * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex - * lock is no longer held by the caller, for zfs we don't require the lock - * to be held so we don't acquire it. - */ -static int -zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = filp->f_mapping->host; - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (error) - return (error); - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_fsync(inode, datasync, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_FILE_AIO_FSYNC -static int -zpl_aio_fsync(struct kiocb *kiocb, int datasync) -{ - return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); -} -#endif - -#else -#error "Unsupported fops->fsync() implementation" -#endif - -static inline int -zfs_io_flags(struct kiocb *kiocb) -{ - int flags = 0; - -#if defined(IOCB_DSYNC) - if (kiocb->ki_flags & IOCB_DSYNC) - flags |= FDSYNC; -#endif -#if defined(IOCB_SYNC) - if (kiocb->ki_flags & IOCB_SYNC) - flags |= FSYNC; -#endif -#if defined(IOCB_APPEND) - if (kiocb->ki_flags & IOCB_APPEND) - flags |= FAPPEND; -#endif -#if defined(IOCB_DIRECT) - if (kiocb->ki_flags & IOCB_DIRECT) - flags |= FDIRECT; -#endif - return (flags); -} - -static ssize_t -zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) -{ - ssize_t read; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_read(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - read = count - uio.uio_resid; - *ppos += read; - - return (read); -} - -inline ssize_t -zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); - ssize_t read; - unsigned int f_flags = filp->f_flags; - - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - /* - * If relatime is enabled, call file_accessed() only if - * zfs_relatime_need_update() is true. This is needed since datasets - * with inherited "relatime" property aren't necessarily mounted with - * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what - * relatime test in VFS by relatime_need_update() is based on. - */ - if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { - if (zfs_relatime_need_update(ip)) - file_accessed(filp); - } else { - file_accessed(filp); - } - - return (read); -} - -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) -{ - ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; - if (to->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (to->type & ITER_BVEC) - seg = UIO_BVEC; - ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, - iov_iter_count(to), seg, to->iov_offset); - if (ret > 0) - iov_iter_advance(to, ret); - return (ret); -} -#else -static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - size_t count; - - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); - if (ret) - return (ret); - - return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); -} -#endif /* HAVE_VFS_RW_ITERATE */ - -static ssize_t -zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) -{ - ssize_t wrote; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; - - if (flags & O_APPEND) - *ppos = i_size_read(ip); - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_write(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - *ppos += wrote; - - return (wrote); -} - -inline ssize_t -zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - ssize_t wrote; - unsigned int f_flags = filp->f_flags; - - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - return (wrote); -} - -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) -{ - size_t count; - ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; - -#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); - - count = iov_iter_count(from); - ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); - if (ret) - return (ret); -#else - /* - * XXX - ideally this check should be in the same lock region with - * write operations, so that there's no TOCTTOU race when doing - * append and someone else grow the file. - */ - ret = generic_write_checks(kiocb, from); - if (ret <= 0) - return (ret); - count = ret; -#endif - - if (from->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (from->type & ITER_BVEC) - seg = UIO_BVEC; - - ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, - count, seg, from->iov_offset); - if (ret > 0) - iov_iter_advance(from, ret); - - return (ret); -} -#else -static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); - size_t count; - ssize_t ret; - - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); - if (ret) - return (ret); - - ret = generic_write_checks(file, &pos, &count, isblk); - if (ret) - return (ret); - - return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); -} -#endif /* HAVE_VFS_RW_ITERATE */ - -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) -{ - if (rw == WRITE) - return (zpl_iter_write(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); -} -#if defined(HAVE_VFS_DIRECT_IO_ITER) -static ssize_t -zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) -{ - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); -} -#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) -static ssize_t -zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) -{ - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); -} -#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) -static ssize_t -zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) -{ - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(rw, kiocb, iter)); -} -#else -#error "Unknown direct IO interface" -#endif - -#else - -#if defined(HAVE_VFS_DIRECT_IO_IOVEC) -static ssize_t -zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, - loff_t pos, unsigned long nr_segs) -{ - if (rw == WRITE) - return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); -} -#else -#error "Unknown direct IO interface" -#endif - -#endif /* HAVE_VFS_RW_ITERATE */ - -static loff_t -zpl_llseek(struct file *filp, loff_t offset, int whence) -{ -#if defined(SEEK_HOLE) && defined(SEEK_DATA) - fstrans_cookie_t cookie; - - if (whence == SEEK_DATA || whence == SEEK_HOLE) { - struct inode *ip = filp->f_mapping->host; - loff_t maxbytes = ip->i_sb->s_maxbytes; - loff_t error; - - spl_inode_lock_shared(ip); - cookie = spl_fstrans_mark(); - error = -zfs_holey(ip, whence, &offset); - spl_fstrans_unmark(cookie); - if (error == 0) - error = lseek_execute(filp, ip, offset, maxbytes); - spl_inode_unlock_shared(ip); - - return (error); - } -#endif /* SEEK_HOLE && SEEK_DATA */ - - return (generic_file_llseek(filp, offset, whence)); -} - -/* - * It's worth taking a moment to describe how mmap is implemented - * for zfs because it differs considerably from other Linux filesystems. - * However, this issue is handled the same way under OpenSolaris. - * - * The issue is that by design zfs bypasses the Linux page cache and - * leaves all caching up to the ARC. This has been shown to work - * well for the common read(2)/write(2) case. However, mmap(2) - * is problem because it relies on being tightly integrated with the - * page cache. To handle this we cache mmap'ed files twice, once in - * the ARC and a second time in the page cache. The code is careful - * to keep both copies synchronized. - * - * When a file with an mmap'ed region is written to using write(2) - * both the data in the ARC and existing pages in the page cache - * are updated. For a read(2) data will be read first from the page - * cache then the ARC if needed. Neither a write(2) or read(2) will - * will ever result in new pages being added to the page cache. - * - * New pages are added to the page cache only via .readpage() which - * is called when the vfs needs to read a page off disk to back the - * virtual memory region. These pages may be modified without - * notifying the ARC and will be written out periodically via - * .writepage(). This will occur due to either a sync or the usual - * page aging behavior. Note because a read(2) of a mmap'ed file - * will always check the page cache first even when the ARC is out - * of date correct data will still be returned. - * - * While this implementation ensures correct behavior it does have - * have some drawbacks. The most obvious of which is that it - * increases the required memory footprint when access mmap'ed - * files. It also adds additional complexity to the code keeping - * both caches synchronized. - * - * Longer term it may be possible to cleanly resolve this wart by - * mapping page cache pages directly on to the ARC buffers. The - * Linux address space operations are flexible enough to allow - * selection of which pages back a particular index. The trick - * would be working out the details of which subsystem is in - * charge, the ARC, the page cache, or both. It may also prove - * helpful to move the ARC buffers to a scatter-gather lists - * rather than a vmalloc'ed region. - */ -static int -zpl_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct inode *ip = filp->f_mapping->host; - znode_t *zp = ITOZ(ip); - int error; - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, - (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); - spl_fstrans_unmark(cookie); - if (error) - return (error); - - error = generic_file_mmap(filp, vma); - if (error) - return (error); - - mutex_enter(&zp->z_lock); - zp->z_is_mapped = B_TRUE; - mutex_exit(&zp->z_lock); - - return (error); -} - -/* - * Populate a page with data for the Linux page cache. This function is - * only used to support mmap(2). There will be an identical copy of the - * data in the ARC which is kept up to date via .write() and .writepage(). - * - * Current this function relies on zpl_read_common() and the O_DIRECT - * flag to read in a page. This works but the more correct way is to - * update zfs_fillpage() to be Linux friendly and use that interface. - */ -static int -zpl_readpage(struct file *filp, struct page *pp) -{ - struct inode *ip; - struct page *pl[1]; - int error = 0; - fstrans_cookie_t cookie; - - ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - pl[0] = pp; - - cookie = spl_fstrans_mark(); - error = -zfs_getpage(ip, pl, 1); - spl_fstrans_unmark(cookie); - - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - flush_dcache_page(pp); - } - - unlock_page(pp); - return (error); -} - -/* - * Populate a set of pages with data for the Linux page cache. This - * function will only be called for read ahead and never for demand - * paging. For simplicity, the code relies on read_cache_pages() to - * correctly lock each page for IO and call zpl_readpage(). - */ -static int -zpl_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return (read_cache_pages(mapping, pages, - (filler_t *)zpl_readpage, filp)); -} - -int -zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) -{ - struct address_space *mapping = data; - fstrans_cookie_t cookie; - - ASSERT(PageLocked(pp)); - ASSERT(!PageWriteback(pp)); - - cookie = spl_fstrans_mark(); - (void) zfs_putpage(mapping->host, pp, wbc); - spl_fstrans_unmark(cookie); - - return (0); -} - -static int -zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - znode_t *zp = ITOZ(mapping->host); - zfsvfs_t *zfsvfs = ITOZSB(mapping->host); - enum writeback_sync_modes sync_mode; - int result; - - ZFS_ENTER(zfsvfs); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - wbc->sync_mode = WB_SYNC_ALL; - ZFS_EXIT(zfsvfs); - sync_mode = wbc->sync_mode; - - /* - * We don't want to run write_cache_pages() in SYNC mode here, because - * that would make putpage() wait for a single page to be committed to - * disk every single time, resulting in atrocious performance. Instead - * we run it once in non-SYNC mode so that the ZIL gets all the data, - * and then we commit it all in one go. - */ - wbc->sync_mode = WB_SYNC_NONE; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); - if (sync_mode != wbc->sync_mode) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - - /* - * We need to call write_cache_pages() again (we can't just - * return after the commit) because the previous call in - * non-SYNC mode does not guarantee that we got all the dirty - * pages (see the implementation of write_cache_pages() for - * details). That being said, this is a no-op in most cases. - */ - wbc->sync_mode = sync_mode; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); - } - return (result); -} - -/* - * Write out dirty pages to the ARC, this function is only required to - * support mmap(2). Mapped pages may be dirtied by memory operations - * which never call .write(). These dirty pages are kept in sync with - * the ARC buffers via this hook. - */ -static int -zpl_writepage(struct page *pp, struct writeback_control *wbc) -{ - if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) - wbc->sync_mode = WB_SYNC_ALL; - - return (zpl_putpage(pp, wbc, pp->mapping)); -} - -/* - * The only flag combination which matches the behavior of zfs_space() - * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE - * flag was introduced in the 2.6.38 kernel. - */ -#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) -long -zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) -{ - int error = -EOPNOTSUPP; - -#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) - cred_t *cr = CRED(); - flock64_t bf; - loff_t olen; - fstrans_cookie_t cookie; - - if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return (error); - - if (offset < 0 || len <= 0) - return (-EINVAL); - - spl_inode_lock(ip); - olen = i_size_read(ip); - - if (offset > olen) { - spl_inode_unlock(ip); - return (0); - } - if (offset + len > olen) - len = olen - offset; - bf.l_type = F_WRLCK; - bf.l_whence = SEEK_SET; - bf.l_start = offset; - bf.l_len = len; - bf.l_pid = 0; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); - spl_fstrans_unmark(cookie); - spl_inode_unlock(ip); - - crfree(cr); -#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ - - ASSERT3S(error, <=, 0); - return (error); -} -#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ - -#ifdef HAVE_FILE_FALLOCATE -static long -zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) -{ - return zpl_fallocate_common(file_inode(filp), - mode, offset, len); -} -#endif /* HAVE_FILE_FALLOCATE */ - -#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) -#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) - -static uint32_t -__zpl_ioctl_getflags(struct inode *ip) -{ - uint64_t zfs_flags = ITOZ(ip)->z_pflags; - uint32_t ioctl_flags = 0; - - if (zfs_flags & ZFS_IMMUTABLE) - ioctl_flags |= FS_IMMUTABLE_FL; - - if (zfs_flags & ZFS_APPENDONLY) - ioctl_flags |= FS_APPEND_FL; - - if (zfs_flags & ZFS_NODUMP) - ioctl_flags |= FS_NODUMP_FL; - - if (zfs_flags & ZFS_PROJINHERIT) - ioctl_flags |= ZFS_PROJINHERIT_FL; - - return (ioctl_flags & ZFS_FL_USER_VISIBLE); -} - -/* - * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file - * attributes common to both Linux and Solaris are mapped. - */ -static int -zpl_ioctl_getflags(struct file *filp, void __user *arg) -{ - uint32_t flags; - int err; - - flags = __zpl_ioctl_getflags(file_inode(filp)); - err = copy_to_user(arg, &flags, sizeof (flags)); - - return (err); -} - -/* - * fchange() is a helper macro to detect if we have been asked to change a - * flag. This is ugly, but the requirement that we do this is a consequence of - * how the Linux file attribute interface was designed. Another consequence is - * that concurrent modification of files suffers from a TOCTOU race. Neither - * are things we can fix without modifying the kernel-userland interface, which - * is outside of our jurisdiction. - */ - -#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) - -static int -__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) -{ - uint64_t zfs_flags = ITOZ(ip)->z_pflags; - xoptattr_t *xoap; - - if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | - ZFS_PROJINHERIT_FL)) - return (-EOPNOTSUPP); - - if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) - return (-EACCES); - - if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || - fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && - !capable(CAP_LINUX_IMMUTABLE)) - return (-EACCES); - - if (!zpl_inode_owner_or_capable(ip)) - return (-EACCES); - - xva_init(xva); - xoap = xva_getxoptattr(xva); - - XVA_SET_REQ(xva, XAT_IMMUTABLE); - if (ioctl_flags & FS_IMMUTABLE_FL) - xoap->xoa_immutable = B_TRUE; - - XVA_SET_REQ(xva, XAT_APPENDONLY); - if (ioctl_flags & FS_APPEND_FL) - xoap->xoa_appendonly = B_TRUE; - - XVA_SET_REQ(xva, XAT_NODUMP); - if (ioctl_flags & FS_NODUMP_FL) - xoap->xoa_nodump = B_TRUE; - - XVA_SET_REQ(xva, XAT_PROJINHERIT); - if (ioctl_flags & ZFS_PROJINHERIT_FL) - xoap->xoa_projinherit = B_TRUE; - - return (0); -} - -static int -zpl_ioctl_setflags(struct file *filp, void __user *arg) -{ - struct inode *ip = file_inode(filp); - uint32_t flags; - cred_t *cr = CRED(); - xvattr_t xva; - int err; - fstrans_cookie_t cookie; - - if (copy_from_user(&flags, arg, sizeof (flags))) - return (-EFAULT); - - err = __zpl_ioctl_setflags(ip, flags, &xva); - if (err) - return (err); - - crhold(cr); - cookie = spl_fstrans_mark(); - err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - - return (err); -} - -static int -zpl_ioctl_getxattr(struct file *filp, void __user *arg) -{ - zfsxattr_t fsx = { 0 }; - struct inode *ip = file_inode(filp); - int err; - - fsx.fsx_xflags = __zpl_ioctl_getflags(ip); - fsx.fsx_projid = ITOZ(ip)->z_projid; - err = copy_to_user(arg, &fsx, sizeof (fsx)); - - return (err); -} - -static int -zpl_ioctl_setxattr(struct file *filp, void __user *arg) -{ - struct inode *ip = file_inode(filp); - zfsxattr_t fsx; - cred_t *cr = CRED(); - xvattr_t xva; - xoptattr_t *xoap; - int err; - fstrans_cookie_t cookie; - - if (copy_from_user(&fsx, arg, sizeof (fsx))) - return (-EFAULT); - - if (!zpl_is_valid_projid(fsx.fsx_projid)) - return (-EINVAL); - - err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); - if (err) - return (err); - - xoap = xva_getxoptattr(&xva); - XVA_SET_REQ(&xva, XAT_PROJID); - xoap->xoa_projid = fsx.fsx_projid; - - crhold(cr); - cookie = spl_fstrans_mark(); - err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - - return (err); -} - -static long -zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case FS_IOC_GETFLAGS: - return (zpl_ioctl_getflags(filp, (void *)arg)); - case FS_IOC_SETFLAGS: - return (zpl_ioctl_setflags(filp, (void *)arg)); - case ZFS_IOC_FSGETXATTR: - return (zpl_ioctl_getxattr(filp, (void *)arg)); - case ZFS_IOC_FSSETXATTR: - return (zpl_ioctl_setxattr(filp, (void *)arg)); - default: - return (-ENOTTY); - } -} - -#ifdef CONFIG_COMPAT -static long -zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - default: - return (-ENOTTY); - } - return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); -} -#endif /* CONFIG_COMPAT */ - - -const struct address_space_operations zpl_address_space_operations = { - .readpages = zpl_readpages, - .readpage = zpl_readpage, - .writepage = zpl_writepage, - .writepages = zpl_writepages, - .direct_IO = zpl_direct_IO, -}; - -const struct file_operations zpl_file_operations = { - .open = zpl_open, - .release = zpl_release, - .llseek = zpl_llseek, -#ifdef HAVE_VFS_RW_ITERATE -#ifdef HAVE_NEW_SYNC_READ - .read = new_sync_read, - .write = new_sync_write, -#endif - .read_iter = zpl_iter_read, - .write_iter = zpl_iter_write, -#else - .read = do_sync_read, - .write = do_sync_write, - .aio_read = zpl_aio_read, - .aio_write = zpl_aio_write, -#endif - .mmap = zpl_mmap, - .fsync = zpl_fsync, -#ifdef HAVE_FILE_AIO_FSYNC - .aio_fsync = zpl_aio_fsync, -#endif -#ifdef HAVE_FILE_FALLOCATE - .fallocate = zpl_fallocate, -#endif /* HAVE_FILE_FALLOCATE */ - .unlocked_ioctl = zpl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = zpl_compat_ioctl, -#endif -}; - -const struct file_operations zpl_dir_file_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, -#if defined(HAVE_VFS_ITERATE_SHARED) - .iterate_shared = zpl_iterate, -#elif defined(HAVE_VFS_ITERATE) - .iterate = zpl_iterate, -#else - .readdir = zpl_readdir, -#endif - .fsync = zpl_fsync, - .unlocked_ioctl = zpl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = zpl_compat_ioctl, -#endif -}; diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c deleted file mode 100644 index 3f3b2e2dc..000000000 --- a/module/zfs/zpl_inode.c +++ /dev/null @@ -1,826 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2011, Lawrence Livermore National Security, LLC. - * Copyright (c) 2015 by Chunwei Chen. All rights reserved. - */ - - -#include <sys/zfs_ctldir.h> -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/dmu_objset.h> -#include <sys/vfs.h> -#include <sys/zpl.h> -#include <sys/file.h> - - -static struct dentry * -#ifdef HAVE_LOOKUP_NAMEIDATA -zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -#else -zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -#endif -{ - cred_t *cr = CRED(); - struct inode *ip; - int error; - fstrans_cookie_t cookie; - pathname_t *ppn = NULL; - pathname_t pn; - int zfs_flags = 0; - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - - if (dlen(dentry) >= ZAP_MAXNAMELEN) - return (ERR_PTR(-ENAMETOOLONG)); - - crhold(cr); - cookie = spl_fstrans_mark(); - - /* If we are a case insensitive fs, we need the real name */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - zfs_flags = FIGNORECASE; - pn_alloc(&pn); - ppn = &pn; - } - - error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - crfree(cr); - - spin_lock(&dentry->d_lock); - dentry->d_time = jiffies; -#ifndef HAVE_S_D_OP - d_set_d_op(dentry, &zpl_dentry_operations); -#endif /* HAVE_S_D_OP */ - spin_unlock(&dentry->d_lock); - - if (error) { - /* - * If we have a case sensitive fs, we do not want to - * insert negative entries, so return NULL for ENOENT. - * Fall through if the error is not ENOENT. Also free memory. - */ - if (ppn) { - pn_free(ppn); - if (error == -ENOENT) - return (NULL); - } - - if (error == -ENOENT) - return (d_splice_alias(NULL, dentry)); - else - return (ERR_PTR(error)); - } - - /* - * If we are case insensitive, call the correct function - * to install the name. - */ - if (ppn) { - struct dentry *new_dentry; - struct qstr ci_name; - - if (strcmp(dname(dentry), pn.pn_buf) == 0) { - new_dentry = d_splice_alias(ip, dentry); - } else { - ci_name.name = pn.pn_buf; - ci_name.len = strlen(pn.pn_buf); - new_dentry = d_add_ci(dentry, ip, &ci_name); - } - pn_free(ppn); - return (new_dentry); - } else { - return (d_splice_alias(ip, dentry)); - } -} - -void -zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) -{ - vap->va_mask = ATTR_MODE; - vap->va_mode = mode; - vap->va_uid = crgetfsuid(cr); - - if (dir && dir->i_mode & S_ISGID) { - vap->va_gid = KGID_TO_SGID(dir->i_gid); - if (S_ISDIR(mode)) - vap->va_mode |= S_ISGID; - } else { - vap->va_gid = crgetfsgid(cr); - } -} - -static int -#ifdef HAVE_CREATE_NAMEIDATA -zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - struct nameidata *nd) -#else -zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - bool flag) -#endif -{ - cred_t *cr = CRED(); - struct inode *ip; - vattr_t *vap; - int error; - fstrans_cookie_t cookie; - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); - - cookie = spl_fstrans_mark(); - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); - if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error == 0) - error = zpl_init_acl(ip, dir); - - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); - } - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - dev_t rdev) -{ - cred_t *cr = CRED(); - struct inode *ip; - vattr_t *vap; - int error; - fstrans_cookie_t cookie; - - /* - * We currently expect Linux to supply rdev=0 for all sockets - * and fifos, but we want to know if this behavior ever changes. - */ - if (S_ISSOCK(mode) || S_ISFIFO(mode)) - ASSERT(rdev == 0); - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); - vap->va_rdev = rdev; - - cookie = spl_fstrans_mark(); - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); - if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error == 0) - error = zpl_init_acl(ip, dir); - - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); - } - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_TMPFILE -static int -zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) -{ - cred_t *cr = CRED(); - struct inode *ip; - vattr_t *vap; - int error; - fstrans_cookie_t cookie; - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); - - cookie = spl_fstrans_mark(); - error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); - if (error == 0) { - /* d_tmpfile will do drop_nlink, so we should set it first */ - set_nlink(ip, 1); - d_tmpfile(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error == 0) - error = zpl_init_acl(ip, dir); - /* - * don't need to handle error here, file is already in - * unlinked set. - */ - } - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} -#endif - -static int -zpl_unlink(struct inode *dir, struct dentry *dentry) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_remove(dir, dname(dentry), cr, 0); - - /* - * For a CI FS we must invalidate the dentry to prevent the - * creation of negative entries. - */ - if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) - d_invalidate(dentry); - - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) -{ - cred_t *cr = CRED(); - vattr_t *vap; - struct inode *ip; - int error; - fstrans_cookie_t cookie; - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode | S_IFDIR, cr); - - cookie = spl_fstrans_mark(); - error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); - if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error == 0) - error = zpl_init_acl(ip, dir); - - if (error) - (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0); - } - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_rmdir(struct inode *dir, struct dentry *dentry) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0); - - /* - * For a CI FS we must invalidate the dentry to prevent the - * creation of negative entries. - */ - if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) - d_invalidate(dentry); - - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, - unsigned int query_flags) -{ - int error; - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - - /* - * XXX request_mask and query_flags currently ignored. - */ - - error = -zfs_getattr_fast(path->dentry->d_inode, stat); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - return (error); -} -ZPL_GETATTR_WRAPPER(zpl_getattr); - -static int -zpl_setattr(struct dentry *dentry, struct iattr *ia) -{ - struct inode *ip = dentry->d_inode; - cred_t *cr = CRED(); - vattr_t *vap; - int error; - fstrans_cookie_t cookie; - - error = setattr_prepare(dentry, ia); - if (error) - return (error); - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; - vap->va_mode = ia->ia_mode; - vap->va_uid = KUID_TO_SUID(ia->ia_uid); - vap->va_gid = KGID_TO_SGID(ia->ia_gid); - vap->va_size = ia->ia_size; - vap->va_atime = ia->ia_atime; - vap->va_mtime = ia->ia_mtime; - vap->va_ctime = ia->ia_ctime; - - if (vap->va_mask & ATTR_ATIME) { - ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime, - ip->i_sb->s_time_gran); - } - - cookie = spl_fstrans_mark(); - error = -zfs_setattr(ip, vap, 0, cr); - if (!error && (ia->ia_valid & ATTR_MODE)) - error = zpl_chmod_acl(ip); - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_rename2(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry, unsigned int flags) -{ - cred_t *cr = CRED(); - int error; - fstrans_cookie_t cookie; - - /* We don't have renameat2(2) support */ - if (flags) - return (-EINVAL); - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifndef HAVE_RENAME_WANTS_FLAGS -static int -zpl_rename(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry) -{ - return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); -} -#endif - -static int -zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) -{ - cred_t *cr = CRED(); - vattr_t *vap; - struct inode *ip; - int error; - fstrans_cookie_t cookie; - - crhold(cr); - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); - - cookie = spl_fstrans_mark(); - error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); - if (error == 0) { - d_instantiate(dentry, ip); - - error = zpl_xattr_security_init(ip, dir, &dentry->d_name); - if (error) - (void) zfs_remove(dir, dname(dentry), cr, 0); - } - - spl_fstrans_unmark(cookie); - kmem_free(vap, sizeof (vattr_t)); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#if defined(HAVE_PUT_LINK_COOKIE) -static void -zpl_put_link(struct inode *unused, void *cookie) -{ - kmem_free(cookie, MAXPATHLEN); -} -#elif defined(HAVE_PUT_LINK_NAMEIDATA) -static void -zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) -{ - const char *link = nd_get_link(nd); - - if (!IS_ERR(link)) - kmem_free(link, MAXPATHLEN); -} -#elif defined(HAVE_PUT_LINK_DELAYED) -static void -zpl_put_link(void *ptr) -{ - kmem_free(ptr, MAXPATHLEN); -} -#endif - -static int -zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) -{ - fstrans_cookie_t cookie; - cred_t *cr = CRED(); - struct iovec iov; - uio_t uio = { { 0 }, 0 }; - int error; - - crhold(cr); - *link = NULL; - iov.iov_len = MAXPATHLEN; - iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = (MAXPATHLEN - 1); - - cookie = spl_fstrans_mark(); - error = -zfs_readlink(ip, &uio, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - - if (error) - kmem_free(iov.iov_base, MAXPATHLEN); - else - *link = iov.iov_base; - - return (error); -} - -#if defined(HAVE_GET_LINK_DELAYED) -const char * -zpl_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *done) -{ - char *link = NULL; - int error; - - if (!dentry) - return (ERR_PTR(-ECHILD)); - - error = zpl_get_link_common(dentry, inode, &link); - if (error) - return (ERR_PTR(error)); - - set_delayed_call(done, zpl_put_link, link); - - return (link); -} -#elif defined(HAVE_GET_LINK_COOKIE) -const char * -zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) -{ - char *link = NULL; - int error; - - if (!dentry) - return (ERR_PTR(-ECHILD)); - - error = zpl_get_link_common(dentry, inode, &link); - if (error) - return (ERR_PTR(error)); - - return (*cookie = link); -} -#elif defined(HAVE_FOLLOW_LINK_COOKIE) -const char * -zpl_follow_link(struct dentry *dentry, void **cookie) -{ - char *link = NULL; - int error; - - error = zpl_get_link_common(dentry, dentry->d_inode, &link); - if (error) - return (ERR_PTR(error)); - - return (*cookie = link); -} -#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) -static void * -zpl_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *link = NULL; - int error; - - error = zpl_get_link_common(dentry, dentry->d_inode, &link); - if (error) - nd_set_link(nd, ERR_PTR(error)); - else - nd_set_link(nd, link); - - return (NULL); -} -#endif - -static int -zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) -{ - cred_t *cr = CRED(); - struct inode *ip = old_dentry->d_inode; - int error; - fstrans_cookie_t cookie; - - if (ip->i_nlink >= ZFS_LINK_MAX) - return (-EMLINK); - - crhold(cr); - ip->i_ctime = current_time(ip); - igrab(ip); /* Use ihold() if available */ - - cookie = spl_fstrans_mark(); - error = -zfs_link(dir, ip, dname(dentry), cr, 0); - if (error) { - iput(ip); - goto out; - } - - d_instantiate(dentry, ip); -out: - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -#ifdef HAVE_INODE_TRUNCATE_RANGE -static void -zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) -{ - cred_t *cr = CRED(); - flock64_t bf; - fstrans_cookie_t cookie; - - ASSERT3S(start, <=, end); - - /* - * zfs_freesp() will interpret (len == 0) as meaning "truncate until - * the end of the file". We don't want that. - */ - if (start == end) - return; - - crhold(cr); - - bf.l_type = F_WRLCK; - bf.l_whence = SEEK_SET; - bf.l_start = start; - bf.l_len = end - start; - bf.l_pid = 0; - cookie = spl_fstrans_mark(); - zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr); - spl_fstrans_unmark(cookie); - - crfree(cr); -} -#endif /* HAVE_INODE_TRUNCATE_RANGE */ - -#ifdef HAVE_INODE_FALLOCATE -static long -zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) -{ - return (zpl_fallocate_common(ip, mode, offset, len)); -} -#endif /* HAVE_INODE_FALLOCATE */ - -static int -#ifdef HAVE_D_REVALIDATE_NAMEIDATA -zpl_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - unsigned int flags = (nd ? nd->flags : 0); -#else -zpl_revalidate(struct dentry *dentry, unsigned int flags) -{ -#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ - /* CSTYLED */ - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - int error; - - if (flags & LOOKUP_RCU) - return (-ECHILD); - - /* - * Automounted snapshots rely on periodic dentry revalidation - * to defer snapshots from being automatically unmounted. - */ - if (zfsvfs->z_issnap) { - if (time_after(jiffies, zfsvfs->z_snap_defer_time + - MAX(zfs_expire_snapshot * HZ / 2, HZ))) { - zfsvfs->z_snap_defer_time = jiffies; - zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, - dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); - } - } - - /* - * After a rollback negative dentries created before the rollback - * time must be invalidated. Otherwise they can obscure files which - * are only present in the rolled back dataset. - */ - if (dentry->d_inode == NULL) { - spin_lock(&dentry->d_lock); - error = time_before(dentry->d_time, zfsvfs->z_rollback_time); - spin_unlock(&dentry->d_lock); - - if (error) - return (0); - } - - /* - * The dentry may reference a stale inode if a mounted file system - * was rolled back to a point in time where the object didn't exist. - */ - if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) - return (0); - - return (1); -} - -const struct inode_operations zpl_inode_operations = { - .setattr = zpl_setattr, - .getattr = zpl_getattr, -#ifdef HAVE_GENERIC_SETXATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, -#endif - .listxattr = zpl_xattr_list, -#ifdef HAVE_INODE_TRUNCATE_RANGE - .truncate_range = zpl_truncate_range, -#endif /* HAVE_INODE_TRUNCATE_RANGE */ -#ifdef HAVE_INODE_FALLOCATE - .fallocate = zpl_fallocate, -#endif /* HAVE_INODE_FALLOCATE */ -#if defined(CONFIG_FS_POSIX_ACL) -#if defined(HAVE_SET_ACL) - .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) - .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* CONFIG_FS_POSIX_ACL */ -}; - -const struct inode_operations zpl_dir_inode_operations = { - .create = zpl_create, - .lookup = zpl_lookup, - .link = zpl_link, - .unlink = zpl_unlink, - .symlink = zpl_symlink, - .mkdir = zpl_mkdir, - .rmdir = zpl_rmdir, - .mknod = zpl_mknod, -#ifdef HAVE_RENAME_WANTS_FLAGS - .rename = zpl_rename2, -#else - .rename = zpl_rename, -#endif -#ifdef HAVE_TMPFILE - .tmpfile = zpl_tmpfile, -#endif - .setattr = zpl_setattr, - .getattr = zpl_getattr, -#ifdef HAVE_GENERIC_SETXATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, -#endif - .listxattr = zpl_xattr_list, -#if defined(CONFIG_FS_POSIX_ACL) -#if defined(HAVE_SET_ACL) - .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) - .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* CONFIG_FS_POSIX_ACL */ -}; - -const struct inode_operations zpl_symlink_inode_operations = { -#ifdef HAVE_GENERIC_READLINK - .readlink = generic_readlink, -#endif -#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) - .get_link = zpl_get_link, -#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) - .follow_link = zpl_follow_link, -#endif -#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) - .put_link = zpl_put_link, -#endif - .setattr = zpl_setattr, - .getattr = zpl_getattr, -#ifdef HAVE_GENERIC_SETXATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, -#endif - .listxattr = zpl_xattr_list, -}; - -const struct inode_operations zpl_special_inode_operations = { - .setattr = zpl_setattr, - .getattr = zpl_getattr, -#ifdef HAVE_GENERIC_SETXATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, -#endif - .listxattr = zpl_xattr_list, -#if defined(CONFIG_FS_POSIX_ACL) -#if defined(HAVE_SET_ACL) - .set_acl = zpl_set_acl, -#endif -#if defined(HAVE_GET_ACL) - .get_acl = zpl_get_acl, -#elif defined(HAVE_CHECK_ACL) - .check_acl = zpl_check_acl, -#elif defined(HAVE_PERMISSION) - .permission = zpl_permission, -#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* CONFIG_FS_POSIX_ACL */ -}; - -dentry_operations_t zpl_dentry_operations = { - .d_revalidate = zpl_revalidate, -}; diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c deleted file mode 100644 index 810ab2898..000000000 --- a/module/zfs/zpl_super.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2011, Lawrence Livermore National Security, LLC. - */ - - -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/zfs_ctldir.h> -#include <sys/zpl.h> - - -static struct inode * -zpl_inode_alloc(struct super_block *sb) -{ - struct inode *ip; - - VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); - inode_set_iversion(ip, 1); - - return (ip); -} - -static void -zpl_inode_destroy(struct inode *ip) -{ - ASSERT(atomic_read(&ip->i_count) == 0); - zfs_inode_destroy(ip); -} - -/* - * Called from __mark_inode_dirty() to reflect that something in the - * inode has changed. We use it to ensure the znode system attributes - * are always strictly update to date with respect to the inode. - */ -#ifdef HAVE_DIRTY_INODE_WITH_FLAGS -static void -zpl_dirty_inode(struct inode *ip, int flags) -{ - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - zfs_dirty_inode(ip, flags); - spl_fstrans_unmark(cookie); -} -#else -static void -zpl_dirty_inode(struct inode *ip) -{ - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - zfs_dirty_inode(ip, 0); - spl_fstrans_unmark(cookie); -} -#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ - -/* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. - * - * Prior to 2.6.36 this eviction was accomplished by the vfs calling - * ->delete_inode(). It was ->delete_inode()'s responsibility to - * truncate the inode pages and call clear_inode(). The call to - * clear_inode() synchronously invalidates all the buffers and - * calls ->clear_inode(). It was ->clear_inode()'s responsibility - * to cleanup and filesystem specific data before freeing the inode. - * - * This elaborate mechanism was replaced by ->evict_inode() which - * does the job of both ->delete_inode() and ->clear_inode(). It - * will be called exactly once, and when it returns the inode must - * be in a state where it can simply be freed.i - * - * The ->evict_inode() callback must minimally truncate the inode pages, - * and call clear_inode(). For 2.6.35 and later kernels this will - * simply update the inode state, with the sync occurring before the - * truncate in evict(). For earlier kernels clear_inode() maps to - * end_writeback() which is responsible for completing all outstanding - * write back. In either case, once this is done it is safe to cleanup - * any remaining inode specific data via zfs_inactive(). - * remaining filesystem specific data. - */ -#ifdef HAVE_EVICT_INODE -static void -zpl_evict_inode(struct inode *ip) -{ - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - truncate_setsize(ip, 0); - clear_inode(ip); - zfs_inactive(ip); - spl_fstrans_unmark(cookie); -} - -#else - -static void -zpl_drop_inode(struct inode *ip) -{ - generic_delete_inode(ip); -} - -static void -zpl_clear_inode(struct inode *ip) -{ - fstrans_cookie_t cookie; - - cookie = spl_fstrans_mark(); - zfs_inactive(ip); - spl_fstrans_unmark(cookie); -} - -static void -zpl_inode_delete(struct inode *ip) -{ - truncate_setsize(ip, 0); - clear_inode(ip); -} -#endif /* HAVE_EVICT_INODE */ - -static void -zpl_put_super(struct super_block *sb) -{ - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_umount(sb); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); -} - -static int -zpl_sync_fs(struct super_block *sb, int wait) -{ - fstrans_cookie_t cookie; - cred_t *cr = CRED(); - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - error = -zfs_sync(sb, wait, cr); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_statfs(struct dentry *dentry, struct kstatfs *statp) -{ - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_statvfs(dentry, statp); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - /* - * If required by a 32-bit system call, dynamically scale the - * block size up to 16MiB and decrease the block counts. This - * allows for a maximum size of 64EiB to be reported. The file - * counts must be artificially capped at 2^32-1. - */ - if (unlikely(zpl_is_32bit_api())) { - while (statp->f_blocks > UINT32_MAX && - statp->f_bsize < SPA_MAXBLOCKSIZE) { - statp->f_frsize <<= 1; - statp->f_bsize <<= 1; - - statp->f_blocks >>= 1; - statp->f_bfree >>= 1; - statp->f_bavail >>= 1; - } - - uint64_t usedobjs = statp->f_files - statp->f_ffree; - statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); - statp->f_files = statp->f_ffree + usedobjs; - } - - return (error); -} - -static int -zpl_remount_fs(struct super_block *sb, int *flags, char *data) -{ - zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_remount(sb, flags, &zm); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) -{ - seq_printf(seq, ",%s", - zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); - -#ifdef CONFIG_FS_POSIX_ACL - switch (zfsvfs->z_acl_type) { - case ZFS_ACLTYPE_POSIXACL: - seq_puts(seq, ",posixacl"); - break; - default: - seq_puts(seq, ",noacl"); - break; - } -#endif /* CONFIG_FS_POSIX_ACL */ - - return (0); -} - -#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY -static int -zpl_show_options(struct seq_file *seq, struct dentry *root) -{ - return (__zpl_show_options(seq, root->d_sb->s_fs_info)); -} -#else -static int -zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) -{ - return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); -} -#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */ - -static int -zpl_fill_super(struct super_block *sb, void *data, int silent) -{ - zfs_mnt_t *zm = (zfs_mnt_t *)data; - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_domount(sb, zm, silent); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_test_super(struct super_block *s, void *data) -{ - zfsvfs_t *zfsvfs = s->s_fs_info; - objset_t *os = data; - - if (zfsvfs == NULL) - return (0); - - return (os == zfsvfs->z_os); -} - -static struct super_block * -zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) -{ - struct super_block *s; - objset_t *os; - int err; - - err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); - if (err) - return (ERR_PTR(-err)); - - /* - * The dsl pool lock must be released prior to calling sget(). - * It is possible sget() may block on the lock in grab_super() - * while deactivate_super() holds that same lock and waits for - * a txg sync. If the dsl_pool lock is held over sget() - * this can prevent the pool sync and cause a deadlock. - */ - dsl_pool_rele(dmu_objset_pool(os), FTAG); - s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os); - dsl_dataset_rele(dmu_objset_ds(os), FTAG); - - if (IS_ERR(s)) - return (ERR_CAST(s)); - - if (s->s_root == NULL) { - err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); - if (err) { - deactivate_locked_super(s); - return (ERR_PTR(err)); - } - s->s_flags |= SB_ACTIVE; - } else if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return (ERR_PTR(-EBUSY)); - } - - return (s); -} - -#ifdef HAVE_FST_MOUNT -static struct dentry * -zpl_mount(struct file_system_type *fs_type, int flags, - const char *osname, void *data) -{ - zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; - - struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); - if (IS_ERR(sb)) - return (ERR_CAST(sb)); - - return (dget(sb->s_root)); -} -#else -static int -zpl_get_sb(struct file_system_type *fs_type, int flags, - const char *osname, void *data, struct vfsmount *mnt) -{ - zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; - - struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); - if (IS_ERR(sb)) - return (PTR_ERR(sb)); - - (void) simple_set_mnt(mnt, sb); - - return (0); -} -#endif /* HAVE_FST_MOUNT */ - -static void -zpl_kill_sb(struct super_block *sb) -{ - zfs_preumount(sb); - kill_anon_super(sb); - -#ifdef HAVE_S_INSTANCES_LIST_HEAD - sb->s_instances.next = &(zpl_fs_type.fs_supers); -#endif /* HAVE_S_INSTANCES_LIST_HEAD */ -} - -void -zpl_prune_sb(int64_t nr_to_scan, void *arg) -{ - struct super_block *sb = (struct super_block *)arg; - int objects = 0; - - (void) -zfs_prune(sb, nr_to_scan, &objects); -} - -#ifdef HAVE_NR_CACHED_OBJECTS -static int -zpl_nr_cached_objects(struct super_block *sb) -{ - return (0); -} -#endif /* HAVE_NR_CACHED_OBJECTS */ - -#ifdef HAVE_FREE_CACHED_OBJECTS -static void -zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) -{ - /* noop */ -} -#endif /* HAVE_FREE_CACHED_OBJECTS */ - -const struct super_operations zpl_super_operations = { - .alloc_inode = zpl_inode_alloc, - .destroy_inode = zpl_inode_destroy, - .dirty_inode = zpl_dirty_inode, - .write_inode = NULL, -#ifdef HAVE_EVICT_INODE - .evict_inode = zpl_evict_inode, -#else - .drop_inode = zpl_drop_inode, - .clear_inode = zpl_clear_inode, - .delete_inode = zpl_inode_delete, -#endif /* HAVE_EVICT_INODE */ - .put_super = zpl_put_super, - .sync_fs = zpl_sync_fs, - .statfs = zpl_statfs, - .remount_fs = zpl_remount_fs, - .show_options = zpl_show_options, - .show_stats = NULL, -#ifdef HAVE_NR_CACHED_OBJECTS - .nr_cached_objects = zpl_nr_cached_objects, -#endif /* HAVE_NR_CACHED_OBJECTS */ -#ifdef HAVE_FREE_CACHED_OBJECTS - .free_cached_objects = zpl_free_cached_objects, -#endif /* HAVE_FREE_CACHED_OBJECTS */ -}; - -struct file_system_type zpl_fs_type = { - .owner = THIS_MODULE, - .name = ZFS_DRIVER, -#ifdef HAVE_FST_MOUNT - .mount = zpl_mount, -#else - .get_sb = zpl_get_sb, -#endif /* HAVE_FST_MOUNT */ - .kill_sb = zpl_kill_sb, -}; diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c deleted file mode 100644 index 95523f28e..000000000 --- a/module/zfs/zpl_xattr.c +++ /dev/null @@ -1,1548 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2011, Lawrence Livermore National Security, LLC. - * - * Extended attributes (xattr) on Solaris are implemented as files - * which exist in a hidden xattr directory. These extended attributes - * can be accessed using the attropen() system call which opens - * the extended attribute. It can then be manipulated just like - * a standard file descriptor. This has a couple advantages such - * as practically no size limit on the file, and the extended - * attributes permissions may differ from those of the parent file. - * This interface is really quite clever, but it's also completely - * different than what is supported on Linux. It also comes with a - * steep performance penalty when accessing small xattrs because they - * are not stored with the parent file. - * - * Under Linux extended attributes are manipulated by the system - * calls getxattr(2), setxattr(2), and listxattr(2). They consider - * extended attributes to be name/value pairs where the name is a - * NULL terminated string. The name must also include one of the - * following namespace prefixes: - * - * user - No restrictions and is available to user applications. - * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. - * system - Used for access control lists (system.nfs4_acl, etc). - * security - Used by SELinux to store a files security context. - * - * The value under Linux to limited to 65536 bytes of binary data. - * In practice, individual xattrs tend to be much smaller than this - * and are typically less than 100 bytes. A good example of this - * are the security.selinux xattrs which are less than 100 bytes and - * exist for every file when xattr labeling is enabled. - * - * The Linux xattr implementation has been written to take advantage of - * this typical usage. When the dataset property 'xattr=sa' is set, - * then xattrs will be preferentially stored as System Attributes (SA). - * This allows tiny xattrs (~100 bytes) to be stored with the dnode and - * up to 64k of xattrs to be stored in the spill block. If additional - * xattr space is required, which is unlikely under Linux, they will - * be stored using the traditional directory approach. - * - * This optimization results in roughly a 3x performance improvement - * when accessing xattrs because it avoids the need to perform a seek - * for every xattr value. When multiple xattrs are stored per-file - * the performance improvements are even greater because all of the - * xattrs stored in the spill block will be cached. - * - * However, by default SA based xattrs are disabled in the Linux port - * to maximize compatibility with other implementations. If you do - * enable SA based xattrs then they will not be visible on platforms - * which do not support this feature. - * - * NOTE: One additional consequence of the xattr directory implementation - * is that when an extended attribute is manipulated an inode is created. - * This inode will exist in the Linux inode cache but there will be no - * associated entry in the dentry cache which references it. This is - * safe but it may result in some confusion. Enabling SA based xattrs - * largely avoids the issue except in the overflow case. - */ - -#include <sys/zfs_vfsops.h> -#include <sys/zfs_vnops.h> -#include <sys/zfs_znode.h> -#include <sys/zap.h> -#include <sys/vfs.h> -#include <sys/zpl.h> - -typedef struct xattr_filldir { - size_t size; - size_t offset; - char *buf; - struct dentry *dentry; -} xattr_filldir_t; - -static const struct xattr_handler *zpl_xattr_handler(const char *); - -static int -zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) -{ - static const struct xattr_handler *handler; - struct dentry *d = xf->dentry; - - handler = zpl_xattr_handler(name); - if (!handler) - return (0); - - if (handler->list) { -#if defined(HAVE_XATTR_LIST_SIMPLE) - if (!handler->list(d)) - return (0); -#elif defined(HAVE_XATTR_LIST_DENTRY) - if (!handler->list(d, NULL, 0, name, name_len, 0)) - return (0); -#elif defined(HAVE_XATTR_LIST_HANDLER) - if (!handler->list(handler, d, NULL, 0, name, name_len)) - return (0); -#elif defined(HAVE_XATTR_LIST_INODE) - if (!handler->list(d->d_inode, NULL, 0, name, name_len)) - return (0); -#endif - } - - return (1); -} - -/* - * Determine is a given xattr name should be visible and if so copy it - * in to the provided buffer (xf->buf). - */ -static int -zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) -{ - /* Check permissions using the per-namespace list xattr handler. */ - if (!zpl_xattr_permission(xf, name, name_len)) - return (0); - - /* When xf->buf is NULL only calculate the required size. */ - if (xf->buf) { - if (xf->offset + name_len + 1 > xf->size) - return (-ERANGE); - - memcpy(xf->buf + xf->offset, name, name_len); - xf->buf[xf->offset + name_len] = '\0'; - } - - xf->offset += (name_len + 1); - - return (0); -} - -/* - * Read as many directory entry names as will fit in to the provided buffer, - * or when no buffer is provided calculate the required buffer size. - */ -int -zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) -{ - zap_cursor_t zc; - zap_attribute_t zap; - int error; - - zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); - - while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { - - if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { - error = -ENXIO; - break; - } - - error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); - if (error) - break; - - zap_cursor_advance(&zc); - } - - zap_cursor_fini(&zc); - - if (error == -ENOENT) - error = 0; - - return (error); -} - -static ssize_t -zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) -{ - struct inode *ip = xf->dentry->d_inode; - struct inode *dxip = NULL; - int error; - - /* Lookup the xattr directory */ - error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); - if (error) { - if (error == -ENOENT) - error = 0; - - return (error); - } - - error = zpl_xattr_readdir(dxip, xf); - iput(dxip); - - return (error); -} - -static ssize_t -zpl_xattr_list_sa(xattr_filldir_t *xf) -{ - znode_t *zp = ITOZ(xf->dentry->d_inode); - nvpair_t *nvp = NULL; - int error = 0; - - mutex_enter(&zp->z_lock); - if (zp->z_xattr_cached == NULL) - error = -zfs_sa_get_xattr(zp); - mutex_exit(&zp->z_lock); - - if (error) - return (error); - - ASSERT(zp->z_xattr_cached); - - while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { - ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); - - error = zpl_xattr_filldir(xf, nvpair_name(nvp), - strlen(nvpair_name(nvp))); - if (error) - return (error); - } - - return (0); -} - -ssize_t -zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - znode_t *zp = ITOZ(dentry->d_inode); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - int error = 0; - - crhold(cr); - cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); - rw_enter(&zp->z_xattr_lock, RW_READER); - - if (zfsvfs->z_use_sa && zp->z_is_sa) { - error = zpl_xattr_list_sa(&xf); - if (error) - goto out; - } - - error = zpl_xattr_list_dir(&xf, cr); - if (error) - goto out; - - error = xf.offset; -out: - - rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); - spl_fstrans_unmark(cookie); - crfree(cr); - - return (error); -} - -static int -zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, - size_t size, cred_t *cr) -{ - struct inode *dxip = NULL; - struct inode *xip = NULL; - loff_t pos = 0; - int error; - - /* Lookup the xattr directory */ - error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); - if (error) - goto out; - - /* Lookup a specific xattr name in the directory */ - error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); - if (error) - goto out; - - if (!size) { - error = i_size_read(xip); - goto out; - } - - if (size < i_size_read(xip)) { - error = -ERANGE; - goto out; - } - - error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); -out: - if (xip) - iput(xip); - - if (dxip) - iput(dxip); - - return (error); -} - -static int -zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) -{ - znode_t *zp = ITOZ(ip); - uchar_t *nv_value; - uint_t nv_size; - int error = 0; - - ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); - - mutex_enter(&zp->z_lock); - if (zp->z_xattr_cached == NULL) - error = -zfs_sa_get_xattr(zp); - mutex_exit(&zp->z_lock); - - if (error) - return (error); - - ASSERT(zp->z_xattr_cached); - error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, - &nv_value, &nv_size); - if (error) - return (error); - - if (size == 0 || value == NULL) - return (nv_size); - - if (size < nv_size) - return (-ERANGE); - - memcpy(value, nv_value, nv_size); - - return (nv_size); -} - -static int -__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, - cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - int error; - - ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); - - if (zfsvfs->z_use_sa && zp->z_is_sa) { - error = zpl_xattr_get_sa(ip, name, value, size); - if (error != -ENOENT) - goto out; - } - - error = zpl_xattr_get_dir(ip, name, value, size, cr); -out: - if (error == -ENOENT) - error = -ENODATA; - - return (error); -} - -#define XATTR_NOENT 0x0 -#define XATTR_IN_SA 0x1 -#define XATTR_IN_DIR 0x2 -/* check where the xattr resides */ -static int -__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - int error; - - ASSERT(where); - ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); - - *where = XATTR_NOENT; - if (zfsvfs->z_use_sa && zp->z_is_sa) { - error = zpl_xattr_get_sa(ip, name, NULL, 0); - if (error >= 0) - *where |= XATTR_IN_SA; - else if (error != -ENOENT) - return (error); - } - - error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); - if (error >= 0) - *where |= XATTR_IN_DIR; - else if (error != -ENOENT) - return (error); - - if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) - cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" - " in both SA and dir", ip, name); - if (*where == XATTR_NOENT) - error = -ENODATA; - else - error = 0; - return (error); -} - -static int -zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); - rw_enter(&zp->z_xattr_lock, RW_READER); - error = __zpl_xattr_get(ip, name, value, size, cr); - rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); - spl_fstrans_unmark(cookie); - crfree(cr); - - return (error); -} - -static int -zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, - size_t size, int flags, cred_t *cr) -{ - struct inode *dxip = NULL; - struct inode *xip = NULL; - vattr_t *vap = NULL; - ssize_t wrote; - int lookup_flags, error; - const int xattr_mode = S_IFREG | 0644; - loff_t pos = 0; - - /* - * Lookup the xattr directory. When we're adding an entry pass - * CREATE_XATTR_DIR to ensure the xattr directory is created. - * When removing an entry this flag is not passed to avoid - * unnecessarily creating a new xattr directory. - */ - lookup_flags = LOOKUP_XATTR; - if (value != NULL) - lookup_flags |= CREATE_XATTR_DIR; - - error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL); - if (error) - goto out; - - /* Lookup a specific xattr name in the directory */ - error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); - if (error && (error != -ENOENT)) - goto out; - - error = 0; - - /* Remove a specific name xattr when value is set to NULL. */ - if (value == NULL) { - if (xip) - error = -zfs_remove(dxip, (char *)name, cr, 0); - - goto out; - } - - /* Lookup failed create a new xattr. */ - if (xip == NULL) { - vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mode = xattr_mode; - vap->va_mask = ATTR_MODE; - vap->va_uid = crgetfsuid(cr); - vap->va_gid = crgetfsgid(cr); - - error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip, - cr, 0, NULL); - if (error) - goto out; - } - - ASSERT(xip != NULL); - - error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE); - if (error) - goto out; - - wrote = zpl_write_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); - if (wrote < 0) - error = wrote; - -out: - - if (error == 0) { - ip->i_ctime = current_time(ip); - zfs_mark_inode_dirty(ip); - } - - if (vap) - kmem_free(vap, sizeof (vattr_t)); - - if (xip) - iput(xip); - - if (dxip) - iput(dxip); - - if (error == -ENOENT) - error = -ENODATA; - - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, - size_t size, int flags, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - nvlist_t *nvl; - size_t sa_size; - int error = 0; - - mutex_enter(&zp->z_lock); - if (zp->z_xattr_cached == NULL) - error = -zfs_sa_get_xattr(zp); - mutex_exit(&zp->z_lock); - - if (error) - return (error); - - ASSERT(zp->z_xattr_cached); - nvl = zp->z_xattr_cached; - - if (value == NULL) { - error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); - if (error == -ENOENT) - error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); - } else { - /* Limited to 32k to keep nvpair memory allocations small */ - if (size > DXATTR_MAX_ENTRY_SIZE) - return (-EFBIG); - - /* Prevent the DXATTR SA from consuming the entire SA region */ - error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); - if (error) - return (error); - - if (sa_size > DXATTR_MAX_SA_SIZE) - return (-EFBIG); - - error = -nvlist_add_byte_array(nvl, name, - (uchar_t *)value, size); - } - - /* - * Update the SA for additions, modifications, and removals. On - * error drop the inconsistent cached version of the nvlist, it - * will be reconstructed from the ARC when next accessed. - */ - if (error == 0) - error = -zfs_sa_set_xattr(zp); - - if (error) { - nvlist_free(nvl); - zp->z_xattr_cached = NULL; - } - - ASSERT3S(error, <=, 0); - - return (error); -} - -static int -zpl_xattr_set(struct inode *ip, const char *name, const void *value, - size_t size, int flags) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - cred_t *cr = CRED(); - fstrans_cookie_t cookie; - int where; - int error; - - crhold(cr); - cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); - rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); - - /* - * Before setting the xattr check to see if it already exists. - * This is done to ensure the following optional flags are honored. - * - * XATTR_CREATE: fail if xattr already exists - * XATTR_REPLACE: fail if xattr does not exist - * - * We also want to know if it resides in sa or dir, so we can make - * sure we don't end up with duplicate in both places. - */ - error = __zpl_xattr_where(ip, name, &where, cr); - if (error < 0) { - if (error != -ENODATA) - goto out; - if (flags & XATTR_REPLACE) - goto out; - - /* The xattr to be removed already doesn't exist */ - error = 0; - if (value == NULL) - goto out; - } else { - error = -EEXIST; - if (flags & XATTR_CREATE) - goto out; - } - - /* Preferentially store the xattr as a SA for better performance */ - if (zfsvfs->z_use_sa && zp->z_is_sa && - (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { - error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); - if (error == 0) { - /* - * Successfully put into SA, we need to clear the one - * in dir. - */ - if (where & XATTR_IN_DIR) - zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); - goto out; - } - } - - error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); - /* - * Successfully put into dir, we need to clear the one in SA. - */ - if (error == 0 && (where & XATTR_IN_SA)) - zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); -out: - rw_exit(&ITOZ(ip)->z_xattr_lock); - ZPL_EXIT(zfsvfs); - spl_fstrans_unmark(cookie); - crfree(cr); - ASSERT3S(error, <=, 0); - - return (error); -} - -/* - * Extended user attributes - * - * "Extended user attributes may be assigned to files and directories for - * storing arbitrary additional information such as the mime type, - * character set or encoding of a file. The access permissions for user - * attributes are defined by the file permission bits: read permission - * is required to retrieve the attribute value, and writer permission is - * required to change it. - * - * The file permission bits of regular files and directories are - * interpreted differently from the file permission bits of special - * files and symbolic links. For regular files and directories the file - * permission bits define access to the file's contents, while for - * device special files they define access to the device described by - * the special file. The file permissions of symbolic links are not - * used in access checks. These differences would allow users to - * consume filesystem resources in a way not controllable by disk quotas - * for group or world writable special files and directories. - * - * For this reason, extended user attributes are allowed only for - * regular files and directories, and access to extended user attributes - * is restricted to the owner and to users with appropriate capabilities - * for directories with the sticky bit set (see the chmod(1) manual page - * for an explanation of the sticky bit)." - xattr(7) - * - * ZFS allows extended user attributes to be disabled administratively - * by setting the 'xattr=off' property on the dataset. - */ -static int -__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return (ITOZSB(ip)->z_flags & ZSB_XATTR); -} -ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); - -static int -__zpl_xattr_user_get(struct inode *ip, const char *name, - void *value, size_t size) -{ - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) - return (-EOPNOTSUPP); - - xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); - -static int -__zpl_xattr_user_set(struct inode *ip, const char *name, - const void *value, size_t size, int flags) -{ - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) - return (-EOPNOTSUPP); - - xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); - error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); - -xattr_handler_t zpl_xattr_user_handler = -{ - .prefix = XATTR_USER_PREFIX, - .list = zpl_xattr_user_list, - .get = zpl_xattr_user_get, - .set = zpl_xattr_user_set, -}; - -/* - * Trusted extended attributes - * - * "Trusted extended attributes are visible and accessible only to - * processes that have the CAP_SYS_ADMIN capability. Attributes in this - * class are used to implement mechanisms in user space (i.e., outside - * the kernel) which keep information in extended attributes to which - * ordinary processes should not have access." - xattr(7) - */ -static int -__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return (capable(CAP_SYS_ADMIN)); -} -ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); - -static int -__zpl_xattr_trusted_get(struct inode *ip, const char *name, - void *value, size_t size) -{ - char *xattr_name; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return (-EACCES); - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); - -static int -__zpl_xattr_trusted_set(struct inode *ip, const char *name, - const void *value, size_t size, int flags) -{ - char *xattr_name; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return (-EACCES); - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); - error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); - -xattr_handler_t zpl_xattr_trusted_handler = -{ - .prefix = XATTR_TRUSTED_PREFIX, - .list = zpl_xattr_trusted_list, - .get = zpl_xattr_trusted_get, - .set = zpl_xattr_trusted_set, -}; - -/* - * Extended security attributes - * - * "The security attribute namespace is used by kernel security modules, - * such as Security Enhanced Linux, and also to implement file - * capabilities (see capabilities(7)). Read and write access - * permissions to security attributes depend on the policy implemented - * for each security attribute by the security module. When no security - * module is loaded, all processes have read access to extended security - * attributes, and write access is limited to processes that have the - * CAP_SYS_ADMIN capability." - xattr(7) - */ -static int -__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return (1); -} -ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); - -static int -__zpl_xattr_security_get(struct inode *ip, const char *name, - void *value, size_t size) -{ - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, value, size); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); - -static int -__zpl_xattr_security_set(struct inode *ip, const char *name, - const void *value, size_t size, int flags) -{ - char *xattr_name; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") == 0) - return (-EINVAL); -#endif - xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); - error = zpl_xattr_set(ip, xattr_name, value, size, flags); - strfree(xattr_name); - - return (error); -} -ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); - -#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY -static int -__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs, - void *fs_info) -{ - const struct xattr *xattr; - int error = 0; - - for (xattr = xattrs; xattr->name != NULL; xattr++) { - error = __zpl_xattr_security_set(ip, - xattr->name, xattr->value, xattr->value_len, 0); - - if (error < 0) - break; - } - - return (error); -} - -int -zpl_xattr_security_init(struct inode *ip, struct inode *dip, - const struct qstr *qstr) -{ - return security_inode_init_security(ip, dip, qstr, - &__zpl_xattr_security_init, NULL); -} - -#else -int -zpl_xattr_security_init(struct inode *ip, struct inode *dip, - const struct qstr *qstr) -{ - int error; - size_t len; - void *value; - char *name; - - error = zpl_security_inode_init_security(ip, dip, qstr, - &name, &value, &len); - if (error) { - if (error == -EOPNOTSUPP) - return (0); - - return (error); - } - - error = __zpl_xattr_security_set(ip, name, value, len, 0); - - kfree(name); - kfree(value); - - return (error); -} -#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */ - -/* - * Security xattr namespace handlers. - */ -xattr_handler_t zpl_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = zpl_xattr_security_list, - .get = zpl_xattr_security_get, - .set = zpl_xattr_security_set, -}; - -/* - * Extended system attributes - * - * "Extended system attributes are used by the kernel to store system - * objects such as Access Control Lists. Read and write access permissions - * to system attributes depend on the policy implemented for each system - * attribute implemented by filesystems in the kernel." - xattr(7) - */ -#ifdef CONFIG_FS_POSIX_ACL -int -zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) -{ - char *name, *value = NULL; - int error = 0; - size_t size = 0; - - if (S_ISLNK(ip->i_mode)) - return (-EOPNOTSUPP); - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - zpl_equivmode_t mode = ip->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) { - return (error); - } else { - /* - * The mode bits will have been set by - * ->zfs_setattr()->zfs_acl_chmod_setattr() - * using the ZFS ACL conversion. If they - * differ from the Posix ACL conversion dirty - * the inode to write the Posix mode bits. - */ - if (ip->i_mode != mode) { - ip->i_mode = mode; - ip->i_ctime = current_time(ip); - zfs_mark_inode_dirty(ip); - } - - if (error == 0) - acl = NULL; - } - } - break; - - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_POSIX_ACL_DEFAULT; - if (!S_ISDIR(ip->i_mode)) - return (acl ? -EACCES : 0); - break; - - default: - return (-EINVAL); - } - - if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmem_alloc(size, KM_SLEEP); - - error = zpl_acl_to_xattr(acl, value, size); - if (error < 0) { - kmem_free(value, size); - return (error); - } - } - - error = zpl_xattr_set(ip, name, value, size, 0); - if (value) - kmem_free(value, size); - - if (!error) { - if (acl) - zpl_set_cached_acl(ip, type, acl); - else - zpl_forget_cached_acl(ip, type); - } - - return (error); -} - -struct posix_acl * -zpl_get_acl(struct inode *ip, int type) -{ - struct posix_acl *acl; - void *value = NULL; - char *name; - int size; - - /* - * As of Linux 3.14, the kernel get_acl will check this for us. - * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong - * as the kernel get_acl will set it to temporary sentinel value. - */ -#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE - acl = get_cached_acl(ip, type); - if (acl != ACL_NOT_CACHED) - return (acl); -#endif - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_POSIX_ACL_DEFAULT; - break; - default: - return (ERR_PTR(-EINVAL)); - } - - size = zpl_xattr_get(ip, name, NULL, 0); - if (size > 0) { - value = kmem_alloc(size, KM_SLEEP); - size = zpl_xattr_get(ip, name, value, size); - } - - if (size > 0) { - acl = zpl_acl_from_xattr(value, size); - } else if (size == -ENODATA || size == -ENOSYS) { - acl = NULL; - } else { - acl = ERR_PTR(-EIO); - } - - if (size > 0) - kmem_free(value, size); - - /* As of Linux 4.7, the kernel get_acl will set this for us */ -#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE - if (!IS_ERR(acl)) - zpl_set_cached_acl(ip, type, acl); -#endif - - return (acl); -} - -#if !defined(HAVE_GET_ACL) -static int -__zpl_check_acl(struct inode *ip, int mask) -{ - struct posix_acl *acl; - int error; - - acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - - if (acl) { - error = posix_acl_permission(ip, acl, mask); - zpl_posix_acl_release(acl); - return (error); - } - - return (-EAGAIN); -} - -#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -int -zpl_check_acl(struct inode *ip, int mask, unsigned int flags) -{ - return (__zpl_check_acl(ip, mask)); -} -#elif defined(HAVE_CHECK_ACL) -int -zpl_check_acl(struct inode *ip, int mask) -{ - return (__zpl_check_acl(ip, mask)); -} -#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -int -zpl_permission(struct inode *ip, int mask, struct nameidata *nd) -{ - return (generic_permission(ip, mask, __zpl_check_acl)); -} -#elif defined(HAVE_PERMISSION) -int -zpl_permission(struct inode *ip, int mask) -{ - return (generic_permission(ip, mask, __zpl_check_acl)); -} -#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -#endif /* !HAVE_GET_ACL */ - -int -zpl_init_acl(struct inode *ip, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error = 0; - - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (0); - - if (!S_ISLNK(ip->i_mode)) { - acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - if (!acl) { - ip->i_mode &= ~current_umask(); - ip->i_ctime = current_time(ip); - zfs_mark_inode_dirty(ip); - return (0); - } - } - - if (acl) { - umode_t mode; - - if (S_ISDIR(ip->i_mode)) { - error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); - if (error) - goto out; - } - - mode = ip->i_mode; - error = __posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error >= 0) { - ip->i_mode = mode; - zfs_mark_inode_dirty(ip); - if (error > 0) - error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); - } - } -out: - zpl_posix_acl_release(acl); - - return (error); -} - -int -zpl_chmod_acl(struct inode *ip) -{ - struct posix_acl *acl; - int error; - - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (0); - - if (S_ISLNK(ip->i_mode)) - return (-EOPNOTSUPP); - - acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return (PTR_ERR(acl)); - - error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); - if (!error) - error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); - - zpl_posix_acl_release(acl); - - return (error); -} - -static int -__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, - const char *name, size_t name_len) -{ - char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); - - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (0); - - if (list && xattr_size <= list_size) - memcpy(list, xattr_name, xattr_size); - - return (xattr_size); -} -ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); - -static int -__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, - const char *name, size_t name_len) -{ - char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; - size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); - - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (0); - - if (list && xattr_size <= list_size) - memcpy(list, xattr_name, xattr_size); - - return (xattr_size); -} -ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); - -static int -__zpl_xattr_acl_get_access(struct inode *ip, const char *name, - void *buffer, size_t size) -{ - struct posix_acl *acl; - int type = ACL_TYPE_ACCESS; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") != 0) - return (-EINVAL); -#endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (-EOPNOTSUPP); - - acl = zpl_get_acl(ip, type); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - if (acl == NULL) - return (-ENODATA); - - error = zpl_acl_to_xattr(acl, buffer, size); - zpl_posix_acl_release(acl); - - return (error); -} -ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); - -static int -__zpl_xattr_acl_get_default(struct inode *ip, const char *name, - void *buffer, size_t size) -{ - struct posix_acl *acl; - int type = ACL_TYPE_DEFAULT; - int error; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") != 0) - return (-EINVAL); -#endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (-EOPNOTSUPP); - - acl = zpl_get_acl(ip, type); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - if (acl == NULL) - return (-ENODATA); - - error = zpl_acl_to_xattr(acl, buffer, size); - zpl_posix_acl_release(acl); - - return (error); -} -ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); - -static int -__zpl_xattr_acl_set_access(struct inode *ip, const char *name, - const void *value, size_t size, int flags) -{ - struct posix_acl *acl; - int type = ACL_TYPE_ACCESS; - int error = 0; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") != 0) - return (-EINVAL); -#endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (-EOPNOTSUPP); - - if (!zpl_inode_owner_or_capable(ip)) - return (-EPERM); - - if (value) { - acl = zpl_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - else if (acl) { - error = zpl_posix_acl_valid(ip, acl); - if (error) { - zpl_posix_acl_release(acl); - return (error); - } - } - } else { - acl = NULL; - } - - error = zpl_set_acl(ip, acl, type); - zpl_posix_acl_release(acl); - - return (error); -} -ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); - -static int -__zpl_xattr_acl_set_default(struct inode *ip, const char *name, - const void *value, size_t size, int flags) -{ - struct posix_acl *acl; - int type = ACL_TYPE_DEFAULT; - int error = 0; - /* xattr_resolve_name will do this for us if this is defined */ -#ifndef HAVE_XATTR_HANDLER_NAME - if (strcmp(name, "") != 0) - return (-EINVAL); -#endif - if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) - return (-EOPNOTSUPP); - - if (!zpl_inode_owner_or_capable(ip)) - return (-EPERM); - - if (value) { - acl = zpl_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return (PTR_ERR(acl)); - else if (acl) { - error = zpl_posix_acl_valid(ip, acl); - if (error) { - zpl_posix_acl_release(acl); - return (error); - } - } - } else { - acl = NULL; - } - - error = zpl_set_acl(ip, acl, type); - zpl_posix_acl_release(acl); - - return (error); -} -ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); - -/* - * ACL access xattr namespace handlers. - * - * Use .name instead of .prefix when available. xattr_resolve_name will match - * whole name and reject anything that has .name only as prefix. - */ -xattr_handler_t zpl_xattr_acl_access_handler = -{ -#ifdef HAVE_XATTR_HANDLER_NAME - .name = XATTR_NAME_POSIX_ACL_ACCESS, -#else - .prefix = XATTR_NAME_POSIX_ACL_ACCESS, -#endif - .list = zpl_xattr_acl_list_access, - .get = zpl_xattr_acl_get_access, - .set = zpl_xattr_acl_set_access, -#if defined(HAVE_XATTR_LIST_SIMPLE) || \ - defined(HAVE_XATTR_LIST_DENTRY) || \ - defined(HAVE_XATTR_LIST_HANDLER) - .flags = ACL_TYPE_ACCESS, -#endif -}; - -/* - * ACL default xattr namespace handlers. - * - * Use .name instead of .prefix when available. xattr_resolve_name will match - * whole name and reject anything that has .name only as prefix. - */ -xattr_handler_t zpl_xattr_acl_default_handler = -{ -#ifdef HAVE_XATTR_HANDLER_NAME - .name = XATTR_NAME_POSIX_ACL_DEFAULT, -#else - .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, -#endif - .list = zpl_xattr_acl_list_default, - .get = zpl_xattr_acl_get_default, - .set = zpl_xattr_acl_set_default, -#if defined(HAVE_XATTR_LIST_SIMPLE) || \ - defined(HAVE_XATTR_LIST_DENTRY) || \ - defined(HAVE_XATTR_LIST_HANDLER) - .flags = ACL_TYPE_DEFAULT, -#endif -}; - -#endif /* CONFIG_FS_POSIX_ACL */ - -xattr_handler_t *zpl_xattr_handlers[] = { - &zpl_xattr_security_handler, - &zpl_xattr_trusted_handler, - &zpl_xattr_user_handler, -#ifdef CONFIG_FS_POSIX_ACL - &zpl_xattr_acl_access_handler, - &zpl_xattr_acl_default_handler, -#endif /* CONFIG_FS_POSIX_ACL */ - NULL -}; - -static const struct xattr_handler * -zpl_xattr_handler(const char *name) -{ - if (strncmp(name, XATTR_USER_PREFIX, - XATTR_USER_PREFIX_LEN) == 0) - return (&zpl_xattr_user_handler); - - if (strncmp(name, XATTR_TRUSTED_PREFIX, - XATTR_TRUSTED_PREFIX_LEN) == 0) - return (&zpl_xattr_trusted_handler); - - if (strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) == 0) - return (&zpl_xattr_security_handler); - -#ifdef CONFIG_FS_POSIX_ACL - if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) - return (&zpl_xattr_acl_access_handler); - - if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) - return (&zpl_xattr_acl_default_handler); -#endif /* CONFIG_FS_POSIX_ACL */ - - return (NULL); -} - -#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) -struct acl_rel_struct { - struct acl_rel_struct *next; - struct posix_acl *acl; - clock_t time; -}; - -#define ACL_REL_GRACE (60*HZ) -#define ACL_REL_WINDOW (1*HZ) -#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) - -/* - * Lockless multi-producer single-consumer fifo list. - * Nodes are added to tail and removed from head. Tail pointer is our - * synchronization point. It always points to the next pointer of the last - * node, or head if list is empty. - */ -static struct acl_rel_struct *acl_rel_head = NULL; -static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; - -static void -zpl_posix_acl_free(void *arg) -{ - struct acl_rel_struct *freelist = NULL; - struct acl_rel_struct *a; - clock_t new_time; - boolean_t refire = B_FALSE; - - ASSERT3P(acl_rel_head, !=, NULL); - while (acl_rel_head) { - a = acl_rel_head; - if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { - /* - * If a is the last node we need to reset tail, but we - * need to use cmpxchg to make sure it is still the - * last node. - */ - if (acl_rel_tail == &a->next) { - acl_rel_head = NULL; - if (cmpxchg(&acl_rel_tail, &a->next, - &acl_rel_head) == &a->next) { - ASSERT3P(a->next, ==, NULL); - a->next = freelist; - freelist = a; - break; - } - } - /* - * a is not last node, make sure next pointer is set - * by the adder and advance the head. - */ - while (READ_ONCE(a->next) == NULL) - cpu_relax(); - acl_rel_head = a->next; - a->next = freelist; - freelist = a; - } else { - /* - * a is still in grace period. We are responsible to - * reschedule the free task, since adder will only do - * so if list is empty. - */ - new_time = a->time + ACL_REL_SCHED; - refire = B_TRUE; - break; - } - } - - if (refire) - taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, - NULL, TQ_SLEEP, new_time); - - while (freelist) { - a = freelist; - freelist = a->next; - kfree(a->acl); - kmem_free(a, sizeof (struct acl_rel_struct)); - } -} - -void -zpl_posix_acl_release_impl(struct posix_acl *acl) -{ - struct acl_rel_struct *a, **prev; - - a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); - a->next = NULL; - a->acl = acl; - a->time = ddi_get_lbolt(); - /* atomically points tail to us and get the previous tail */ - prev = xchg(&acl_rel_tail, &a->next); - ASSERT3P(*prev, ==, NULL); - *prev = a; - /* if it was empty before, schedule the free task */ - if (prev == &acl_rel_head) - taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, - NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); -} -#endif |