diff options
author | Brian Behlendorf <[email protected]> | 2010-05-28 13:45:14 -0700 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2010-05-28 13:45:14 -0700 |
commit | 428870ff734fdaccc342b33fc53cf94724409a46 (patch) | |
tree | 164e83c0ceda52a843795ed7cd9e95637d02c177 /module/zfs/include | |
parent | 6119cb885a976e175a6e827894accf657ff1984f (diff) |
Update core ZFS code from build 121 to build 141.
Diffstat (limited to 'module/zfs/include')
55 files changed, 2709 insertions, 773 deletions
diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h index 6e5955b7c..8f189c62d 100644 --- a/module/zfs/include/sys/arc.h +++ b/module/zfs/include/sys/arc.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ARC_H @@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; - krwlock_t b_lock; + kmutex_t b_evict_lock; + krwlock_t b_data_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; @@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); @@ -99,28 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf); int arc_referenced(arc_buf_t *buf); #endif -typedef struct writeprops { - dmu_object_type_t wp_type; - uint8_t wp_level; - uint8_t wp_copies; - uint8_t wp_dncompress, wp_oscompress; - uint8_t wp_dnchecksum, wp_oschecksum; -} writeprops_t; - -void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp); -int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb); -int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int flags, uint32_t *arc_flags, const zbookmark_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb); -int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); int arc_buf_evict(arc_buf_t *buf); diff --git a/module/zfs/include/sys/bplist.h b/module/zfs/include/sys/bplist.h index cdb93a6c3..471be9047 100644 --- a/module/zfs/include/sys/bplist.h +++ b/module/zfs/include/sys/bplist.h @@ -19,68 +19,36 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_BPLIST_H #define _SYS_BPLIST_H -#include <sys/dmu.h> -#include <sys/spa.h> -#include <sys/txg.h> #include <sys/zfs_context.h> +#include <sys/spa.h> #ifdef __cplusplus extern "C" { #endif -typedef struct bplist_phys { - /* - * This is the bonus buffer for the dead lists. The object's - * contents is an array of bpl_entries blkptr_t's, representing - * a total of bpl_bytes physical space. - */ - uint64_t bpl_entries; - uint64_t bpl_bytes; - uint64_t bpl_comp; - uint64_t bpl_uncomp; -} bplist_phys_t; - -#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) - -typedef struct bplist_q { - blkptr_t bpq_blk; - void *bpq_next; -} bplist_q_t; +typedef struct bplist_entry { + blkptr_t bpe_blk; + list_node_t bpe_node; +} bplist_entry_t; typedef struct bplist { kmutex_t bpl_lock; - objset_t *bpl_mos; - uint64_t bpl_object; - uint8_t bpl_blockshift; - uint8_t bpl_bpshift; - uint8_t bpl_havecomp; - bplist_q_t *bpl_queue; - bplist_phys_t *bpl_phys; - dmu_buf_t *bpl_dbuf; - dmu_buf_t *bpl_cached_dbuf; + list_t bpl_list; } bplist_t; -extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); -extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); -extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); -extern void bplist_close(bplist_t *bpl); -extern boolean_t bplist_empty(bplist_t *bpl); -extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); -extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); -extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); -extern int bplist_space(bplist_t *bpl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -extern int bplist_space_birthrange(bplist_t *bpl, - uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); +typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +void bplist_create(bplist_t *bpl); +void bplist_destroy(bplist_t *bpl); +void bplist_append(bplist_t *bpl, const blkptr_t *bp); +void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, + void *arg, dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/bpobj.h b/module/zfs/include/sys/bpobj.h new file mode 100644 index 000000000..3771a9541 --- /dev/null +++ b/module/zfs/include/sys/bpobj.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_BPOBJ_H +#define _SYS_BPOBJ_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bpobj_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpo_entries blkptr_t's, representing + * a total of bpo_bytes physical space. + */ + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; +} bpobj_phys_t; + +#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) + +typedef struct bpobj { + kmutex_t bpo_lock; + objset_t *bpo_os; + uint64_t bpo_object; + int bpo_epb; + uint8_t bpo_havecomp; + uint8_t bpo_havesubobj; + bpobj_phys_t *bpo_phys; + dmu_buf_t *bpo_dbuf; + dmu_buf_t *bpo_cached_dbuf; +} bpobj_t; + +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); +void bpobj_close(bpobj_t *bpo); + +int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp); + +void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); + +int bpobj_space(bpobj_t *bpo, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPOBJ_H */ diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h index 267852519..4c05806e3 100644 --- a/module/zfs/include/sys/dbuf.h +++ b/module/zfs/include/sys/dbuf.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DBUF_H @@ -38,7 +37,6 @@ extern "C" { #endif -#define DB_BONUS_BLKID (-1ULL) #define IN_DMU_SYNC 2 /* @@ -75,7 +73,6 @@ typedef enum dbuf_states { DB_EVICTING } dbuf_states_t; -struct objset_impl; struct dnode; struct dmu_tx; @@ -134,6 +131,7 @@ typedef struct dbuf_dirty_record { arc_buf_t *dr_data; blkptr_t dr_overridden_by; override_states_t dr_override_state; + uint8_t dr_copies; } dl; } dt; } dbuf_dirty_record_t; @@ -148,7 +146,7 @@ typedef struct dmu_buf_impl { dmu_buf_t db; /* the objset we belong to */ - struct objset_impl *db_objset; + struct objset *db_objset; /* * the dnode we belong to (NULL when evicted) @@ -242,6 +240,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); +int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); +void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); + +void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, @@ -255,6 +257,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); @@ -266,6 +269,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); @@ -273,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); @@ -324,7 +329,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + sprintf_blkptr(__blkbuf, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/module/zfs/include/sys/ddt.h b/module/zfs/include/sys/ddt.h new file mode 100644 index 000000000..9724d6ece --- /dev/null +++ b/module/zfs/include/sys/ddt.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DDT_H +#define _SYS_DDT_H + +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ +enum ddt_type { + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ +enum ddt_class { + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ +typedef struct ddt_key { + zio_cksum_t ddk_cksum; /* 256-bit block checksum */ + uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ +} ddt_key_t; + +/* + * ddk_prop layout: + * + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ +#define DDK_GET_LSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_LSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_PSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_PSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +typedef struct ddt_phys { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; +} ddt_phys_t; + +enum ddt_phys_type { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_TYPES +}; + +/* + * In-core ddt entry + */ +struct ddt_entry { + ddt_key_t dde_key; + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + void *dde_repair_data; + enum ddt_type dde_type; + enum ddt_class dde_class; + uint8_t dde_loading; + uint8_t dde_loaded; + kcondvar_t dde_cv; + avl_node_t dde_node; +}; + +/* + * In-core ddt + */ +struct ddt { + kmutex_t ddt_lock; + avl_tree_t ddt_tree; + avl_tree_t ddt_repair_tree; + enum zio_checksum ddt_checksum; + spa_t *ddt_spa; + objset_t *ddt_os; + uint64_t ddt_stat_object; + uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + avl_node_t ddt_node; +}; + +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + uint64_t (*ddt_op_count)(objset_t *os, uint64_t object); +} ddt_ops_t; + +#define DDT_NAMELEN 80 + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); +extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, dmu_object_info_t *); +extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); + +extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, + uint64_t txg); +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_phys_t *ddp, blkptr_t *bp); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); +extern void ddt_phys_clear(ddt_phys_t *ddp); +extern void ddt_phys_addref(ddt_phys_t *ddp); +extern void ddt_phys_decref(ddt_phys_t *ddp); +extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, + uint64_t txg); +extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); +extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); +extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); +extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); + +extern uint64_t ddt_get_dedup_dspace(spa_t *spa); +extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); + +extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, + ddt_phys_t *ddp_willref); +extern int ddt_ditto_copies_present(ddt_entry_t *dde); + +extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); +extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); + +extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); +extern void ddt_enter(ddt_t *ddt); +extern void ddt_exit(ddt_t *ddt); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); + +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + +extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); +extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); + +extern int ddt_entry_compare(const void *x1, const void *x2); + +extern void ddt_create(spa_t *spa); +extern int ddt_load(spa_t *spa); +extern void ddt_unload(spa_t *spa); +extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); + +extern const ddt_ops_t ddt_zap_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index 3ff71b3b7..83932f467 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_DMU_H #define _SYS_DMU_H @@ -38,12 +39,14 @@ #include <sys/types.h> #include <sys/param.h> #include <sys/cred.h> +#include <sys/time.h> #ifdef __cplusplus extern "C" { #endif struct uio; +struct xuio; struct page; struct vnode; struct spa; @@ -59,8 +62,9 @@ struct drr_end; struct zbookmark; struct spa; struct nvlist; -struct objset_impl; struct arc_buf; +struct zio_prop; +struct sa_handle; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -73,8 +77,8 @@ typedef enum dmu_object_type { DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ - DMU_OT_BPLIST, /* UINT64 */ - DMU_OT_BPLIST_HDR, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ @@ -114,10 +118,22 @@ typedef enum dmu_object_type { DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ - DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -140,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); -#define DS_MODE_NOHOLD 0 /* internal use only */ -#define DS_MODE_USER 1 /* simple access, no special needs */ -#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ -#define DS_MODE_TYPE_MASK 0x3 -#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) -#define DS_MODE_READONLY 0x8 -#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) -#define DS_MODE_INCONSISTENT 0x10 -#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) - #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) @@ -162,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) +#define DMU_DEADLIST_OBJECT (-3ULL) /* + * artificial blkids for bonus buffer and spill blocks + */ +#define DMU_BONUS_BLKID (-1ULL) +#define DMU_SPILL_BLKID (-2ULL) +/* * Public routines to create, destroy, open, and close objsets. */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, - objset_t **osp); -void dmu_objset_close(objset_t *os); +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); + int dmu_objset_evict_dbufs(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); int dmu_objset_destroy(const char *name, boolean_t defer); int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); -int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, boolean_t recursive); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); @@ -201,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_ROOT_DATASET "root_dataset" -#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" @@ -209,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" - -/* 4x8 zbookmark_t */ -#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" -/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ -#define DMU_POOL_SCRUB_QUEUE "scrub_queue" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" -/* 1x4 enum scrub_func */ -#define DMU_POOL_SCRUB_FUNC "scrub_func" -/* 1x8 count */ -#define DMU_POOL_SCRUB_ERRORS "scrub_errors" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" /* * Allocate an object from this objset. The range of object numbers @@ -306,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); /* - * Decide how many copies of a given block we should make. Can be from - * 1 to SPA_DVAS_PER_BP. + * Decide how to write a block: checksum, compression, number of copies, etc. */ -int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, - dmu_object_type_t ot); +#define WP_NOFILL 0x1 +#define WP_DMU_SYNC 0x2 +#define WP_SPILL 0x4 + +void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, + struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a @@ -324,6 +334,17 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); +int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); + +/* + * Special spill buffer support used by "SA" framework + */ + +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, + void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the @@ -340,7 +361,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **); + void *tag, dmu_buf_t **, int flags); void dmu_buf_add_ref(dmu_buf_t *db, void* tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); @@ -437,12 +458,35 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); +void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); /* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); + +/* * Free up the data blocks for a defined range of a file. If size is * zero, the range from offset to end-of-file is freed. */ @@ -469,12 +513,23 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); +int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(); +void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; @@ -485,19 +540,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len); typedef struct dmu_object_info { - /* All sizes are in bytes. */ + /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; - uint64_t doi_bonus_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; + uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_pad[5]; - /* Values below are number of 512-byte blocks. */ - uint64_t doi_physical_blks; /* data + metadata */ - uint64_t doi_max_block_offset; + uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ + uint64_t doi_max_offset; + uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); @@ -566,6 +621,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, */ uint64_t dmu_objset_fsid_guid(objset_t *os); +/* + * Get the [cm]time for an objset's snapshot dir + */ +timestruc_t dmu_objset_snap_cmtime(objset_t *os); + int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); @@ -575,6 +635,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_syncprop(objset_t *os); +extern uint64_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, @@ -582,9 +644,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); -typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype, - void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused, - dmu_tx_t *tx); +typedef int objset_used_cb_t(dmu_object_type_t bonustype, + void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); @@ -605,9 +666,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ -typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); -int dmu_sync(struct zio *zio, dmu_buf_t *db, - struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * {zfs,zvol,ztest}_get_done() args + */ +typedef struct zgd { + struct zilog *zgd_zilog; + struct blkptr *zgd_bp; + dmu_buf_t *zgd_db; + struct rl *zgd_rl; + void *zgd_private; +} zgd_t; + +typedef void dmu_sync_cb_t(zgd_t *arg, int error); +int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off @@ -642,11 +714,12 @@ typedef struct dmu_recv_cookie { struct dsl_dataset *drc_real_ds; struct drr_begin *drc_drrb; char *drc_tosnap; + char *drc_top_ds; boolean_t drc_newfs; boolean_t drc_force; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, +int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, boolean_t force, objset_t *origin, dmu_recv_cookie_t *); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); int dmu_recv_end(dmu_recv_cookie_t *drc); diff --git a/module/zfs/include/sys/dmu_impl.h b/module/zfs/include/sys/dmu_impl.h index 3868a5816..22f9f5f8c 100644 --- a/module/zfs/include/sys/dmu_impl.h +++ b/module/zfs/include/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -210,8 +210,7 @@ extern "C" { * * ds_lock * protects: - * ds_user_ptr - * ds_user_evict_func + * ds_objset * ds_open_refcount * ds_snapname * ds_phys accounting @@ -233,6 +232,39 @@ extern "C" { struct objset; struct dmu_pool; +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h index 052cb8dd9..5c5119a20 100644 --- a/module/zfs/include/sys/dmu_objset.h +++ b/module/zfs/include/sys/dmu_objset.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H @@ -33,6 +34,7 @@ #include <sys/dnode.h> #include <sys/zio.h> #include <sys/zil.h> +#include <sys/sa.h> #ifdef __cplusplus extern "C" { @@ -40,11 +42,13 @@ extern "C" { struct dsl_dataset; struct dmu_tx; -struct objset_impl; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) + #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { @@ -59,11 +63,6 @@ typedef struct objset_phys { } objset_phys_t; struct objset { - struct objset_impl *os; - int os_mode; -}; - -typedef struct objset_impl { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; @@ -73,12 +72,17 @@ typedef struct objset_impl { dnode_t *os_userused_dnode; dnode_t *os_groupused_dnode; zilog_t *os_zil; - objset_t os; - uint8_t os_checksum; /* can change, under dsl_dir's locks */ - uint8_t os_compress; /* can change, under dsl_dir's locks */ - uint8_t os_copies; /* can change, under dsl_dir's locks */ - uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ - uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ + + /* can change, under dsl_dir's locks: */ + uint8_t os_checksum; + uint8_t os_compress; + uint8_t os_copies; + uint8_t os_dedup_checksum; + uint8_t os_dedup_verify; + uint8_t os_logbias; + uint8_t os_primary_cache; + uint8_t os_secondary_cache; + uint8_t os_sync; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ @@ -101,8 +105,12 @@ typedef struct objset_impl { /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; -} objset_impl_t; + /* SA layout/attribute registration */ + sa_os_t *os_sa; +}; + +#define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) @@ -111,14 +119,18 @@ typedef struct objset_impl { (os)->os_secondary_cache == ZFS_CACHE_METADATA) /* called from zpl */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -void dmu_objset_close(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); + +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); int dmu_objset_destroy(const char *name, boolean_t defer); -int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, boolean_t recursive); void dmu_objset_stats(objset_t *os, nvlist_t *nv); @@ -126,23 +138,26 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); int dmu_objset_find_spa(spa_t *spa, const char *name, int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); -int dmu_objset_prefetch(char *name, void *arg); +int dmu_objset_prefetch(const char *name, void *arg); void dmu_objset_byteswap(void *buf, size_t size); int dmu_objset_evict_dbufs(objset_t *os); +timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ -void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); -objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, +void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, - objset_impl_t **osip); -void dmu_objset_evict(struct dsl_dataset *ds, void *arg); -void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx); -boolean_t dmu_objset_userused_enabled(objset_impl_t *os); + objset_t **osp); +void dmu_objset_evict(objset_t *os); +void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); diff --git a/module/zfs/include/sys/dmu_traverse.h b/module/zfs/include/sys/dmu_traverse.h index 3e0268911..844e7f1ae 100644 --- a/module/zfs/include/sys/dmu_traverse.h +++ b/module/zfs/include/sys/dmu_traverse.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -36,19 +35,24 @@ extern "C" { struct dnode_phys; struct dsl_dataset; +struct zilog; +struct arc_buf; -typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, - const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); +typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, + void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) #define TRAVERSE_PREFETCH_METADATA (1<<2) #define TRAVERSE_PREFETCH_DATA (1<<3) #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) +#define TRAVERSE_HARD (1<<4) -int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int flags, blkptr_cb_t func, void *arg); -int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/dmu_tx.h b/module/zfs/include/sys/dmu_tx.h index 2727daaaa..c5ea50fa8 100644 --- a/module/zfs/include/sys/dmu_tx.h +++ b/module/zfs/include/sys/dmu_tx.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/inttypes.h> #include <sys/dmu.h> #include <sys/txg.h> @@ -59,6 +57,7 @@ struct dmu_tx { txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; + list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ uint8_t tx_anyobj; int tx_err; #ifdef ZFS_DEBUG @@ -78,6 +77,7 @@ enum dmu_tx_hold_type { THT_FREE, THT_ZAP, THT_SPACE, + THT_SPILL, THT_NUMTYPES }; @@ -98,6 +98,11 @@ typedef struct dmu_tx_hold { #endif } dmu_tx_hold_t; +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; /* * These routines are defined in dmu.h, and are called by the user. @@ -109,6 +114,10 @@ void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_do_callbacks(list_t *cb_list, int error); + /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ diff --git a/module/zfs/include/sys/dmu_zfetch.h b/module/zfs/include/sys/dmu_zfetch.h index c94bced93..78cadd2b1 100644 --- a/module/zfs/include/sys/dmu_zfetch.h +++ b/module/zfs/include/sys/dmu_zfetch.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _DFETCH_H #define _DFETCH_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #ifdef __cplusplus @@ -63,6 +61,9 @@ typedef struct zfetch { uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ } zfetch_t; +void zfetch_init(void); +void zfetch_fini(void); + void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_rele(zfetch_t *); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h index 48e4da8cd..8bae1602e 100644 --- a/module/zfs/include/sys/dnode.h +++ b/module/zfs/include/sys/dnode.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DNODE_H @@ -63,6 +62,18 @@ extern "C" { #define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ /* + * dnode id flags + * + * Note: a file will never ever have its + * ids moved from bonus->spill + * and only in a crypto environment would it be on spill + */ +#define DN_ID_CHKED_BONUS 0x1 +#define DN_ID_CHKED_SPILL 0x2 +#define DN_ID_OLD_EXIST 0x4 +#define DN_ID_NEW_EXIST 0x8 + +/* * Derived constants. */ #define DNODE_SIZE (1 << DNODE_SHIFT) @@ -70,10 +81,12 @@ extern "C" { #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) +#define DN_KILL_SPILLBLK (1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ @@ -88,7 +101,7 @@ extern "C" { #define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) struct dmu_buf_impl; -struct objset_impl; +struct objset; struct zio; enum dnode_dirtycontext { @@ -101,6 +114,9 @@ enum dnode_dirtycontext { #define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -121,7 +137,8 @@ typedef struct dnode_phys { uint64_t dn_pad3[4]; blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN]; + uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; + blkptr_t dn_spill; } dnode_phys_t; typedef struct dnode { @@ -136,7 +153,7 @@ typedef struct dnode { list_node_t dn_link; /* immutable: */ - struct objset_impl *dn_objset; + struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ @@ -161,6 +178,8 @@ typedef struct dnode { uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; + uint8_t dn_next_bonustype[TXG_SIZE]; + uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ @@ -185,12 +204,17 @@ typedef struct dnode { kmutex_t dn_dbufs_mtx; list_t dn_dbufs; /* linked list of descendent dbuf_t's */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ zio_t *dn_zio; /* used in syncing context */ - dnode_phys_t *dn_oldphys; + uint64_t dn_oldused; /* old phys used bytes */ + uint64_t dn_oldflags; /* old phys dn_flags */ + uint64_t dn_olduid, dn_oldgid; + uint64_t dn_newuid, dn_newgid; + int dn_id_flags; /* holds prefetch structure */ struct zfetch dn_zfetch; @@ -202,14 +226,17 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, +dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object); void dnode_special_close(dnode_t *dn); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); -int dnode_hold(struct objset_impl *dd, uint64_t object, +void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); +void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); + +int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h index b51036d38..58414e133 100644 --- a/module/zfs/include/sys/dsl_dataset.h +++ b/module/zfs/include/sys/dsl_dataset.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -33,6 +32,7 @@ #include <sys/bplist.h> #include <sys/dsl_synctask.h> #include <sys/zfs_context.h> +#include <sys/dsl_deadlist.h> #ifdef __cplusplus extern "C" { @@ -42,8 +42,6 @@ struct dsl_dataset; struct dsl_dir; struct dsl_pool; -typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); - #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) @@ -85,7 +83,7 @@ typedef struct dsl_dataset_phys { uint64_t ds_num_children; /* clone/snap children; ==0 for head */ uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ uint64_t ds_used_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; @@ -115,10 +113,10 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_origin_txg; /* has internal locking: */ - bplist_t ds_deadlist; + dsl_deadlist_t ds_deadlist; + bplist_t ds_pending_deadlist; /* to protect against multiple concurrent incremental recv */ kmutex_t ds_recvlock; @@ -132,8 +130,7 @@ typedef struct dsl_dataset { * Protected by ds_lock: */ kmutex_t ds_lock; - void *ds_user_ptr; - dsl_dataset_evict_func_t *ds_user_evict_func; + objset_t *ds_objset; uint64_t ds_userrefs; /* @@ -165,7 +162,7 @@ struct dsl_ds_destroyarg { boolean_t need_prep; /* do we need to retry due to EBUSY? */ }; -#define dsl_dataset_is_snapshot(ds) \ +#define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) #define DS_UNIQUE_IS_ACCURATE(ds) \ @@ -174,17 +171,17 @@ struct dsl_ds_destroyarg { int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **); -int dsl_dataset_own(const char *name, int flags, void *owner, - dsl_dataset_t **dsp); +int dsl_dataset_own(const char *name, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - int flags, void *owner, dsl_dataset_t **); + boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); void dsl_dataset_name(dsl_dataset_t *ds, char *name); void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, - void *owner); -void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); + void *tag); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -195,21 +192,18 @@ dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; dsl_syncfunc_t dsl_dataset_snapshot_sync; -int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -int dsl_dataset_promote(const char *name); +int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive); + boolean_t recursive, boolean_t temphold); int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, boolean_t recursive); +int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, + char *htag); int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); -void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func); -void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); - blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); @@ -219,10 +213,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); -void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, +void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); -boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx, boolean_t async); +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, + uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); @@ -238,13 +234,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv); -int dsl_dataset_set_quota(const char *dsname, uint64_t quota); -void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx); -int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); -void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); -int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, - dmu_tx_t *tx); +int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, + uint64_t quota); +dsl_syncfunc_t dsl_dataset_set_quota_sync; +int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, + uint64_t reservation); + +int dsl_destroy_inconsistent(const char *dsname, void *arg); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/module/zfs/include/sys/dsl_deadlist.h b/module/zfs/include/sys/dsl_deadlist.h new file mode 100644 index 000000000..d2c16d72c --- /dev/null +++ b/module/zfs/include/sys/dsl_deadlist.h @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_DEADLIST_H +#define _SYS_DSL_DEADLIST_H + +#include <sys/bpobj.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf; +struct dsl_dataset; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +typedef struct dsl_deadlist { + objset_t *dl_os; + uint64_t dl_object; + avl_tree_t dl_tree; + boolean_t dl_havetree; + struct dmu_buf *dl_dbuf; + dsl_deadlist_phys_t *dl_phys; + kmutex_t dl_lock; + + /* if it's the old on-disk format: */ + bpobj_t dl_bpobj; + boolean_t dl_oldfmt; +} dsl_deadlist_t; + +typedef struct dsl_deadlist_entry { + avl_node_t dle_node; + uint64_t dle_mintxg; + bpobj_t dle_bpobj; +} dsl_deadlist_entry_t; + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +void dsl_deadlist_close(dsl_deadlist_t *dl); +uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); +void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx); +void dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_space_range(dsl_deadlist_t *dl, + uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); +void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DEADLIST_H */ diff --git a/module/zfs/include/sys/dsl_dir.h b/module/zfs/include/sys/dsl_dir.h index 56d06388c..2191635dd 100644 --- a/module/zfs/include/sys/dsl_dir.h +++ b/module/zfs/include/sys/dsl_dir.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -70,7 +69,8 @@ typedef struct dsl_dir_phys { uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ uint64_t dd_flags; uint64_t dd_used_breakdown[DD_USED_NUM]; - uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ + uint64_t dd_clones; /* dsl_dir objects */ + uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { @@ -89,6 +89,8 @@ struct dsl_dir { /* Protected by dd_lock */ kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ + uint64_t dd_origin_txg; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; @@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); -int dsl_dir_set_quota(const char *ddname, uint64_t quota); -int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_set_quota(const char *ddname, zprop_source_t source, + uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" +#define XLATION_DIR_NAME "$XLATION" +#define FREE_DIR_NAME "$FREE" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/module/zfs/include/sys/dsl_pool.h b/module/zfs/include/sys/dsl_pool.h index d8da295f3..7d25bd7c0 100644 --- a/module/zfs/include/sys/dsl_pool.h +++ b/module/zfs/include/sys/dsl_pool.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -32,6 +31,9 @@ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/dnode.h> +#include <sys/ddt.h> +#include <sys/arc.h> +#include <sys/bpobj.h> #ifdef __cplusplus extern "C" { @@ -42,12 +44,7 @@ struct dsl_dir; struct dsl_dataset; struct dsl_pool; struct dmu_tx; - -enum scrub_func { - SCRUB_FUNC_NONE, - SCRUB_FUNC_CLEAN, - SCRUB_FUNC_NUMFUNCS -}; +struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -75,6 +72,7 @@ typedef struct dsl_pool { struct objset *dp_meta_objset; struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; + struct dsl_dir *dp_free_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_vnrele_taskq; @@ -83,25 +81,18 @@ typedef struct dsl_pool { blkptr_t dp_meta_rootbp; list_t dp_synced_datasets; hrtime_t dp_read_overhead; - uint64_t dp_throughput; + uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; + uint64_t dp_tmp_userrefs_obj; + bpobj_t dp_free_bpobj; + + struct dsl_scan *dp_scan; /* Uses dp_lock */ kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; - enum scrub_func dp_scrub_func; - uint64_t dp_scrub_queue_obj; - uint64_t dp_scrub_min_txg; - uint64_t dp_scrub_max_txg; - zbookmark_t dp_scrub_bookmark; - boolean_t dp_scrub_pausing; - boolean_t dp_scrub_isresilver; - uint64_t dp_scrub_start_time; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ - boolean_t dp_scrub_restart; - /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; @@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_zil_clean(dsl_pool_t *dp); +void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_memory_pressure(dsl_pool_t *dp); void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, - struct dmu_tx *tx); +void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); +void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, + const blkptr_t *bpp); +int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); - -int dsl_pool_scrub_cancel(dsl_pool_t *dp); -int dsl_pool_scrub_clean(dsl_pool_t *dp); -void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_scrub_restart(dsl_pool_t *dp); +void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); +extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t *now, dmu_tx_t *tx); +extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, dmu_tx_t *tx); +extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h index 5afaa1f0d..a636ad350 100644 --- a/module/zfs/include/sys/dsl_prop.h +++ b/module/zfs/include/sys/dsl_prop.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_PROP_H @@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record { void *cbr_arg; } dsl_prop_cb_record_t; +typedef struct dsl_props_arg { + nvlist_t *pa_props; + zprop_source_t pa_source; +} dsl_props_arg_t; + +typedef struct dsl_prop_set_arg { + const char *psa_name; + zprop_source_t psa_source; + int psa_intsz; + int psa_numints; + const void *psa_value; + + /* + * Used to handle the special requirements of the quota and reservation + * properties. + */ + uint64_t psa_effective_value; +} dsl_prop_setarg_t; + int dsl_prop_register(struct dsl_dataset *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg); int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, @@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint); + int intsz, int numints, void *buf, char *setpoint, + boolean_t snapshot); dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, - int intsz, int numints, const void *buf); -int dsl_props_set(const char *dsname, nvlist_t *nvl); + zprop_source_t source, int intsz, int numints, const void *buf); +int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx); + dmu_tx_t *tx); + +void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, + zprop_source_t source, uint64_t *value); +int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#ifdef ZFS_DEBUG +void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ + dsl_prop_check_prediction((dd), (psa)) +#else +#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ +#endif + +/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ +boolean_t dsl_prop_get_hasrecvd(objset_t *os); +void dsl_prop_set_hasrecvd(objset_t *os); +void dsl_prop_unset_hasrecvd(objset_t *os); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv, diff --git a/module/zfs/include/sys/dsl_scan.h b/module/zfs/include/sys/dsl_scan.h new file mode 100644 index 000000000..c79666e67 --- /dev/null +++ b/module/zfs/include/sys/dsl_scan.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, +} dsl_scan_flags_t; + +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + + boolean_t scn_pausing; + uint64_t scn_restart_txg; + uint64_t scn_sync_start_time; + zio_t *scn_zio_root; + + /* for debugging / information */ + uint64_t scn_visited_this_txg; + + dsl_scan_phys_t scn_phys; +} dsl_scan_t; + +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +boolean_t dsl_scan_active(dsl_scan_t *scn); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/module/zfs/include/sys/dsl_synctask.h b/module/zfs/include/sys/dsl_synctask.h index 4995bfe5a..9126290cd 100644 --- a/module/zfs/include/sys/dsl_synctask.h +++ b/module/zfs/include/sys/dsl_synctask.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H #define _SYS_DSL_SYNCTASK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/txg.h> #include <sys/zfs_context.h> @@ -38,7 +35,7 @@ extern "C" { struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); typedef struct dsl_sync_task { list_node_t dst_node; @@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group { txg_node_t dstg_node; list_t dstg_tasks; struct dsl_pool *dstg_pool; - cred_t *dstg_cr; uint64_t dstg_txg; int dstg_err; int dstg_space; diff --git a/module/zfs/include/sys/fm/fs/zfs.h b/module/zfs/include/sys/fm/fs/zfs.h index 21b7dbe52..c752edc99 100644 --- a/module/zfs/include/sys/fm/fs/zfs.h +++ b/module/zfs/include/sys/fm/fs/zfs.h @@ -68,6 +68,18 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_FAILMODE_WAIT "wait" #define FM_EREPORT_FAILMODE_CONTINUE "continue" @@ -75,6 +87,7 @@ extern "C" { #define FM_RESOURCE_REMOVED "removed" #define FM_RESOURCE_AUTOREPLACE "autoreplace" +#define FM_RESOURCE_STATECHANGE "statechange" #ifdef __cplusplus } diff --git a/module/zfs/include/sys/fm/protocol.h b/module/zfs/include/sys/fm/protocol.h index 767fb07d8..c4103c48a 100644 --- a/module/zfs/include/sys/fm/protocol.h +++ b/module/zfs/include/sys/fm/protocol.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FM_PROTOCOL_H @@ -47,6 +46,7 @@ extern "C" { /* FM event class values */ #define FM_EREPORT_CLASS "ereport" #define FM_FAULT_CLASS "fault" +#define FM_DEFECT_CLASS "defect" #define FM_RSRC_CLASS "resource" #define FM_LIST_EVENT "list" @@ -83,6 +83,7 @@ extern "C" { #define FM_SUSPECT_FAULT_LIST "fault-list" #define FM_SUSPECT_FAULT_SZ "fault-list-sz" #define FM_SUSPECT_FAULT_STATUS "fault-status" +#define FM_SUSPECT_INJECTED "__injected" #define FM_SUSPECT_MESSAGE "message" #define FM_SUSPECT_RETIRE "retire" #define FM_SUSPECT_RESPONSE "response" @@ -122,6 +123,7 @@ extern "C" { #define FM_RSRC_ASRU_REPAIRED "repaired" #define FM_RSRC_ASRU_REPLACED "replaced" #define FM_RSRC_ASRU_ACQUITTED "acquitted" +#define FM_RSRC_ASRU_RESOLVED "resolved" #define FM_RSRC_ASRU_UNUSABLE "unusable" #define FM_RSRC_ASRU_EVENT "event" @@ -170,6 +172,7 @@ extern "C" { /* FMRI authority-type member names */ #define FM_FMRI_AUTH_CHASSIS "chassis-id" +#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" #define FM_FMRI_AUTH_PRODUCT "product-id" #define FM_FMRI_AUTH_DOMAIN "domain-id" #define FM_FMRI_AUTH_SERVER "server-id" @@ -243,6 +246,7 @@ extern "C" { /* dev scheme member names */ #define FM_FMRI_DEV_ID "devid" +#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id" #define FM_FMRI_DEV_PATH "device-path" /* pkg scheme member names */ @@ -311,7 +315,7 @@ extern int i_fm_payload_set(nvlist_t *, const char *, va_list); extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, int, ...); extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, - const char *); + const char *, const char *); extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, uint8_t *, const char *); @@ -320,6 +324,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, extern void fm_authority_set(nvlist_t *, int, const char *, const char *, const char *, const char *); extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); +extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, + nvlist_t *, int, ...); extern uint64_t fm_ena_increment(uint64_t); extern uint64_t fm_ena_generate(uint64_t, uchar_t); diff --git a/module/zfs/include/sys/metaslab.h b/module/zfs/include/sys/metaslab.h index 5d3e11c97..583d6303b 100644 --- a/module/zfs/include/sys/metaslab.h +++ b/module/zfs/include/sys/metaslab.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -36,9 +35,6 @@ extern "C" { #endif -typedef struct metaslab_class metaslab_class_t; -typedef struct metaslab_group metaslab_group_t; - extern space_map_ops_t *zfs_metaslab_ops; extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, @@ -46,6 +42,7 @@ extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 @@ -57,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops); +extern metaslab_class_t *metaslab_class_create(spa_t *spa, + space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); -extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); -extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); +extern int metaslab_class_validate(metaslab_class_t *mc); + +extern void metaslab_class_space_update(metaslab_class_t *mc, + int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta, int64_t dspace_delta); +extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_space(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); extern void metaslab_group_destroy(metaslab_group_t *mg); +extern void metaslab_group_activate(metaslab_group_t *mg); +extern void metaslab_group_passivate(metaslab_group_t *mg); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/metaslab_impl.h b/module/zfs/include/sys/metaslab_impl.h index d67dea7e9..07988dd51 100644 --- a/module/zfs/include/sys/metaslab_impl.h +++ b/module/zfs/include/sys/metaslab_impl.h @@ -37,16 +37,23 @@ extern "C" { #endif struct metaslab_class { + spa_t *mc_spa; metaslab_group_t *mc_rotor; - uint64_t mc_allocated; space_map_ops_t *mc_ops; + uint64_t mc_aliquot; + uint64_t mc_alloc; /* total allocated space */ + uint64_t mc_deferred; /* total deferred frees */ + uint64_t mc_space; /* total space (alloc + free) */ + uint64_t mc_dspace; /* total deflated space */ }; struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; + uint64_t mg_bonus_area; int64_t mg_bias; + int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; @@ -66,7 +73,9 @@ struct metaslab { space_map_obj_t ms_smo_syncing; /* syncing space map object */ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ space_map_t ms_map; /* in-core free space map */ + int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ diff --git a/module/zfs/include/sys/refcount.h b/module/zfs/include/sys/refcount.h index d3fe7b1f8..bc3ade80f 100644 --- a/module/zfs/include/sys/refcount.h +++ b/module/zfs/include/sys/refcount.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_REFCOUNT_H #define _SYS_REFCOUNT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/inttypes.h> #include <sys/list.h> #include <sys/zfs_context.h> @@ -91,6 +88,11 @@ typedef struct refcount { atomic_add_64_nv(&(rc)->rc_count, number) #define refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) +#define refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} #define refcount_init() #define refcount_fini() diff --git a/module/zfs/include/sys/sa.h b/module/zfs/include/sys/sa.h new file mode 100644 index 000000000..e9a96a0f9 --- /dev/null +++ b/module/zfs/include/sys/sa.h @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SA_H +#define _SYS_SA_H + +#include <sys/dmu.h> + +/* + * Currently available byteswap functions. + * If it all possible new attributes should used + * one of the already defined byteswap functions. + * If a new byteswap function is added then the + * ZPL/Pool version will need to be bumped. + */ + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +/* + * Attribute to register support for. + */ +typedef struct sa_attr_reg { + char *sa_name; /* attribute name */ + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; /* bswap functon enum */ + sa_attr_type_t sa_attr; /* filled in during registration */ +} sa_attr_reg_t; + + +typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, + boolean_t, void *userptr); + +/* + * array of attributes to store. + * + * This array should be treated as opaque/private data. + * The SA_BULK_ADD_ATTR() macro should be used for manipulating + * the array. + * + * When sa_replace_all_by_template() is used the attributes + * will be stored in the order defined in the array, except that + * the attributes may be split between the bonus and the spill buffer + * + */ +typedef struct sa_bulk_attr { + void *sa_data; + sa_data_locator_t *sa_data_func; + uint16_t sa_length; + sa_attr_type_t sa_attr; + /* the following are private to the sa framework */ + void *sa_addr; + uint16_t sa_buftype; + uint16_t sa_size; +} sa_bulk_attr_t; + + +/* + * special macro for adding entries for bulk attr support + * bulk - sa_bulk_attr_t + * count - integer that will be incremented during each add + * attr - attribute to manipulate + * func - function for accessing data. + * data - pointer to data. + * len - length of data + */ + +#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ +{ \ + b[idx].sa_attr = attr;\ + b[idx].sa_data_func = func; \ + b[idx].sa_data = data; \ + b[idx++].sa_length = len; \ +} + +typedef struct sa_os sa_os_t; + +typedef enum sa_handle_type { + SA_HDL_SHARED, + SA_HDL_PRIVATE +} sa_handle_type_t; + +struct sa_handle; +typedef void *sa_lookup_tab_t; +typedef struct sa_handle sa_handle_t; + +typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); + +int sa_handle_get(objset_t *, uint64_t, void *userp, + sa_handle_type_t, sa_handle_t **); +int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, + sa_handle_type_t, sa_handle_t **); +void sa_handle_destroy(sa_handle_t *); +int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); +void sa_buf_rele(dmu_buf_t *, void *); +int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); +int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, + uint32_t buflen, dmu_tx_t *); +int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); +int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); +int sa_size(sa_handle_t *, sa_attr_type_t, int *); +int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, + uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); +void sa_object_info(sa_handle_t *, dmu_object_info_t *); +void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void sa_update_user(sa_handle_t *, sa_handle_t *); +void *sa_get_userdata(sa_handle_t *); +void sa_set_userp(sa_handle_t *, void *); +dmu_buf_t *sa_get_db(sa_handle_t *); +uint64_t sa_handle_object(sa_handle_t *); +boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); +void sa_register_update_callback(objset_t *, sa_update_cb_t *); +sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int); +void sa_tear_down(objset_t *); +int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +boolean_t sa_enabled(objset_t *); +void sa_cache_init(); +void sa_cache_fini(); +int sa_set_sa_object(objset_t *, uint64_t); +int sa_hdrsize(void *); +void sa_handle_lock(sa_handle_t *); +void sa_handle_unlock(sa_handle_t *); + +#ifdef _KERNEL +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_H */ diff --git a/module/zfs/include/sys/sa_impl.h b/module/zfs/include/sys/sa_impl.h new file mode 100644 index 000000000..62497e702 --- /dev/null +++ b/module/zfs/include/sys/sa_impl.h @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/list.h> + +/* + * Array of known attributes and their + * various characteristics. + */ +typedef struct sa_attr_table { + sa_attr_type_t sa_attr; + uint8_t sa_registered; + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; + char *sa_name; +} sa_attr_table_t; + +/* + * Zap attribute format for attribute registration + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | unused | len | bswap | attr num | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Zap attribute format for layout information. + * + * layout information is stored as an array of attribute numbers + * The name of the attribute is the layout number (0, 1, 2, ...) + * + * 16 0 + * +---- ---+ + * | attr # | + * +--------+ + * | attr # | + * +--- ----+ + * ...... + * + */ + +#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define ATTR_NUM(x) BF32_GET(x, 0, 16) +#define ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define TOC_OFF(x) BF32_GET(x, 0, 23) +#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) +#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) +#define TOC_ATTR_ENCODE(x, len_idx, offset) \ +{ \ + BF32_SET(x, 31, 1, 1); \ + BF32_SET(x, 24, 7, len_idx); \ + BF32_SET(x, 0, 24, offset); \ +} + +#define SA_LAYOUTS "LAYOUTS" +#define SA_REGISTRY "REGISTRY" + +/* + * Each unique layout will have their own table + * sa_lot (layout_table) + */ +typedef struct sa_lot { + avl_node_t lot_num_node; + avl_node_t lot_hash_node; + uint64_t lot_num; + uint64_t lot_hash; + sa_attr_type_t *lot_attrs; /* array of attr #'s */ + uint32_t lot_var_sizes; /* how many aren't fixed size */ + uint32_t lot_attr_count; /* total attr count */ + list_t lot_idx_tab; /* should be only a couple of entries */ + int lot_instance; /* used with lot_hash to identify entry */ +} sa_lot_t; + +/* index table of offsets */ +typedef struct sa_idx_tab { + list_node_t sa_next; + sa_lot_t *sa_layout; + uint16_t *sa_variable_lengths; + refcount_t sa_refcount; + uint32_t *sa_idx_tab; /* array of offsets */ +} sa_idx_tab_t; + +/* + * Since the offset/index information into the actual data + * will usually be identical we can share that information with + * all handles that have the exact same offsets. + * + * You would typically only have a large number of different table of + * contents if you had a several variable sized attributes. + * + * Two AVL trees are used to track the attribute layout numbers. + * one is keyed by number and will be consulted when a DMU_OT_SA + * object is first read. The second tree is keyed by the hash signature + * of the attributes and will be consulted when an attribute is added + * to determine if we already have an instance of that layout. Both + * of these tree's are interconnected. The only difference is that + * when an entry is found in the "hash" tree the list of attributes will + * need to be compared against the list of attributes you have in hand. + * The assumption is that typically attributes will just be updated and + * adding a completely new attribute is a very rare operation. + */ +struct sa_os { + kmutex_t sa_lock; + boolean_t sa_need_attr_registration; + boolean_t sa_force_spill; + uint64_t sa_master_obj; + uint64_t sa_reg_attr_obj; + uint64_t sa_layout_attr_obj; + int sa_num_attrs; + sa_attr_table_t *sa_attr_table; /* private attr table */ + sa_update_cb_t *sa_update_cb; + avl_tree_t sa_layout_num_tree; /* keyed by layout number */ + avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ + int sa_user_table_sz; + sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ +}; + +/* + * header for all bonus and spill buffers. + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attribues which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +/* + * sa_hdr_phys -> sa_layout_info + * + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +typedef enum sa_buf_type { + SA_BONUS = 1, + SA_SPILL = 2 +} sa_buf_type_t; + +typedef enum sa_data_op { + SA_LOOKUP, + SA_UPDATE, + SA_ADD, + SA_REPLACE, + SA_REMOVE +} sa_data_op_t; + +/* + * Opaque handle used for most sa functions + * + * This needs to be kept as small as possible. + */ + +struct sa_handle { + kmutex_t sa_lock; + dmu_buf_t *sa_bonus; + dmu_buf_t *sa_spill; + objset_t *sa_os; + void *sa_userp; + sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ + sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ +}; + +#define SA_GET_DB(hdl, type) \ + (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) + +#define SA_GET_HDR(hdl, type) \ + ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ + type))->db.db_data)) + +#define SA_IDX_TAB_GET(hdl, type) \ + (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) + +#define IS_SA_BONUSTYPE(a) \ + ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) + +#define SA_BONUSTYPE_FROM_DB(db) \ + (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype) + +#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) + +#define SA_LAYOUT_NUM(x, type) \ + ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ + ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) + + +#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length + +#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ + hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ + SA_REGISTERED_LEN(sa, attr)) + +#define SA_SET_HDR(hdr, num, size) \ + { \ + hdr->sa_magic = SA_MAGIC; \ + SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ + } + +#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ + { \ + bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ + bulk.sa_buftype = type; \ + bulk.sa_addr = \ + (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ + (uintptr_t)hdr); \ +} + +#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ + (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ + (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ + sizeof (uint16_t), 8) : 0))) + +int sa_add_impl(sa_handle_t *, sa_attr_type_t, + uint32_t, sa_data_locator_t, void *, dmu_tx_t *); + +void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); +int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); + +void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, + uint16_t *, sa_hdr_phys_t *); + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_IMPL_H */ diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 0a4d55097..41a40300e 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_H @@ -43,8 +42,13 @@ extern "C" { typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; struct dsl_pool; /* @@ -134,15 +138,15 @@ typedef struct zio_cksum { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | padding | + * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | birth txg | + * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -166,25 +170,29 @@ typedef struct zio_cksum { * cksum checksum function * comp compression function * G gang block indicator - * E endianness - * type DMU object type + * B byteorder (endianness) + * D dedup + * X unused * lvl level of indirection - * birth txg transaction group in which the block was born + * type DMU object type + * phys birth txg of block allocation; zero if same as logical birth txg + * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ -typedef struct blkptr { - dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[3]; /* Extra space for the future */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + /* * Macros to get and set fields in a bp or DVA. */ @@ -208,8 +216,7 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define BP_SET_LSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) @@ -218,20 +225,35 @@ typedef struct blkptr { #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) -#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) -#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) +#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) -#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} #define BP_GET_ASIZE(bp) \ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ @@ -239,7 +261,7 @@ typedef struct blkptr { #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ @@ -255,6 +277,12 @@ typedef struct blkptr { ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + #define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ @@ -274,7 +302,10 @@ typedef struct blkptr { #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) #define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) -#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ @@ -287,14 +318,12 @@ typedef struct blkptr { (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ - (bp)->blk_pad[2] = 0; \ + (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } -#define BLK_FILL_ALREADY_FREED (-1ULL) - /* * Note: the byteorder is either 0 or -1, both of which are palindromes. * This simplifies the endianness handling a bit. @@ -309,17 +338,81 @@ typedef struct blkptr { #define BP_SPRINTF_LEN 320 +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ +#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int size = BP_SPRINTF_LEN; \ + int len = 0; \ + int copies = 0; \ + \ + if (bp == NULL) { \ + len = func(buf + len, size - len, "<NULL>"); \ + } else if (BP_IS_HOLE(bp)) { \ + len = func(buf + len, size - len, "<hole>"); \ + } else { \ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)bp->blk_fill, \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + #include <sys/dmu.h> #define BP_GET_BUFC_TYPE(bp) \ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA); -/* - * Routines found in spa.c - */ + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, @@ -338,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -345,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa); #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -353,8 +456,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -368,15 +474,23 @@ extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); -extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); -/* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); +/* + * DEFERRED_FREE must be large enough that regular blocks are not + * deferred. XXX so can't we change it back to 1? + */ +#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ +#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ +#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; @@ -394,7 +508,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); -extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* * Miscellaneous SPA routines in spa_misc.c @@ -402,7 +515,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, const char *altroot); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); @@ -411,6 +524,7 @@ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); +#define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ @@ -430,12 +544,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa); +extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_offline_log(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); + /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); @@ -447,18 +579,26 @@ extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_alloc(spa_t *spa); -extern uint64_t spa_get_space(spa_t *spa); -extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); @@ -466,18 +606,24 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void sprintf_blkptr(char *buf, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); +extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to); + extern int spa_mode(spa_t *spa); +extern uint64_t strtonum(const char *str, char **nptr); /* history logging */ typedef enum history_log_type { @@ -487,10 +633,11 @@ typedef enum history_log_type { } history_log_type_t; typedef struct history_arg { - const char *ha_history_str; + char *ha_history_str; history_log_type_t ha_log_type; history_internal_events_t ha_event; - char ha_zone[MAXPATHLEN]; + char *ha_zone; + uid_t ha_uid; } history_arg_t; extern char *spa_his_ievent_table[]; @@ -500,17 +647,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -extern void spa_history_internal_log(history_internal_events_t event, - spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_internal(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ struct zbookmark; -struct zio; -extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t stateoroffset, uint64_t length); + zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); @@ -541,7 +688,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + sprintf_blkptr(__blkbuf, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h index 84da68488..e2e1851ec 100644 --- a/module/zfs/include/sys/spa_impl.h +++ b/module/zfs/include/sys/spa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -36,6 +35,7 @@ #include <sys/avl.h> #include <sys/refcount.h> #include <sys/bplist.h> +#include <sys/bpobj.h> #ifdef __cplusplus extern "C" { @@ -78,19 +78,33 @@ typedef struct spa_config_dirent { char *scd_path; } spa_config_dirent_t; -typedef enum spa_log_state { - SPA_LOG_UNKNOWN = 0, /* unknown log state */ - SPA_LOG_MISSING, /* missing log(s) */ - SPA_LOG_CLEAR, /* clear the log(s) */ - SPA_LOG_GOOD, /* log(s) are good */ -} spa_log_state_t; - enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES }; +/* + * State machine for the zpool-pooname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -99,6 +113,7 @@ struct spa { avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ @@ -113,6 +128,8 @@ struct spa { uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ + uint64_t spa_load_max_txg; /* best initial ub_txg */ + uint64_t spa_claim_max_txg; /* highest claimed birth txg */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ @@ -122,21 +139,24 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ - uint64_t spa_sync_bplist_obj; /* object for deferred frees */ - bplist_t spa_sync_bplist; /* deferred-free bplist */ + bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ + bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ - uint64_t spa_scrub_errors; /* scrub I/O error count */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ @@ -144,7 +164,14 @@ struct spa { uint16_t spa_async_tasks; /* async task mask */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ - boolean_t spa_last_open_failed; /* true if last open faled */ + int spa_last_open_failed; /* error if last open failed */ + uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ + uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_txg; /* ub txg that loaded */ + uint64_t spa_load_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_meta_errors; /* verify metadata err count */ + uint64_t spa_load_data_errors; /* verify data err count */ + uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ @@ -166,11 +193,27 @@ struct spa { kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ + uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ + ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ + uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_ditto; /* dedup ditto threshold */ + uint64_t spa_dedup_checksum; /* default dedup checksum */ + uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + struct proc *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ + boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. @@ -183,12 +226,6 @@ struct spa { extern const char *spa_config_path; -#define BOOTFS_COMPRESS_VALID(compress) \ - ((compress) == ZIO_COMPRESS_LZJB || \ - ((compress) == ZIO_COMPRESS_ON && \ - ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ - (compress) == ZIO_COMPRESS_OFF) - #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h index a682bbd40..6f935c9db 100644 --- a/module/zfs/include/sys/space_map.h +++ b/module/zfs/include/sys/space_map.h @@ -77,6 +77,7 @@ struct space_map_ops { void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); uint64_t (*smop_max)(space_map_t *sm); + boolean_t (*smop_fragmented)(space_map_t *sm); }; /* diff --git a/module/zfs/include/sys/txg.h b/module/zfs/include/sys/txg.h index 23bdff211..e323d5efa 100644 --- a/module/zfs/include/sys/txg.h +++ b/module/zfs/include/sys/txg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TXG_H #define _SYS_TXG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/zfs_context.h> @@ -41,6 +39,9 @@ extern "C" { #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +/* Number of txgs worth of frees we defer adding to in-core spacemaps */ +#define TXG_DEFER_SIZE 2 + #define TXG_WAIT 1ULL #define TXG_NOWAIT 2ULL @@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); -extern void txg_suspend(struct dsl_pool *dp); -extern void txg_resume(struct dsl_pool *dp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); /* * Delay the caller by the specified number of ticks or until @@ -117,6 +117,7 @@ extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); extern int txg_list_empty(txg_list_t *tl, uint64_t txg); extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); diff --git a/module/zfs/include/sys/txg_impl.h b/module/zfs/include/sys/txg_impl.h index 7413c662b..7b356eac1 100644 --- a/module/zfs/include/sys/txg_impl.h +++ b/module/zfs/include/sys/txg_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,13 +37,13 @@ struct tx_cpu { kmutex_t tc_lock; kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ char tc_pad[16]; }; typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects right to enter txg */ kmutex_t tx_sync_lock; /* protects tx_state_t */ - krwlock_t tx_suspend; uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ @@ -64,6 +64,8 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus diff --git a/module/zfs/include/sys/uberblock.h b/module/zfs/include/sys/uberblock.h index 93d936ae4..b5bb91573 100644 --- a/module/zfs/include/sys/uberblock.h +++ b/module/zfs/include/sys/uberblock.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,19 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_H #define _SYS_UBERBLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/vdev.h> #include <sys/zio.h> -#include <sys/zio_checksum.h> #ifdef __cplusplus extern "C" { diff --git a/module/zfs/include/sys/uberblock_impl.h b/module/zfs/include/sys/uberblock_impl.h index b49df8ae0..6ab6aa313 100644 --- a/module/zfs/include/sys/uberblock_impl.h +++ b/module/zfs/include/sys/uberblock_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_UBERBLOCK_IMPL_H @@ -33,11 +32,6 @@ extern "C" { #endif /* - * For zdb use and debugging purposes only - */ -extern uint64_t ub_max_txg; - -/* * The uberblock version is incremented whenever an incompatible on-disk * format change is made to the SPA, DMU, or ZAP. * @@ -57,6 +51,9 @@ struct uberblock { uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; }; #ifdef __cplusplus diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h index 7e53f62d2..941f234dc 100644 --- a/module/zfs/include/sys/vdev.h +++ b/module/zfs/include/sys/vdev.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -47,7 +46,8 @@ typedef enum vdev_dtl_type { extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); -extern void vdev_open_children(vdev_t *vd); +extern void vdev_open_children(vdev_t *); +extern boolean_t vdev_uses_zvols(vdev_t *); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); @@ -69,28 +69,31 @@ extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); +extern void vdev_hold(vdev_t *); +extern void vdev_rele(vdev_t *); + extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); -extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, - boolean_t complete); -extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); -extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta, boolean_t update_root); +extern void vdev_space_update(vdev_t *vd, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern int vdev_fault(spa_t *spa, uint64_t guid); -extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); @@ -121,8 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2 +} vdev_config_flag_t; + +extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + boolean_t getstats, vdev_config_flag_t flags); /* * Label routines @@ -138,7 +148,8 @@ typedef enum { VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ - VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h index 23780430d..2b886bc58 100644 --- a/module/zfs/include/sys/vdev_impl.h +++ b/module/zfs/include/sys/vdev_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -62,6 +61,8 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef int vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef void vdev_hold_func_t(vdev_t *vd); +typedef void vdev_rele_func_t(vdev_t *vd); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -70,6 +71,8 @@ typedef struct vdev_ops { vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; + vdev_hold_func_t *vdev_op_hold; + vdev_rele_func_t *vdev_op_rele; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -112,6 +115,7 @@ struct vdev { uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ @@ -120,6 +124,8 @@ struct vdev { vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ + vnode_t *vdev_name_vp; /* vnode for pathname */ + vnode_t *vdev_devid_vp; /* vnode for devid */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ @@ -127,8 +133,10 @@ struct vdev { space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ + boolean_t vdev_reopening; /* reopen in progress? */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ + uint64_t vdev_crtxg; /* txg when top-level was added */ /* * Top-level vdev state. @@ -143,10 +151,12 @@ struct vdev { txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ + uint64_t vdev_removing; /* device is being removed? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_ishole; /* is a hole in the namespace */ /* * Leaf vdev state. @@ -170,6 +180,8 @@ struct vdev { boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ + boolean_t vdev_delayed_close; /* delayed device close? */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint8_t vdev_cant_read; /* vdev is failing all reads */ @@ -180,6 +192,7 @@ struct vdev { vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ + vdev_aux_t vdev_label_aux; /* on-disk aux state */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -193,6 +206,8 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; +#define VDEV_RAIDZ_MAXPARITY 3 + #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 @@ -208,8 +223,8 @@ struct vdev { #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { - char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; - zio_block_tail_t vp_zbt; + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { @@ -244,10 +259,13 @@ typedef struct vdev_label { #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 /* * Allocate or free a vdev */ +extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, + vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); @@ -264,7 +282,7 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ -extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv); +extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -280,6 +298,7 @@ extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; /* diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h index 967174be4..a1130bbba 100644 --- a/module/zfs/include/sys/zap.h +++ b/module/zfs/include/sys/zap.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -101,6 +100,18 @@ typedef enum matchtype MT_FIRST } matchtype_t; +typedef enum zap_flags { + /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ + ZAP_FLAG_HASH64 = 1 << 0, + /* Key is binary, not string (zap_add_uint64() can be used) */ + ZAP_FLAG_UINT64_KEY = 1 << 1, + /* + * First word of key (which must be an array of uint64) is + * already randomly distributed. + */ + ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, +} zap_flags_t; + /* * Create a new zapobj with no attributes and return its object number. * MT_EXACT will cause the zap object to only support MT_EXACT lookups, @@ -118,6 +129,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -180,6 +194,11 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); +int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints); int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, uint64_t *towrite, uint64_t *tooverwrite); @@ -190,9 +209,12 @@ int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *name, +int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -204,6 +226,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -214,6 +239,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); +int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. @@ -224,6 +251,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); +int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap @@ -231,7 +260,6 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string @@ -248,6 +276,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj, */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). @@ -255,6 +291,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); + +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +/* + * They name is a stringified version of key; increment its value by + * delta. Zero values will be zap_remove()-ed. + */ +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); struct zap; struct zap_leaf; @@ -264,6 +317,7 @@ typedef struct zap_cursor { struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; + uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; } zap_cursor_t; @@ -315,6 +369,11 @@ void zap_cursor_advance(zap_cursor_t *zc); uint64_t zap_cursor_serialize(zap_cursor_t *zc); /* + * Advance the cursor to the attribute having the given key. + */ +int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); + +/* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also * use a "serialized" argument of 0 to start at the beginning of the diff --git a/module/zfs/include/sys/zap_impl.h b/module/zfs/include/sys/zap_impl.h index c86bb16de..1dc322e02 100644 --- a/module/zfs/include/sys/zap_impl.h +++ b/module/zfs/include/sys/zap_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -40,13 +39,13 @@ extern int fzap_default_block_shift; #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) -#define ZAP_MAXCD (uint32_t)(-1) -#define ZAP_HASHBITS 28 #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) #define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT #define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) +#define ZAP_NEED_CD (-1U) + typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; @@ -67,9 +66,11 @@ typedef struct mzap_ent { avl_node_t mze_node; int mze_chunkid; uint64_t mze_hash; - mzap_ent_phys_t mze_phys; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ } mzap_ent_t; +#define MZE_PHYS(zap, mze) \ + (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of @@ -127,6 +128,7 @@ typedef struct zap_phys { uint64_t zap_num_entries; /* number of entries */ uint64_t zap_salt; /* salt to stir into hash function */ uint64_t zap_normflags; /* flags for u8_textprep_str() */ + uint64_t zap_flags; /* zap_flags_t */ /* * This structure is followed by padding, and then the embedded * pointer table. The embedded pointer table takes up second @@ -168,10 +170,13 @@ typedef struct zap { typedef struct zap_name { zap_t *zn_zap; - const char *zn_name_orij; + int zn_key_intlen; + const void *zn_key_orig; + int zn_key_orig_numints; + const void *zn_key_norm; + int zn_key_norm_numints; uint64_t zn_hash; matchtype_t zn_matchtype; - const char *zn_name_norm; char zn_normbuf[ZAP_MAXNAMELEN]; } zap_name_t; @@ -183,8 +188,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); void zap_evict(dmu_buf_t *db, void *vmzap); -zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt); +zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); +int zap_hashbits(zap_t *zap); +uint32_t zap_maxcd(zap_t *zap); +uint64_t zap_getflags(zap_t *zap); #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) @@ -193,6 +201,7 @@ int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); +void fzap_prefetch(zap_name_t *zn); int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, @@ -209,7 +218,8 @@ void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx); -void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); +int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zap_leaf.h b/module/zfs/include/sys/zap_leaf.h index 14144e059..3a3363674 100644 --- a/module/zfs/include/sys/zap_leaf.h +++ b/module/zfs/include/sys/zap_leaf.h @@ -19,20 +19,21 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H -#pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/zap.h> #ifdef __cplusplus extern "C" { #endif struct zap; +struct zap_name; +struct zap_stats; #define ZAP_LEAF_MAGIC 0x2AB1EAF @@ -129,12 +130,12 @@ typedef struct zap_leaf_phys { typedef union zap_leaf_chunk { struct zap_leaf_entry { uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ - uint8_t le_int_size; /* size of ints */ + uint8_t le_value_intlen; /* size of value's ints */ uint16_t le_next; /* next entry in hash chain */ uint16_t le_name_chunk; /* first chunk of the name */ - uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_name_numints; /* ints in name (incl null) */ uint16_t le_value_chunk; /* first chunk of the value */ - uint16_t le_value_length; /* value length in ints */ + uint16_t le_value_numints; /* value length in ints */ uint32_t le_cd; /* collision differentiator */ uint64_t le_hash; /* hash value of the name */ } l_entry; @@ -177,7 +178,7 @@ typedef struct zap_entry_handle { * value must equal zap_hash(name). */ extern int zap_leaf_lookup(zap_leaf_t *l, - zap_name_t *zn, zap_entry_handle_t *zeh); + struct zap_name *zn, zap_entry_handle_t *zeh); /* * Return a handle to the entry with this hash+cd, or the entry with the @@ -193,10 +194,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l, * num_integers in the attribute. */ extern int zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf); + uint8_t integer_size, uint64_t num_integers, void *buf); -extern int zap_entry_read_name(const zap_entry_handle_t *zeh, - uint16_t buflen, char *buf); +extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); /* * Replace the value of an existing entry. @@ -204,7 +205,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh, * zap_entry_update may fail if it runs out of space (ENOSPC). */ extern int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf); + uint8_t integer_size, uint64_t num_integers, const void *buf); /* * Remove an entry. @@ -216,17 +217,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh); * belong in this leaf (according to its hash value). Fills in the * entry handle on success. Returns 0 on success or ENOSPC on failure. */ -extern int zap_entry_create(zap_leaf_t *l, - const char *name, uint64_t h, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh); +extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); /* * Return true if there are additional entries with the same normalized * form. */ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, - zap_name_t *zn, const char *name, zap_t *zap); + struct zap_name *zn, const char *name, struct zap *zap); /* * Other stuff. @@ -235,7 +235,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); -extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); +extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, + struct zap_stats *zs); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h index 3488962e2..72e868fab 100644 --- a/module/zfs/include/sys/zfs_acl.h +++ b/module/zfs/include/sys/zfs_acl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_ACL_H @@ -33,6 +32,7 @@ #include <sys/acl.h> #include <sys/dmu.h> #include <sys/zfs_fuid.h> +#include <sys/sa.h> #ifdef __cplusplus extern "C" { @@ -106,12 +106,18 @@ typedef struct zfs_acl_phys_v0 { #define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) +/* + * Size of ACL count is always 2 bytes. + * Necessary to for dealing with both V0 ACL and V1 ACL layout + */ +#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) + typedef struct zfs_acl_phys { uint64_t z_acl_extern_obj; /* ext acl pieces */ uint32_t z_acl_size; /* Number of bytes in ACL */ uint16_t z_acl_version; /* acl version */ uint16_t z_acl_count; /* ace count */ - uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; typedef struct acl_ops { @@ -146,21 +152,26 @@ typedef struct zfs_acl_node { void *z_allocdata; /* pointer to kmem allocated memory */ size_t z_allocsize; /* Size of blob in bytes */ size_t z_size; /* length of ACL data */ - int z_ace_count; /* number of ACEs in this acl node */ + uint64_t z_ace_count; /* number of ACEs in this acl node */ int z_ace_idx; /* ace iterator positioned on */ } zfs_acl_node_t; typedef struct zfs_acl { - int z_acl_count; /* Number of ACEs */ + uint64_t z_acl_count; /* Number of ACEs */ size_t z_acl_bytes; /* Number of bytes in ACL */ uint_t z_version; /* version of ACL */ void *z_next_ace; /* pointer to next ACE */ - int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ } zfs_acl_t; +typedef struct acl_locator_cb { + zfs_acl_t *cb_aclp; + zfs_acl_node_t *cb_acl_node; +} zfs_acl_locator_cb_t; + #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) @@ -174,6 +185,10 @@ typedef struct zfs_acl_ids { struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ } zfs_acl_ids_t; +#define ZFS_EXTERNAL_ACL(zp) \ + (zp->z_is_sa ? 0 : zfs_external_acl(zp)) +#define ZNODE_ACL_VERSION(zp) \ + (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp)) /* * Property values for acl_mode and acl_inherit. * @@ -215,6 +230,16 @@ void zfs_acl_free(zfs_acl_t *); int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **); int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); +uint64_t zfs_external_acl(struct znode *); +int zfs_znode_acl_version(struct znode *); +int zfs_acl_size(struct znode *, int *); +zfs_acl_t *zfs_acl_alloc(int); +zfs_acl_node_t *zfs_acl_node_alloc(size_t); +void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); +void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, + uint64_t *, uint64_t, uint64_t); +int zfs_acl_chown_setattr(struct znode *); #endif diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h index 40de32084..558e9e188 100644 --- a/module/zfs/include/sys/zfs_context.h +++ b/module/zfs/include/sys/zfs_context.h @@ -62,6 +62,7 @@ extern "C" { #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/dev.h> #include <sys/fm/util.h> +#include <sys/sunddi.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/module/zfs/include/sys/zfs_ctldir.h b/module/zfs/include/sys/zfs_ctldir.h index c15c946d5..f88ef95fd 100644 --- a/module/zfs/include/sys/zfs_ctldir.h +++ b/module/zfs/include/sys/zfs_ctldir.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ZFS_CTLDIR_H @@ -49,6 +48,7 @@ void zfsctl_destroy(zfsvfs_t *); vnode_t *zfsctl_root(znode_t *); void zfsctl_init(void); void zfsctl_fini(void); +boolean_t zfsctl_is_node(vnode_t *); int zfsctl_rename_snapshot(const char *from, const char *to); int zfsctl_destroy_snapshot(const char *snapname, int force); diff --git a/module/zfs/include/sys/zfs_debug.h b/module/zfs/include/sys/zfs_debug.h index 450ac1c81..50ecf9b36 100644 --- a/module/zfs/include/sys/zfs_debug.h +++ b/module/zfs/include/sys/zfs_debug.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H #define _SYS_ZFS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func, extern void zfs_panic_recover(const char *fmt, ...); +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); +extern void zfs_dbgmsg(const char *fmt, ...); + #ifdef __cplusplus } #endif diff --git a/module/zfs/include/sys/zfs_dir.h b/module/zfs/include/sys/zfs_dir.h index 650315be2..349f8ef37 100644 --- a/module/zfs/include/sys/zfs_dir.h +++ b/module/zfs/include/sys/zfs_dir.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,11 +42,11 @@ extern "C" { #define ZRENAMING 0x0010 /* znode is being renamed */ #define ZCILOOK 0x0020 /* case-insensitive lookup requested */ #define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ /* mknode flags */ #define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_XATTR 0x02 /* create an extended attribute node */ -#define IS_REPLAY 0x04 /* we are replaying intent log */ extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, int, int *, pathname_t *); @@ -57,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_ids_t *); + uint_t, znode_t **, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h index f81ddf4a5..0feb3ce4b 100644 --- a/module/zfs/include/sys/zfs_fuid.h +++ b/module/zfs/include/sys/zfs_fuid.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,8 @@ typedef struct zfs_fuid_info { #ifdef _KERNEL struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, + uint64_t, uint64_t, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, cred_t *, zfs_fuid_info_t **); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index 3a3e6e711..b0cb4955e 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,7 @@ #include <sys/dmu.h> #include <sys/zio.h> #include <sys/dsl_deleg.h> +#include <sys/spa.h> #ifdef _KERNEL #include <sys/nvpair.h> @@ -45,26 +46,86 @@ extern "C" { #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 -#define DMU_BACKUP_STREAM_VERSION (1ULL) -#define DMU_BACKUP_HEADER_VERSION (2ULL) +/* + * Field manipulation macros for the drr_versioninfo field of the + * send stream header. + */ + +/* + * Header types for zfs send streams. + */ +typedef enum drr_headertype { + DMU_SUBSTREAM = 0x1, + DMU_COMPOUNDSTREAM = 0x2 +} drr_headertype_t; + +#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) +#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) + +#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) +#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) + +/* + * Feature flags for zfs send streams (flags in drr_versioninfo) + */ + +#define DMU_BACKUP_FEATURE_DEDUP (0x1) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) +#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) + +/* + * Mask of all supported backup features + */ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ + DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) + +/* Are all features in the given flag word currently supported? */ +#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) + +/* + * The drr_versioninfo field of the dmu_replay_record has the + * following layout: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | reserved | feature-flags |C|S| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * The low order two bits indicate the header type: SUBSTREAM (0x1) + * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: + * this field used to be a version number, where the two version types + * were 1 and 2. Using two bits for this allows earlier versions of + * the code to be able to recognize send streams that don't use any + * of the features indicated by feature flags. + */ + #define DMU_BACKUP_MAGIC 0x2F5bacbacULL #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* + * flags in the drr_checksumflags field in the DRR_WRITE and + * DRR_WRITE_BYREF blocks + */ +#define DRR_CHECKSUM_DEDUP (1<<0) + +#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) + +/* * zfs ioctl command structure */ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, - DRR_WRITE, DRR_FREE, DRR_END, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_SPILL, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin { uint64_t drr_magic; - uint64_t drr_version; + uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; @@ -74,6 +135,7 @@ typedef struct dmu_replay_record { } drr_begin; struct drr_end { zio_cksum_t drr_checksum; + uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; @@ -81,14 +143,16 @@ typedef struct dmu_replay_record { dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; - uint8_t drr_checksum; + uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_pad[6]; + uint64_t drr_toguid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; + uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; @@ -96,13 +160,42 @@ typedef struct dmu_replay_record { uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint64_t drr_pad[4]; /* needed for crypto */ + /* spill data follows */ + } drr_spill; } drr_u; } dmu_replay_record_t; @@ -117,6 +210,10 @@ typedef struct zinject_record { uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -146,6 +243,7 @@ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; @@ -166,6 +264,7 @@ typedef struct zfs_cmd { struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; boolean_t zc_defer_destroy; + boolean_t zc_temphold; } zfs_cmd_t; typedef struct zfs_useracct { @@ -178,6 +277,8 @@ typedef struct zfs_useracct { #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 + #ifdef _KERNEL typedef struct zfs_creat { @@ -192,7 +293,7 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); -extern int zfs_unmount_snap(char *, void *); +extern int zfs_unmount_snap(const char *, void *); #endif /* _KERNEL */ diff --git a/module/zfs/include/sys/zfs_sa.h b/module/zfs/include/sys/zfs_sa.h new file mode 100644 index 000000000..cd312b27a --- /dev/null +++ b/module/zfs/include/sys/zfs_sa.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_SA_H +#define _SYS_ZFS_SA_H + +#ifdef _KERNEL +#include <sys/types32.h> +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_znode.h> +#include <sys/sa.h> +#include <sys/zil.h> + + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the list of known attributes + * to the ZPL. The values of the actual + * attributes are not defined by the order + * the enums. It is controlled by the attribute + * registration mechanism. Two different file system + * could have different numeric values for the same + * attributes. this list is only used for dereferencing + * into the table that will hold the actual numeric value. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_END +} zpl_attr_t; + +#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 +#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ + sizeof (zfs_acl_phys_t)) + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 + +extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; +extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; + +/* + * This is a deprecated data structure that only exists for + * dealing with file systems create prior to ZPL version 5. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +#ifdef _KERNEL +int zfs_sa_readlink(struct znode *, uio_t *); +void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); +void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); +void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); +void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_SA_H */ diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h index 28555232b..86dcdacc0 100644 --- a/module/zfs/include/sys/zfs_vfsops.h +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H @@ -31,6 +30,7 @@ #include <sys/list.h> #include <sys/vfs.h> #include <sys/zil.h> +#include <sys/sa.h> #include <sys/rrwlock.h> #include <sys/zfs_ioctl.h> @@ -39,6 +39,7 @@ extern "C" { #endif typedef struct zfsvfs zfsvfs_t; +struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ @@ -56,7 +57,6 @@ struct zfsvfs { boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ - uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ @@ -73,11 +73,13 @@ struct zfsvfs { boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -132,19 +134,22 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; -extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); -extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname); extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valuep); extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota); -extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, - boolean_t isgroup, uint64_t fuid); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); -extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp); +extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index 5db5b8d51..4781ee686 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,8 +32,10 @@ #include <sys/attr.h> #include <sys/list.h> #include <sys/dmu.h> +#include <sys/sa.h> #include <sys/zfs_vfsops.h> #include <sys/rrwlock.h> +#include <sys/zfs_sa.h> #endif #include <sys/zfs_acl.h> #include <sys/zil.h> @@ -57,13 +59,16 @@ extern "C" { #define ZFS_OPAQUE 0x0000010000000000 #define ZFS_AV_QUARANTINED 0x0000020000000000 #define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_REPARSE 0x0000080000000000 -#define ZFS_ATTR_SET(zp, attr, value) \ +#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ { \ if (value) \ - zp->z_phys->zp_flags |= attr; \ + pflags |= attr; \ else \ - zp->z_phys->zp_flags &= ~attr; \ + pflags &= ~attr; \ + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ + &pflags, sizeof (pflags), tx)); \ } /* @@ -79,6 +84,27 @@ extern "C" { #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ #define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ +#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] +#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] +#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] +#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] +#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] +#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] +#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] +#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] +#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] +#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] +#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] +#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] +#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] +#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] +#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] +#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] +#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] +#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] +#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] + /* * Is ID ephemeral? */ @@ -87,8 +113,10 @@ extern "C" { /* * Should we use FUIDs? */ -#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) +#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) #define MASTER_NODE_OBJ 1 @@ -103,6 +131,7 @@ extern "C" { #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" #define ZFS_SHARES_DIR "SHARES" +#define ZFS_SA_ATTRS "SA_ATTRS" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -131,42 +160,6 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) /* - * This is the persistent portion of the znode. It is stored - * in the "bonus buffer" of the file. Short symbolic links - * are also stored in the bonus buffer. - */ -typedef struct znode_phys { - uint64_t zp_atime[2]; /* 0 - last file access time */ - uint64_t zp_mtime[2]; /* 16 - last file modification time */ - uint64_t zp_ctime[2]; /* 32 - last file change time */ - uint64_t zp_crtime[2]; /* 48 - creation time */ - uint64_t zp_gen; /* 64 - generation (txg of creation) */ - uint64_t zp_mode; /* 72 - file mode bits */ - uint64_t zp_size; /* 80 - size of file */ - uint64_t zp_parent; /* 88 - directory parent (`..') */ - uint64_t zp_links; /* 96 - number of links to file */ - uint64_t zp_xattr; /* 104 - DMU object for xattrs */ - uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ - uint64_t zp_flags; /* 120 - persistent flags */ - uint64_t zp_uid; /* 128 - file owner */ - uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_zap; /* 144 - extra attributes */ - uint64_t zp_pad[3]; /* 152 - future */ - zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ - /* - * Data may pad out any remaining bytes in the znode buffer, eg: - * - * |<---------------------- dnode_phys (512) ------------------------>| - * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| - * |<---- znode (264) ---->|<---- data (56) ---->| - * - * At present, we use this space for the following: - * - symbolic links - * - 32-byte anti-virus scanstamp (regular files only) - */ -} znode_phys_t; - -/* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. @@ -175,6 +168,7 @@ typedef struct znode_phys { typedef struct zfs_dirlock { char *dl_name; /* directory entry being locked */ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ uint16_t dl_namesize; /* set if dl_name was allocated */ kcondvar_t dl_cv; /* wait for entry to be unlocked */ struct znode *dl_dzp; /* directory znode */ @@ -198,16 +192,20 @@ typedef struct znode { uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_last_itx; /* last ZIL itx on this znode */ - uint64_t z_gen; /* generation (same as zp_gen) */ + uint64_t z_gen; /* generation (cached) */ + uint64_t z_size; /* file size (cached) */ + uint64_t z_atime[2]; /* atime (cached) */ + uint64_t z_links; /* file links (cached) */ + uint64_t z_pflags; /* pflags (cached) */ + uid_t z_uid; /* uid mapped (cached) */ + uid_t z_gid; /* gid mapped (cached) */ + mode_t z_mode; /* mode (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ - /* - * These are dmu managed fields. - */ - znode_phys_t *z_phys; /* pointer to persistent znode */ - dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ + sa_handle_t *z_sa_hdl; /* handle to sa data */ + boolean_t z_is_sa; /* are we native sa? */ } znode_t; @@ -250,7 +248,7 @@ typedef struct znode { #define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) #define ZFS_VERIFY_ZP(zp) \ - if ((zp)->z_dbuf == NULL) { \ + if ((zp)->z_sa_hdl == NULL) { \ ZFS_EXIT((zp)->z_zfsvfs); \ return (EIO); \ } \ @@ -292,14 +290,14 @@ typedef struct znode { #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ - zfs_time_stamper(zp, ACCESSED, NULL) + zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); extern int zfs_init_fs(zfsvfs_t *, znode_t **); extern void zfs_set_dataprop(objset_t *); extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, dmu_tx_t *tx); -extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); -extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], + uint64_t [2], boolean_t); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); @@ -338,7 +336,7 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); -extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h index 2aff8cd68..2f01cf922 100644 --- a/module/zfs/include/sys/zil.h +++ b/module/zfs/include/sys/zil.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_ZIL_H #define _SYS_ZIL_H @@ -55,34 +56,40 @@ typedef struct zil_header { uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ - uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ uint64_t zh_flags; /* header flags */ - uint64_t zh_pad[4]; + uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t zh_pad[3]; } zil_header_t; /* * zh_flags bit settings */ -#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ /* - * Log block trailer - structure at the end of the header and each log block + * Log block chaining. + * + * Log blocks are chained together. Originally they were chained at the + * end of the block. For performance reasons the chain was moved to the + * beginning of the block which allows writes for only the data being used. + * The older position is supported for backwards compatability. * - * The zit_bt contains a zbt_cksum which for the intent log is + * The zio_eck_t contains a zec_cksum which for the intent log is * the sequence number of this log block. A seq of 0 is invalid. - * The zbt_cksum is checked by the SPA against the sequence + * The zec_cksum is checked by the SPA against the sequence * number passed in the blk_cksum field of the blkptr_t */ -typedef struct zil_trailer { - uint64_t zit_pad; - blkptr_t zit_next_blk; /* next block in chain */ - uint64_t zit_nused; /* bytes in log block used */ - zio_block_tail_t zit_bt; /* block trailer */ -} zil_trailer_t; +typedef struct zil_chain { + uint64_t zc_pad; + blkptr_t zc_next_blk; /* next block in chain */ + uint64_t zc_nused; /* bytes in log block used */ + zio_eck_t zc_eck; /* block trailer */ +} zil_chain_t; #define ZIL_MIN_BLKSZ 4096ULL #define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE -#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) /* * The words of a log block checksum. @@ -139,7 +146,8 @@ typedef enum zil_create { #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ -#define TX_MAX_TYPE 20 /* Max transaction type */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -149,6 +157,20 @@ typedef enum zil_create { #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* + * Transactions for write, truncate, setattr, acl_v0, and acl can be logged + * out of order. For convenience in the code, all such records must have + * lr_foid at the same offset. + */ +#define TX_OOO(txtype) \ + ((txtype) == TX_WRITE || \ + (txtype) == TX_TRUNCATE || \ + (txtype) == TX_SETATTR || \ + (txtype) == TX_ACL_V0 || \ + (txtype) == TX_ACL || \ + (txtype) == TX_WRITE2) + + +/* * Format of log records. * The fields are carefully defined to allow them to be aligned * and sized the same on sparc & intel architectures. @@ -168,6 +190,14 @@ typedef struct { /* common log record header */ } lr_t; /* + * Common start of all out-of-order record types (TX_OOO() above). + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id */ +} lr_ooo_t; + +/* * Handle option extended vattr attributes. * * Whenever new attributes are added the version number @@ -257,7 +287,7 @@ typedef struct { uint64_t lr_foid; /* file object to write */ uint64_t lr_offset; /* offset to write to */ uint64_t lr_length; /* user data length to write */ - uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + uint64_t lr_blkoff; /* no longer used */ blkptr_t lr_blkptr; /* spa block pointer for replay */ /* write data will follow for small writes */ } lr_write_t; @@ -332,6 +362,7 @@ typedef enum { /* and put blkptr in log, rather than actual data) */ WR_COPIED, /* immediate - data is copied into lr_write_t */ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ + WR_NUM_STATES /* number of states */ } itx_wr_state_t; typedef struct itx { @@ -344,26 +375,14 @@ typedef struct itx { /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; - -/* - * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() - * to handle the cleanup of the dmu_sync() buffer write - */ -typedef struct { - zilog_t *zgd_zilog; /* zilog */ - blkptr_t *zgd_bp; /* block pointer */ - struct rl *zgd_rl; /* range lock */ -} zgd_t; - - -typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, +typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg); -typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, +typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); -extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); @@ -377,27 +396,33 @@ extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern void zil_itx_destroy(itx_t *itx); extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); -extern int zil_vdev_offline(char *osname, void *txarg); -extern int zil_claim(char *osname, void *txarg); -extern int zil_check_log_chain(char *osname, void *txarg); +extern int zil_vdev_offline(const char *osname, void *txarg); +extern int zil_claim(const char *osname, void *txarg); +extern int zil_check_log_chain(const char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog); -extern int zil_is_committed(zilog_t *zilog); extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); -extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); +extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); +extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); + +extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); + +extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); -extern int zil_disable; +extern int zil_replay_disable; #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h index 685305fb5..6560a7942 100644 --- a/module/zfs/include/sys/zil_impl.h +++ b/module/zfs/include/sys/zil_impl.h @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* Portions Copyright 2010 Robert Milkowski */ + #ifndef _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H @@ -43,8 +44,8 @@ typedef struct lwb { int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ zio_t *lwb_zio; /* zio for this buffer */ + dmu_tx_t *lwb_tx; /* tx for log block allocation */ uint64_t lwb_max_txg; /* highest txg in this lwb */ - txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ } lwb_t; @@ -57,6 +58,8 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; +#define ZIL_PREV_BLKS 16 + /* * Stable storage intent log management structure. One per dataset. */ @@ -68,9 +71,10 @@ struct zilog { objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ zio_t *zl_root_zio; /* log writer root zio */ - uint64_t zl_itx_seq; /* next itx sequence number */ + uint64_t zl_itx_seq; /* next in-core itx sequence number */ + uint64_t zl_lr_seq; /* on-disk log record sequence number */ uint64_t zl_commit_seq; /* committed upto this number */ - uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ uint64_t zl_replaying_seq; /* current replay seq number */ @@ -82,7 +86,13 @@ struct zilog { uint8_t zl_replay; /* replaying records while set */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ - uint8_t zl_log_error; /* boolean: log write error */ + uint8_t zl_logbias; /* latency or throughput */ + uint8_t zl_sync; /* synchronous or asynchronous */ + int zl_parse_error; /* last zil_parse() error */ + uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ + uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ + uint64_t zl_parse_blk_count; /* number of blocks parsed */ + uint64_t zl_parse_lr_count; /* number of log records parsed */ list_t zl_itx_list; /* in-memory itx list */ uint64_t zl_itx_list_sz; /* total size of records on list */ uint64_t zl_cur_used; /* current commit log size used */ @@ -91,17 +101,20 @@ struct zilog { kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ - avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ + zil_header_t zl_old_header; /* debugging aid */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_rotor; /* rotor for zl_prev[] */ }; -typedef struct zil_dva_node { +typedef struct zil_bp_node { dva_t zn_dva; avl_node_t zn_node; -} zil_dva_node_t; +} zil_bp_node_t; -#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ sizeof (lr_write_t)) #ifdef __cplusplus diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h index e47d8f468..0400c1702 100644 --- a/module/zfs/include/sys/zio.h +++ b/module/zfs/include/sys/zio.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ZIO_H @@ -38,12 +37,15 @@ extern "C" { #endif -#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL -typedef struct zio_block_tail { - uint64_t zbt_magic; /* for validation, endianness */ - zio_cksum_t zbt_cksum; /* 256-bit checksum */ -} zio_block_tail_t; +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; /* * Gang block headers are self-checksumming and contain an array @@ -51,16 +53,16 @@ typedef struct zio_block_tail { */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) + sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t) - \ + sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; - zio_block_tail_t zg_tail; + zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { @@ -73,12 +75,19 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_FUNCTIONS }; #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 +#define ZIO_DEDUPDITTO_MIN 100 + enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, @@ -94,12 +103,19 @@ enum zio_compress { ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, ZIO_COMPRESS_FUNCTIONS }; #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) + #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 @@ -107,84 +123,89 @@ enum zio_compress { #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) -#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) -#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) -#define ZIO_PRIORITY_FREE (zio_priority_table[5]) -#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) -#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) -#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) -#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) -#define ZIO_PRIORITY_TABLE_SIZE 10 - -#define ZIO_FLAG_MUSTSUCCEED 0x000000 -#define ZIO_FLAG_CANFAIL 0x000001 -#define ZIO_FLAG_SPECULATIVE 0x000002 -#define ZIO_FLAG_CONFIG_WRITER 0x000004 -#define ZIO_FLAG_DONT_RETRY 0x000008 - -#define ZIO_FLAG_DONT_CACHE 0x000010 -#define ZIO_FLAG_DONT_QUEUE 0x000020 -#define ZIO_FLAG_DONT_AGGREGATE 0x000040 -#define ZIO_FLAG_DONT_PROPAGATE 0x000080 - -#define ZIO_FLAG_IO_BYPASS 0x000100 -#define ZIO_FLAG_IO_REPAIR 0x000200 -#define ZIO_FLAG_IO_RETRY 0x000400 -#define ZIO_FLAG_IO_REWRITE 0x000800 - -#define ZIO_FLAG_SELF_HEAL 0x001000 -#define ZIO_FLAG_RESILVER 0x002000 -#define ZIO_FLAG_SCRUB 0x004000 -#define ZIO_FLAG_SCRUB_THREAD 0x008000 - -#define ZIO_FLAG_PROBE 0x010000 -#define ZIO_FLAG_GANG_CHILD 0x020000 -#define ZIO_FLAG_RAW 0x040000 -#define ZIO_FLAG_GODFATHER 0x080000 - -#define ZIO_FLAG_TRYHARD 0x100000 -#define ZIO_FLAG_NODATA 0x200000 -#define ZIO_FLAG_OPTIONAL 0x400000 - -#define ZIO_FLAG_GANG_INHERIT \ - (ZIO_FLAG_CANFAIL | \ - ZIO_FLAG_SPECULATIVE | \ - ZIO_FLAG_CONFIG_WRITER | \ - ZIO_FLAG_DONT_RETRY | \ - ZIO_FLAG_DONT_CACHE | \ - ZIO_FLAG_DONT_AGGREGATE | \ - ZIO_FLAG_SELF_HEAL | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) - -#define ZIO_FLAG_VDEV_INHERIT \ - (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_IO_RETRY | \ - ZIO_FLAG_PROBE | \ - ZIO_FLAG_TRYHARD | \ - ZIO_FLAG_NODATA | \ - ZIO_FLAG_OPTIONAL) - -#define ZIO_FLAG_AGG_INHERIT \ - (ZIO_FLAG_DONT_AGGREGATE | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_SELF_HEAL | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) +#define ZIO_PRIORITY_AGG (zio_priority_table[5]) +#define ZIO_PRIORITY_FREE (zio_priority_table[6]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) +#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) +#define ZIO_PRIORITY_TABLE_SIZE 12 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +enum zio_flag { + /* + * Flags inherited by gang, ddt, and vdev children, + * and that must be equal for two zios to aggregate + */ + ZIO_FLAG_DONT_AGGREGATE = 1 << 0, + ZIO_FLAG_IO_REPAIR = 1 << 1, + ZIO_FLAG_SELF_HEAL = 1 << 2, + ZIO_FLAG_RESILVER = 1 << 3, + ZIO_FLAG_SCRUB = 1 << 4, + ZIO_FLAG_SCRUB_THREAD = 1 << 5, + +#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) + + /* + * Flags inherited by ddt, gang, and vdev children. + */ + ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 7, + ZIO_FLAG_CONFIG_WRITER = 1 << 8, + ZIO_FLAG_DONT_RETRY = 1 << 9, + ZIO_FLAG_DONT_CACHE = 1 << 10, + ZIO_FLAG_NODATA = 1 << 11, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, + +#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) +#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) + + /* + * Flags inherited by vdev children. + */ + ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 14, + ZIO_FLAG_TRYHARD = 1 << 15, + ZIO_FLAG_OPTIONAL = 1 << 16, + +#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) + + /* + * Flags not inherited by any children. + */ + ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 18, + ZIO_FLAG_IO_BYPASS = 1 << 19, + ZIO_FLAG_IO_REWRITE = 1 << 20, + ZIO_FLAG_RAW = 1 << 21, + ZIO_FLAG_GANG_CHILD = 1 << 22, + ZIO_FLAG_DDT_CHILD = 1 << 23, + ZIO_FLAG_GODFATHER = 1 << 24 +}; + +#define ZIO_FLAG_MUSTSUCCEED 0 + +#define ZIO_DDT_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ + ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) + #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) +#define ZIO_VDEV_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ + ZIO_FLAG_CANFAIL) + enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, + ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; @@ -202,7 +223,6 @@ enum zio_wait_type { #define ECKSUM EBADE #define EFRAGS EBADR -typedef struct zio zio_t; typedef void zio_done_func_t(zio_t *zio); extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; @@ -211,18 +231,15 @@ extern char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) - * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is - * level -1 of the meta-dnode, and intent log blocks (which are chained - * off the root block) have blkid == sequence number. In summary: + * is objset 0, and the meta-dnode is object 0. This covers all blocks + * except root blocks and ZIL blocks, which are defined as follows: * - * mos is objset 0 - * meta-dnode is object 0 - * root block is <objset, 0, -1, 0> - * intent log is <objset, 0, -1, ZIL sequence number> + * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. + * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. + * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. * - * Note: this structure is called a bookmark because its first purpose was - * to remember where to resume a pool-wide traverse. The absolute ordering - * for block visitation during traversal is defined in compare_bookmark(). + * Note: this structure is called a bookmark because its original purpose + * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel. * Therefore it must not change size or alignment between 32/64 bit @@ -235,14 +252,66 @@ typedef struct zbookmark { uint64_t zb_blkid; } zbookmark_t; +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define ZB_DESTROYED_OBJSET (-1ULL) + +#define ZB_ROOT_OBJECT (0ULL) +#define ZB_ROOT_LEVEL (-1LL) +#define ZB_ROOT_BLKID (0ULL) + +#define ZB_ZIL_OBJECT (0ULL) +#define ZB_ZIL_LEVEL (-2LL) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; - uint8_t zp_ndvas; + uint8_t zp_copies; + uint8_t zp_dedup; + uint8_t zp_dedup_verify; } zio_prop_t; +typedef struct zio_cksum_report zio_cksum_report_t; + +typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, + const void *good_data); +typedef void zio_cksum_free_f(void *cbdata, size_t size); + +struct zio_bad_cksum; /* defined in zio_checksum.h */ + +struct zio_cksum_report { + struct zio_cksum_report *zcr_next; + nvlist_t *zcr_ereport; + nvlist_t *zcr_detector; + void *zcr_cbdata; + size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_align; + uint64_t zcr_length; + zio_cksum_finish_f *zcr_finish; + zio_cksum_free_f *zcr_free; + + /* internal use only */ + struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ +}; + +typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, + void *arg); + +zio_vsd_cksum_report_f zio_vsd_default_cksum_report; + +typedef struct zio_vsd_ops { + zio_done_func_t *vsd_free; + zio_vsd_cksum_report_f *vsd_cksum_report; +} zio_vsd_ops_t; + typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; @@ -293,6 +362,7 @@ struct zio { uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; + blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; @@ -304,16 +374,20 @@ struct zio { zio_done_func_t *io_ready; zio_done_func_t *io_done; void *io_private; + int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; + void *io_orig_data; uint64_t io_size; + uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; - zio_done_func_t *io_vsd_free; + const zio_vsd_ops_t *io_vsd_ops; + uint64_t io_offset; uint64_t io_deadline; avl_node_t io_offset_node; @@ -321,15 +395,17 @@ struct zio { avl_tree_t *io_vdev_tree; /* Internal pipeline state */ - int io_flags; - zio_stage_t io_stage; - uint32_t io_pipeline; - int io_orig_flags; - zio_stage_t io_orig_stage; - uint32_t io_orig_pipeline; + enum zio_flag io_flags; + enum zio_stage io_stage; + enum zio_stage io_pipeline; + enum zio_flag io_orig_flags; + enum zio_stage io_orig_stage; + enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -339,53 +415,58 @@ struct zio { kcondvar_t io_cv; /* FMA state */ + zio_cksum_report_t *io_cksum_report; uint64_t io_ena; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, - zio_done_func_t *done, void *private, int flags); + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *private, int flags); + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_prop_t *zp, + void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb); + int priority, enum zio_flag flags, zbookmark_t *zb); -extern void zio_skip_write(zio_t *zio); +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); -extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); -extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); -extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t txg); -extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, enum zio_flag flags); + +extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t use_slog); +extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); @@ -406,11 +487,11 @@ extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); @@ -419,8 +500,12 @@ extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); -extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); -extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); +extern enum zio_checksum zio_checksum_select(enum zio_checksum child, + enum zio_checksum parent); +extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, + enum zio_checksum child, enum zio_checksum parent); +extern enum zio_compress zio_compress_select(enum zio_compress child, + enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); @@ -442,9 +527,30 @@ extern int zio_inject_fault(char *name, int flags, int *id, extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); +extern void zio_handle_ignored_writes(zio_t *zio); + +/* + * Checksum ereport functions + */ +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, + uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical); + +extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); +extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); + +/* If we have the good data in hand, this function can be used */ +extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, struct zio_bad_cksum *info); + +/* Called from spa_sync(), but primarily an injection handler */ +extern void spa_handle_ignored_writes(spa_t *spa); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zio_checksum.h b/module/zfs/include/sys/zio_checksum.h index da407399d..0956c04ab 100644 --- a/module/zfs/include/sys/zio_checksum.h +++ b/module/zfs/include/sys/zio_checksum.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -43,28 +42,31 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); typedef struct zio_checksum_info { zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ int ci_correctable; /* number of correctable bits */ - int ci_zbt; /* uses zio block tail? */ + int ci_eck; /* uses zio embedded checksum? */ + int ci_dedup; /* strong enough for dedup? */ char *ci_name; /* descriptive name */ } zio_checksum_info_t; +typedef struct zio_bad_cksum { + zio_cksum_t zbc_expected; + zio_cksum_t zbc_actual; + const char *zbc_checksum_name; + uint8_t zbc_byteswapped; + uint8_t zbc_injected; + uint8_t zbc_has_cksum; /* expected/actual valid */ +} zio_bad_cksum_t; + extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t fletcher_2_native; -extern zio_checksum_t fletcher_4_native; -extern zio_checksum_t fletcher_4_incremental_native; - -extern zio_checksum_t fletcher_2_byteswap; -extern zio_checksum_t fletcher_4_byteswap; -extern zio_checksum_t fletcher_4_incremental_byteswap; - extern zio_checksum_t zio_checksum_SHA256; extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); -extern int zio_checksum_error(zio_t *zio); +extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zio_compress.h b/module/zfs/include/sys/zio_compress.h index 66ee8d45b..30bed1a67 100644 --- a/module/zfs/include/sys/zio_compress.h +++ b/module/zfs/include/sys/zio_compress.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zio.h> #ifdef __cplusplus @@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level); +extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, - void **destp, uint64_t *destsizep, uint64_t *destbufsizep); -extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize); +extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, + size_t s_len); +extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len); #ifdef __cplusplus } diff --git a/module/zfs/include/sys/zio_impl.h b/module/zfs/include/sys/zio_impl.h index e7503b733..d90bd8bd5 100644 --- a/module/zfs/include/sys/zio_impl.h +++ b/module/zfs/include/sys/zio_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,104 +34,136 @@ extern "C" { #endif /* - * I/O Groups: pipeline stage definitions. + * zio pipeline stage definitions */ -typedef enum zio_stage { - ZIO_STAGE_OPEN = 0, /* RWFCI */ +enum zio_stage { + ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ - ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ - ZIO_STAGE_READ_BP_INIT, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ - ZIO_STAGE_DVA_FREE, /* --F-- */ - ZIO_STAGE_DVA_CLAIM, /* ---C- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ - ZIO_STAGE_READY, /* RWFCI */ + ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ - ZIO_STAGE_DONE, /* RWFCI */ - ZIO_STAGES -} zio_stage_t; + ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ +}; -#define ZIO_INTERLOCK_STAGES \ - ((1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_INTERLOCK_STAGES \ + (ZIO_STAGE_READY | \ + ZIO_STAGE_DONE) -#define ZIO_INTERLOCK_PIPELINE \ +#define ZIO_INTERLOCK_PIPELINE \ ZIO_INTERLOCK_STAGES -#define ZIO_VDEV_IO_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_VDEV_IO_STAGES \ + (ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_DONE | \ + ZIO_STAGE_VDEV_IO_ASSESS) -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DONE) -#define ZIO_READ_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY)) +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_CHECKSUM_VERIFY) -#define ZIO_READ_PHYS_PIPELINE \ +#define ZIO_READ_PHYS_PIPELINE \ ZIO_READ_COMMON_STAGES -#define ZIO_READ_PIPELINE \ - (ZIO_READ_COMMON_STAGES | \ - (1U << ZIO_STAGE_READ_BP_INIT)) +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + ZIO_STAGE_READ_BP_INIT) -#define ZIO_WRITE_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_ISSUE_ASYNC) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE)) - -#define ZIO_WRITE_PHYS_PIPELINE \ - ZIO_WRITE_COMMON_STAGES - -#define ZIO_REWRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT)) - -#define ZIO_WRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE)) - -#define ZIO_GANG_STAGES \ - ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ - (1U << ZIO_STAGE_GANG_ISSUE)) +#define ZIO_DDT_CHILD_READ_PIPELINE \ + ZIO_READ_COMMON_STAGES -#define ZIO_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_FREE)) +#define ZIO_DDT_READ_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_READ_BP_INIT | \ + ZIO_STAGE_DDT_READ_START | \ + ZIO_STAGE_DDT_READ_DONE) -#define ZIO_CLAIM_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_CHECKSUM_GENERATE) -#define ZIO_IOCTL_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES -#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE) | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_CHILD_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_CHECKSUM_GENERATE | \ + ZIO_STAGE_DDT_WRITE) + +#define ZIO_GANG_STAGES \ + (ZIO_STAGE_GANG_ASSEMBLE | \ + ZIO_STAGE_GANG_ISSUE) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_DVA_FREE) + +#define ZIO_DDT_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_DDT_FREE) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_DVA_CLAIM) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_BLOCKING_STAGES \ + (ZIO_STAGE_DVA_ALLOCATE | \ + ZIO_STAGE_DVA_CLAIM | \ + ZIO_STAGE_VDEV_IO_START) extern void zio_inject_init(void); extern void zio_inject_fini(void); diff --git a/module/zfs/include/sys/zvol.h b/module/zfs/include/sys/zvol.h index 06adc667e..0059bf510 100644 --- a/module/zfs/include/sys/zvol.h +++ b/module/zfs/include/sys/zvol.h @@ -20,15 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZVOL_H #define _SYS_ZVOL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #ifdef __cplusplus @@ -43,10 +40,10 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, major_t); +extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); +extern void zvol_remove_minors(const char *); extern int zvol_set_volsize(const char *, major_t, uint64_t); -extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); @@ -61,6 +58,15 @@ extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, extern int zvol_busy(void); extern void zvol_init(void); extern void zvol_fini(void); + +extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize, + uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, + void **rl_hdl, void **bonus_hdl); +extern uint64_t zvol_get_volume_size(void *minor_hdl); +extern int zvol_get_volume_wce(void *minor_hdl); +extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, + ssize_t resid, boolean_t sync); + #endif #ifdef __cplusplus |