diff options
Diffstat (limited to 'module/zfs/include/sys')
52 files changed, 8662 insertions, 0 deletions
diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h new file mode 100644 index 000000000..749bf53e5 --- /dev/null +++ b/module/zfs/include/sys/arc.h @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/spa.h> + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); +typedef int arc_evict_func_t(void *private); + +/* generic arc_done_func_t's which you can use */ +arc_done_func_t arc_bcopy_func; +arc_done_func_t arc_getbuf_func; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + krwlock_t b_lock; + void *b_data; + arc_evict_func_t *b_efunc; + void *b_private; +}; + +typedef enum arc_buf_contents { + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES +} arc_buf_contents_t; +/* + * These are the flags we pass into calls to the arc + */ +#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ +#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ +#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ +#define ARC_CACHED (1 << 4) /* I/O was already in cache */ +#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ + +void arc_space_consume(uint64_t space); +void arc_space_return(uint64_t space); +void *arc_data_buf_alloc(uint64_t space); +void arc_data_buf_free(void *buf, uint64_t space); +arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, + arc_buf_contents_t type); +void arc_buf_add_ref(arc_buf_t *buf, void *tag); +int arc_buf_remove_ref(arc_buf_t *buf, void *tag); +int arc_buf_size(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); +int arc_has_callback(arc_buf_t *buf); +void arc_buf_freeze(arc_buf_t *buf); +void arc_buf_thaw(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif + +typedef struct writeprops { + dmu_object_type_t wp_type; + uint8_t wp_level; + uint8_t wp_copies; + uint8_t wp_dncompress, wp_oscompress; + uint8_t wp_dnchecksum, wp_oschecksum; +} writeprops_t; + +void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp); +int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t *arc_flags, const zbookmark_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int zio_flags, const zbookmark_t *zb); +int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); + +void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); +int arc_buf_evict(arc_buf_t *buf); + +void arc_flush(spa_t *spa); +void arc_tempreserve_clear(uint64_t reserve); +int arc_tempreserve_space(uint64_t reserve, uint64_t txg); + +void arc_init(void); +void arc_fini(void); + +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_remove_vdev(vdev_t *vd); +boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_init(void); +void l2arc_fini(void); +void l2arc_start(void); +void l2arc_stop(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/module/zfs/include/sys/bplist.h b/module/zfs/include/sys/bplist.h new file mode 100644 index 000000000..cdb93a6c3 --- /dev/null +++ b/module/zfs/include/sys/bplist.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpl_entries blkptr_t's, representing + * a total of bpl_bytes physical space. + */ + uint64_t bpl_entries; + uint64_t bpl_bytes; + uint64_t bpl_comp; + uint64_t bpl_uncomp; +} bplist_phys_t; + +#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) + +typedef struct bplist_q { + blkptr_t bpq_blk; + void *bpq_next; +} bplist_q_t; + +typedef struct bplist { + kmutex_t bpl_lock; + objset_t *bpl_mos; + uint64_t bpl_object; + uint8_t bpl_blockshift; + uint8_t bpl_bpshift; + uint8_t bpl_havecomp; + bplist_q_t *bpl_queue; + bplist_phys_t *bpl_phys; + dmu_buf_t *bpl_dbuf; + dmu_buf_t *bpl_cached_dbuf; +} bplist_t; + +extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); +extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); +extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); +extern void bplist_close(bplist_t *bpl); +extern boolean_t bplist_empty(bplist_t *bpl); +extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); +extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); +extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); +extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); +extern int bplist_space(bplist_t *bpl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +extern int bplist_space_birthrange(bplist_t *bpl, + uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h new file mode 100644 index 000000000..75ce27264 --- /dev/null +++ b/module/zfs/include/sys/dbuf.h @@ -0,0 +1,347 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define DB_BONUS_BLKID (-1ULL) +#define IN_DMU_SYNC 2 + +/* + * define flags for dbuf_read + */ + +#define DB_RF_MUST_SUCCEED (1 << 0) +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) + +/* + * The simplified state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * | ^ ^ + * | | | + * +----> FILL ----+ | + * | | + * | | + * +--------> NOFILL -------+ + */ +typedef enum dbuf_states { + DB_UNCACHED, + DB_FILL, + DB_NOFILL, + DB_READ, + DB_CACHED, + DB_EVICTING +} dbuf_states_t; + +struct objset_impl; +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +#define LIST_LINK_INACTIVE(link) \ + ((link)->list_next == NULL && (link)->list_prev == NULL) + +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* pointer to next dirty record */ + struct dbuf_dirty_record *dr_next; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + } dl; + } dt; +} dbuf_dirty_record_t; + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset_impl *db_objset; + + /* + * the dnode we belong to (NULL when evicted) + */ + struct dnode *db_dnode; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + refcount_t db_holds; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + kcondvar_t db_changed; + dbuf_dirty_record_t *db_data_pending; + + /* pointer to most recent dirty record for this buffer */ + dbuf_dirty_record_t *db_last_dirty; + + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. + */ + list_node_t db_link; + + /* Data which is unique to data (leaf) blocks: */ + + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + void **db_user_data_ptr_ptr; + dmu_buf_evict_func_t *db_evict_func; + + uint8_t db_immediate_evict; + uint8_t db_freed_in_flight; + + uint8_t db_dirtycnt; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 256 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + + +uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); + +dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); +void dbuf_create_bonus(struct dnode *dn); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, uint64_t blkid); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db, void *tag); + +dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); + +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); + +void dbuf_clear(dmu_buf_impl_t *db); +void dbuf_evict(dmu_buf_impl_t *db); + +void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, dmu_tx_t *tx); + +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, + struct dmu_tx *); + +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_init(void); +void dbuf_fini(void); + +#define DBUF_IS_METADATA(db) \ + ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) + +#define DBUF_GET_BUFC_TYPE(db) \ + (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +#define DBUF_IS_CACHEABLE(db) \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + +#define DBUF_IS_L2CACHEABLE(db) \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but gcc does not + * support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DBUF_VERIFY(db) dbuf_verify(db) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) +#define DBUF_VERIFY(db) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h new file mode 100644 index 000000000..3b1e5c8fb --- /dev/null +++ b/module/zfs/include/sys/dmu.h @@ -0,0 +1,638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include <sys/inttypes.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cred.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct uio; +struct page; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; +struct zbookmark; +struct spa; +struct nvlist; +struct objset_impl; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_NUMTYPES +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_oldacl_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_MODE_NOHOLD 0 /* internal use only */ +#define DS_MODE_USER 1 /* simple access, no special needs */ +#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ +#define DS_MODE_TYPE_MASK 0x3 +#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) +#define DS_MODE_READONLY 0x8 +#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) +#define DS_MODE_INCONSISTENT 0x10 +#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) + +#define DS_FIND_SNAPSHOTS (1<<0) +#define DS_FIND_CHILDREN (1<<1) + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (10<<20) /* 10MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ + +/* + * Public routines to create, destroy, open, and close objsets. + */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_evict_dbufs(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_snapshots_destroy(char *fsname, char *snapname); +int dmu_objset_rollback(objset_t *os); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_rename(const char *name, const char *newname, + boolean_t recursive); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" + +/* 4x8 zbookmark_t */ +#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" +/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ +#define DMU_POOL_SCRUB_QUEUE "scrub_queue" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" +/* 1x4 enum scrub_func */ +#define DMU_POOL_SCRUB_FUNC "scrub_func" +/* 1x8 count */ +#define DMU_POOL_SCRUB_ERRORS "scrub_errors" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. Ignore objects which have not been + * modified since txg. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, + boolean_t hole, uint64_t txg); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allcated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +/* + * Decide how many copies of a given block we should make. Can be from + * 1 to SPA_DVAS_PER_BP. + */ +int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, + dmu_object_type_t ot); +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_read() to + * read db_data, dmu_buf_will_dirty() before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release your hold with dmu_buf_rele(). + */ +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); +int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your + * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **); +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); +void dmu_buf_rele(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + +/* + * Returns NULL on success, or the existing user ptr if it's already + * been set. + * + * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * + * user_data_ptr_ptr should be NULL, or a pointer to a pointer which + * will be set to db->db_data when you are allowed to access it. Note + * that db->db_data (the pointer) can change when you do dmu_buf_read(), + * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). + * *user_data_ptr_ptr will be set to the new value when it changes. + * + * If non-NULL, pageout func will be called when this buffer is being + * excised from the cache, so that you can clean up the data structure + * pointed to by user_ptr. + * + * dmu_evict_user() will call the pageout func for all buffers in a + * objset with a given pageout func. + */ +void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +/* + * set_user_ie is the same as set_user, but request immediate eviction + * when hold count goes to zero. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, + void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); + +/* + * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_commit(dmu_tx_t *tx); + +/* + * Free up the data blocks for a defined range of a file. If size is + * zero, the range from offset to end-of-file is freed. + */ +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_object(objset_t *os, uint64_t object); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct page *pp, dmu_tx_t *tx); + +extern int zfs_prefetch_disable; + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len); + +typedef struct dmu_object_info { + /* All sizes are in bytes. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + uint64_t doi_bonus_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_pad[5]; + /* Values below are number of 512-byte blocks. */ + uint64_t doi_physical_blks; /* data + metadata */ + uint64_t doi_max_block_offset; +} dmu_object_info_t; + +typedef void arc_byteswap_func_t(void *buf, size_t size); + +typedef struct dmu_object_type_info { + arc_byteswap_func_t *ot_byteswap; + boolean_t ot_metadata; + char *ot_name; +} dmu_object_type_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +typedef struct dmu_objset_stats { + uint64_t dds_num_clones; /* number of clones of this */ + uint64_t dds_creation_txg; + uint64_t dds_guid; + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_inconsistent; + char dds_origin[MAXNAMELEN]; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); + +/* + * Add entries to the nvlist for all the objset's properties. See + * zfs_prop_table[] and zfs(1m) for details on the properties. + */ +void dmu_objset_stats(objset_t *os, struct nvlist *nv); + +/* + * Get the space usage statistics for statvfs(). + * + * refdbytes is the amount of space "referenced" by this objset. + * availbytes is the amount of space available to this objset, taking + * into account quotas & reservations, assuming that no other objsets + * use the space first. These values correspond to the 'referenced' and + * 'available' properties, described in the zfs(1m) manpage. + * + * usedobjs and availobjs are the number of objects currently allocated, + * and available. + */ +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); + +/* + * The fsid_guid is a 56-bit ID that can change to avoid collisions. + * (Contrast with the ds_guid which is a 64-bit ID that will never + * change, so there is a small probability that it will collide.) + */ +uint64_t dmu_objset_fsid_guid(objset_t *os); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp, boolean_t *case_conflict); +extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, + int maxlen, boolean_t *conflict); +extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * Synchronous write. + * If a parent zio is provided this function initiates a write on the + * provided buffer as a child of the parent zio. + * In the absence of a parent zio, the write is completed synchronously. + * At write completion, blk is filled with the bp of the written block. + * Note that while the data covered by this function will be on stable + * storage when the write completes this new data does not become a + * permanent part of the file until the associated transaction commits. + */ +typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); +int dmu_sync(struct zio *zio, dmu_buf_t *db, + struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); + +int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + struct vnode *vp, offset_t *off); + +typedef struct dmu_recv_cookie { + /* + * This structure is opaque! + * + * If logical and real are different, we are recving the stream + * into the "real" temporary clone, and then switching it with + * the "logical" target. + */ + struct dsl_dataset *drc_logical_ds; + struct dsl_dataset *drc_real_ds; + struct drr_begin *drc_drrb; + char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_force; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); +int dmu_recv_end(dmu_recv_cookie_t *drc); +void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/module/zfs/include/sys/dmu_impl.h b/module/zfs/include/sys/dmu_impl.h new file mode 100644 index 000000000..96ce688e1 --- /dev/null +++ b/module/zfs/include/sys/dmu_impl.h @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#include <sys/txg_impl.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dmu_tx_count_free: + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dd_assigned_tx + * dn_notxholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock + * must be held before: + * ds_lock + * ancestors' dd_lock + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * dsl_dataset_block_freeable: none (dd_sync_*) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock + * protects: + * ds_user_ptr + * ds_user_evice_func + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * ds_reserved + * held from: + * dsl_dataset_* + * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid + */ + +struct objset; +struct dmu_pool; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h new file mode 100644 index 000000000..15df29a17 --- /dev/null +++ b/module/zfs/include/sys/dmu_objset.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/arc.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dmu_tx; +struct objset_impl; + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - + sizeof (uint64_t)]; +} objset_phys_t; + +struct objset { + struct objset_impl *os; + int os_mode; +}; + +typedef struct objset_impl { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + arc_buf_t *os_phys_buf; + objset_phys_t *os_phys; + dnode_t *os_meta_dnode; + zilog_t *os_zil; + objset_t os; + uint8_t os_checksum; /* can change, under dsl_dir's locks */ + uint8_t os_compress; /* can change, under dsl_dir's locks */ + uint8_t os_copies; /* can change, under dsl_dir's locks */ + uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ + uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + blkptr_t *os_rootbp; + zil_header_t os_zil_header; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next; + + /* Protected by os_lock */ + kmutex_t os_lock; + list_t os_dirty_dnodes[TXG_SIZE]; + list_t os_free_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; +} objset_impl_t; + +#define DMU_META_DNODE_OBJECT 0 + +#define DMU_OS_IS_L2CACHEABLE(os) \ + ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ + (os)->os_secondary_cache == ZFS_CACHE_METADATA) + +/* called from zpl */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_objset_rollback(objset_t *os); +int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +void dmu_objset_stats(objset_t *os, nvlist_t *nv); +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dmu_objset_fsid_guid(objset_t *os); +int dmu_objset_find(char *name, int func(char *, void *), void *arg, + int flags); +int dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); +void dmu_objset_byteswap(void *buf, size_t size); +int dmu_objset_evict_dbufs(objset_t *os); + +/* called from dsl */ +void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); +objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_impl_t **osip); +void dmu_objset_evict(struct dsl_dataset *ds, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/module/zfs/include/sys/dmu_traverse.h b/module/zfs/include/sys/dmu_traverse.h new file mode 100644 index 000000000..3e0268911 --- /dev/null +++ b/module/zfs/include/sys/dmu_traverse.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dnode_phys; +struct dsl_dataset; + +typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, + const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) + +int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/module/zfs/include/sys/dmu_tx.h b/module/zfs/include/sys/dmu_tx.h new file mode 100644 index 000000000..2727daaaa --- /dev/null +++ b/module/zfs/include/sys/dmu_tx.h @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dmu_tx_hold; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + uint64_t tx_lastsnap_txg; + uint64_t tx_lasttried_txg; + txg_handle_t tx_txgh; + void *tx_tempreserve_cookie; + struct dmu_tx_hold *tx_needassign_txh; + uint8_t tx_anyobj; + int tx_err; +#ifdef ZFS_DEBUG + uint64_t tx_space_towrite; + uint64_t tx_space_tofree; + uint64_t tx_space_tooverwrite; + uint64_t tx_space_tounref; + refcount_t tx_space_written; + refcount_t tx_space_freed; +#endif +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_NUMTYPES +}; + +typedef struct dmu_tx_hold { + dmu_tx_t *txh_tx; + list_node_t txh_node; + struct dnode *txh_dnode; + uint64_t txh_space_towrite; + uint64_t txh_space_tofree; + uint64_t txh_space_tooverwrite; + uint64_t txh_space_tounref; + uint64_t txh_memory_tohold; + uint64_t txh_fudge; +#ifdef ZFS_DEBUG + enum dmu_tx_hold_type txh_type; + uint64_t txh_arg1; + uint64_t txh_arg2; +#endif +} dmu_tx_hold_t; + + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +void dmu_tx_wait(dmu_tx_t *tx); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG +#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#else +#define DMU_TX_DIRTY_BUF(tx, db) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/module/zfs/include/sys/dmu_zfetch.h b/module/zfs/include/sys/dmu_zfetch.h new file mode 100644 index 000000000..c94bced93 --- /dev/null +++ b/module/zfs/include/sys/dmu_zfetch.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DFETCH_H +#define _DFETCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef enum zfetch_dirn { + ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ + ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ +} zfetch_dirn_t; + +typedef struct zstream { + uint64_t zst_offset; /* offset of starting block in range */ + uint64_t zst_len; /* length of range, in blocks */ + zfetch_dirn_t zst_direction; /* direction of prefetch */ + uint64_t zst_stride; /* length of stride, in blocks */ + uint64_t zst_ph_offset; /* prefetch offset, in blocks */ + uint64_t zst_cap; /* prefetch limit (cap), in blocks */ + kmutex_t zst_lock; /* protects stream */ + clock_t zst_last; /* lbolt of last prefetch */ + avl_node_t zst_node; /* embed avl node here */ +} zstream_t; + +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* AVL tree of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + uint32_t zf_stream_cnt; /* # of active streams */ + uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ +} zfetch_t; + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_rele(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DFETCH_H */ diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h new file mode 100644 index 000000000..c79ff48a6 --- /dev/null +++ b/module/zfs/include/sys/dnode.h @@ -0,0 +1,275 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/refcount.h> +#include <sys/dmu_zfetch.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * dnode_hold() flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset_impl; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1<<0) + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + uint64_t dn_pad3[4]; + + blkptr_t dn_blkptr[1]; + uint8_t dn_bonus[DN_MAX_BONUSLEN]; +} dnode_phys_t; + +typedef struct dnode { + /* + * dn_struct_rwlock protects the structure of the dnode, + * including the number of levels of indirection (dn_nlevels), + * dn_maxblkid, and dn_next_* + */ + krwlock_t dn_struct_rwlock; + + /* + * Our link on dataset's dd_dnodes list. + * Protected by dd_accounting_mtx. + */ + list_node_t dn_link; + + /* immutable: */ + struct objset_impl *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid in the open + * context (eg. even before the dnode is first synced). + * Where necessary, these are protected by dn_struct_rwlock. + */ + dmu_object_type_t dn_type; /* object type */ + uint16_t dn_bonuslen; /* bonus length */ + uint8_t dn_bonustype; /* bonus type */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + uint32_t dn_datablksz; /* in bytes */ + uint64_t dn_maxblkid; + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + uint16_t dn_next_bonuslen[TXG_SIZE]; + uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + + /* protected by os_lock: */ + list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_records[TXG_SIZE]; + avl_tree_t dn_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + kcondvar_t dn_notxholds; + enum dnode_dirtycontext dn_dirtyctx; + uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + refcount_t dn_tx_holds; + refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + + /* parent IO for current sync write */ + zio_t *dn_zio; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +} dnode_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, + uint64_t object); +void dnode_special_close(dnode_t *dn); + +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); +int dnode_hold(struct objset_impl *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, + void *ref, dnode_t **dnp); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +uint64_t dnode_current_max_length(dnode_t *dn); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_clear_range(dnode_t *dn, uint64_t blkid, + uint64_t nblks, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); +void dnode_evict_dbufs(dnode_t *dn); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DNODE_VERIFY(dn) dnode_verify(dn) +#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) + +#else + +#define dprintf_dnode(db, fmt, ...) +#define DNODE_VERIFY(dn) +#define FREE_VERIFY(db, start, end, tx) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h new file mode 100644 index 000000000..8665aec2d --- /dev/null +++ b/module/zfs/include/sys/dsl_dataset.h @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/bplist.h> +#include <sys/dsl_synctask.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; + +typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); + +#define DS_FLAG_INCONSISTENT (1ULL<<0) +#define DS_IS_INCONSISTENT(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) +/* + * NB: nopromote can not yet be set, but we want support for it in this + * on-disk version, so that we don't need to upgrade for it later. It + * will be needed when we implement 'zfs split' (where the split off + * clone should not be promoted). + */ +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ +#define DS_FLAG_CI_DATASET (1ULL<<16) + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; /* DS_FLAG_* */ + blkptr_t ds_bp; + uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + /* Immutable: */ + struct dsl_dir *ds_dir; + dsl_dataset_phys_t *ds_phys; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + uint64_t ds_fsid_guid; + + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_origin_txg; + + /* has internal locking: */ + bplist_t ds_deadlist; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_<accounting> is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + void *ds_user_ptr; + dsl_dataset_evict_func_t *ds_user_evict_func; + + /* + * ds_owner is protected by the ds_rwlock and the ds_lock + */ + krwlock_t ds_rwlock; + kcondvar_t ds_exclusive_cv; + void *ds_owner; + + /* no locking; only for making guesses */ + uint64_t ds_trysnap_txg; + + /* for objset_open() */ + kmutex_t ds_opening_lock; + + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[MAXNAMELEN]; +} dsl_dataset_t; + +#define dsl_dataset_is_snapshot(ds) \ + ((ds)->ds_phys->ds_num_children != 0) + +#define DS_UNIQUE_IS_ACCURATE(ds) \ + (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + +int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); +int dsl_dataset_own(const char *name, int flags, void *owner, + dsl_dataset_t **dsp); +int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, + int flags, void *owner, dsl_dataset_t **); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, + void *owner); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); +uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); +int dsl_snapshots_destroy(char *fsname, char *snapname); +dsl_checkfunc_t dsl_dataset_destroy_check; +dsl_syncfunc_t dsl_dataset_destroy_sync; +dsl_checkfunc_t dsl_dataset_snapshot_check; +dsl_syncfunc_t dsl_dataset_snapshot_sync; +int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); +int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); +int dsl_dataset_promote(const char *name); +int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force); + +void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func); +void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); + +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); +void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); + +void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx); +int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); +void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); +void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); +void dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, + uint64_t *ref_rsrv); +int dsl_dataset_set_quota(const char *dsname, uint64_t quota); +void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx); +int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); +void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); +int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, + dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h new file mode 100644 index 000000000..a29e44e67 --- /dev/null +++ b/module/zfs/include/sys/dsl_deleg.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DELEG_H +#define _SYS_DSL_DELEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_PERM_NONE "" +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" + +/* + * Note: the names of properties that are marked delegatable are also + * valid delegated permissions + */ + +int dsl_deleg_get(const char *ddname, nvlist_t **nvp); +int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); +int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); +int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); +boolean_t dsl_delegation_on(objset_t *os); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DELEG_H */ diff --git a/module/zfs/include/sys/dsl_dir.h b/module/zfs/include/sys/dsl_dir.h new file mode 100644 index 000000000..86b9636ce --- /dev/null +++ b/module/zfs/include/sys/dsl_dir.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_synctask.h> +#include <sys/refcount.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_origin_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + /* These are immutable; no lock needed: */ + uint64_t dd_object; + dsl_dir_phys_t *dd_phys; + dmu_buf_t *dd_dbuf; + dsl_pool_t *dd_pool; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[MAXNAMELEN]; +}; + +void dsl_dir_close(dsl_dir_t *dd, void *tag); +int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); +int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, + const char **tailp); +int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_namelen(dsl_dir_t *dd); +int dsl_dir_is_private(dsl_dir_t *dd); +uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, + const char *name, dmu_tx_t *tx); +dsl_checkfunc_t dsl_dir_destroy_check; +dsl_syncfunc_t dsl_dir_destroy_sync; +void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); +uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, + dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +int dsl_dir_set_quota(const char *ddname, uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_rename(dsl_dir_t *dd, const char *newname); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); +boolean_t dsl_dir_is_clone(dsl_dir_t *dd); +void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, + uint64_t reservation, cred_t *cr, dmu_tx_t *tx); + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" +#define ORIGIN_DIR_NAME "$ORIGIN" + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ + KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/module/zfs/include/sys/dsl_pool.h b/module/zfs/include/sys/dsl_pool.h new file mode 100644 index 000000000..3bb4ad4ef --- /dev/null +++ b/module/zfs/include/sys/dsl_pool.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/txg_impl.h> +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/dnode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +enum scrub_func { + SCRUB_FUNC_NONE, + SCRUB_FUNC_CLEAN, + SCRUB_FUNC_NUMFUNCS +}; + +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_TOTAL DMU_OT_NUMTYPES + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; +} zfs_all_blkstats_t; + + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + struct dsl_dataset *dp_origin_snap; + uint64_t dp_root_dir_obj; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + list_t dp_synced_datasets; + hrtime_t dp_read_overhead; + uint64_t dp_throughput; + uint64_t dp_write_limit; + + /* Uses dp_lock */ + kmutex_t dp_lock; + uint64_t dp_space_towrite[TXG_SIZE]; + uint64_t dp_tempreserved[TXG_SIZE]; + + enum scrub_func dp_scrub_func; + uint64_t dp_scrub_queue_obj; + uint64_t dp_scrub_min_txg; + uint64_t dp_scrub_max_txg; + zbookmark_t dp_scrub_bookmark; + boolean_t dp_scrub_pausing; + boolean_t dp_scrub_isresilver; + uint64_t dp_scrub_start_time; + kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ + boolean_t dp_scrub_restart; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_dirs; + txg_list_t dp_sync_tasks; + + /* + * Protects administrative changes (properties, namespace) + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + krwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; +} dsl_pool_t; + +int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_zil_clean(dsl_pool_t *dp); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); +void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +void dsl_pool_memory_pressure(dsl_pool_t *dp); +void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); + +int dsl_pool_scrub_cancel(dsl_pool_t *dp); +int dsl_pool_scrub_clean(dsl_pool_t *dp); +void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_scrub_restart(dsl_pool_t *dp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h new file mode 100644 index 000000000..d66caa86c --- /dev/null +++ b/module/zfs/include/sys/dsl_prop.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +typedef struct dsl_prop_cb_record { + list_node_t cbr_node; /* link on dd_prop_cbs */ + struct dsl_dataset *cbr_ds; + const char *cbr_propname; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_numcb(struct dsl_dataset *ds); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint); + +int dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf); +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx); + +void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); +void dsl_prop_nvlist_add_string(nvlist_t *nv, + zfs_prop_t prop, const char *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/module/zfs/include/sys/dsl_synctask.h b/module/zfs/include/sys/dsl_synctask.h new file mode 100644 index 000000000..4995bfe5a --- /dev/null +++ b/module/zfs/include/sys/dsl_synctask.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_SYNCTASK_H +#define _SYS_DSL_SYNCTASK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; + +typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); + +typedef struct dsl_sync_task { + list_node_t dst_node; + dsl_checkfunc_t *dst_checkfunc; + dsl_syncfunc_t *dst_syncfunc; + void *dst_arg1; + void *dst_arg2; + int dst_err; +} dsl_sync_task_t; + +typedef struct dsl_sync_task_group { + txg_node_t dstg_node; + list_t dstg_tasks; + struct dsl_pool *dstg_pool; + cred_t *dstg_cr; + uint64_t dstg_txg; + int dstg_err; + int dstg_space; + boolean_t dstg_nowaiter; +} dsl_sync_task_group_t; + +dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); +void dsl_sync_task_create(dsl_sync_task_group_t *dstg, + dsl_checkfunc_t *, dsl_syncfunc_t *, + void *arg1, void *arg2, int blocks_modified); +int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); +void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); + +int dsl_sync_task_do(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified); +void dsl_sync_task_do_nowait(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/module/zfs/include/sys/metaslab.h b/module/zfs/include/sys/metaslab.h new file mode 100644 index 000000000..1c9d89e8f --- /dev/null +++ b/module/zfs/include/sys/metaslab.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#include <sys/spa.h> +#include <sys/space_map.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct metaslab_class metaslab_class_t; +typedef struct metaslab_group metaslab_group_t; + +extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + uint64_t start, uint64_t size, uint64_t txg); +extern void metaslab_fini(metaslab_t *msp); +extern void metaslab_sync(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); + +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 + +extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, + boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); + +extern metaslab_class_t *metaslab_class_create(void); +extern void metaslab_class_destroy(metaslab_class_t *mc); +extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); +extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); + +extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, + vdev_t *vd); +extern void metaslab_group_destroy(metaslab_group_t *mg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/module/zfs/include/sys/metaslab_impl.h b/module/zfs/include/sys/metaslab_impl.h new file mode 100644 index 000000000..5980cbc84 --- /dev/null +++ b/module/zfs/include/sys/metaslab_impl.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/metaslab.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/txg.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct metaslab_class { + metaslab_group_t *mc_rotor; + uint64_t mc_allocated; +}; + +struct metaslab_group { + kmutex_t mg_lock; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + int64_t mg_bias; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; +}; + +/* + * Each metaslab's free space is tracked in space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map object. + * When the txg is done syncing, metaslab_sync_done() updates ms_smo + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + */ +struct metaslab { + kmutex_t ms_lock; /* metaslab lock */ + space_map_obj_t ms_smo; /* synced space map object */ + space_map_obj_t ms_smo_syncing; /* syncing space map object */ + space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_map; /* in-core free space map */ + uint64_t ms_weight; /* weight vs. others in group */ + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/module/zfs/include/sys/refcount.h b/module/zfs/include/sys/refcount.h new file mode 100644 index 000000000..d3fe7b1f8 --- /dev/null +++ b/module/zfs/include/sys/refcount.h @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_REFCOUNT_H +#define _SYS_REFCOUNT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/list.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((char *)__func__) + +#if defined(DEBUG) || !defined(_KERNEL) +typedef struct reference { + list_node_t ref_link; + void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + list_t rc_list; + list_t rc_removed; + int64_t rc_count; + int64_t rc_removed_count; +} refcount_t; + +/* Note: refcount_t must be initialized with refcount_create() */ + +void refcount_create(refcount_t *rc); +void refcount_destroy(refcount_t *rc); +void refcount_destroy_many(refcount_t *rc, uint64_t number); +int refcount_is_zero(refcount_t *rc); +int64_t refcount_count(refcount_t *rc); +int64_t refcount_add(refcount_t *rc, void *holder_tag); +int64_t refcount_remove(refcount_t *rc, void *holder_tag); +int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); +int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); + +void refcount_init(void); +void refcount_fini(void); + +#else /* DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} refcount_t; + +#define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_destroy(rc) ((rc)->rc_count = 0) +#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define refcount_is_zero(rc) ((rc)->rc_count == 0) +#define refcount_count(rc) ((rc)->rc_count) +#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) +#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) +#define refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) + +#define refcount_init() +#define refcount_fini() + +#endif /* DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/module/zfs/include/sys/rrwlock.h b/module/zfs/include/sys/rrwlock.h new file mode 100644 index 000000000..19a43c97f --- /dev/null +++ b/module/zfs/include/sys/rrwlock.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/inttypes.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + refcount_t rr_anon_rcount; + refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h new file mode 100644 index 000000000..24b3ca447 --- /dev/null +++ b/module/zfs/include/sys/spa.h @@ -0,0 +1,554 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#include <sys/avl.h> +#include <sys/zfs_context.h> +#include <sys/nvpair.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct zilog zilog_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; +struct dsl_pool; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * We currently support nine block sizes, from 512 bytes to 128K. + * We could go higher, but the benefits are near-zero and the cost + * of COWing a giant block to modify one byte would become excessive. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) + +/* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1 << 14) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * E endianness + * type DMU object type + * lvl level of indirection + * birth txg transaction group in which the block was born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ +typedef struct blkptr { + dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[3]; /* Extra space for the future */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_HOLE(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_pad[2] = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#define BLK_FILL_ALREADY_FREED (-1ULL) + +/* + * Note: the byteorder is either 0 or -1, both of which are palindromes. + * This simplifies the endianness handling a bit. + */ +#ifdef _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (-1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 320 + +#include <sys/dmu.h> + +#define BP_GET_BUFC_TYPE(bp) \ + (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ + ARC_BUFC_METADATA : ARC_BUFC_DATA); +/* + * Routines found in spa.c + */ + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_get_stats(const char *pool, nvlist_t **config, + char *altroot, size_t buflen); +extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, + const char *history_str, nvlist_t *zplprops); +extern int spa_check_rootconf(char *devpath, char *devid, + nvlist_t **bestconf, uint64_t *besttxg); +extern boolean_t spa_rootdev_validate(nvlist_t *nv); +extern int spa_import_rootpool(char *devpath, char *devid); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); +extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); + +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); +extern void spa_spare_activate(vdev_t *vd); + +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); +extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); + +/* scrubbing */ +extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_config_sync(spa_t *, boolean_t, boolean_t); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); +extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + +/* Accessor functions */ +extern boolean_t spa_shutting_down(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +extern uint64_t spa_get_alloc(spa_t *spa); +extern uint64_t spa_get_space(spa_t *spa); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_version(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_busy(void); +extern uint8_t spa_get_failmode(spa_t *spa); +extern boolean_t spa_suspended(spa_t *spa); + +/* Miscellaneous support routines */ +extern int spa_rename(const char *oldname, const char *newname); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); + +/* history logging */ +typedef enum history_log_type { + LOG_CMD_POOL_CREATE, + LOG_CMD_NORMAL, + LOG_INTERNAL +} history_log_type_t; + +typedef struct history_arg { + const char *ha_history_str; + history_log_type_t ha_log_type; + history_internal_events_t ha_event; + char ha_zone[MAXPATHLEN]; +} history_arg_t; + +extern char *spa_his_ievent_table[]; + +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf, + history_log_type_t what); +void spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); + +/* error handling */ +struct zbookmark; +struct zio; +extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t stateoroffset, uint64_t length); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); +extern void spa_boot_init(); + +/* properties */ +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/module/zfs/include/sys/spa_boot.h b/module/zfs/include/sys/spa_boot.h new file mode 100644 index 000000000..b56073b97 --- /dev/null +++ b/module/zfs/include/sys/spa_boot.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_BOOT_H +#define _SYS_SPA_BOOT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern char *spa_get_bootprop(char *prop); +extern void spa_free_bootprop(char *prop); +extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_BOOT_H */ diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h new file mode 100644 index 000000000..8aeb414fe --- /dev/null +++ b/module/zfs/include/sys/spa_impl.h @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/metaslab.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/uberblock_impl.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/refcount.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_error_entry { + zbookmark_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + +typedef struct spa_history_phys { + uint64_t sh_pool_create_len; /* ending offset of zpool create */ + uint64_t sh_phys_max_off; /* physical EOF */ + uint64_t sh_bof; /* logical BOF */ + uint64_t sh_eof; /* logical EOF */ + uint64_t sh_records_lost; /* num of records overwritten */ +} spa_history_phys_t; + +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + int scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + +typedef struct spa_config_dirent { + list_node_t scd_link; + char *scd_path; +} spa_config_dirent_t; + +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +enum zio_taskq_type { + ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_TYPES +}; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char spa_name[MAXNAMELEN]; /* pool name */ + avl_node_t spa_avl; /* node in spa_namespace_avl */ + nvlist_t *spa_config; /* last synced config */ + nvlist_t *spa_config_syncing; /* currently syncing config */ + uint64_t spa_config_txg; /* txg of last config change */ + int spa_sync_pass; /* iterate-to-convergence */ + pool_state_t spa_state; /* pool state */ + int spa_inject_ref; /* injection references */ + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ + taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; + dsl_pool_t *spa_dsl_pool; + metaslab_class_t *spa_normal_class; /* normal data class */ + metaslab_class_t *spa_log_class; /* intent log data class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_final_txg; /* txg of export/destroy */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + uint64_t spa_load_guid; /* initial guid for spa_load */ + list_t spa_config_dirty_list; /* vdevs with dirty config */ + list_t spa_state_dirty_list; /* vdevs with dirty state */ + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + uint64_t spa_sync_bplist_obj; /* object for deferred frees */ + bplist_t spa_sync_bplist; /* deferred-free bplist */ + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ + uint64_t spa_scrub_errors; /* scrub I/O error count */ + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + uint8_t spa_scrub_finished; /* indicator to rotate logs */ + uint8_t spa_scrub_started; /* started since last boot */ + uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ + kmutex_t spa_async_root_lock; /* protects async root count */ + uint64_t spa_async_root_count; /* number of async root zios */ + kcondvar_t spa_async_root_cv; /* notify when count == 0 */ + char *spa_root; /* alternate root directory */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + boolean_t spa_last_open_failed; /* true if last open faled */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + uint64_t spa_deflate; /* should we deflate? */ + uint64_t spa_history; /* history object */ + kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + kmutex_t spa_props_lock; /* property lock */ + uint64_t spa_pool_props_object; /* object for properties */ + uint64_t spa_bootfs; /* default boot filesystem */ + uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_delegation; /* delegation on/off */ + list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ + kcondvar_t spa_suspend_cv; /* notification of resume */ + uint8_t spa_suspended; /* pool is suspended */ + boolean_t spa_import_faulted; /* allow faulted vdevs */ + boolean_t spa_is_root; /* pool is root */ + int spa_minref; /* num refs when first opened */ + spa_log_state_t spa_log_state; /* log state */ + /* + * spa_refcnt & spa_config_lock must be the last elements + * because refcount_t changes size based on compilation options. + * In order for the MDB module to function correctly, the other + * fields must remain in the same location. + */ + spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ + refcount_t spa_refcount; /* number of opens */ +}; + +extern const char *spa_config_path; + +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h new file mode 100644 index 000000000..db9daef1f --- /dev/null +++ b/module/zfs/include/sys/space_map.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_map_ops space_map_ops_t; + +typedef struct space_map { + avl_tree_t sm_root; /* AVL tree of map segments */ + uint64_t sm_space; /* sum of all segments in the map */ + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + uint8_t sm_pad[3]; /* unused */ + uint8_t sm_loaded; /* map loaded? */ + uint8_t sm_loading; /* map loading? */ + kcondvar_t sm_load_cv; /* map load completion */ + space_map_ops_t *sm_ops; /* space map block picker ops vector */ + void *sm_ppd; /* picker-private data */ + kmutex_t *sm_lock; /* pointer to lock that protects map */ +} space_map_t; + +typedef struct space_seg { + avl_node_t ss_node; /* AVL node */ + uint64_t ss_start; /* starting offset of this segment */ + uint64_t ss_end; /* ending offset (non-inclusive) */ +} space_seg_t; + +typedef struct space_map_obj { + uint64_t smo_object; /* on-disk space map object */ + uint64_t smo_objsize; /* size of the object */ + uint64_t smo_alloc; /* space allocated from the map */ +} space_map_obj_t; + +struct space_map_ops { + void (*smop_load)(space_map_t *sm); + void (*smop_unload)(space_map_t *sm); + uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); + void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); + void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); +}; + +/* + * debug entry + * + * 1 3 10 50 + * ,---+--------+------------+---------------------------------. + * | 1 | action | syncpass | txg (lower bits) | + * `---+--------+------------+---------------------------------' + * 63 62 60 59 50 49 0 + * + * + * + * non-debug entry + * + * 1 47 1 15 + * ,-----------------------------------------------------------. + * | 0 | offset (sm_shift units) | type | run | + * `-----------------------------------------------------------' + * 63 62 17 16 15 0 + */ + +/* All this stuff takes and returns bytes */ +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) +#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) +#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) + +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) + +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) + +#define SM_ALLOC 0x0 +#define SM_FREE 0x1 + +/* + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer i/o operations, but they also cause the + * DMU to keep more data in-core, and also to waste more i/o bandwidth + * when only a few blocks have changed since the last transaction group. + * This could use a lot more research, but for now, set the freelist + * block size to 4k (2^12). + */ +#define SPACE_MAP_BLOCKSHIFT 12 + +typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, + uint8_t shift, kmutex_t *lp); +extern void space_map_destroy(space_map_t *sm); +extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); +extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_vacate(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_walk(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_union(space_map_t *smd, space_map_t *sms); + +extern void space_map_load_wait(space_map_t *sm); +extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, + uint8_t maptype, space_map_obj_t *smo, objset_t *os); +extern void space_map_unload(space_map_t *sm); + +extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); +extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_sync(space_map_t *sm, uint8_t maptype, + space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_truncate(space_map_obj_t *smo, + objset_t *os, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/module/zfs/include/sys/txg.h b/module/zfs/include/sys/txg.h new file mode 100644 index 000000000..23bdff211 --- /dev/null +++ b/module/zfs/include/sys/txg.h @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +#define TXG_WAIT 1ULL +#define TXG_NOWAIT 2ULL + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_suspend(struct dsl_pool *dp); +extern void txg_resume(struct dsl_pool *dp); + +/* + * Delay the caller by the specified number of ticks or until + * the txg closes (whichever comes first). This is intended + * to be used to throttle writers when the system nears its + * capacity. + */ +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately). + * If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern boolean_t txg_stalled(struct dsl_pool *dp); + +/* returns TRUE if someone is waiting for the next txg to sync */ +extern boolean_t txg_sync_waiting(struct dsl_pool *dp); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/module/zfs/include/sys/txg_impl.h b/module/zfs/include/sys/txg_impl.h new file mode 100644 index 000000000..7413c662b --- /dev/null +++ b/module/zfs/include/sys/txg_impl.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#include <sys/spa.h> +#include <sys/txg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct tx_cpu { + kmutex_t tc_lock; + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; + char tc_pad[16]; +}; + +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects right to enter txg */ + kmutex_t tx_sync_lock; /* protects tx_state_t */ + krwlock_t tx_suspend; + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/module/zfs/include/sys/uberblock.h b/module/zfs/include/sys/uberblock.h new file mode 100644 index 000000000..93d936ae4 --- /dev/null +++ b/module/zfs/include/sys/uberblock.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *ub); +extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/module/zfs/include/sys/uberblock_impl.h b/module/zfs/include/sys/uberblock_impl.h new file mode 100644 index 000000000..55a0dd5ae --- /dev/null +++ b/module/zfs/include/sys/uberblock_impl.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/uberblock.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* SPA_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/module/zfs/include/sys/unique.h b/module/zfs/include/sys/unique.h new file mode 100644 index 000000000..2ef3093ed --- /dev/null +++ b/module/zfs/include/sys/unique.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); +void unique_fini(void); + +/* + * Return a new unique value (which will not be uniquified against until + * it is unique_insert()-ed. + */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h new file mode 100644 index 000000000..c070d6f3d --- /dev/null +++ b/module/zfs/include/sys/vdev.h @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/space_map.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t zfs_nocacheflush; + +extern int vdev_open(vdev_t *); +extern int vdev_validate(vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); +extern void vdev_init(vdev_t *, uint64_t txg); +extern void vdev_reopen(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); +extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); + +extern boolean_t vdev_is_bootable(vdev_t *vd); +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); +extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + int scrub_done); +extern boolean_t vdev_resilver_needed(vdev_t *vd, + uint64_t *minp, uint64_t *maxp); + +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); + +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_clear_stats(vdev_t *vd); +extern void vdev_stat_update(zio_t *zio, uint64_t psize); +extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, + boolean_t complete); +extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_propagate_state(vdev_t *vd); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); + +extern void vdev_space_update(vdev_t *vd, int64_t space_delta, + int64_t alloc_delta, boolean_t update_root); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern void vdev_clear(spa_t *spa, vdev_t *vd); + +extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_readable(vdev_t *vd); +extern boolean_t vdev_writeable(vdev_t *vd); +extern boolean_t vdev_allocatable(vdev_t *vd); +extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern int vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); + +extern void vdev_state_dirty(vdev_t *vd); +extern void vdev_state_clean(vdev_t *vd); + +extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, + boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern int vdev_label_number(uint64_t psise, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd); +extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/module/zfs/include/sys/vdev_file.h b/module/zfs/include/sys/vdev_file.h new file mode 100644 index 000000000..cd4967357 --- /dev/null +++ b/module/zfs/include/sys/vdev_file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + vnode_t *vf_vnode; +} vdev_file_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h new file mode 100644 index 000000000..26904d089 --- /dev/null +++ b/module/zfs/include/sys/vdev_impl.h @@ -0,0 +1,305 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#include <sys/avl.h> +#include <sys/dmu.h> +#include <sys/metaslab.h> +#include <sys/nvpair.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/dkio.h> +#include <sys/uberblock_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef int vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); + +typedef struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + char *ve_data; + uint64_t ve_offset; + uint64_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +struct vdev_queue { + avl_tree_t vq_deadline_tree; + avl_tree_t vq_read_tree; + avl_tree_t vq_write_tree; + avl_tree_t vq_pending_tree; + kmutex_t vq_lock; +}; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_ashift; /* block alignment shift */ + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + uint64_t vdev_prevstate; /* used when reopening a vdev */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + space_map_t vdev_dtl_map; /* dirty time log in-core state */ + space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ + boolean_t vdev_probe_wanted; /* async probe wanted? */ + list_node_t vdev_config_dirty_node; /* config dirty list */ + list_node_t vdev_state_dirty_node; /* state dirty list */ + uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_islog; /* is an intent log device */ + + /* + * Leaf vdev state. + */ + uint64_t vdev_psize; /* physical device capacity */ + space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_nparity; /* number of parity devices for raidz */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ + uint64_t vdev_not_present; /* not present during import */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + hrtime_t vdev_last_try; /* last reopen time */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ + uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ + uint8_t vdev_detached; /* device detached? */ + uint8_t vdev_cant_read; /* vdev is failing all reads */ + uint8_t vdev_cant_write; /* vdev is failing all writes */ + uint64_t vdev_isspare; /* was a hot spare */ + uint64_t vdev_isl2cache; /* was a l2cache device */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ + zio_t *vdev_probe_zio; /* root of current probe */ + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_stat_lock; /* vdev_stat */ + kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ +}; + +#define VDEV_SKIP_SIZE (8 << 10) +#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +/* ZFS boot block */ +#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL +#define VDEV_BOOT_VERSION 1 /* version number */ + +typedef struct vdev_boot_header { + uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ + uint64_t vb_version; /* VDEV_BOOT_VERSION */ + uint64_t vb_offset; /* start offset (bytes) */ + uint64_t vb_size; /* size (bytes) */ + char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; +} vdev_boot_header_t; + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; + zio_block_tail_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 +#define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 + +/* + * Allocate or free a vdev + */ +extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, + vdev_t *parent, uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern void vdev_load(vdev_t *vd); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_disk_ops; +extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_spare_ops; + +/* + * Common size functions + */ +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_rsize(vdev_t *vd); + +/* + * zdb uses this tunable, so it must be declared here to make lint happy. + */ +extern int zfs_vdev_cache_size; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h new file mode 100644 index 000000000..f88cc068b --- /dev/null +++ b/module/zfs/include/sys/zap.h @@ -0,0 +1,425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Management + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including + * terminating NULL). The value is an array of integers, which may be + * 1, 2, 4, or 8 bytes long. The total space used by the array (number + * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. + * Note that an 8-byte integer value can be used to store the location + * (object number) of another dmu object (which may be itself a zapobj). + * Note that you can use a zero-length attribute to store a single bit + * of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (49 bytes or less) names and single 8-byte values, for which + * the microzap will be used. The ZAP should be efficient enough so + * that the user does not need to cache these attributes. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 128 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN 1024 + +/* + * The matchtype specifies which entry will be accessed. + * MT_EXACT: only find an exact match (non-normalized) + * MT_FIRST: find the "first" normalized (case and Unicode + * form) match; the designated "first" match will not change as long + * as the set of entries with this normalization doesn't change + * MT_BEST: if there is an exact match, find that, otherwise find the + * first normalized match + */ +typedef enum matchtype +{ + MT_EXACT, + MT_BEST, + MT_FIRST +} matchtype_t; + +/* + * Create a new zapobj with no attributes and return its object number. + * MT_EXACT will cause the zap object to only support MT_EXACT lookups, + * otherwise any matchtype can be used for lookups. + * + * normflags specifies what normalization will be done. values are: + * 0: no normalization (legacy on-disk format, supports MT_EXACT matching + * only) + * U8_TEXTPREP_TOLOWER: case normalization will be performed. + * MT_FIRST/MT_BEST matching will find entries that match without + * regard to case (eg. looking for "foo" can find an entry "Foo"). + * Eventually, other flags will permit unicode normalization as well. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. * When converting to a + * larger integer size, the integers will be treated as unsigned (ie. no + * sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + * + * If rn_len is nonzero, realname will be set to the name of the found + * entry (which may be different from the requested name if matchtype is + * not MT_EXACT). + * + * If normalization_conflictp is not NULL, it will be set if there is + * another name with the same case/unicode normalized form. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *normalization_conflictp); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + + +/* + * Returns (in name) the name of the entry whose (value & mask) + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. If mask==0, the + * match must be exact (ie, same as mask=-1ULL). + */ +int zap_value_search(objset_t *os, uint64_t zapobj, + uint64_t value, uint64_t mask, char *name); + +/* + * Transfer all the entries from fromobj into intoobj. Only works on + * int_size=8 num_integers=1 values. Fails if there are any duplicated + * entries. + */ +int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); + +/* + * Manipulate entries where the name + value are the "same" (the name is + * a stringified version of the value). + */ +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); + +struct zap; +struct zap_leaf; +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_zapobj; + uint64_t zc_hash; + uint32_t zc_cd; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + /* + * za_normalization_conflict will be set if there are additional + * entries with this normalized form (eg, "foo" and "Foo"). + */ + boolean_t za_normalization_conflict; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. You must _fini the cursor when you are done with it. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_fini(zap_cursor_t *zc); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* + * Pointer table values from zap_ptrtbl in the zap_phys_t + */ + uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ + uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ + uint64_t zs_ptrtbl_zt_blk; /* starting block number */ + uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ + uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ + + /* + * Values of the other members of the zap_phys_t + */ + uint64_t zs_block_type; /* ZBT_HEADER */ + uint64_t zs_magic; /* ZAP_MAGIC */ + uint64_t zs_num_leafs; /* The number of leaf blocks */ + uint64_t zs_num_entries; /* The number of zap entries */ + uint64_t zs_salt; /* salt to stir into hash function */ + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/module/zfs/include/sys/zap_impl.h b/module/zfs/include/sys/zap_impl.h new file mode 100644 index 000000000..0dc02ab6b --- /dev/null +++ b/module/zfs/include/sys/zap_impl.h @@ -0,0 +1,218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int fzap_default_block_shift; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_normflags; + uint64_t mz_pad[5]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + mzap_ent_phys_t mze_phys; +} mzap_ent_t; + + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)). + */ +#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \ + ((uint64_t *)(zap)->zap_f.zap_phys) \ + [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))] + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_normflags; /* flags for u8_textprep_str() */ + /* + * This structure is followed by padding, and then the embedded + * pointer table. The embedded pointer table takes up second + * half of the block. It is accessed using the + * ZAP_EMBEDDED_PTRTBL_ENT() macro. + */ +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct zap { + objset_t *zap_objset; + uint64_t zap_object; + struct dmu_buf *zap_dbuf; + krwlock_t zap_rwlock; + boolean_t zap_ismicro; + int zap_normflags; + uint64_t zap_salt; + union { + struct { + zap_phys_t *zap_phys; + + /* + * zap_num_entries_mtx protects + * zap_num_entries + */ + kmutex_t zap_num_entries_mtx; + int zap_block_shift; + } zap_fat; + struct { + mzap_phys_t *zap_phys; + int16_t zap_num_entries; + int16_t zap_num_chunks; + int16_t zap_alloc_next; + avl_tree_t zap_avl; + } zap_micro; + } zap_u; +} zap_t; + +typedef struct zap_name { + zap_t *zn_zap; + const char *zn_name_orij; + uint64_t zn_hash; + matchtype_t zn_matchtype; + const char *zn_name_norm; + char zn_normbuf[ZAP_MAXNAMELEN]; +} zap_name_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +boolean_t zap_match(zap_name_t *zn, const char *matchname); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); +void zap_unlockdir(zap_t *zap); +void zap_evict(dmu_buf_t *db, void *vmzap); +zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt); +void zap_name_free(zap_name_t *zn); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); +void zap_put_leaf(struct zap_leaf *l); + +int fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/module/zfs/include/sys/zap_leaf.h b/module/zfs/include/sys/zap_leaf.h new file mode 100644 index 000000000..14144e059 --- /dev/null +++ b/module/zfs/include/sys/zap_leaf.h @@ -0,0 +1,244 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<<l->l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS(l) \ + (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +#define ZLF_ENTRIES_CDSORTED (1<<0) + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_flags; /* ZLF_* flags */ + uint8_t lh_pad2[11]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_int_size; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_length; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + krwlock_t l_rwlock; + uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ + int l_bs; /* block size shift */ + dmu_buf_t *l_dbuf; + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + + +typedef struct zap_entry_handle { + /* below is set by zap_leaf.c and is public to zap.c */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* below is private to zap_leaf.c */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + zap_name_t *zn, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * zap_entry_update may fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, + const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* + * Return true if there are additional entries with the same normalized + * form. + */ +extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, + zap_name_t *zn, const char *name, zap_t *zap); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); +extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h new file mode 100644 index 000000000..bd91b33d1 --- /dev/null +++ b/module/zfs/include/sys/zfs_acl.h @@ -0,0 +1,214 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#endif +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/zfs_fuid.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs are store in various forms. + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * thats needed. + */ +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ +typedef struct zfs_ace { + zfs_ace_hdr_t z_hdr; + uint64_t z_fuid; +} zfs_ace_t; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ + +typedef struct zfs_object_ace { + zfs_ace_t z_ace; + uint8_t z_object_type[16]; /* object type */ + uint8_t z_inherit_type[16]; /* inherited object type */ +} zfs_object_ace_t; + +typedef struct zfs_oldace { + uint32_t z_fuid; /* "who" */ + uint32_t z_access_mask; /* access mask */ + uint16_t z_flags; /* flags, i.e inheritance */ + uint16_t z_type; /* type of entry allow/deny */ +} zfs_oldace_t; + +typedef struct zfs_acl_phys_v0 { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_acl_phys_v0_t; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +typedef struct zfs_acl_phys { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_size; /* Number of bytes in ACL */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_count; /* ace count */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +} zfs_acl_phys_t; + + + +typedef struct acl_ops { + uint32_t (*ace_mask_get) (void *acep); /* get access mask */ + void (*ace_mask_set) (void *acep, + uint32_t mask); /* set access mask */ + uint16_t (*ace_flags_get) (void *acep); /* get flags */ + void (*ace_flags_set) (void *acep, + uint16_t flags); /* set flags */ + uint16_t (*ace_type_get)(void *acep); /* get type */ + void (*ace_type_set)(void *acep, + uint16_t type); /* set type */ + uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ + void (*ace_who_set)(void *acep, + uint64_t who); /* set who/fuid */ + size_t (*ace_size)(void *acep); /* how big is this ace */ + size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ + int (*ace_mask_off)(void); /* off of access mask in ace */ + int (*ace_data)(void *acep, void **datap); + /* ptr to data if any */ +} acl_ops_t; + +/* + * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. + * Each node will have one or more ACEs associated with it. You will + * only have multiple nodes during a chmod operation. Normally only + * one node is required. + */ +typedef struct zfs_acl_node { + list_node_t z_next; /* Next chunk of ACEs */ + void *z_acldata; /* pointer into actual ACE(s) */ + void *z_allocdata; /* pointer to kmem allocated memory */ + size_t z_allocsize; /* Size of blob in bytes */ + size_t z_size; /* length of ACL data */ + int z_ace_count; /* number of ACEs in this acl node */ + int z_ace_idx; /* ace iterator positioned on */ +} zfs_acl_node_t; + +typedef struct zfs_acl { + int z_acl_count; /* Number of ACEs */ + size_t z_acl_bytes; /* Number of bytes in ACL */ + uint_t z_version; /* version of ACL */ + void *z_next_ace; /* pointer to next ACE */ + int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ + list_t z_acl; /* chunks of ACE data */ + acl_ops_t z_ops; /* ACL operations */ + boolean_t z_has_fuids; /* FUIDs present in ACL? */ +} zfs_acl_t; + +#define ACL_DATA_ALLOCED 0x1 +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define ZFS_ACL_DISCARD 0 +#define ZFS_ACL_NOALLOW 1 +#define ZFS_ACL_GROUPMASK 2 +#define ZFS_ACL_PASSTHROUGH 3 +#define ZFS_ACL_RESTRICTED 4 +#define ZFS_ACL_PASSTHROUGH_X 5 + +struct znode; +struct zfsvfs; +struct zfs_fuid_info; + +#ifdef _KERNEL +void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, + dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); +int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +void zfs_acl_rele(void *); +void zfs_oldace_byteswap(ace_t *, int); +void zfs_ace_byteswap(void *, size_t, boolean_t); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); +extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +void zfs_acl_free(zfs_acl_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, + struct zfs_fuid_info **, dmu_tx_t *); + +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h new file mode 100644 index 000000000..a5be3e130 --- /dev/null +++ b/module/zfs/include/sys/zfs_context.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/note.h> +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/buf.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> +#include <sys/kobj.h> +#include <sys/conf.h> +#include <sys/disp.h> +#include <sys/debug.h> +#include <sys/random.h> +#include <sys/byteorder.h> +#include <sys/systm.h> +#include <sys/list.h> +#include <sys/uio.h> +#include <sys/dirent.h> +#include <sys/time.h> +#include <vm/seg_kmem.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/zfs_debug.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/fm/util.h> + +#define CPU_SEQID (CPU->cpu_seqid) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/module/zfs/include/sys/zfs_ctldir.h b/module/zfs/include/sys/zfs_ctldir.h new file mode 100644 index 000000000..ce29625d1 --- /dev/null +++ b/module/zfs/include/sys/zfs_ctldir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +vnode_t *zfsctl_root(znode_t *); +void zfsctl_init(void); +void zfsctl_fini(void); + +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp); + +int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen, + fid_t *fidp); +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/module/zfs/include/sys/zfs_debug.h b/module/zfs/include/sys/zfs_debug.h new file mode 100644 index 000000000..450ac1c81 --- /dev/null +++ b/module/zfs/include/sys/zfs_debug.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* + * ZFS debugging + */ + +#if defined(DEBUG) || !defined(_KERNEL) +#define ZFS_DEBUG +#endif + +extern int zfs_flags; + +#define ZFS_DEBUG_DPRINTF 0x0001 +#define ZFS_DEBUG_DBUF_VERIFY 0x0002 +#define ZFS_DEBUG_DNODE_VERIFY 0x0004 +#define ZFS_DEBUG_SNAPNAMES 0x0008 +#define ZFS_DEBUG_MODIFY 0x0010 + +#ifdef ZFS_DEBUG +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +extern void zfs_panic_recover(const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/module/zfs/include/sys/zfs_dir.h b/module/zfs/include/sys/zfs_dir.h new file mode 100644 index 000000000..ebb66e8ae --- /dev/null +++ b/module/zfs/include/sys/zfs_dir.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/pathname.h> +#include <sys/dmu.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, + int, int *, pathname_t *); +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); +extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, + pathname_t *); +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); +extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h new file mode 100644 index 000000000..810ffc81a --- /dev/null +++ b/module/zfs/include/sys/zfs_fuid.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_FUID_H +#define _SYS_FS_ZFS_FUID_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/sid.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_OWNER, + ZFS_GROUP, + ZFS_ACE_USER, + ZFS_ACE_GROUP +} zfs_fuid_type_t; + +/* + * Estimate space needed for one more fuid table entry. + * for now assume its current size + 1K + */ +#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) + +#define FUID_INDEX(x) (x >> 32) +#define FUID_RID(x) (x & 0xffffffff) +#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +/* + * FUIDs cause problems for the intent log + * we need to replay the creation of the FUID, + * but we can't count on the idmapper to be around + * and during replay the FUID index may be different than + * before. Also, if an ACL has 100 ACEs and 12 different + * domains we don't want to log 100 domain strings, but rather + * just the unique 12. + */ + +/* + * The FUIDs in the log will index into + * domain string table and the bottom half will be the rid. + * Used for mapping ephemeral uid/gid during ACL setting to FUIDs + */ +typedef struct zfs_fuid { + list_node_t z_next; + uint64_t z_id; /* uid/gid being converted to fuid */ + uint64_t z_domidx; /* index in AVL domain table */ + uint64_t z_logfuid; /* index for domain in log */ +} zfs_fuid_t; + +/* list of unique domains */ +typedef struct zfs_fuid_domain { + list_node_t z_next; + uint64_t z_domidx; /* AVL tree idx */ + const char *z_domain; /* domain string */ +} zfs_fuid_domain_t; + +/* + * FUID information necessary for logging create, setattr, and setacl. + */ +typedef struct zfs_fuid_info { + list_t z_fuids; + list_t z_domains; + uint64_t z_fuid_owner; + uint64_t z_fuid_group; + char **z_domain_table; /* Used during replay */ + uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ + uint32_t z_domain_cnt; /* How many domains */ + size_t z_domain_str_sz; /* len of domain strings z_domain list */ +} zfs_fuid_info_t; + +#ifdef _KERNEL +struct znode; +extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_destroy(zfsvfs_t *); +extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, + dmu_tx_t *, cred_t *, zfs_fuid_info_t **); +extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, + dmu_tx_t *, zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, + uid_t *gid); +extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); +extern void zfs_fuid_info_free(); +extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +#endif + +char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); +void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h new file mode 100644 index 000000000..1692608bb --- /dev/null +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/cred.h> +#include <sys/dmu.h> +#include <sys/zio.h> +#include <sys/dsl_deleg.h> + +#ifdef _KERNEL +#include <sys/nvpair.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Property values for snapdir + */ +#define ZFS_SNAPDIR_HIDDEN 0 +#define ZFS_SNAPDIR_VISIBLE 1 + +#define DMU_BACKUP_STREAM_VERSION (1ULL) +#define DMU_BACKUP_HEADER_VERSION (2ULL) +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +#define DRR_FLAG_CLONE (1<<0) +#define DRR_FLAG_CI_DATA (1<<1) + +/* + * zfs ioctl command structure + */ +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, + } drr_type; + uint32_t drr_payloadlen; + union { + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_version; + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksum; + uint8_t drr_compress; + uint8_t drr_pad[6]; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_length; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + } drr_free; + } drr_u; +} dmu_replay_record_t; + +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_pad; /* pad out to 64 bit alignment */ +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 + +typedef struct zfs_share { + uint64_t z_exportdata; + uint64_t z_sharedata; + uint64_t z_sharetype; /* 0 = share, 1 = unshare */ + uint64_t z_sharemax; /* max length of share string */ +} zfs_share_t; + +/* + * ZFS file systems may behave the usual, POSIX-compliant way, where + * name lookups are case-sensitive. They may also be set up so that + * all the name lookups are case-insensitive, or so that only some + * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. + */ +typedef enum zfs_case { + ZFS_CASE_SENSITIVE, + ZFS_CASE_INSENSITIVE, + ZFS_CASE_MIXED +} zfs_case_t; + +typedef struct zfs_cmd { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + zfs_share_t zc_share; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; +} zfs_cmd_t; + +#define ZVOL_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) + +#ifdef _KERNEL + +typedef struct zfs_creat { + nvlist_t *zct_zplprops; + nvlist_t *zct_props; +} zfs_creat_t; + +extern dev_info_t *zfs_dip; + +extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); +extern int zfs_secpolicy_rename_perms(const char *from, + const char *to, cred_t *cr); +extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); +extern int zfs_busy(void); +extern int zfs_unmount_snap(char *, void *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/module/zfs/include/sys/zfs_rlock.h b/module/zfs/include/sys/zfs_rlock.h new file mode 100644 index 000000000..f302b663e --- /dev/null +++ b/module/zfs/include/sys/zfs_rlock.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_RLOCK_H +#define _SYS_FS_ZFS_RLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <sys/zfs_znode.h> + +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rl { + znode_t *r_zp; /* znode this lock applies to */ + avl_node_t r_node; /* avl node link */ + uint64_t r_off; /* file range offset */ + uint64_t r_len; /* file range length */ + uint_t r_cnt; /* range reference count in tree */ + rl_type_t r_type; /* range type */ + kcondvar_t r_wr_cv; /* cv for waiting writers */ + kcondvar_t r_rd_cv; /* cv for waiting readers */ + uint8_t r_proxy; /* acting for original range */ + uint8_t r_write_wanted; /* writer wants to lock this range */ + uint8_t r_read_wanted; /* reader wants to lock this range */ +} rl_t; + +/* + * Lock a range (offset, length) as either shared (READER) + * or exclusive (WRITER or APPEND). APPEND is a special type that + * is converted to WRITER that specified to lock from the start of the + * end of file. zfs_range_lock() returns the range lock structure. + */ +rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); + +/* + * Unlock range and destroy range lock structure. + */ +void zfs_range_unlock(rl_t *rl); + +/* + * Reduce range locked as RW_WRITER from whole file to specified range. + * Asserts the whole file was previously locked. + */ +void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); + +/* + * AVL comparison function used to compare range locks + */ +int zfs_range_compare(const void *arg1, const void *arg2); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h new file mode 100644 index 000000000..87b75e6e7 --- /dev/null +++ b/module/zfs/include/sys/zfs_vfsops.h @@ -0,0 +1,140 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/list.h> +#include <sys/vfs.h> +#include <sys/zil.h> +#include <sys/rrwlock.h> +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrwlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + vnode_t *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + uint64_t z_version; /* ZPL version */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ +}; + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h new file mode 100644 index 000000000..a5416525c --- /dev/null +++ b/module/zfs/include/sys/zfs_znode.h @@ -0,0 +1,356 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/attr.h> +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#include <sys/rrwlock.h> +#endif +#include <sys/zfs_acl.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Additional file level attributes, that are stored + * in the upper half of zp_flags + */ +#define ZFS_READONLY 0x0000000100000000 +#define ZFS_HIDDEN 0x0000000200000000 +#define ZFS_SYSTEM 0x0000000400000000 +#define ZFS_ARCHIVE 0x0000000800000000 +#define ZFS_IMMUTABLE 0x0000001000000000 +#define ZFS_NOUNLINK 0x0000002000000000 +#define ZFS_APPENDONLY 0x0000004000000000 +#define ZFS_NODUMP 0x0000008000000000 +#define ZFS_OPAQUE 0x0000010000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 + +#define ZFS_ATTR_SET(zp, attr, value) \ +{ \ + if (value) \ + zp->z_phys->zp_flags |= attr; \ + else \ + zp->z_phys->zp_flags &= ~attr; \ +} + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ + +/* + * Is ID ephemeral? + */ +#define IS_EPHEMERAL(x) (x > MAXUID) + +/* + * Should we use FUIDs? + */ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + +#define MASTER_NODE_OBJ 1 + +/* + * Special attributes for master node. + */ +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" + +#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) + +/* Path component length */ +/* + * The generic fs code uses MAXNAMELEN to represent + * what the largest component length is. Unfortunately, + * this length includes the terminating NULL. ZFS needs + * to tell the users via pathconf() and statvfs() what the + * true maximum length of a component is, excluding the NULL. + */ +#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) + +/* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. + */ +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#ifdef _KERNEL +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +typedef struct znode { + struct zfsvfs *z_zfsvfs; + vnode_t *z_vnode; + uint64_t z_id; /* object ID for this znode */ + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_map_lock; /* page map lock */ + krwlock_t z_parent_lock; /* parent lock for directories */ + krwlock_t z_name_lock; /* "master" lock for dirent locks */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ + kmutex_t z_range_lock; /* protects changes to z_range_avl */ + avl_tree_t z_range_avl; /* avl tree of file range locks */ + uint8_t z_unlinked; /* file has been unlinked */ + uint8_t z_atime_dirty; /* atime needs to be synced */ + uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_last_itx; /* last ZIL itx on this znode */ + uint64_t z_gen; /* generation (same as zp_gen) */ + uint32_t z_sync_cnt; /* synchronous open count */ + kmutex_t z_acl_lock; /* acl data lock */ + list_node_t z_link_node; /* all znodes in fs link */ + /* + * These are dmu managed fields. + */ + znode_phys_t *z_phys; /* pointer to persistent znode */ + dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ +} znode_t; + + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)(VP)->v_data) + +/* + * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. + * ZFS_EXIT() must be called before exitting the vop. + * ZFS_VERIFY_ZP() verifies the znode is valid. + */ +#define ZFS_ENTER(zfsvfs) \ + { \ + rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } + +#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_dbuf == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* + * Macros to encode/decode ZFS stored time values from/to struct timespec + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} + +/* + * Timestamp defines + */ +#define ACCESSED (AT_ATIME) +#define STATE_CHANGED (AT_CTIME) +#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_time_stamper(zp, ACCESSED, NULL) + +extern int zfs_init_fs(zfsvfs_t *, znode_t **); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, + dmu_tx_t *tx); +extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_znode_free(znode_t *); +extern void zfs_remove_op_tables(); +extern int zfs_create_op_tables(); +extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); +extern dev_t zfs_cmpldev(uint64_t); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); +extern int zfs_set_version(const char *name, uint64_t newvers); +extern int zfs_get_stats(objset_t *os, nvlist_t *nv); +extern void zfs_znode_dmu_fini(znode_t *); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, + vattr_t *vap); +extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, + vattr_t *vap); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name); +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag); +extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); + +extern caddr_t zfs_map_page(page_t *, enum seg_rw); +extern void zfs_unmap_page(page_t *, caddr_t); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#endif /* _KERNEL */ + +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h new file mode 100644 index 000000000..4d02d14f7 --- /dev/null +++ b/module/zfs/include/sys/zil.h @@ -0,0 +1,382 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; +} zil_header_t; + +/* + * Log block trailer - structure at the end of the header and each log block + * + * The zit_bt contains a zbt_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zbt_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_trailer { + uint64_t zit_pad; + blkptr_t zit_next_blk; /* next block in chain */ + uint64_t zit_nused; /* bytes in log block used */ + zio_block_tail_t zit_bt; /* block trailer */ +} zil_trailer_t; + +#define ZIL_MIN_BLKSZ 4096ULL +#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE +#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) + +/* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +typedef enum zil_create { + Z_FILE, + Z_DIR, + Z_XATTRDIR, +} zil_create_t; + +/* + * size of xvattr log section. + * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps + * for create time and a single 64 bit integer for all of the attributes, + * and 4 64 bit integers (32 bytes) for the scanstamp. + * + */ + +#define ZIL_XVAT_SIZE(mapsize) \ + sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ + (sizeof (uint64_t) * 7) + +/* + * Size of ACL in log. The ACE data is padded out to properly align + * on 8 byte boundary. + */ + +#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) + +/* + * Intent log transaction types and record structures + */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL_V0 12 /* Set old formatted ACL */ +#define TX_ACL 13 /* Set ACL */ +#define TX_CREATE_ACL 14 /* create with ACL */ +#define TX_CREATE_ATTR 15 /* create + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_MKDIR_ACL 17 /* mkdir with ACL */ +#define TX_MKDIR_ATTR 18 /* mkdir with attr */ +#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ +#define TX_MAX_TYPE 20 /* Max transaction type */ + +/* + * The transactions for mkdir, symlink, remove, rmdir, link, and rename + * may have the following bit set, indicating the original request + * specified case-insensitive handling of names. + */ +#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + * + * Note, lrc_seq holds two different sequence numbers. Whilst in memory + * it contains the transaction sequence number. The log record on + * disk holds the sequence number of all log records which is used to + * ensure we don't replay the same record. The two sequence numbers are + * different because the transactions can now be pushed out of order. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* see comment above */ +} lr_t; + +/* + * Handle option extended vattr attributes. + * + * Whenever new attributes are added the version number + * will need to be updated as will code in + * zfs_log.c and zfs_replay.c + */ +typedef struct { + uint32_t lr_attr_masksize; /* number of elements in array */ + uint32_t lr_attr_bitmap; /* First entry of array */ + /* remainder of array and any additional fields */ +} lr_attr_t; + +/* + * log record for creates without optional ACL. + * This log record does support optional xvattr_t attributes. + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ + /* for creates with xvattr data, the name follows the xvattr info */ +} lr_create_t; + +/* + * FUID ACL record will be an array of ACEs from the original ACL. + * If this array includes ephemeral IDs, the record will also include + * an array of log-specific FUIDs to replace the ephemeral IDs. + * Only one copy of each unique domain will be present, so the log-specific + * FUIDs will use an index into a compressed domain table. On replay this + * information will be used to construct real FUIDs (and bypass idmap, + * since it may not be available). + */ + +/* + * Log record for creates with optional ACL + * This log record is also used for recording any FUID + * information needed for replaying the create. If the + * file doesn't have any actual ACEs then the lr_aclcnt + * would be zero. + */ +typedef struct { + lr_create_t lr_create; /* common create portion */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ + /* if create is also setting xvattr's, then acl data follows xvattr */ + /* if ACE FUIDs are needed then they will follow the xvattr_t */ + /* Following the FUIDs will be the domain table information. */ + /* The FUIDs for the owner and group will be in the lr_create */ + /* portion of the record. */ + /* name follows ACL data */ +} lr_acl_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ + /* optional attribute lr_attr_t may be here */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_v0_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * ZFS intent log transaction structure + */ +typedef enum { + WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ + /* and put blkptr in log, rather than actual data) */ + WR_COPIED, /* immediate - data is copied into lr_write_t */ + WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ +} itx_wr_state_t; + +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + itx_wr_state_t itx_wr_state; /* write state */ + uint8_t itx_sync; /* synchronous transaction */ + uint64_t itx_sod; /* record size on disk */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + + +/* + * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() + * to handle the cleanup of the dmu_sync() buffer write + */ +typedef struct { + zilog_t *zgd_zilog; /* zilog */ + blkptr_t *zgd_bp; /* block pointer */ + struct rl *zgd_rl; /* range lock */ +} zgd_t; + + +typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(); +typedef void zil_replay_cleaner_t(); +typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); + +extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); + +extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); + +extern int zil_claim(char *osname, void *txarg); +extern int zil_check_log_chain(char *osname, void *txarg); +extern int zil_clear_log_chain(char *osname, void *txarg); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog); +extern int zil_is_committed(zilog_t *zilog); + +extern int zil_suspend(zilog_t *zilog); +extern void zil_resume(zilog_t *zilog); + +extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); + +extern int zil_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h new file mode 100644 index 000000000..0fc800b96 --- /dev/null +++ b/module/zfs/include/sys/zil_impl.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zil.h> +#include <sys/dmu_objset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Log write buffer. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + char *lwb_buf; /* log write buffer */ + zio_t *lwb_zio; /* zio for this buffer */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ +} lwb_t; + +/* + * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs + * we've touched so we know which ones need a write cache flush at the end. + */ +typedef struct zil_vdev_node { + uint64_t zv_vdev; /* vdev to be flushed */ + avl_node_t zv_node; /* AVL tree linkage */ +} zil_vdev_node_t; + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + const zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + zio_t *zl_root_zio; /* log writer root zio */ + uint64_t zl_itx_seq; /* next itx sequence number */ + uint64_t zl_commit_seq; /* committed upto this number */ + uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_writer; /* log writer thread completion */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ + uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_stop_sync; /* for debugging */ + uint8_t zl_writer; /* boolean: write setup in progress */ + uint8_t zl_log_error; /* boolean: log write error */ + list_t zl_itx_list; /* in-memory itx list */ + uint64_t zl_itx_list_sz; /* total size of records on list */ + uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_prev_used; /* previous commit log size used */ + list_t zl_lwb_list; /* in-flight log write list */ + kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ + avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ + taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ + avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + clock_t zl_replay_time; /* lbolt of when replay started */ + uint64_t zl_replay_blks; /* number of log blocks replayed */ +}; + +typedef struct zil_dva_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_dva_node_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h new file mode 100644 index 000000000..4de78dfee --- /dev/null +++ b/module/zfs/include/sys/zio.h @@ -0,0 +1,424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/fs/zfs.h> +#include <sys/zio_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ + +typedef struct zio_block_tail { + uint64_t zbt_magic; /* for validation, endianness */ + zio_cksum_t zbt_cksum; /* 256-bit checksum */ +} zio_block_tail_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_block_tail_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + +#define ZIO_PRIORITY_NOW (zio_priority_table[0]) +#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) +#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) +#define ZIO_PRIORITY_FREE (zio_priority_table[5]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) +#define ZIO_PRIORITY_TABLE_SIZE 10 + +#define ZIO_FLAG_MUSTSUCCEED 0x00000 +#define ZIO_FLAG_CANFAIL 0x00001 +#define ZIO_FLAG_SPECULATIVE 0x00002 +#define ZIO_FLAG_CONFIG_WRITER 0x00004 +#define ZIO_FLAG_DONT_RETRY 0x00008 + +#define ZIO_FLAG_DONT_CACHE 0x00010 +#define ZIO_FLAG_DONT_QUEUE 0x00020 +#define ZIO_FLAG_DONT_AGGREGATE 0x00040 +#define ZIO_FLAG_DONT_PROPAGATE 0x00080 + +#define ZIO_FLAG_IO_BYPASS 0x00100 +#define ZIO_FLAG_IO_REPAIR 0x00200 +#define ZIO_FLAG_IO_RETRY 0x00400 +#define ZIO_FLAG_IO_REWRITE 0x00800 + +#define ZIO_FLAG_PROBE 0x01000 +#define ZIO_FLAG_RESILVER 0x02000 +#define ZIO_FLAG_SCRUB 0x04000 +#define ZIO_FLAG_SCRUB_THREAD 0x08000 + +#define ZIO_FLAG_GANG_CHILD 0x10000 + +#define ZIO_FLAG_GANG_INHERIT \ + (ZIO_FLAG_CANFAIL | \ + ZIO_FLAG_SPECULATIVE | \ + ZIO_FLAG_CONFIG_WRITER | \ + ZIO_FLAG_DONT_RETRY | \ + ZIO_FLAG_DONT_CACHE | \ + ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) + +#define ZIO_FLAG_VDEV_INHERIT \ + (ZIO_FLAG_GANG_INHERIT | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_IO_RETRY | \ + ZIO_FLAG_PROBE) + +#define ZIO_PIPELINE_CONTINUE 0x100 +#define ZIO_PIPELINE_STOP 0x101 + +#define ZIO_GANG_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ + ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) + +enum zio_child { + ZIO_CHILD_VDEV = 0, + ZIO_CHILD_GANG, + ZIO_CHILD_LOGICAL, + ZIO_CHILD_TYPES +}; + +enum zio_wait_type { + ZIO_WAIT_READY = 0, + ZIO_WAIT_DONE, + ZIO_WAIT_TYPES +}; + +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +typedef struct zio zio_t; +typedef void zio_done_func_t(zio_t *zio); + +extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; +extern char *zio_type_name[ZIO_TYPES]; + +/* + * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is + * level -1 of the meta-dnode, and intent log blocks (which are chained + * off the root block) have blkid == sequence number. In summary: + * + * mos is objset 0 + * meta-dnode is object 0 + * root block is <objset, 0, -1, 0> + * intent log is <objset, 0, -1, ZIL sequence number> + * + * Note: this structure is called a bookmark because its first purpose was + * to remember where to resume a pool-wide traverse. The absolute ordering + * for block visitation during traversal is defined in compare_bookmark(). + * + * Note: this structure is passed between userland and the kernel. + * Therefore it must not change size or alignment between 32/64 bit + * compilation options. + */ +typedef struct zbookmark { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +} zbookmark_t; + +typedef struct zio_prop { + enum zio_checksum zp_checksum; + enum zio_compress zp_compress; + dmu_object_type_t zp_type; + uint8_t zp_level; + uint8_t zp_ndvas; +} zio_prop_t; + +typedef struct zio_gang_node { + zio_gbh_phys_t *gn_gbh; + struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; +} zio_gang_node_t; + +typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, + zio_gang_node_t *gn, void *data); + +typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); + +typedef struct zio_transform { + void *zt_orig_data; + uint64_t zt_orig_size; + uint64_t zt_bufsize; + zio_transform_func_t *zt_transform; + struct zio_transform *zt_next; +} zio_transform_t; + +typedef int zio_pipe_stage_t(zio_t *zio); + +/* + * The io_reexecute flags are distinct from io_flags because the child must + * be able to propagate them to the parent. The normal io_flags are local + * to the zio, not protected by any lock, and not modifiable by children; + * the reexecute flags are protected by io_lock, modifiable by children, + * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + */ +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 + +struct zio { + /* Core information about this I/O */ + zbookmark_t io_bookmark; + zio_prop_t io_prop; + zio_type_t io_type; + enum zio_child io_child_type; + int io_cmd; + uint8_t io_priority; + uint8_t io_reexecute; + uint8_t io_async_root; + uint64_t io_txg; + spa_t *io_spa; + blkptr_t *io_bp; + blkptr_t io_bp_copy; + zio_t *io_parent; + zio_t *io_child; + zio_t *io_sibling_prev; + zio_t *io_sibling_next; + zio_t *io_logical; + zio_transform_t *io_transform_stack; + + /* Callback info */ + zio_done_func_t *io_ready; + zio_done_func_t *io_done; + void *io_private; + blkptr_t io_bp_orig; + + /* Data represented by this I/O */ + void *io_data; + uint64_t io_size; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + zio_done_func_t *io_vsd_free; + uint64_t io_offset; + uint64_t io_deadline; + avl_node_t io_offset_node; + avl_node_t io_deadline_node; + avl_tree_t *io_vdev_tree; + zio_t *io_delegate_list; + zio_t *io_delegate_next; + + /* Internal pipeline state */ + int io_flags; + zio_stage_t io_stage; + uint32_t io_pipeline; + int io_orig_flags; + zio_stage_t io_orig_stage; + uint32_t io_orig_pipeline; + int io_error; + int io_child_error[ZIO_CHILD_TYPES]; + uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t *io_stall; + zio_gang_node_t *io_gang_tree; + void *io_executor; + void *io_waiter; + kmutex_t io_lock; + kcondvar_t io_cv; + + /* FMA state */ + uint64_t io_ena; +}; + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, zbookmark_t *zb); + +extern void zio_skip_write(zio_t *zio); + +extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); + +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t txg); +extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern void zio_flush(zio_t *zio, vdev_t *vd); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); + +extern void zio_resubmit_stage_async(void *); + +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_checksum_verified(zio_t *zio); +extern int zio_worst_error(int e1, int e2); + +extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); +extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); + +extern void zio_suspend(spa_t *spa, zio_t *zio); +extern void zio_resume(spa_t *spa); +extern void zio_resume_wait(spa_t *spa); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, int error); +extern int zio_handle_label_injection(zio_t *zio, int error); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/module/zfs/include/sys/zio_checksum.h b/module/zfs/include/sys/zio_checksum.h new file mode 100644 index 000000000..da407399d --- /dev/null +++ b/module/zfs/include/sys/zio_checksum.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + int ci_correctable; /* number of correctable bits */ + int ci_zbt; /* uses zio block tail? */ + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t fletcher_2_native; +extern zio_checksum_t fletcher_4_native; +extern zio_checksum_t fletcher_4_incremental_native; + +extern zio_checksum_t fletcher_2_byteswap; +extern zio_checksum_t fletcher_4_byteswap; +extern zio_checksum_t fletcher_4_incremental_byteswap; + +extern zio_checksum_t zio_checksum_SHA256; + +extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + void *data, uint64_t size); +extern int zio_checksum_error(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/module/zfs/include/sys/zio_compress.h b/module/zfs/include/sys/zio_compress.h new file mode 100644 index 000000000..66ee8d45b --- /dev/null +++ b/module/zfs/include/sys/zio_compress.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; /* compression function */ + zio_decompress_func_t *ci_decompress; /* decompression function */ + int ci_level; /* level parameter */ + char *ci_name; /* algorithm name */ +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); + +/* + * Compress and decompress data if necessary. + */ +extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, + void **destp, uint64_t *destsizep, uint64_t *destbufsizep); +extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/module/zfs/include/sys/zio_impl.h b/module/zfs/include/sys/zio_impl.h new file mode 100644 index 000000000..e7503b733 --- /dev/null +++ b/module/zfs/include/sys/zio_impl.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * I/O Groups: pipeline stage definitions. + */ +typedef enum zio_stage { + ZIO_STAGE_OPEN = 0, /* RWFCI */ + + ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ + + ZIO_STAGE_READ_BP_INIT, /* R---- */ + ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + + ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE, /* RWFC- */ + + ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ + ZIO_STAGE_DVA_FREE, /* --F-- */ + ZIO_STAGE_DVA_CLAIM, /* ---C- */ + + ZIO_STAGE_READY, /* RWFCI */ + + ZIO_STAGE_VDEV_IO_START, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + + ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + + ZIO_STAGE_DONE, /* RWFCI */ + ZIO_STAGES +} zio_stage_t; + +#define ZIO_INTERLOCK_STAGES \ + ((1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_INTERLOCK_PIPELINE \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_IO_STAGES \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY)) + +#define ZIO_READ_PHYS_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + (1U << ZIO_STAGE_READ_BP_INIT)) + +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_ISSUE_ASYNC) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE)) + +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES + +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT)) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE)) + +#define ZIO_GANG_STAGES \ + ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ + (1U << ZIO_STAGE_GANG_ISSUE)) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_FREE)) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_CLAIM)) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE) | \ + (1U << ZIO_STAGE_DVA_CLAIM)) + +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/module/zfs/include/sys/zvol.h b/module/zfs/include/sys/zvol.h new file mode 100644 index 000000000..06adc667e --- /dev/null +++ b/module/zfs/include/sys/zvol.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZVOL_H +#define _SYS_ZVOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + +#ifdef _KERNEL +extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); +extern int zvol_check_volblocksize(uint64_t volblocksize); +extern int zvol_get_stats(objset_t *os, nvlist_t *nv); +extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); +extern int zvol_create_minor(const char *, major_t); +extern int zvol_remove_minor(const char *); +extern int zvol_set_volsize(const char *, major_t, uint64_t); +extern int zvol_set_volblocksize(const char *, uint64_t); + +extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); +extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); +extern int zvol_strategy(buf_t *bp); +extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); +extern int zvol_busy(void); +extern void zvol_init(void); +extern void zvol_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZVOL_H */ |