diff options
Diffstat (limited to 'include/sys')
-rw-r--r-- | include/sys/Makefile.am | 1 | ||||
-rw-r--r-- | include/sys/dsl_pool.h | 1 | ||||
-rw-r--r-- | include/sys/fs/zfs.h | 20 | ||||
-rw-r--r-- | include/sys/mmp.h | 63 | ||||
-rw-r--r-- | include/sys/spa.h | 5 | ||||
-rw-r--r-- | include/sys/spa_impl.h | 2 | ||||
-rw-r--r-- | include/sys/uberblock.h | 3 | ||||
-rw-r--r-- | include/sys/uberblock_impl.h | 6 | ||||
-rw-r--r-- | include/sys/vdev.h | 2 | ||||
-rw-r--r-- | include/sys/vdev_impl.h | 7 |
10 files changed, 108 insertions, 2 deletions
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 956643801..be606b8c6 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -36,6 +36,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/efi_partition.h \ $(top_srcdir)/include/sys/metaslab.h \ $(top_srcdir)/include/sys/metaslab_impl.h \ + $(top_srcdir)/include/sys/mmp.h \ $(top_srcdir)/include/sys/mntent.h \ $(top_srcdir)/include/sys/multilist.h \ $(top_srcdir)/include/sys/nvpair.h \ diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 674dd25b5..d2dabda6d 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -38,6 +38,7 @@ #include <sys/bpobj.h> #include <sys/bptree.h> #include <sys/rrwlock.h> +#include <sys/mmp.h> #ifdef __cplusplus extern "C" { diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 9fbcfbef3..13b25a695 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -223,6 +223,7 @@ typedef enum { ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, + ZPOOL_PROP_MULTIHOST, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -651,6 +652,11 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" +#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -744,7 +750,8 @@ typedef enum vdev_aux { VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ - VDEV_AUX_EXTERNAL_PERSIST /* persistent forced fault */ + VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ + VDEV_AUX_ACTIVE, /* vdev active on a different host */ } vdev_aux_t; /* @@ -765,6 +772,16 @@ typedef enum pool_state { } pool_state_t; /* + * mmp state. The following states provide additional detail describing + * why a pool couldn't be safely imported. + */ +typedef enum mmp_state { + MMP_STATE_ACTIVE = 0, /* In active use */ + MMP_STATE_INACTIVE, /* Inactive and safe to import */ + MMP_STATE_NO_HOSTID /* System hostid is not set */ +} mmp_state_t; + +/* * Scan Functions. */ typedef enum pool_scan_func { @@ -1126,6 +1143,7 @@ typedef enum { #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 +#define ZFS_IMPORT_SKIP_MMP 0x20 /* * Sysevent payload members. ZFS will generate the following sysevents with the diff --git a/include/sys/mmp.h b/include/sys/mmp.h new file mode 100644 index 000000000..4da612d6a --- /dev/null +++ b/include/sys/mmp.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2017 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_MMP_H +#define _SYS_MMP_H + +#include <sys/spa.h> +#include <sys/zfs_context.h> +#include <sys/uberblock_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define MMP_MIN_INTERVAL 100 /* ms */ +#define MMP_DEFAULT_INTERVAL 1000 /* ms */ +#define MMP_DEFAULT_IMPORT_INTERVALS 10 +#define MMP_DEFAULT_FAIL_INTERVALS 5 + +typedef struct mmp_thread { + kmutex_t mmp_thread_lock; /* protect thread mgmt fields */ + kcondvar_t mmp_thread_cv; + kthread_t *mmp_thread; + uint8_t mmp_thread_exiting; + kmutex_t mmp_io_lock; /* protect below */ + hrtime_t mmp_last_write; /* last successful MMP write */ + uint64_t mmp_delay; /* decaying avg ns between MMP writes */ + uberblock_t mmp_ub; /* last ub written by sync */ + zio_t *mmp_zio_root; /* root of mmp write zios */ +} mmp_thread_t; + + +extern void mmp_init(struct spa *spa); +extern void mmp_fini(struct spa *spa); +extern void mmp_thread_start(struct spa *spa); +extern void mmp_thread_stop(struct spa *spa); +extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub); + +/* Global tuning */ +extern ulong_t zfs_multihost_interval; +extern uint_t zfs_multihost_fail_intervals; +extern uint_t zfs_multihost_import_intervals; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MMP_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index dd86aad40..de942ad2b 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -729,6 +729,7 @@ typedef struct spa_stats { spa_stats_history_t txg_history; spa_stats_history_t tx_assign_histogram; spa_stats_history_t io_history; + spa_stats_history_t mmp_history; } spa_stats_t; typedef enum txg_state { @@ -758,6 +759,8 @@ extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); +extern void spa_mmp_history_add(uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); @@ -860,6 +863,8 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_multihost(spa_t *spa); +extern unsigned long spa_get_hostid(void); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9142242cb..06de24421 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -275,6 +275,8 @@ struct spa { spa_stats_t spa_stats; /* assorted spa statistics */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + uint64_t spa_multihost; /* multihost aware (mmp) */ + mmp_thread_t spa_mmp; /* multihost mmp thread */ /* * spa_refcount & spa_config_lock must be the last elements diff --git a/include/sys/uberblock.h b/include/sys/uberblock.h index 21e7ae0de..044e43838 100644 --- a/include/sys/uberblock.h +++ b/include/sys/uberblock.h @@ -40,7 +40,8 @@ extern "C" { typedef struct uberblock uberblock_t; extern int uberblock_verify(uberblock_t *); -extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t); +extern boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, + uint64_t mmp_delay); #ifdef __cplusplus } diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 6ab6aa313..9fdc70b91 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -43,6 +43,7 @@ extern "C" { */ #define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ #define UBERBLOCK_SHIFT 10 /* up to 1K */ +#define MMP_MAGIC 0xa11cea11 /* all-see-all */ struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ @@ -54,6 +55,11 @@ struct uberblock { /* highest SPA_VERSION supported by software that wrote this txg */ uint64_t ub_software_version; + + /* Maybe missing in uberblocks we read, but always written */ + uint64_t ub_mmp_magic; /* MMP_MAGIC */ + uint64_t ub_mmp_delay; /* nanosec since last MMP write */ + uint64_t ub_mmp_seq; /* reserved for sequence number */ }; #ifdef __cplusplus diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 63b4904c5..7157ef43f 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -155,6 +155,8 @@ extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); +extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t + offset, uint64_t size, zio_done_func_t *done, void *private, int flags); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 835d2dbbf..7c5e54b08 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -238,6 +238,7 @@ struct vdev { zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; + hrtime_t vdev_mmp_pending; /* 0 if write finished */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -268,6 +269,12 @@ struct vdev { #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +/* + * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock + * ring when MMP is enabled. + */ +#define MMP_BLOCKS_PER_LABEL 1 + /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ |