summaryrefslogtreecommitdiffstats
path: root/module/zfs
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2009-07-02 15:44:48 -0700
committerBrian Behlendorf <[email protected]>2009-07-02 15:44:48 -0700
commit9babb37438b58e77bad04e820d5702e15b79e6a6 (patch)
treee369da81095eca3fc155b0c02bdd4a9f06506781 /module/zfs
parentd164b2093561a9771db07346e6fffc9ca19427a2 (diff)
Rebase master to b117
Diffstat (limited to 'module/zfs')
-rw-r--r--module/zfs/arc.c84
-rw-r--r--module/zfs/dbuf.c154
-rw-r--r--module/zfs/dmu.c82
-rw-r--r--module/zfs/dmu_object.c44
-rw-r--r--module/zfs/dmu_objset.c307
-rw-r--r--module/zfs/dmu_send.c44
-rw-r--r--module/zfs/dmu_traverse.c62
-rw-r--r--module/zfs/dmu_tx.c179
-rw-r--r--module/zfs/dnode.c86
-rw-r--r--module/zfs/dnode_sync.c22
-rw-r--r--module/zfs/dsl_dataset.c59
-rw-r--r--module/zfs/dsl_dir.c15
-rw-r--r--module/zfs/dsl_pool.c67
-rw-r--r--module/zfs/dsl_prop.c87
-rw-r--r--module/zfs/dsl_scrub.c56
-rw-r--r--module/zfs/fletcher.c104
-rw-r--r--module/zfs/include/sys/arc.h4
-rw-r--r--module/zfs/include/sys/dbuf.h3
-rw-r--r--module/zfs/include/sys/dmu.h34
-rw-r--r--module/zfs/include/sys/dmu_objset.h26
-rw-r--r--module/zfs/include/sys/dnode.h11
-rw-r--r--module/zfs/include/sys/dsl_dataset.h4
-rw-r--r--module/zfs/include/sys/dsl_deleg.h8
-rw-r--r--module/zfs/include/sys/dsl_dir.h3
-rw-r--r--module/zfs/include/sys/dsl_pool.h5
-rw-r--r--module/zfs/include/sys/dsl_prop.h7
-rw-r--r--module/zfs/include/sys/metaslab.h6
-rw-r--r--module/zfs/include/sys/metaslab_impl.h5
-rw-r--r--module/zfs/include/sys/spa.h7
-rw-r--r--module/zfs/include/sys/spa_boot.h5
-rw-r--r--module/zfs/include/sys/spa_impl.h8
-rw-r--r--module/zfs/include/sys/space_map.h6
-rw-r--r--module/zfs/include/sys/vdev.h8
-rw-r--r--module/zfs/include/sys/vdev_impl.h31
-rw-r--r--module/zfs/include/sys/zap.h11
-rw-r--r--module/zfs/include/sys/zap_impl.h6
-rw-r--r--module/zfs/include/sys/zfs_acl.h29
-rw-r--r--module/zfs/include/sys/zfs_context.h5
-rw-r--r--module/zfs/include/sys/zfs_ctldir.h5
-rw-r--r--module/zfs/include/sys/zfs_dir.h6
-rw-r--r--module/zfs/include/sys/zfs_fuid.h28
-rw-r--r--module/zfs/include/sys/zfs_ioctl.h14
-rw-r--r--module/zfs/include/sys/zfs_vfsops.h20
-rw-r--r--module/zfs/include/sys/zfs_znode.h5
-rw-r--r--module/zfs/include/sys/zil.h34
-rw-r--r--module/zfs/include/sys/zil_impl.h5
-rw-r--r--module/zfs/include/sys/zio.h54
-rw-r--r--module/zfs/metaslab.c242
-rw-r--r--module/zfs/spa.c965
-rw-r--r--module/zfs/spa_config.c7
-rw-r--r--module/zfs/spa_errlog.c11
-rw-r--r--module/zfs/spa_history.c13
-rw-r--r--module/zfs/spa_misc.c6
-rw-r--r--module/zfs/space_map.c37
-rw-r--r--module/zfs/vdev.c359
-rw-r--r--module/zfs/vdev_label.c65
-rw-r--r--module/zfs/vdev_queue.c42
-rw-r--r--module/zfs/vdev_raidz.c4
-rw-r--r--module/zfs/zap.c61
-rw-r--r--module/zfs/zap_leaf.c9
-rw-r--r--module/zfs/zap_micro.c84
-rw-r--r--module/zfs/zfs_acl.c458
-rw-r--r--module/zfs/zfs_ctldir.c180
-rw-r--r--module/zfs/zfs_dir.c41
-rw-r--r--module/zfs/zfs_fm.c59
-rw-r--r--module/zfs/zfs_fuid.c230
-rw-r--r--module/zfs/zfs_ioctl.c1036
-rw-r--r--module/zfs/zfs_log.c30
-rw-r--r--module/zfs/zfs_vfsops.c608
-rw-r--r--module/zfs/zfs_vnops.c435
-rw-r--r--module/zfs/zfs_znode.c227
-rw-r--r--module/zfs/zil.c172
-rw-r--r--module/zfs/zio.c194
-rw-r--r--module/zfs/zio_inject.c10
-rw-r--r--module/zfs/zvol.c143
75 files changed, 5278 insertions, 2305 deletions
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3a9598a92..d5e5aa544 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -124,6 +124,7 @@
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -397,6 +398,7 @@ static arc_state_t *arc_l2c_only;
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
+static uint64_t arc_loaned_bytes;
static uint64_t arc_meta_used;
static uint64_t arc_meta_limit;
static uint64_t arc_meta_max = 0;
@@ -610,7 +612,7 @@ typedef struct l2arc_write_callback {
struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
- daddr_t b_daddr; /* disk address, offset byte */
+ uint64_t b_daddr; /* disk address, offset byte */
};
typedef struct l2arc_data_free {
@@ -1203,6 +1205,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
return (buf);
}
+static char *arc_onloan_tag = "onloan";
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, int size)
+{
+ arc_buf_t *buf;
+
+ buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+
+ atomic_add_64(&arc_loaned_bytes, size);
+ return (buf);
+}
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(buf->b_data != NULL);
+ VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
+ VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+
+ atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
@@ -2504,7 +2541,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
- arc_buf_hdr_t *hdr = pbuf->b_hdr;
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@@ -2512,9 +2548,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
-
- ASSERT3P(hdr, ==, pbuf->b_hdr);
rw_exit(&pbuf->b_lock);
+
return (err);
}
@@ -2604,7 +2639,7 @@ top:
uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb;
vdev_t *vd = NULL;
- daddr_t addr;
+ uint64_t addr;
boolean_t devw = B_FALSE;
if (hdr == NULL) {
@@ -2923,6 +2958,7 @@ arc_release(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock;
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size;
+ boolean_t released = B_FALSE;
rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
@@ -2938,12 +2974,12 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf);
rw_exit(&buf->b_lock);
- return;
+ released = B_TRUE;
+ } else {
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
}
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
-
l2hdr = hdr->b_l2hdr;
if (l2hdr) {
mutex_enter(&l2arc_buflist_mtx);
@@ -2951,6 +2987,9 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size;
}
+ if (released)
+ goto out;
+
/*
* Do we have more than one buf?
*/
@@ -3018,6 +3057,7 @@ arc_release(arc_buf_t *buf, void *tag)
buf->b_efunc = NULL;
buf->b_private = NULL;
+out:
if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3311,10 +3351,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t inflight_data = arc_anon->arcs_size;
uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
@@ -3376,6 +3415,7 @@ int
arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{
int error;
+ uint64_t anon_size;
#ifdef ZFS_DEBUG
/*
@@ -3392,11 +3432,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
return (ENOMEM);
/*
+ * Don't count loaned bufs as in flight dirty data to prevent long
+ * network delays from blocking transactions that are ready to be
+ * assigned to a txg.
+ */
+ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+
+ /*
* Writes will, almost always, require additional memory allocations
* in order to compress/encrypt/etc the data. We therefor need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, txg))
+ if (error = arc_memory_throttle(reserve, anon_size, txg))
return (error);
/*
@@ -3406,8 +3453,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
*/
- if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_anon->arcs_size > arc_c / 4) {
+
+ if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
+ anon_size > arc_c / 4) {
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
arc_tempreserve>>10,
@@ -3592,6 +3640,8 @@ arc_fini(void)
mutex_destroy(&zfs_write_limit_lock);
buf_fini();
+
+ ASSERT(arc_loaned_bytes == 0);
}
/*
@@ -4486,7 +4536,7 @@ l2arc_vdev_present(vdev_t *vd)
* validated the vdev and opened it.
*/
void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
l2arc_dev_t *adddev;
@@ -4500,8 +4550,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_vdev = vd;
adddev->l2ad_write = l2arc_write_max;
adddev->l2ad_boost = l2arc_write_boost;
- adddev->l2ad_start = start;
- adddev->l2ad_end = end;
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 113fa1f03..1b6f242dd 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -329,7 +329,7 @@ dbuf_verify(dmu_buf_impl_t *db)
if (db->db_parent == dn->dn_dbuf) {
/* db is pointed to by the dnode */
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
@@ -465,15 +465,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- int bonuslen = dn->dn_bonuslen;
+ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
- bonuslen);
+ if (bonuslen)
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -908,15 +908,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
* were already pre-dirtied in open context.
- * XXX We may want to prohibit dirtying in syncing context even
- * if they did pre-dirty.
*/
ASSERT(!dmu_tx_is_syncing(tx) ||
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
- dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_objset->os_dsl_dataset == NULL ||
- dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
-
+ DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ dn->dn_objset->os_dsl_dataset == NULL);
/*
* We make this assert for private objects as well, but after we
* check if we're already dirty. They are allowed to re-dirty
@@ -975,7 +971,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* Only valid if not already dirty.
*/
- ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ ASSERT(dn->dn_object == 0 ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
ASSERT3U(dn->dn_nlevels, >, db->db_level);
@@ -987,15 +984,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* We should only be dirtying in syncing context if it's the
- * mos, a spa os, or we're initializing the os. However, we are
- * allowed to dirty in syncing context provided we already
- * dirtied it in open context. Hence we must make this
- * assertion only if we're not already dirty.
+ * mos or we're initializing the os or it's a special object.
+ * However, we are allowed to dirty in syncing context provided
+ * we already dirtied it in open context. Hence we must make
+ * this assertion only if we're not already dirty.
*/
- ASSERT(!dmu_tx_is_syncing(tx) ||
- os->os_dsl_dataset == NULL ||
- !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
- !BP_IS_HOLE(os->os_rootbp));
+ ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+ os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1312,6 +1307,68 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_level == 0);
+ ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+ ASSERT(buf != NULL);
+ ASSERT(arc_buf_size(buf) == db->db.db_size);
+ ASSERT(tx->tx_txg != 0);
+
+ arc_return_buf(buf, db);
+ ASSERT(arc_released(buf));
+
+ mutex_enter(&db->db_mtx);
+
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+ if (db->db_state == DB_CACHED &&
+ refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ return;
+ }
+
+ if (db->db_state == DB_CACHED) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(db->db_buf != NULL);
+ if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+ ASSERT(dr->dt.dl.dr_data == db->db_buf);
+ if (!arc_released(db->db_buf)) {
+ ASSERT(dr->dt.dl.dr_override_state ==
+ DR_OVERRIDDEN);
+ arc_release(db->db_buf, db);
+ }
+ dr->dt.dl.dr_data = buf;
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+ } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+ arc_release(db->db_buf, db);
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+ }
+ db->db_buf = NULL;
+ }
+ ASSERT(db->db_buf == NULL);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_FILL;
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ dbuf_fill_done(db, tx);
+}
+
+/*
* "Clear" the contents of this dbuf. This will mark the dbuf
* EVICTING and clear *most* of its references. Unfortunetely,
* when we are not holding the dn_dbufs_mtx, we can't clear the
@@ -1855,6 +1912,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user_ptr);
}
+boolean_t
+dmu_buf_freeable(dmu_buf_t *dbuf)
+{
+ boolean_t res = B_FALSE;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+
+ if (db->db_blkptr)
+ res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
+ db->db_blkptr->blk_birth);
+
+ return (res);
+}
+
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@@ -1943,7 +2013,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- int blksz;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2049,32 +2118,25 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
return;
}
- if (db->db_state != DB_NOFILL) {
- blksz = arc_buf_size(*datap);
-
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
- /*
- * If this buffer is currently "in use" (i.e., there
- * are active holds and db_data still references it),
- * then make a copy before we start the write so that
- * any modifications from the open txg will not leak
- * into this write.
- *
- * NOTE: this copy does not need to be made for
- * objects only modified in the syncing context (e.g.
- * DNONE_DNODE blocks).
- */
- if (refcount_count(&db->db_holds) > 1 &&
- *datap == db->db_buf) {
- arc_buf_contents_t type =
- DBUF_GET_BUFC_TYPE(db);
- *datap =
- arc_buf_alloc(os->os_spa, blksz, db, type);
- bcopy(db->db.db_data, (*datap)->b_data, blksz);
- }
- }
-
- ASSERT(*datap != NULL);
+ if (db->db_state != DB_NOFILL &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ refcount_count(&db->db_holds) > 1 &&
+ *datap == db->db_buf) {
+ /*
+ * If this buffer is currently "in use" (i.e., there
+ * are active holds and db_data still references it),
+ * then make a copy before we start the write so that
+ * any modifications from the open txg will not leak
+ * into this write.
+ *
+ * NOTE: this copy does not need to be made for
+ * objects only modified in the syncing context (e.g.
+ * DNONE_DNODE blocks).
+ */
+ int blksz = arc_buf_size(*datap);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
db->db_data_pending = dr;
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b6205bd50..785c7c621 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -85,6 +85,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint64_array, TRUE, "FUID table size" },
{ zap_byteswap, TRUE, "DSL dataset next clones"},
{ zap_byteswap, TRUE, "scrub work queue" },
+ { zap_byteswap, TRUE, "ZFS user/group used" },
+ { zap_byteswap, TRUE, "ZFS user/group quota" },
};
int
@@ -180,22 +182,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
* whose dnodes are in the same block.
*/
static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
- uint32_t flags;
+ uint32_t dbuf_flags;
int err;
zio_t *zio;
hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
- flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
- if (length > zfetch_array_rd_sz)
- flags |= DB_RF_NOPREFETCH;
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+ dbuf_flags |= DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
@@ -233,7 +235,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
/* initiate async i/o */
if (read) {
rw_exit(&dn->dn_struct_rwlock);
- (void) dbuf_read(db, zio, flags);
+ (void) dbuf_read(db, zio, dbuf_flags);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
}
dbp[i] = &db->db;
@@ -285,7 +287,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
return (err);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
- numbufsp, dbpp);
+ numbufsp, dbpp, DMU_READ_PREFETCH);
dnode_rele(dn, FTAG);
@@ -300,7 +302,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
int err;
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
- numbufsp, dbpp);
+ numbufsp, dbpp, DMU_READ_PREFETCH);
return (err);
}
@@ -442,7 +444,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
object_size = align == 1 ? dn->dn_datablksz :
(dn->dn_maxblkid + 1) << dn->dn_datablkshift;
- if (trunc || (end = offset + length) > object_size)
+ end = offset + length;
+ if (trunc || end > object_size)
end = object_size;
if (end <= offset)
return (0);
@@ -450,6 +453,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
while (length) {
start = end;
+ /* assert(offset <= start) */
err = get_next_chunk(dn, &start, offset);
if (err)
return (err);
@@ -540,7 +544,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf)
+ void *buf, uint32_t flags)
{
dnode_t *dn;
dmu_buf_t **dbp;
@@ -570,7 +574,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
- TRUE, FTAG, &numbufs, &dbp);
+ TRUE, FTAG, &numbufs, &dbp, flags);
if (err)
break;
@@ -810,6 +814,58 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
}
#endif
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+
+ return (arc_loan_buf(dn->dn_objset->os_spa, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+ arc_return_buf(buf, FTAG);
+ VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *db;
+ uint32_t blksz = (uint32_t)arc_buf_size(buf);
+ uint64_t blkid;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, offset);
+ VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (offset == db->db.db_offset && blksz == db->db.db_size) {
+ dbuf_assign_arcbuf(db, buf, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ dbuf_rele(db, FTAG);
+ ASSERT(dn->dn_objset->os.os == dn->dn_objset);
+ dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
+ buf->b_data, tx);
+ dmu_return_arcbuf(buf);
+ }
+}
+
typedef struct {
dbuf_dirty_record_t *dr;
dmu_sync_cb_t *done;
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index 1b9247d66..1f91fc1ad 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
@@ -108,22 +106,56 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ int blocksize, dmu_object_type_t bonustype, int bonuslen)
{
dnode_t *dn;
+ dmu_tx_t *tx;
+ int nblkptr;
int err;
- if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ if (object == DMU_META_DNODE_OBJECT)
return (EBADF);
err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn);
if (err)
return (err);
+
+ if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
+ dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
+ /* nothing is changing, this is a noop */
+ dnode_rele(dn, FTAG);
+ return (0);
+ }
+
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+ /*
+ * If we are losing blkptrs or changing the block size this must
+ * be a new file instance. We must clear out the previous file
+ * contents before we can change this type of metadata in the dnode.
+ */
+ if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+ if (err)
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+
+ dmu_tx_commit(tx);
+out:
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
int
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index c9e00d511..e962c4b88 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -164,10 +164,15 @@ dmu_objset_byteswap(void *buf, size_t size)
{
objset_phys_t *osp = buf;
- ASSERT(size == sizeof (objset_phys_t));
+ ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
dnode_byteswap(&osp->os_meta_dnode);
byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
osp->os_type = BSWAP_64(osp->os_type);
+ osp->os_flags = BSWAP_64(osp->os_flags);
+ if (size == sizeof (objset_phys_t)) {
+ dnode_byteswap(&osp->os_userused_dnode);
+ dnode_byteswap(&osp->os_groupused_dnode);
+ }
}
int
@@ -210,12 +215,30 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
err = EIO;
return (err);
}
+
+ /* Increase the blocksize if we are permitted. */
+ if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
+ arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
+ arc_buf_t *buf = arc_buf_alloc(spa,
+ sizeof (objset_phys_t), &osi->os_phys_buf,
+ ARC_BUFC_METADATA);
+ bzero(buf->b_data, sizeof (objset_phys_t));
+ bcopy(osi->os_phys_buf->b_data, buf->b_data,
+ arc_buf_size(osi->os_phys_buf));
+ (void) arc_buf_remove_ref(osi->os_phys_buf,
+ &osi->os_phys_buf);
+ osi->os_phys_buf = buf;
+ }
+
osi->os_phys = osi->os_phys_buf->b_data;
+ osi->os_flags = osi->os_phys->os_flags;
} else {
- osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+ int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+ sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
+ osi->os_phys_buf = arc_buf_alloc(spa, size,
&osi->os_phys_buf, ARC_BUFC_METADATA);
osi->os_phys = osi->os_phys_buf->b_data;
- bzero(osi->os_phys, sizeof (objset_phys_t));
+ bzero(osi->os_phys, size);
}
/*
@@ -276,6 +299,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
osi->os_meta_dnode = dnode_special_open(osi,
&osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+ if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
+ osi->os_userused_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
+ osi->os_groupused_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+ }
/*
* We should be the only thread trying to do this because we
@@ -456,13 +485,15 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
os.os = osi;
(void) dmu_objset_evict_dbufs(&os);
- ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
- ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
- ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
-
dnode_special_close(osi->os_meta_dnode);
+ if (osi->os_userused_dnode) {
+ dnode_special_close(osi->os_userused_dnode);
+ dnode_special_close(osi->os_groupused_dnode);
+ }
zil_free(osi->os_zil);
+ ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
+
VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock);
@@ -520,6 +551,10 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
osi->os_phys->os_type = type;
+ if (dmu_objset_userused_enabled(osi)) {
+ osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ osi->os_flags = osi->os_phys->os_flags;
+ }
dsl_dataset_dirty(ds, tx);
@@ -704,13 +739,33 @@ struct snaparg {
char *snapname;
char failed[MAXPATHLEN];
boolean_t checkperms;
- list_t objsets;
+ nvlist_t *props;
};
-struct osnode {
- list_node_t node;
- objset_t *os;
-};
+static int
+snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ struct snaparg *sn = arg2;
+
+ /* The props have already been checked by zfs_check_userprops(). */
+
+ return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
+ sn->snapname, tx));
+}
+
+static void
+snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ struct snaparg *sn = arg2;
+
+ dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+
+ if (sn->props)
+ dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
+}
static int
dmu_objset_snapshot_one(char *name, void *arg)
@@ -747,13 +802,8 @@ dmu_objset_snapshot_one(char *name, void *arg)
*/
err = zil_suspend(dmu_objset_zil(os));
if (err == 0) {
- struct osnode *osn;
- dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
- dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
- sn->snapname, 3);
- osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
- osn->os = os;
- list_insert_tail(&sn->objsets, osn);
+ dsl_sync_task_create(sn->dstg, snapshot_check,
+ snapshot_sync, os, sn, 3);
} else {
dmu_objset_close(os);
}
@@ -762,11 +812,11 @@ dmu_objset_snapshot_one(char *name, void *arg)
}
int
-dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname,
+ nvlist_t *props, boolean_t recursive)
{
dsl_sync_task_t *dst;
- struct osnode *osn;
- struct snaparg sn = { 0 };
+ struct snaparg sn;
spa_t *spa;
int err;
@@ -778,8 +828,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname;
- list_create(&sn.objsets, sizeof (struct osnode),
- offsetof(struct osnode, node));
+ sn.props = props;
if (recursive) {
sn.checkperms = B_TRUE;
@@ -790,27 +839,19 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
err = dmu_objset_snapshot_one(fsname, &sn);
}
- if (err)
- goto out;
-
- err = dsl_sync_task_group_wait(sn.dstg);
+ if (err == 0)
+ err = dsl_sync_task_group_wait(sn.dstg);
for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
+ objset_t *os = dst->dst_arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
if (dst->dst_err)
dsl_dataset_name(ds, sn.failed);
+ zil_resume(dmu_objset_zil(os));
+ dmu_objset_close(os);
}
-out:
- while (osn = list_head(&sn.objsets)) {
- list_remove(&sn.objsets, osn);
- zil_resume(dmu_objset_zil(osn->os));
- dmu_objset_close(osn->os);
- kmem_free(osn, sizeof (struct osnode));
- }
- list_destroy(&sn.objsets);
-
if (err)
(void) strcpy(fsname, sn.failed);
dsl_sync_task_group_destroy(sn.dstg);
@@ -819,7 +860,7 @@ out:
}
static void
-dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
{
dnode_t *dn;
@@ -827,14 +868,20 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(dn->dn_dbuf->db_data_pending);
/*
- * Initialize dn_zio outside dnode_sync()
- * to accomodate meta-dnode
+ * Initialize dn_zio outside dnode_sync() because the
+ * meta-dnode needs to set it ouside dnode_sync().
*/
dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
ASSERT(dn->dn_zio);
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
list_remove(list, dn);
+
+ if (newlist) {
+ (void) dnode_add_ref(dn, newlist);
+ list_insert_tail(newlist, dn);
+ }
+
dnode_sync(dn, tx);
}
}
@@ -853,9 +900,12 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
ASSERT(BP_GET_LEVEL(bp) == 0);
/*
- * Update rootbp fill count.
+ * Update rootbp fill count: it should be the number of objects
+ * allocated in the object set (not counting the "special"
+ * objects that are stored in the objset_phys_t -- the meta
+ * dnode and user/group accounting objects).
*/
- bp->blk_fill = 1; /* count the meta-dnode */
+ bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
@@ -878,6 +928,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
writeprops_t wp = { 0 };
zio_t *zio;
list_t *list;
+ list_t *newlist = NULL;
dbuf_dirty_record_t *dr;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -915,20 +966,41 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
}
arc_release(os->os_phys_buf, &os->os_phys_buf);
+
zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
- * Sync meta-dnode - the parent IO for the sync is the root block
+ * Sync special dnodes - the parent IO for the sync is the root block
*/
os->os_meta_dnode->dn_zio = zio;
dnode_sync(os->os_meta_dnode, tx);
+ os->os_phys->os_flags = os->os_flags;
+
+ if (os->os_userused_dnode &&
+ os->os_userused_dnode->dn_type != DMU_OT_NONE) {
+ os->os_userused_dnode->dn_zio = zio;
+ dnode_sync(os->os_userused_dnode, tx);
+ os->os_groupused_dnode->dn_zio = zio;
+ dnode_sync(os->os_groupused_dnode, tx);
+ }
+
txgoff = tx->tx_txg & TXG_MASK;
- dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
- dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+ if (dmu_objset_userused_enabled(os)) {
+ newlist = &os->os_synced_dnodes;
+ /*
+ * We must create the list here because it uses the
+ * dn_dirty_link[] of this txg.
+ */
+ list_create(newlist, sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[txgoff]));
+ }
+
+ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
+ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
list = &os->os_meta_dnode->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
@@ -945,6 +1017,145 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
zio_nowait(zio);
}
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
+{
+ used_cbs[ost] = cb;
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_impl_t *os)
+{
+ return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+ used_cbs[os->os_phys->os_type] &&
+ os->os_userused_dnode);
+}
+
+void
+dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ list_t *list = &os->os_synced_dnodes;
+ static const char zerobuf[DN_MAX_BONUSLEN] = {0};
+
+ ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
+
+ while (dn = list_head(list)) {
+ dmu_object_type_t bonustype;
+
+ ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+ ASSERT(dn->dn_oldphys);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+ dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED);
+
+ /* Allocate the user/groupused objects if necessary. */
+ if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+ VERIFY(0 == zap_create_claim(&os->os,
+ DMU_USERUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ VERIFY(0 == zap_create_claim(&os->os,
+ DMU_GROUPUSED_OBJECT,
+ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+ }
+
+ /*
+ * If the object was not previously
+ * accounted, pretend that it was free.
+ */
+ if (!(dn->dn_oldphys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED)) {
+ bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
+ }
+
+ /*
+ * If the object was freed, use the previous bonustype.
+ */
+ bonustype = dn->dn_phys->dn_bonustype ?
+ dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
+ ASSERT(dn->dn_phys->dn_type != 0 ||
+ (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
+ DN_MAX_BONUSLEN) == 0 &&
+ DN_USED_BYTES(dn->dn_phys) == 0));
+ ASSERT(dn->dn_oldphys->dn_type != 0 ||
+ (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
+ DN_MAX_BONUSLEN) == 0 &&
+ DN_USED_BYTES(dn->dn_oldphys) == 0));
+ used_cbs[os->os_phys->os_type](&os->os, bonustype,
+ DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
+ DN_USED_BYTES(dn->dn_oldphys),
+ DN_USED_BYTES(dn->dn_phys), tx);
+
+ /*
+ * The mutex is needed here for interlock with dnode_allocate.
+ */
+ mutex_enter(&dn->dn_mtx);
+ zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
+ dn->dn_oldphys = NULL;
+ mutex_exit(&dn->dn_mtx);
+
+ list_remove(list, dn);
+ dnode_rele(dn, list);
+ }
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+ return (os->os->os_phys->os_flags &
+ OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+int
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+ uint64_t obj;
+ int err = 0;
+
+ if (dmu_objset_userspace_present(os))
+ return (0);
+ if (!dmu_objset_userused_enabled(os->os))
+ return (ENOTSUP);
+ if (dmu_objset_is_snapshot(os))
+ return (EINVAL);
+
+ /*
+ * We simply need to mark every object dirty, so that it will be
+ * synced out and now accounted. If this is called
+ * concurrently, or if we already did some work before crashing,
+ * that's fine, since we track each object's accounted state
+ * independently.
+ */
+
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_buf_t *db;
+ int objerr;
+
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ return (EINTR);
+
+ objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+ if (objerr)
+ continue;
+ dmu_tx_hold_bonus(tx, obj);
+ objerr = dmu_tx_assign(tx, TXG_WAIT);
+ if (objerr) {
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_buf_will_dirty(db, tx);
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_commit(tx);
+ }
+
+ os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ return (0);
+}
+
void
dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
@@ -978,6 +1189,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
os->os->os_phys->os_type);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+ dmu_objset_userspace_present(os));
}
int
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 6043df0aa..9ca3999dd 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -161,7 +161,9 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
- if (bp == NULL && zb->zb_object == 0) {
+ if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+ return (0);
+ } else if (bp == NULL && zb->zb_object == 0) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@@ -775,11 +777,6 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
dmu_tx_t *tx;
void *data = NULL;
- err = dmu_object_info(os, drro->drr_object, NULL);
-
- if (err != 0 && err != ENOENT)
- return (EINVAL);
-
if (drro->drr_type == DMU_OT_NONE ||
drro->drr_type >= DMU_OT_NUMTYPES ||
drro->drr_bonustype >= DMU_OT_NUMTYPES ||
@@ -792,18 +789,21 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
return (EINVAL);
}
+ err = dmu_object_info(os, drro->drr_object, NULL);
+
+ if (err != 0 && err != ENOENT)
+ return (EINVAL);
+
if (drro->drr_bonuslen) {
data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (ra->err)
return (ra->err);
}
- tx = dmu_tx_create(os);
-
if (err == ENOENT) {
/* currently free, want to be allocated */
+ tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
@@ -812,28 +812,22 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
err = dmu_object_claim(os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx);
+ dmu_tx_commit(tx);
} else {
/* currently allocated, want to be allocated */
- dmu_tx_hold_bonus(tx, drro->drr_object);
- /*
- * We may change blocksize and delete old content,
- * so need to hold_write and hold_free.
- */
- dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
- dmu_tx_hold_free(tx, drro->drr_object, 0, DMU_OBJECT_END);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
-
err = dmu_object_reclaim(os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen);
}
- if (err) {
- dmu_tx_commit(tx);
+ if (err)
return (EINVAL);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, drro->drr_object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
}
dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 197284e1e..89cbfad29 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -64,6 +64,9 @@ struct traverse_data {
void *td_arg;
};
+static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+ arc_buf_t *buf, uint64_t objset, uint64_t object);
+
/* ARGSUSED */
static void
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
@@ -189,7 +192,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
}
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
- int i, j;
+ int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
err = arc_read(NULL, td->td_spa, bp, pbuf,
@@ -201,20 +204,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
/* recursively visitbp() blocks below this */
dnp = buf->b_data;
for (i = 0; i < epb && err == 0; i++, dnp++) {
- for (j = 0; j < dnp->dn_nblkptr; j++) {
- SET_BOOKMARK(&czb, zb->zb_objset,
- zb->zb_blkid * epb + i,
- dnp->dn_nlevels - 1, j);
- err = traverse_visitbp(td, dnp, buf,
- (blkptr_t *)&dnp->dn_blkptr[j], &czb);
- if (err)
- break;
- }
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ zb->zb_blkid * epb + i);
+ if (err)
+ break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
objset_phys_t *osp;
- int j;
+ dnode_phys_t *dnp;
err = arc_read_nolock(NULL, td->td_spa, bp,
arc_getbuf_func, &buf,
@@ -223,20 +221,19 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
return (err);
osp = buf->b_data;
- /*
- * traverse_zil is just here for zdb's leak checking.
- * For other consumers, there will be no ZIL blocks.
- */
traverse_zil(td, &osp->os_zil_header);
- for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
- SET_BOOKMARK(&czb, zb->zb_objset, 0,
- osp->os_meta_dnode.dn_nlevels - 1, j);
- err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
- (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
- &czb);
- if (err)
- break;
+ dnp = &osp->os_meta_dnode;
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ dnp = &osp->os_userused_dnode;
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ DMU_USERUSED_OBJECT);
+ }
+ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ dnp = &osp->os_groupused_dnode;
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ DMU_GROUPUSED_OBJECT);
}
}
@@ -249,6 +246,23 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
return (err);
}
+static int
+traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+ arc_buf_t *buf, uint64_t objset, uint64_t object)
+{
+ int j, err = 0;
+ zbookmark_t czb;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_blkptr[j], &czb);
+ if (err)
+ break;
+ }
+ return (err);
+}
+
/* ARGSUSED */
static int
traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index bf560e565..af2b049a7 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -160,6 +160,41 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
return (err);
}
+static void
+dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
+ boolean_t freeable, dmu_buf_impl_t **history)
+{
+ int i = db->db_level + 1;
+ dnode_t *dn = db->db_dnode;
+
+ if (i >= dn->dn_nlevels)
+ return;
+
+ db = db->db_parent;
+ if (db == NULL) {
+ uint64_t lvls = dn->dn_nlevels - i;
+
+ txh->txh_space_towrite += lvls << dn->dn_indblkshift;
+ return;
+ }
+
+ if (db != history[i]) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ uint64_t space = 1ULL << dn->dn_indblkshift;
+
+ freeable = (db->db_blkptr && (freeable ||
+ dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
+ if (freeable)
+ txh->txh_space_tooverwrite += space;
+ else
+ txh->txh_space_towrite += space;
+ if (db->db_blkptr)
+ txh->txh_space_tounref += space;
+ history[i] = db;
+ dmu_tx_count_indirects(txh, db, freeable, history);
+ }
+}
+
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
@@ -177,18 +212,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
- /*
- * For i/o error checking, read the first and last level-0
- * blocks (if they are not aligned), and all the level-1 blocks.
- */
-
if (dn) {
+ dmu_buf_impl_t *last[DN_MAX_LEVELS];
+ int nlvls = dn->dn_nlevels;
+ int delta;
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks (if they are not aligned), and all the level-1 blocks.
+ */
if (dn->dn_maxblkid == 0) {
- if ((off > 0 || len < dn->dn_datablksz) &&
- off < dn->dn_datablksz) {
+ delta = dn->dn_datablksz;
+ start = (off < dn->dn_datablksz) ? 0 : 1;
+ end = (off+len <= dn->dn_datablksz) ? 0 : 1;
+ if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
if (err)
goto out;
+ delta -= off;
}
} else {
zio_t *zio = zio_root(dn->dn_objset->os_spa,
@@ -213,10 +254,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
/* level-1 blocks */
- if (dn->dn_nlevels > 1) {
- start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- for (i = start+1; i < end; i++) {
+ if (nlvls > 1) {
+ int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (i = (start>>shft)+1; i < end>>shft; i++) {
err = dmu_tx_check_ioerr(zio, dn, 1, i);
if (err)
goto out;
@@ -226,20 +266,70 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
err = zio_wait(zio);
if (err)
goto out;
+ delta = P2NPHASE(off, dn->dn_datablksz);
}
- }
- /*
- * If there's more than one block, the blocksize can't change,
- * so we can make a more precise estimate. Alternatively,
- * if the dnode's ibs is larger than max_ibs, always use that.
- * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
- * the code will still work correctly on existing pools.
- */
- if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
- min_ibs = max_ibs = dn->dn_indblkshift;
- if (dn->dn_datablkshift != 0)
+ if (dn->dn_maxblkid > 0) {
+ /*
+ * The blocksize can't change,
+ * so we can make a more precise estimate.
+ */
+ ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ } else if (dn->dn_indblkshift > max_ibs) {
+ /*
+ * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+ * the code will still work correctly on older pools.
+ */
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ }
+
+ /*
+ * If this write is not off the end of the file
+ * we need to account for overwrites/unref.
+ */
+ if (start <= dn->dn_maxblkid)
+ bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ while (start <= dn->dn_maxblkid) {
+ spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, 0, start, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db->db_blkptr && dsl_dataset_block_freeable(ds,
+ db->db_blkptr->blk_birth)) {
+ dprintf_bp(db->db_blkptr, "can free old%s", "");
+ txh->txh_space_tooverwrite += dn->dn_datablksz;
+ txh->txh_space_tounref += dn->dn_datablksz;
+ dmu_tx_count_indirects(txh, db, TRUE, last);
+ } else {
+ txh->txh_space_towrite += dn->dn_datablksz;
+ if (db->db_blkptr)
+ txh->txh_space_tounref +=
+ bp_get_dasize(spa, db->db_blkptr);
+ dmu_tx_count_indirects(txh, db, FALSE, last);
+ }
+ dbuf_rele(db, FTAG);
+ if (++start > end) {
+ /*
+ * Account for new indirects appearing
+ * before this IO gets assigned into a txg.
+ */
+ bits = 64 - min_bs;
+ epbs = min_ibs - SPA_BLKPTRSHIFT;
+ for (bits -= epbs * (nlvls - 1);
+ bits >= 0; bits -= epbs)
+ txh->txh_fudge += 1ULL << max_ibs;
+ goto out;
+ }
+ off += delta;
+ if (len >= delta)
+ len -= delta;
+ delta = dn->dn_datablksz;
+ }
}
/*
@@ -262,20 +352,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
start >>= epbs;
end >>= epbs;
- /*
- * If we increase the number of levels of indirection,
- * we'll need new blkid=0 indirect blocks. If start == 0,
- * we're already accounting for that blocks; and if end == 0,
- * we can't increase the number of levels beyond that.
- */
- if (start != 0 && end != 0)
- txh->txh_space_towrite += 1ULL << max_ibs;
+ ASSERT3U(end, >=, start);
txh->txh_space_towrite += (end - start + 1) << max_ibs;
+ if (start != 0) {
+ /*
+ * We also need a new blkid=0 indirect block
+ * to reference any existing file data.
+ */
+ txh->txh_space_towrite += 1ULL << max_ibs;
+ }
}
- ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
-
out:
+ if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+ 2 * DMU_MAX_ACCESS)
+ err = EFBIG;
+
if (err)
txh->txh_tx->tx_err = err;
}
@@ -292,6 +384,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
dn->dn_dbuf->db_blkptr->blk_birth)) {
txh->txh_space_tooverwrite += space;
+ txh->txh_space_tounref += space;
} else {
txh->txh_space_towrite += space;
if (dn && dn->dn_dbuf->db_blkptr)
@@ -535,7 +628,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
}
void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
{
dmu_tx_hold_t *txh;
dnode_t *dn;
@@ -584,9 +677,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
} else {
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
- txh->txh_space_tounref +=
- BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
}
+ if (dn->dn_phys->dn_blkptr[0].blk_birth)
+ txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
return;
}
@@ -603,12 +696,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
}
}
- /*
- * 3 blocks overwritten: target leaf, ptrtbl block, header block
- * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
- */
- dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
- (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
+ err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+ &txh->txh_space_towrite, &txh->txh_space_tooverwrite,
+ txh->txh_dnode->dn_datablkshift);
/*
* If the modified blocks are scattered to the four winds,
@@ -616,7 +706,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
*/
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
- txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+ if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
+ txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+ else
+ txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
}
void
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 538a141b0..cf49b97f1 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -156,7 +156,7 @@ dnode_verify(dnode_t *dn)
}
if (dn->dn_phys->dn_type != DMU_OT_NONE)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+ ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
if (dn->dn_dbuf != NULL) {
ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -321,6 +321,7 @@ dnode_destroy(dnode_t *dn)
}
ASSERT(NULL == list_head(&dn->dn_dbufs));
#endif
+ ASSERT(dn->dn_oldphys == NULL);
mutex_enter(&os->os_lock);
list_remove(&os->os_dnodes, dn);
@@ -417,8 +418,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int i, nblkptr;
- dmu_buf_impl_t *db = NULL;
+ int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
@@ -430,42 +430,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
- for (i = 0; i < TXG_SIZE; i++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
- /*
- * XXX I should really have a generation number to tell if we
- * need to do this...
- */
- if (blocksize != dn->dn_datablksz ||
- dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
- /* free all old data */
- dnode_free_range(dn, 0, -1ULL, tx);
- }
-
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
- /* change blocksize */
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (blocksize != dn->dn_datablksz &&
- (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
- list_head(&dn->dn_dbufs) != NULL)) {
- db = dbuf_hold(dn, 0, FTAG);
- dbuf_new_size(db, blocksize, tx);
- }
- dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
- dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ if (dn->dn_datablksz != blocksize) {
+ /* change blocksize */
+ ASSERT(dn->dn_maxblkid == 0 &&
+ (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ dnode_block_freed(dn, 0)));
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ }
+ if (dn->dn_bonuslen != bonuslen)
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
rw_exit(&dn->dn_struct_rwlock);
- if (db)
- dbuf_rele(db, FTAG);
/* change type */
dn->dn_type = ot;
@@ -569,6 +552,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
*/
ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+ if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
+ dn = (object == DMU_USERUSED_OBJECT) ?
+ os->os_userused_dnode : os->os_groupused_dnode;
+ if (dn == NULL)
+ return (ENOENT);
+ type = dn->dn_type;
+ if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+ return (ENOENT);
+ if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+ return (EEXIST);
+ DNODE_VERIFY(dn);
+ (void) refcount_add(&dn->dn_holds, tag);
+ *dnp = dn;
+ return (0);
+ }
+
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
@@ -627,7 +626,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
type = dn->dn_type;
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+ ((flag & DNODE_MUST_BE_FREE) &&
+ (type != DMU_OT_NONE || dn->dn_oldphys))) {
mutex_exit(&dn->dn_mtx);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@@ -692,8 +692,10 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
return;
+ }
DNODE_VERIFY(dn);
@@ -1189,11 +1191,6 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
if (dn->dn_free_txg)
return (TRUE);
- /*
- * If dn_datablkshift is not set, then there's only a single
- * block, in which case there will never be a free range so it
- * won't matter.
- */
range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
@@ -1278,7 +1275,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
- hole = flags & DNODE_FIND_HOLE;
+ hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
@@ -1325,16 +1322,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
for (i = (*offset >> span) & (blkfill - 1);
i >= 0 && i < blkfill; i += inc) {
- boolean_t newcontents = B_TRUE;
- if (txg) {
- int j;
- newcontents = B_FALSE;
- for (j = 0; j < dnp[i].dn_nblkptr; j++) {
- if (dnp[i].dn_blkptr[j].blk_birth > txg)
- newcontents = B_TRUE;
- }
- }
- if (!dnp[i].dn_type == hole && newcontents)
+ if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
*offset += (1ULL << span) * inc;
}
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 23dcb4c7b..184fe292b 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -504,9 +504,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
/*
* Write out the dnode's dirty buffers.
- *
- * NOTE: The dnode is kept in memory by being dirty. Once the
- * dirty bit is cleared, it may be evicted. Beware of this!
*/
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@@ -515,20 +512,33 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
+ static const dnode_phys_t zerodn = { 0 };
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ ASSERT(dnp->dn_type != DMU_OT_NONE ||
+ bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+ if (dmu_objset_userused_enabled(dn->dn_objset) &&
+ !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ ASSERT(dn->dn_oldphys == NULL);
+ dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
+ *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ } else {
+ /* Once we account for it, we should always account for it. */
+ ASSERT(!(dn->dn_phys->dn_flags &
+ DNODE_FLAG_USERUSED_ACCOUNTED));
+ }
+
mutex_enter(&dn->dn_mtx);
if (dn->dn_allocated_txg == tx->tx_txg) {
/* The dnode is newly allocated or reallocated */
if (dnp->dn_type == DMU_OT_NONE) {
/* this is a first alloc, not a realloc */
- /* XXX shouldn't the phys already be zeroed? */
- bzero(dnp, DNODE_CORE_SIZE);
dnp->dn_nlevels = 1;
dnp->dn_nblkptr = dn->dn_nblkptr;
}
@@ -626,7 +636,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dbuf_sync_list(list, tx);
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
ASSERT3P(list_head(list), ==, NULL);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
}
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index a68b12d33..0fe7eb583 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -229,7 +229,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
}
-int
+boolean_t
dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
{
return (blk_birth > dsl_dataset_prev_snap_txg(ds));
@@ -548,6 +548,7 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
return (err);
if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
dsl_dataset_rele(*dsp, owner);
+ *dsp = NULL;
return (EBUSY);
}
return (0);
@@ -974,6 +975,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
(void) dmu_free_object(os, obj);
}
+ /*
+ * We need to sync out all in-flight IO before we try to evict
+ * (the dataset evict func is trying to clear the cached entries
+ * for this dataset in the ARC).
+ */
+ txg_wait_synced(dd->dd_pool, 0);
+
+ /*
+ * If we managed to free all the objects in open
+ * context, the user space accounting should be zero.
+ */
+ if (ds->ds_phys->ds_bp.blk_fill == 0 &&
+ dmu_objset_userused_enabled(os->os)) {
+ uint64_t count;
+
+ ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
+ count == 0);
+ ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
+ count == 0);
+ }
+
dmu_objset_close(os);
if (err != ESRCH)
goto out;
@@ -1058,7 +1080,6 @@ dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
return (ds->ds_user_ptr);
}
-
blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
@@ -1164,8 +1185,18 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (bp == NULL)
return (0);
- ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
- (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+ if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
+ (zb->zb_object != 0 && dnp == NULL)) {
+ /*
+ * It's a block in the intent log. It has no
+ * accounting, so just free it.
+ */
+ VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
+ ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+ } else {
+ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+ (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+ }
return (0);
}
@@ -1209,13 +1240,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- /*
- * Before the roll back destroy the zil.
- */
if (ds->ds_user_ptr != NULL) {
- zil_rollback_destroy(
- ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
-
/*
* We need to make sure that the objset_impl_t is reopened after
* we do the rollback, otherwise it will have the wrong
@@ -1248,7 +1273,10 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ds->ds_phys->ds_deadlist_obj));
{
- /* Free blkptrs that we gave birth to */
+ /*
+ * Free blkptrs that we gave birth to - this covers
+ * claimed but not played log blocks too.
+ */
zio_t *zio;
struct killarg ka;
@@ -1262,8 +1290,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
(void) zio_wait(zio);
}
- ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
- ds->ds_phys->ds_unique_bytes == 0);
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
/* Change our contents to that of the prev snapshot */
@@ -1481,7 +1508,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point &&
ds_prev->ds_phys->ds_next_clones_obj != 0) {
- VERIFY(0 == zap_remove_int(mos,
+ VERIFY3U(0, ==, zap_remove_int(mos,
ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
if (ds->ds_phys->ds_next_snap_obj != 0) {
VERIFY(0 == zap_add_int(mos,
@@ -1654,7 +1681,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
- ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE ||
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0);
}
@@ -2583,7 +2610,7 @@ snaplist_destroy(list_t *l, boolean_t own)
{
struct promotenode *snap;
- if (!list_link_active(&l->list_head))
+ if (!l || !list_link_active(&l->list_head))
return;
while ((snap = list_tail(l)) != NULL) {
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index e5e18f428..f19653d92 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -227,24 +227,11 @@ dsl_dir_namelen(dsl_dir_t *dd)
return (result);
}
-int
-dsl_dir_is_private(dsl_dir_t *dd)
-{
- int rv = FALSE;
-
- if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
- rv = TRUE;
- if (dataset_name_hidden(dd->dd_myname))
- rv = TRUE;
- return (rv);
-}
-
-
static int
getcomponent(const char *path, char *component, const char **nextp)
{
char *p;
- if (path == NULL)
+ if ((path == NULL) || (path[0] == '\0'))
return (ENOENT);
/* This would be a good place to reserve some namespace... */
p = strpbrk(path, "/@");
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index dacc57c81..2c5dfca53 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -90,6 +90,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
+ dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
+ 1, 4, 0);
+
return (dp);
}
@@ -129,14 +132,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
goto out;
err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
FTAG, &ds);
+ if (err == 0) {
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, dp,
+ &dp->dp_origin_snap);
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_dir_close(dd, dp);
if (err)
goto out;
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- dp, &dp->dp_origin_snap);
- if (err)
- goto out;
- dsl_dataset_rele(ds, FTAG);
- dsl_dir_close(dd, dp);
}
/* get scrub status */
@@ -226,6 +230,7 @@ dsl_pool_close(dsl_pool_t *dp)
rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
mutex_destroy(&dp->dp_scrub_cancel_lock);
+ taskq_destroy(dp->dp_vnrele_taskq);
if (dp->dp_blkstats)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
kmem_free(dp, sizeof (dsl_pool_t));
@@ -296,24 +301,52 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
tx = dmu_tx_create_assigned(dp, txg);
dp->dp_read_overhead = 0;
+ start = gethrtime();
+
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
- if (!list_link_active(&ds->ds_synced_link))
- list_insert_tail(&dp->dp_synced_datasets, ds);
- else
- dmu_buf_rele(ds->ds_dbuf, ds);
+ /*
+ * We must not sync any non-MOS datasets twice, because
+ * we may have taken a snapshot of them. However, we
+ * may sync newly-created datasets on pass 2.
+ */
+ ASSERT(!list_link_active(&ds->ds_synced_link));
+ list_insert_tail(&dp->dp_synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx);
}
DTRACE_PROBE(pool_sync__1setup);
-
- start = gethrtime();
err = zio_wait(zio);
+
write_time = gethrtime() - start;
ASSERT(err == 0);
DTRACE_PROBE(pool_sync__2rootzio);
- while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+ for (ds = list_head(&dp->dp_synced_datasets); ds;
+ ds = list_next(&dp->dp_synced_datasets, ds))
+ dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+
+ /*
+ * Sync the datasets again to push out the changes due to
+ * userquota updates. This must be done before we process the
+ * sync tasks, because that could cause a snapshot of a dataset
+ * whose ds_bp will be rewritten when we do this 2nd sync.
+ */
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ ASSERT(list_link_active(&ds->ds_synced_link));
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ err = zio_wait(zio);
+
+ while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
+ /*
+ * No more sync tasks should have been added while we
+ * were syncing.
+ */
+ ASSERT(spa_sync_pass(dp->dp_spa) == 1);
dsl_sync_task_group_sync(dstg, tx);
+ }
DTRACE_PROBE(pool_sync__3task);
start = gethrtime();
@@ -611,3 +644,9 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
rw_exit(&dp->dp_config_rwlock);
}
+
+taskq_t *
+dsl_pool_vnrele_taskq(dsl_pool_t *dp)
+{
+ return (dp->dp_vnrele_taskq);
+}
diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c
index 212acbbc5..664ccff45 100644
--- a/module/zfs/dsl_prop.c
+++ b/module/zfs/dsl_prop.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
@@ -416,6 +414,34 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
void
+dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ nvlist_t *nvl = arg2;
+ nvpair_t *elem = NULL;
+
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ struct prop_set_arg psa;
+
+ psa.name = nvpair_name(elem);
+
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ VERIFY(nvpair_value_string(elem,
+ (char **)&psa.buf) == 0);
+ psa.intsz = 1;
+ psa.numints = strlen(psa.buf) + 1;
+ } else {
+ uint64_t intval;
+ VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+ psa.intsz = sizeof (intval);
+ psa.numints = 1;
+ psa.buf = &intval;
+ }
+ dsl_prop_set_sync(ds, &psa, cr, tx);
+ }
+}
+
+void
dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx)
{
@@ -438,6 +464,7 @@ dsl_prop_set(const char *dsname, const char *propname,
int intsz, int numints, const void *buf)
{
dsl_dataset_t *ds;
+ uint64_t version;
int err;
struct prop_set_arg psa;
@@ -447,15 +474,19 @@ dsl_prop_set(const char *dsname, const char *propname,
*/
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG);
- if (intsz * numints >= ZAP_MAXVALUELEN)
- return (E2BIG);
err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err)
return (err);
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (E2BIG);
+ }
if (dsl_dataset_is_snapshot(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+ version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
@@ -471,6 +502,50 @@ dsl_prop_set(const char *dsname, const char *propname,
return (err);
}
+int
+dsl_props_set(const char *dsname, nvlist_t *nvl)
+{
+ dsl_dataset_t *ds;
+ uint64_t version;
+ nvpair_t *elem = NULL;
+ int err;
+
+ if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+ return (err);
+ /*
+ * Do these checks before the syncfunc, since it can't fail.
+ */
+ version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENAMETOOLONG);
+ }
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ char *valstr;
+ VERIFY(nvpair_value_string(elem, &valstr) == 0);
+ if (strlen(valstr) >= (version <
+ SPA_VERSION_STMF_PROP ?
+ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (E2BIG);
+ }
+ }
+ }
+
+ if (dsl_dataset_is_snapshot(ds) &&
+ version < SPA_VERSION_SNAP_PROPS) {
+ dsl_dataset_rele(ds, FTAG);
+ return (ENOTSUP);
+ }
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ NULL, dsl_props_set_sync, ds, nvl, 2);
+
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+}
+
/*
* Iterate over all properties for this dataset and return them in an nvlist.
*/
diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c
index 783604101..8a802b53a 100644
--- a/module/zfs/dsl_scrub.c
+++ b/module/zfs/dsl_scrub.c
@@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
static scrub_cb_t dsl_pool_scrub_clean_cb;
static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t objset, uint64_t object);
int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
@@ -348,6 +350,12 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
if (bp->blk_birth <= dp->dp_scrub_min_txg)
return;
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * plain scrub there's nothing to do to it).
+ */
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
return;
@@ -373,6 +381,11 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
if (bp->blk_birth <= dp->dp_scrub_min_txg)
return;
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
if (claim_txg == 0 || bp->blk_birth < claim_txg)
return;
@@ -472,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
dnode_phys_t *child_dnp;
- int i, j;
+ int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
err = arc_read(NULL, dp->dp_spa, bp, pbuf,
@@ -487,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
child_dnp = buf->b_data;
for (i = 0; i < epb; i++, child_dnp++) {
- for (j = 0; j < child_dnp->dn_nblkptr; j++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, zb->zb_objset,
- zb->zb_blkid * epb + i,
- child_dnp->dn_nlevels - 1, j);
- scrub_visitbp(dp, child_dnp, buf,
- &child_dnp->dn_blkptr[j], &czb);
- }
+ scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
+ zb->zb_blkid * epb + i);
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
objset_phys_t *osp;
- int j;
err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, &buf,
@@ -516,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
traverse_zil(dp, &osp->os_zil_header);
- for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, zb->zb_objset, 0,
- osp->os_meta_dnode.dn_nlevels - 1, j);
- scrub_visitbp(dp, &osp->os_meta_dnode, buf,
- &osp->os_meta_dnode.dn_blkptr[j], &czb);
+ scrub_visitdnode(dp, &osp->os_meta_dnode,
+ buf, zb->zb_objset, 0);
+ if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ scrub_visitdnode(dp, &osp->os_userused_dnode,
+ buf, zb->zb_objset, 0);
+ scrub_visitdnode(dp, &osp->os_groupused_dnode,
+ buf, zb->zb_objset, 0);
}
}
@@ -532,6 +537,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
}
static void
+scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t objset, uint64_t object)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+ scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
+ }
+
+}
+
+static void
scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
{
zbookmark_t zb;
diff --git a/module/zfs/fletcher.c b/module/zfs/fletcher.c
index edda3c9a9..54247d724 100644
--- a/module/zfs/fletcher.c
+++ b/module/zfs/fletcher.c
@@ -19,11 +19,111 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Fletcher Checksums
+ * ------------------
+ *
+ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
+ * recurrence relations:
+ *
+ * a = a + f
+ * i i-1 i-1
+ *
+ * b = b + a
+ * i i-1 i
+ *
+ * c = c + b (fletcher-4 only)
+ * i i-1 i
+ *
+ * d = d + c (fletcher-4 only)
+ * i i-1 i
+ *
+ * Where
+ * a_0 = b_0 = c_0 = d_0 = 0
+ * and
+ * f_0 .. f_(n-1) are the input data.
+ *
+ * Using standard techniques, these translate into the following series:
+ *
+ * __n_ __n_
+ * \ | \ |
+ * a = > f b = > i * f
+ * n /___| n - i n /___| n - i
+ * i = 1 i = 1
+ *
+ *
+ * __n_ __n_
+ * \ | i*(i+1) \ | i*(i+1)*(i+2)
+ * c = > ------- f d = > ------------- f
+ * n /___| 2 n - i n /___| 6 n - i
+ * i = 1 i = 1
+ *
+ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
+ * Since the additions are done mod (2^64), errors in the high bits may not
+ * be noticed. For this reason, fletcher-2 is deprecated.
+ *
+ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
+ * A conservative estimate of how big the buffer can get before we overflow
+ * can be estimated using f_i = 0xffffffff for all i:
+ *
+ * % bc
+ * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
+ * 2264
+ * quit
+ * %
+ *
+ * So blocks of up to 2k will not overflow. Our largest block size is
+ * 128k, which has 32k 4-byte words, so we can compute the largest possible
+ * accumulators, then divide by 2^64 to figure the max amount of overflow:
+ *
+ * % bc
+ * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
+ * a/2^64;b/2^64;c/2^64;d/2^64
+ * 0
+ * 0
+ * 1365
+ * 11186858
+ * quit
+ * %
+ *
+ * So a and b cannot overflow. To make sure each bit of input has some
+ * effect on the contents of c and d, we can look at what the factors of
+ * the coefficients in the equations for c_n and d_n are. The number of 2s
+ * in the factors determines the lowest set bit in the multiplier. Running
+ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
+ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
+ * the 64-bit accumulators, every bit of every f_i effects every accumulator,
+ * even for 128k blocks.
+ *
+ * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
+ * we could do our calculations mod (2^32 - 1) by adding in the carries
+ * periodically, and store the number of carries in the top 32-bits.
+ *
+ * --------------------
+ * Checksum Performance
+ * --------------------
+ *
+ * There are two interesting components to checksum performance: cached and
+ * uncached performance. With cached data, fletcher-2 is about four times
+ * faster than fletcher-4. With uncached data, the performance difference is
+ * negligible, since the cost of a cache fill dominates the processing time.
+ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
+ * efficient pass over the data.
+ *
+ * In normal operation, the data which is being checksummed is in a buffer
+ * which has been filled either by:
+ *
+ * 1. a compression step, which will be mostly cached, or
+ * 2. a bcopy() or copyin(), which will be uncached (because the
+ * copy is cache-bypassing).
+ *
+ * For both cached and uncached data, both fletcher checksums are much faster
+ * than sha-256, and slower than 'off', which doesn't touch the data at all.
+ */
#include <sys/types.h>
#include <sys/sysmacros.h>
diff --git a/module/zfs/include/sys/arc.h b/module/zfs/include/sys/arc.h
index c402d3d58..6e5955b7c 100644
--- a/module/zfs/include/sys/arc.h
+++ b/module/zfs/include/sys/arc.h
@@ -85,6 +85,8 @@ void *arc_data_buf_alloc(uint64_t space);
void arc_data_buf_free(void *buf, uint64_t space);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_contents_t type);
+arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
@@ -134,7 +136,7 @@ void arc_fini(void);
* Level 2 ARC
*/
-void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_init(void);
diff --git a/module/zfs/include/sys/dbuf.h b/module/zfs/include/sys/dbuf.h
index 75ce27264..267852519 100644
--- a/module/zfs/include/sys/dbuf.h
+++ b/module/zfs/include/sys/dbuf.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -264,6 +264,7 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h
index 3b1e5c8fb..a363d7032 100644
--- a/module/zfs/include/sys/dmu.h
+++ b/module/zfs/include/sys/dmu.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +60,7 @@ struct zbookmark;
struct spa;
struct nvlist;
struct objset_impl;
+struct arc_buf;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
@@ -114,6 +115,8 @@ typedef enum dmu_object_type {
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */
DMU_OT_SCRUB_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -156,6 +159,9 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_MAX_ACCESS (10<<20) /* 10MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
+#define DMU_USERUSED_OBJECT (-1ULL)
+#define DMU_GROUPUSED_OBJECT (-2ULL)
+
/*
* Public routines to create, destroy, open, and close objsets.
*/
@@ -171,7 +177,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
int dmu_objset_destroy(const char *name);
int dmu_snapshots_destroy(char *fsname, char *snapname);
int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
+ boolean_t recursive);
int dmu_objset_rename(const char *name, const char *newname,
boolean_t recursive);
int dmu_objset_find(char *name, int func(char *, void *), void *arg,
@@ -235,7 +242,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+ int blocksize, dmu_object_type_t bonustype, int bonuslen);
/*
* Free an object from this objset.
@@ -398,6 +405,11 @@ void *dmu_buf_get_user(dmu_buf_t *db);
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
/*
+ * Tells if the given dbuf is freeable.
+ */
+boolean_t dmu_buf_freeable(dmu_buf_t *);
+
+/*
* You must create a transaction, then hold the objects which you will
* (or might) modify as part of this transaction. Then you must assign
* the transaction to a transaction group. Once the transaction has
@@ -422,7 +434,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@@ -445,8 +457,10 @@ int dmu_free_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
+#define DMU_READ_PREFETCH 0 /* prefetch */
+#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf);
+ void *buf, uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@@ -456,6 +470,10 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+ dmu_tx_t *tx);
extern int zfs_prefetch_disable;
@@ -562,6 +580,12 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
int maxlen, boolean_t *conflict);
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp);
+
+typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
+ void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
+ dmu_tx_t *tx);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+ objset_used_cb_t *cb);
extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
extern void *dmu_objset_get_user(objset_t *os);
diff --git a/module/zfs/include/sys/dmu_objset.h b/module/zfs/include/sys/dmu_objset.h
index 1d6572780..82cb6ad7d 100644
--- a/module/zfs/include/sys/dmu_objset.h
+++ b/module/zfs/include/sys/dmu_objset.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,12 +42,20 @@ struct dsl_dataset;
struct dmu_tx;
struct objset_impl;
+#define OBJSET_PHYS_SIZE 2048
+#define OBJSET_OLD_PHYS_SIZE 1024
+
+#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
+
typedef struct objset_phys {
dnode_phys_t os_meta_dnode;
zil_header_t os_zil_header;
uint64_t os_type;
- char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
- sizeof (uint64_t)];
+ uint64_t os_flags;
+ char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
+ sizeof (zil_header_t) - sizeof (uint64_t)*2];
+ dnode_phys_t os_userused_dnode;
+ dnode_phys_t os_groupused_dnode;
} objset_phys_t;
struct objset {
@@ -62,6 +70,8 @@ typedef struct objset_impl {
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
dnode_t *os_meta_dnode;
+ dnode_t *os_userused_dnode;
+ dnode_t *os_groupused_dnode;
zilog_t *os_zil;
objset_t os;
uint8_t os_checksum; /* can change, under dsl_dir's locks */
@@ -74,6 +84,8 @@ typedef struct objset_impl {
struct dmu_tx *os_synctx; /* XXX sketchy */
blkptr_t *os_rootbp;
zil_header_t os_zil_header;
+ list_t os_synced_dnodes;
+ uint64_t os_flags;
/* Protected by os_obj_lock */
kmutex_t os_obj_lock;
@@ -92,6 +104,7 @@ typedef struct objset_impl {
} objset_impl_t;
#define DMU_META_DNODE_OBJECT 0
+#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
#define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
@@ -106,7 +119,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
int dmu_objset_destroy(const char *name);
int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
+ boolean_t recursive);
void dmu_objset_stats(objset_t *os, nvlist_t *nv);
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
@@ -127,6 +141,10 @@ objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
objset_impl_t **osip);
void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+int dmu_objset_userspace_upgrade(objset_t *os);
+boolean_t dmu_objset_userspace_present(objset_t *os);
#ifdef __cplusplus
}
diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h
index be9e56908..48e4da8cd 100644
--- a/module/zfs/include/sys/dnode.h
+++ b/module/zfs/include/sys/dnode.h
@@ -98,7 +98,8 @@ enum dnode_dirtycontext {
};
/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USED_BYTES (1<<0)
+#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
@@ -131,10 +132,7 @@ typedef struct dnode {
*/
krwlock_t dn_struct_rwlock;
- /*
- * Our link on dataset's dd_dnodes list.
- * Protected by dd_accounting_mtx.
- */
+ /* Our link on dn_objset->os_dnodes list; protected by os_lock. */
list_node_t dn_link;
/* immutable: */
@@ -191,6 +189,9 @@ typedef struct dnode {
/* parent IO for current sync write */
zio_t *dn_zio;
+ /* used in syncing context */
+ dnode_phys_t *dn_oldphys;
+
/* holds prefetch structure */
struct zfetch dn_zfetch;
} dnode_t;
diff --git a/module/zfs/include/sys/dsl_dataset.h b/module/zfs/include/sys/dsl_dataset.h
index 8665aec2d..a1c2896e3 100644
--- a/module/zfs/include/sys/dsl_dataset.h
+++ b/module/zfs/include/sys/dsl_dataset.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -195,7 +195,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
diff --git a/module/zfs/include/sys/dsl_deleg.h b/module/zfs/include/sys/dsl_deleg.h
index a29e44e67..b064c9228 100644
--- a/module/zfs/include/sys/dsl_deleg.h
+++ b/module/zfs/include/sys/dsl_deleg.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DSL_DELEG_H
#define _SYS_DSL_DELEG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
#include <sys/zfs_context.h>
@@ -51,6 +49,10 @@ extern "C" {
#define ZFS_DELEG_PERM_ALLOW "allow"
#define ZFS_DELEG_PERM_USERPROP "userprop"
#define ZFS_DELEG_PERM_VSCAN "vscan"
+#define ZFS_DELEG_PERM_USERQUOTA "userquota"
+#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
+#define ZFS_DELEG_PERM_USERUSED "userused"
+#define ZFS_DELEG_PERM_GROUPUSED "groupused"
/*
* Note: the names of properties that are marked delegatable are also
diff --git a/module/zfs/include/sys/dsl_dir.h b/module/zfs/include/sys/dsl_dir.h
index 86b9636ce..56d06388c 100644
--- a/module/zfs/include/sys/dsl_dir.h
+++ b/module/zfs/include/sys/dsl_dir.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -107,7 +107,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
const char *tail, void *tag, dsl_dir_t **);
void dsl_dir_name(dsl_dir_t *dd, char *buf);
int dsl_dir_namelen(dsl_dir_t *dd);
-int dsl_dir_is_private(dsl_dir_t *dd);
uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
const char *name, dmu_tx_t *tx);
dsl_checkfunc_t dsl_dir_destroy_check;
diff --git a/module/zfs/include/sys/dsl_pool.h b/module/zfs/include/sys/dsl_pool.h
index 3bb4ad4ef..d8da295f3 100644
--- a/module/zfs/include/sys/dsl_pool.h
+++ b/module/zfs/include/sys/dsl_pool.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -77,6 +77,7 @@ typedef struct dsl_pool {
struct dsl_dir *dp_mos_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
+ struct taskq *dp_vnrele_taskq;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@@ -143,6 +144,8 @@ int dsl_pool_scrub_clean(dsl_pool_t *dp);
void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_scrub_restart(dsl_pool_t *dp);
+taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+
#ifdef __cplusplus
}
#endif
diff --git a/module/zfs/include/sys/dsl_prop.h b/module/zfs/include/sys/dsl_prop.h
index d66caa86c..26018a46d 100644
--- a/module/zfs/include/sys/dsl_prop.h
+++ b/module/zfs/include/sys/dsl_prop.h
@@ -19,18 +19,17 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DSL_PROP_H
#define _SYS_DSL_PROP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dsl_pool.h>
#include <sys/zfs_context.h>
+#include <sys/dsl_synctask.h>
#ifdef __cplusplus
extern "C" {
@@ -66,8 +65,10 @@ int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
int intsz, int numints, void *buf, char *setpoint);
+dsl_syncfunc_t dsl_props_set_sync;
int dsl_prop_set(const char *ddname, const char *propname,
int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, nvlist_t *nvl);
void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx);
diff --git a/module/zfs/include/sys/metaslab.h b/module/zfs/include/sys/metaslab.h
index 1c9d89e8f..5d3e11c97 100644
--- a/module/zfs/include/sys/metaslab.h
+++ b/module/zfs/include/sys/metaslab.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,6 +39,8 @@ extern "C" {
typedef struct metaslab_class metaslab_class_t;
typedef struct metaslab_group metaslab_group_t;
+extern space_map_ops_t *zfs_metaslab_ops;
+
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
uint64_t start, uint64_t size, uint64_t txg);
extern void metaslab_fini(metaslab_t *msp);
@@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(void);
+extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
diff --git a/module/zfs/include/sys/metaslab_impl.h b/module/zfs/include/sys/metaslab_impl.h
index 5980cbc84..d67dea7e9 100644
--- a/module/zfs/include/sys/metaslab_impl.h
+++ b/module/zfs/include/sys/metaslab_impl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_METASLAB_IMPL_H
#define _SYS_METASLAB_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/metaslab.h>
#include <sys/space_map.h>
#include <sys/vdev.h>
@@ -41,6 +39,7 @@ extern "C" {
struct metaslab_class {
metaslab_group_t *mc_rotor;
uint64_t mc_allocated;
+ space_map_ops_t *mc_ops;
};
struct metaslab_group {
diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h
index 029123dfe..c7ae4022e 100644
--- a/module/zfs/include/sys/spa.h
+++ b/module/zfs/include/sys/spa.h
@@ -324,12 +324,9 @@ extern int spa_get_stats(const char *pool, nvlist_t **config,
char *altroot, size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
const char *history_str, nvlist_t *zplprops);
-extern int spa_check_rootconf(char *devpath, char *devid,
- nvlist_t **bestconf, uint64_t *besttxg);
-extern boolean_t spa_rootdev_validate(nvlist_t *nv);
extern int spa_import_rootpool(char *devpath, char *devid);
extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -347,6 +344,7 @@ extern void spa_inject_delref(spa_t *spa);
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
+#define SPA_ASYNC_AUTOEXPAND 0x20
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -356,6 +354,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
/* spare state (which is global across all pools) */
extern void spa_spare_add(vdev_t *vd);
diff --git a/module/zfs/include/sys/spa_boot.h b/module/zfs/include/sys/spa_boot.h
index b56073b97..1d3622f5a 100644
--- a/module/zfs/include/sys/spa_boot.h
+++ b/module/zfs/include/sys/spa_boot.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SPA_BOOT_H
#define _SYS_SPA_BOOT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/nvpair.h>
#ifdef __cplusplus
@@ -36,7 +34,6 @@ extern "C" {
extern char *spa_get_bootprop(char *prop);
extern void spa_free_bootprop(char *prop);
-extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
#ifdef __cplusplus
}
diff --git a/module/zfs/include/sys/spa_impl.h b/module/zfs/include/sys/spa_impl.h
index 588b4f5a9..12999ee9e 100644
--- a/module/zfs/include/sys/spa_impl.h
+++ b/module/zfs/include/sys/spa_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -141,9 +141,6 @@ struct spa {
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
- kmutex_t spa_async_root_lock; /* protects async root count */
- uint64_t spa_async_root_count; /* number of async root zios */
- kcondvar_t spa_async_root_cv; /* notify when count == 0 */
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
boolean_t spa_last_open_failed; /* true if last open faled */
@@ -163,15 +160,16 @@ struct spa {
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
+ zio_t *spa_async_zio_root; /* root of all async I/O */
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
- boolean_t spa_import_faulted; /* allow faulted vdevs */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */
+ uint64_t spa_autoexpand; /* lun expansion on/off */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/module/zfs/include/sys/space_map.h b/module/zfs/include/sys/space_map.h
index 8d7860660..a682bbd40 100644
--- a/module/zfs/include/sys/space_map.h
+++ b/module/zfs/include/sys/space_map.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -46,12 +46,14 @@ typedef struct space_map {
uint8_t sm_loading; /* map loading? */
kcondvar_t sm_load_cv; /* map load completion */
space_map_ops_t *sm_ops; /* space map block picker ops vector */
+ avl_tree_t *sm_pp_root; /* picker-private AVL tree */
void *sm_ppd; /* picker-private data */
kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t;
typedef struct space_seg {
avl_node_t ss_node; /* AVL node */
+ avl_node_t ss_pp_node; /* AVL picker-private node */
uint64_t ss_start; /* starting offset of this segment */
uint64_t ss_end; /* ending offset (non-inclusive) */
} space_seg_t;
@@ -74,6 +76,7 @@ struct space_map_ops {
uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+ uint64_t (*smop_max)(space_map_t *sm);
};
/*
@@ -152,6 +155,7 @@ extern void space_map_unload(space_map_t *sm);
extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
extern void space_map_sync(space_map_t *sm, uint8_t maptype,
space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
diff --git a/module/zfs/include/sys/vdev.h b/module/zfs/include/sys/vdev.h
index b8313a920..71b9b12d6 100644
--- a/module/zfs/include/sys/vdev.h
+++ b/module/zfs/include/sys/vdev.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -50,7 +50,6 @@ extern int vdev_open(vdev_t *);
extern int vdev_validate(vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
@@ -71,6 +70,8 @@ extern boolean_t vdev_resilver_needed(vdev_t *vd,
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
@@ -113,7 +114,8 @@ extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
+ boolean_t);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
diff --git a/module/zfs/include/sys/vdev_impl.h b/module/zfs/include/sys/vdev_impl.h
index 652349341..8240b66ac 100644
--- a/module/zfs/include/sys/vdev_impl.h
+++ b/module/zfs/include/sys/vdev_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -113,6 +113,7 @@ struct vdev {
uint64_t vdev_guid; /* unique ID for this vdev */
uint64_t vdev_guid_sum; /* self guid + all child guids */
uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
@@ -125,6 +126,7 @@ struct vdev {
uint64_t vdev_children; /* number of children */
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */
+ boolean_t vdev_expanding; /* expand the vdev? */
/*
* Top-level vdev state.
@@ -159,6 +161,7 @@ struct vdev {
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
char *vdev_physpath; /* vdev device path (if any) */
+ char *vdev_fru; /* physical FRU location */
uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */
hrtime_t vdev_last_try; /* last reopen time */
@@ -188,8 +191,9 @@ struct vdev {
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
};
-#define VDEV_SKIP_SIZE (8 << 10)
-#define VDEV_BOOT_HEADER_SIZE (8 << 10)
+#define VDEV_PAD_SIZE (8 << 10)
+/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
+#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
@@ -201,26 +205,14 @@ struct vdev {
offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
-/* ZFS boot block */
-#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
-#define VDEV_BOOT_VERSION 1 /* version number */
-
-typedef struct vdev_boot_header {
- uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
- uint64_t vb_version; /* VDEV_BOOT_VERSION */
- uint64_t vb_offset; /* start offset (bytes) */
- uint64_t vb_size; /* size (bytes) */
- char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
-} vdev_boot_header_t;
-
typedef struct vdev_phys {
char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
zio_block_tail_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
- char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
- vdev_boot_header_t vl_boot_header; /* 8K */
+ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
+ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
vdev_phys_t vl_vdev_phys; /* 112K */
char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
} vdev_label_t; /* 256K total */
@@ -249,6 +241,7 @@ typedef struct vdev_label {
#define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2
#define VDEV_ALLOC_L2CACHE 3
+#define VDEV_ALLOC_ROOTPOOL 4
/*
* Allocate or free a vdev
@@ -269,6 +262,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
+extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -290,7 +284,8 @@ extern vdev_ops_t vdev_spare_ops;
* Common size functions
*/
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
/*
* zdb uses this tunable, so it must be declared here to make lint happy.
diff --git a/module/zfs/include/sys/zap.h b/module/zfs/include/sys/zap.h
index f88cc068b..de2053812 100644
--- a/module/zfs/include/sys/zap.h
+++ b/module/zfs/include/sys/zap.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZAP_H
#define _SYS_ZAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* ZAP - ZFS Attribute Processor
*
@@ -87,9 +85,6 @@
extern "C" {
#endif
-#define ZAP_MAXNAMELEN 256
-#define ZAP_MAXVALUELEN 1024
-
/*
* The matchtype specifies which entry will be accessed.
* MT_EXACT: only find an exact match (non-normalized)
@@ -186,6 +181,10 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
+int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+ int add, uint64_t *towrite, uint64_t *tooverwrite,
+ uint64_t dn_datablkshift);
+
/*
* Create an attribute with the given name and value.
*
diff --git a/module/zfs/include/sys/zap_impl.h b/module/zfs/include/sys/zap_impl.h
index 0dc02ab6b..c86bb16de 100644
--- a/module/zfs/include/sys/zap_impl.h
+++ b/module/zfs/include/sys/zap_impl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZAP_IMPL_H
#define _SYS_ZAP_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zap.h>
#include <sys/zfs_context.h>
#include <sys/avl.h>
@@ -195,6 +193,8 @@ int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp);
+int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+ uint64_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn,
diff --git a/module/zfs/include/sys/zfs_acl.h b/module/zfs/include/sys/zfs_acl.h
index bd91b33d1..f5e5aa7f4 100644
--- a/module/zfs/include/sys/zfs_acl.h
+++ b/module/zfs/include/sys/zfs_acl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -114,8 +114,6 @@ typedef struct zfs_acl_phys {
uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
} zfs_acl_phys_t;
-
-
typedef struct acl_ops {
uint32_t (*ace_mask_get) (void *acep); /* get access mask */
void (*ace_mask_set) (void *acep,
@@ -161,12 +159,21 @@ typedef struct zfs_acl {
zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
list_t z_acl; /* chunks of ACE data */
acl_ops_t z_ops; /* ACL operations */
- boolean_t z_has_fuids; /* FUIDs present in ACL? */
} zfs_acl_t;
#define ACL_DATA_ALLOCED 0x1
#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+struct zfs_fuid_info;
+
+typedef struct zfs_acl_ids {
+ uint64_t z_fuid; /* file owner fuid */
+ uint64_t z_fgid; /* file group owner fuid */
+ uint64_t z_mode; /* mode to set on create */
+ zfs_acl_t *z_aclp; /* ACL to create with file */
+ struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
+} zfs_acl_ids_t;
+
/*
* Property values for acl_mode and acl_inherit.
*
@@ -183,16 +190,18 @@ typedef struct zfs_acl {
struct znode;
struct zfsvfs;
-struct zfs_fuid_info;
#ifdef _KERNEL
-void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
- dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **);
+int zfs_acl_ids_create(struct znode *, int, vattr_t *,
+ cred_t *, vsecattr_t *, zfs_acl_ids_t *);
+void zfs_acl_ids_free(zfs_acl_ids_t *);
+boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
void zfs_acl_rele(void *);
void zfs_oldace_byteswap(ace_t *, int);
void zfs_ace_byteswap(void *, size_t, boolean_t);
+extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
@@ -202,9 +211,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr);
void zfs_acl_free(zfs_acl_t *);
-int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **);
-int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *,
- struct zfs_fuid_info **, dmu_tx_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
+ struct zfs_fuid_info **, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
#endif
diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h
index a5be3e130..40de32084 100644
--- a/module/zfs/include/sys/zfs_context.h
+++ b/module/zfs/include/sys/zfs_context.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZFS_CONTEXT_H
#define _SYS_ZFS_CONTEXT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -62,6 +60,7 @@ extern "C" {
#include <sys/zfs_debug.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
#include <sys/fm/util.h>
#define CPU_SEQID (CPU->cpu_seqid)
diff --git a/module/zfs/include/sys/zfs_ctldir.h b/module/zfs/include/sys/zfs_ctldir.h
index ce29625d1..c15c946d5 100644
--- a/module/zfs/include/sys/zfs_ctldir.h
+++ b/module/zfs/include/sys/zfs_ctldir.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _ZFS_CTLDIR_H
#define _ZFS_CTLDIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/pathname.h>
#include <sys/vnode.h>
#include <sys/zfs_vfsops.h>
@@ -66,6 +64,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
#define ZFSCTL_INO_ROOT 0x1
#define ZFSCTL_INO_SNAPDIR 0x2
+#define ZFSCTL_INO_SHARES 0x3
#ifdef __cplusplus
}
diff --git a/module/zfs/include/sys/zfs_dir.h b/module/zfs/include/sys/zfs_dir.h
index ebb66e8ae..650315be2 100644
--- a/module/zfs/include/sys/zfs_dir.h
+++ b/module/zfs/include/sys/zfs_dir.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FS_ZFS_DIR_H
#define _SYS_FS_ZFS_DIR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/pathname.h>
#include <sys/dmu.h>
#include <sys/zfs_znode.h>
@@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
pathname_t *);
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
- uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **);
+ uint_t, znode_t **, int, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h
index 810ffc81a..f81ddf4a5 100644
--- a/module/zfs/include/sys/zfs_fuid.h
+++ b/module/zfs/include/sys/zfs_fuid.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FS_ZFS_FUID_H
#define _SYS_FS_ZFS_FUID_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef _KERNEL
#include <sys/kidmap.h>
#include <sys/sid.h>
@@ -51,11 +49,11 @@ typedef enum {
* Estimate space needed for one more fuid table entry.
* for now assume its current size + 1K
*/
-#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
-#define FUID_INDEX(x) (x >> 32)
-#define FUID_RID(x) (x & 0xffffffff)
-#define FUID_ENCODE(idx, rid) ((idx << 32) | rid)
+#define FUID_INDEX(x) ((x) >> 32)
+#define FUID_RID(x) ((x) & 0xffffffff)
+#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
/*
* FUIDs cause problems for the intent log
* we need to replay the creation of the FUID,
@@ -104,17 +102,23 @@ struct znode;
extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
extern void zfs_fuid_destroy(zfsvfs_t *);
extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
- dmu_tx_t *, cred_t *, zfs_fuid_info_t **);
+ cred_t *, zfs_fuid_info_t **);
extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
- dmu_tx_t *, zfs_fuid_info_t **);
-extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid,
- uid_t *gid);
+ zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
+ uid_t *uid, uid_t *gid);
extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
-extern void zfs_fuid_info_free();
+extern void zfs_fuid_info_free(zfs_fuid_info_t *);
extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
+extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
+ char **retdomain, boolean_t addok);
+extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
+extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
#endif
char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h
index 1692608bb..1e9f35155 100644
--- a/module/zfs/include/sys/zfs_ioctl.h
+++ b/module/zfs/include/sys/zfs_ioctl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZFS_IOCTL_H
#define _SYS_ZFS_IOCTL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/cred.h>
#include <sys/dmu.h>
#include <sys/zio.h>
@@ -118,7 +116,7 @@ typedef struct zinject_record {
uint32_t zi_error;
uint64_t zi_type;
uint32_t zi_freq;
- uint32_t zi_pad; /* pad out to 64 bit alignment */
+ uint32_t zi_failfast;
} zinject_record_t;
#define ZINJECT_NULL 0x1
@@ -162,12 +160,20 @@ typedef struct zfs_cmd {
uint64_t zc_history_len;
uint64_t zc_history_offset;
uint64_t zc_obj;
+ uint64_t zc_iflags; /* internal to zfs(7fs) */
zfs_share_t zc_share;
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
} zfs_cmd_t;
+typedef struct zfs_useracct {
+ char zu_domain[256];
+ uid_t zu_rid;
+ uint32_t zu_pad;
+ uint64_t zu_space;
+} zfs_useracct_t;
+
#define ZVOL_MAX_MINOR (1 << 16)
#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
diff --git a/module/zfs/include/sys/zfs_vfsops.h b/module/zfs/include/sys/zfs_vfsops.h
index 7e0440be4..b8ed7b27f 100644
--- a/module/zfs/include/sys/zfs_vfsops.h
+++ b/module/zfs/include/sys/zfs_vfsops.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -53,6 +53,7 @@ struct zfsvfs {
avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
krwlock_t z_fuid_lock; /* fuid lock */
boolean_t z_fuid_loaded; /* fuid tables are loaded */
+ boolean_t z_fuid_dirty; /* need to sync fuid table ? */
struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
zilog_t *z_log; /* intent log pointer */
uint_t z_acl_mode; /* acl chmod/mode behavior */
@@ -72,8 +73,12 @@ struct zfsvfs {
boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
- kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */
+ kmutex_t z_online_recv_lock; /* held while recv in progress */
uint64_t z_version; /* ZPL version */
+ uint64_t z_shares_dir; /* hidden shares dir */
+ kmutex_t z_lock;
+ uint64_t z_userquota_obj;
+ uint64_t z_groupquota_obj;
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
};
@@ -130,6 +135,17 @@ extern uint_t zfs_fsyncer_key;
extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valuep);
+extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
+extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota);
+extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
+ boolean_t isgroup, uint64_t fuid);
+extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
+extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
+extern void zfsvfs_free(zfsvfs_t *zfsvfs);
#ifdef __cplusplus
}
diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h
index 9192abcd7..69f4b50f5 100644
--- a/module/zfs/include/sys/zfs_znode.h
+++ b/module/zfs/include/sys/zfs_znode.h
@@ -93,12 +93,15 @@ extern "C" {
/*
* Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
*/
#define ZFS_FSID "FSID"
#define ZFS_UNLINKED_SET "DELETE_QUEUE"
#define ZFS_ROOT_OBJ "ROOT"
#define ZPL_VERSION_STR "VERSION"
#define ZFS_FUID_TABLES "FUID"
+#define ZFS_SHARES_DIR "SHARES"
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
@@ -309,7 +312,6 @@ extern int zfs_create_op_tables();
extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
extern dev_t zfs_cmpldev(uint64_t);
extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
-extern int zfs_set_version(const char *name, uint64_t newvers);
extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
extern void zfs_znode_dmu_fini(znode_t *);
@@ -336,6 +338,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t);
diff --git a/module/zfs/include/sys/zil.h b/module/zfs/include/sys/zil.h
index b69323cfa..2aff8cd68 100644
--- a/module/zfs/include/sys/zil.h
+++ b/module/zfs/include/sys/zil.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -56,10 +56,16 @@ typedef struct zil_header {
uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */
uint64_t zh_claim_seq; /* highest claimed sequence number */
- uint64_t zh_pad[5];
+ uint64_t zh_flags; /* header flags */
+ uint64_t zh_pad[4];
} zil_header_t;
/*
+ * zh_flags bit settings
+ */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+
+/*
* Log block trailer - structure at the end of the header and each log block
*
* The zit_bt contains a zbt_cksum which for the intent log is
@@ -299,7 +305,27 @@ typedef struct {
*/
/*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
*/
typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
@@ -359,9 +385,9 @@ extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern int zil_vdev_offline(char *osname, void *txarg);
extern int zil_claim(char *osname, void *txarg);
extern int zil_check_log_chain(char *osname, void *txarg);
-extern int zil_clear_log_chain(char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog);
extern int zil_is_committed(zilog_t *zilog);
diff --git a/module/zfs/include/sys/zil_impl.h b/module/zfs/include/sys/zil_impl.h
index 3f2582931..685305fb5 100644
--- a/module/zfs/include/sys/zil_impl.h
+++ b/module/zfs/include/sys/zil_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -101,6 +101,9 @@ typedef struct zil_dva_node {
avl_node_t zn_node;
} zil_dva_node_t;
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+ sizeof (lr_write_t))
+
#ifdef __cplusplus
}
#endif
diff --git a/module/zfs/include/sys/zio.h b/module/zfs/include/sys/zio.h
index 67adc3b4c..5c51717c1 100644
--- a/module/zfs/include/sys/zio.h
+++ b/module/zfs/include/sys/zio.h
@@ -76,7 +76,7 @@ enum zio_checksum {
ZIO_CHECKSUM_FUNCTIONS
};
-#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
enum zio_compress {
@@ -116,30 +116,33 @@ enum zio_compress {
#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
#define ZIO_PRIORITY_TABLE_SIZE 10
-#define ZIO_FLAG_MUSTSUCCEED 0x00000
-#define ZIO_FLAG_CANFAIL 0x00001
-#define ZIO_FLAG_SPECULATIVE 0x00002
-#define ZIO_FLAG_CONFIG_WRITER 0x00004
-#define ZIO_FLAG_DONT_RETRY 0x00008
+#define ZIO_FLAG_MUSTSUCCEED 0x000000
+#define ZIO_FLAG_CANFAIL 0x000001
+#define ZIO_FLAG_SPECULATIVE 0x000002
+#define ZIO_FLAG_CONFIG_WRITER 0x000004
+#define ZIO_FLAG_DONT_RETRY 0x000008
-#define ZIO_FLAG_DONT_CACHE 0x00010
-#define ZIO_FLAG_DONT_QUEUE 0x00020
-#define ZIO_FLAG_DONT_AGGREGATE 0x00040
-#define ZIO_FLAG_DONT_PROPAGATE 0x00080
+#define ZIO_FLAG_DONT_CACHE 0x000010
+#define ZIO_FLAG_DONT_QUEUE 0x000020
+#define ZIO_FLAG_DONT_AGGREGATE 0x000040
+#define ZIO_FLAG_DONT_PROPAGATE 0x000080
-#define ZIO_FLAG_IO_BYPASS 0x00100
-#define ZIO_FLAG_IO_REPAIR 0x00200
-#define ZIO_FLAG_IO_RETRY 0x00400
-#define ZIO_FLAG_IO_REWRITE 0x00800
+#define ZIO_FLAG_IO_BYPASS 0x000100
+#define ZIO_FLAG_IO_REPAIR 0x000200
+#define ZIO_FLAG_IO_RETRY 0x000400
+#define ZIO_FLAG_IO_REWRITE 0x000800
-#define ZIO_FLAG_SELF_HEAL 0x01000
-#define ZIO_FLAG_RESILVER 0x02000
-#define ZIO_FLAG_SCRUB 0x04000
-#define ZIO_FLAG_SCRUB_THREAD 0x08000
+#define ZIO_FLAG_SELF_HEAL 0x001000
+#define ZIO_FLAG_RESILVER 0x002000
+#define ZIO_FLAG_SCRUB 0x004000
+#define ZIO_FLAG_SCRUB_THREAD 0x008000
-#define ZIO_FLAG_PROBE 0x10000
-#define ZIO_FLAG_GANG_CHILD 0x20000
-#define ZIO_FLAG_RAW 0x40000
+#define ZIO_FLAG_PROBE 0x010000
+#define ZIO_FLAG_GANG_CHILD 0x020000
+#define ZIO_FLAG_RAW 0x040000
+#define ZIO_FLAG_GODFATHER 0x080000
+
+#define ZIO_FLAG_TRYHARD 0x100000
#define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \
@@ -157,7 +160,8 @@ enum zio_compress {
(ZIO_FLAG_GANG_INHERIT | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_IO_RETRY | \
- ZIO_FLAG_PROBE)
+ ZIO_FLAG_PROBE | \
+ ZIO_FLAG_TRYHARD)
#define ZIO_FLAG_AGG_INHERIT \
(ZIO_FLAG_DONT_AGGREGATE | \
@@ -281,7 +285,6 @@ struct zio {
int io_cmd;
uint8_t io_priority;
uint8_t io_reexecute;
- uint8_t io_async_root;
uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg;
spa_t *io_spa;
@@ -324,6 +327,7 @@ struct zio {
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t *io_stall;
+ zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
void *io_executor;
void *io_waiter;
@@ -415,7 +419,7 @@ extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
-extern void zio_resume(spa_t *spa);
+extern int zio_resume(spa_t *spa);
extern void zio_resume_wait(spa_t *spa);
/*
@@ -435,7 +439,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
extern int zio_handle_fault_injection(zio_t *zio, int error);
-extern int zio_handle_device_injection(vdev_t *vd, int error);
+extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
#ifdef __cplusplus
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 412832968..77556ac5d 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 30;
+
+/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
mc->mc_rotor = NULL;
+ mc->mc_ops = ops;
return (mc);
}
@@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
}
/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
*/
-static void
-metaslab_ff_load(space_map_t *sm)
-{
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-}
-
static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
space_seg_t *ss, ssearch;
avl_index_t where;
@@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
return (-1ULL);
*cursor = 0;
- return (metaslab_ff_alloc(sm, size));
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+ sm->sm_pp_root = NULL;
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+ return (metaslab_block_picker(t, cursor, size, align));
}
/* ARGSUSED */
@@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = {
metaslab_ff_unload,
metaslab_ff_alloc,
metaslab_ff_claim,
- metaslab_ff_free
+ metaslab_ff_free,
+ NULL /* maxsize */
+};
+
+/*
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ */
+
+uint64_t
+metaslab_df_maxsize(space_map_t *sm)
+{
+ avl_tree_t *t = sm->sm_pp_root;
+ space_seg_t *ss;
+
+ if (t == NULL || (ss = avl_last(t)) == NULL)
+ return (0ULL);
+
+ return (ss->ss_end - ss->ss_start);
+}
+
+static int
+metaslab_df_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+ uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+ uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+ if (ss_size1 < ss_size2)
+ return (-1);
+ if (ss_size1 > ss_size2)
+ return (1);
+
+ if (s1->ss_start < s2->ss_start)
+ return (-1);
+ if (s1->ss_start > s2->ss_start)
+ return (1);
+
+ return (0);
+}
+
+static void
+metaslab_df_load(space_map_t *sm)
+{
+ space_seg_t *ss;
+
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+ sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_df_unload(space_map_t *sm)
+{
+ void *cookie = NULL;
+
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+
+ while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+ /* tear down the tree */
+ }
+
+ avl_destroy(sm->sm_pp_root);
+ kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+ sm->sm_pp_root = NULL;
+}
+
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ uint64_t max_size = metaslab_df_maxsize(sm);
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if (max_size < metaslab_df_alloc_threshold ||
+ free_pct < metaslab_df_free_pct) {
+ t = sm->sm_pp_root;
+ *cursor = 0;
+ }
+
+ return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_df_ops = {
+ metaslab_df_load,
+ metaslab_df_unload,
+ metaslab_df_alloc,
+ metaslab_df_claim,
+ metaslab_df_free,
+ metaslab_df_maxsize
};
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
/*
* ==========================================================================
* Metaslabs
@@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
space_map_t *sm = &msp->ms_map;
+ space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, &metaslab_ff_ops,
- SM_FREE, &msp->ms_smo,
+ int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
+
+ /*
+ * If we were able to load the map then make sure
+ * that this map is still able to satisfy our request.
+ */
+ if (msp->ms_weight < size)
+ return (ENOSPC);
+
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
@@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
int i;
activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++)
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ for (i = 0; i < d; i++) {
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
+ break;
+ }
+ }
for (;;) {
+ boolean_t was_active;
+
mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
if (msp->ms_weight < size) {
@@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
return (-1ULL);
}
+ was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
@@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
- if (msp->ms_weight < size) {
+ if (msp->ms_weight < size || (was_active &&
+ !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue;
}
- if (metaslab_activate(msp, activation_weight) != 0) {
+ if (metaslab_activate(msp, activation_weight, size) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -869,7 +1043,7 @@ next:
goto top;
}
- if (!zio_lock) {
+ if (!allocatable && !zio_lock) {
dshift = 3;
zio_lock = B_TRUE;
goto top;
@@ -955,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index df1b7e125..6a95b399b 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -59,6 +59,7 @@
#include <sys/systeminfo.h>
#include <sys/sunddi.h>
#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
#ifdef _KERNEL
#include <sys/zone.h>
@@ -67,16 +68,44 @@
#include "zfs_prop.h"
#include "zfs_comutil.h"
-int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
- /* ISSUE INTR */
- { 1, 1 }, /* ZIO_TYPE_NULL */
- { 1, 8 }, /* ZIO_TYPE_READ */
- { 8, 1 }, /* ZIO_TYPE_WRITE */
- { 1, 1 }, /* ZIO_TYPE_FREE */
- { 1, 1 }, /* ZIO_TYPE_CLAIM */
- { 1, 1 }, /* ZIO_TYPE_IOCTL */
+enum zti_modes {
+ zti_mode_fixed, /* value is # of threads (min 1) */
+ zti_mode_online_percent, /* value is % of online CPUs */
+ zti_mode_tune, /* fill from zio_taskq_tune_* */
+ zti_nmodes
};
+#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) }
+#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) }
+#define ZTI_THREAD_TUNE { zti_mode_tune, 0 }
+
+#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1)
+
+typedef struct zio_taskq_info {
+ const char *zti_name;
+ struct {
+ enum zti_modes zti_mode;
+ uint_t zti_value;
+ } zti_nthreads[ZIO_TASKQ_TYPES];
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+ "issue", "intr"
+};
+
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
+ /* ISSUE INTR */
+ { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+ { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } },
+ { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } },
+ { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+ { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+ { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+};
+
+enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
+uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */
+
static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
@@ -304,12 +333,18 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
+ case ZPOOL_PROP_AUTOEXPAND:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = EINVAL;
break;
case ZPOOL_PROP_BOOTFS:
+ /*
+ * If the pool version is less than SPA_VERSION_BOOTFS,
+ * or the pool is still being created (version == 0),
+ * the bootfs property cannot be set.
+ */
if (spa_version(spa) < SPA_VERSION_BOOTFS) {
error = ENOTSUP;
break;
@@ -541,14 +576,50 @@ spa_activate(spa_t *spa, int mode)
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
- spa->spa_normal_class = metaslab_class_create();
- spa->spa_log_class = metaslab_class_create();
+ spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
+ const zio_taskq_info_t *ztip = &zio_taskqs[t];
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
- zio_taskq_threads[t][q], maxclsyspri, 50,
- INT_MAX, TASKQ_PREPOPULATE);
+ enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
+ uint_t value = ztip->zti_nthreads[q].zti_value;
+ char name[32];
+
+ (void) snprintf(name, sizeof (name),
+ "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+
+ if (mode == zti_mode_tune) {
+ mode = zio_taskq_tune_mode;
+ value = zio_taskq_tune_value;
+ if (mode == zti_mode_tune)
+ mode = zti_mode_online_percent;
+ }
+
+ switch (mode) {
+ case zti_mode_fixed:
+ ASSERT3U(value, >=, 1);
+ value = MAX(value, 1);
+
+ spa->spa_zio_taskq[t][q] = taskq_create(name,
+ value, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ break;
+
+ case zti_mode_online_percent:
+ spa->spa_zio_taskq[t][q] = taskq_create(name,
+ value, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+ break;
+
+ case zti_mode_tune:
+ default:
+ panic("unrecognized mode for "
+ "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
+ "in spa_activate()",
+ t, q, mode, value);
+ break;
+ }
}
}
@@ -577,7 +648,7 @@ spa_deactivate(spa_t *spa)
ASSERT(spa->spa_sync_on == B_FALSE);
ASSERT(spa->spa_dsl_pool == NULL);
ASSERT(spa->spa_root_vdev == NULL);
-
+ ASSERT(spa->spa_async_zio_root == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
txg_list_destroy(&spa->spa_vdev_txg_list);
@@ -621,7 +692,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
uint_t id, int atype)
{
nvlist_t **child;
- uint_t c, children;
+ uint_t children;
int error;
if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
@@ -642,7 +713,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
return (EINVAL);
}
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vdev_t *vd;
if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
atype)) != 0) {
@@ -683,10 +754,10 @@ spa_unload(spa_t *spa)
/*
* Wait for any outstanding async I/O to complete.
*/
- mutex_enter(&spa->spa_async_root_lock);
- while (spa->spa_async_root_count != 0)
- cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
- mutex_exit(&spa->spa_async_root_lock);
+ if (spa->spa_async_zio_root != NULL) {
+ (void) zio_wait(spa->spa_async_zio_root);
+ spa->spa_async_zio_root = NULL;
+ }
/*
* Close the dsl pool.
@@ -828,6 +899,7 @@ spa_load_spares(spa_t *spa)
}
vd->vdev_top = vd;
+ vd->vdev_aux = &spa->spa_spares;
if (vdev_open(vd) != 0)
continue;
@@ -869,7 +941,7 @@ spa_load_l2cache(spa_t *spa)
nvlist_t **l2cache;
uint_t nl2cache;
int i, j, oldnvdevs;
- uint64_t guid, size;
+ uint64_t guid;
vdev_t *vd, **oldvdevs, **newvdevs;
spa_aux_vdev_t *sav = &spa->spa_l2cache;
@@ -933,12 +1005,8 @@ spa_load_l2cache(spa_t *spa)
(void) vdev_validate_aux(vd);
- if (!vdev_is_dead(vd)) {
- size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ if (!vdev_is_dead(vd))
+ l2arc_add_vdev(spa, vd);
}
}
@@ -1001,7 +1069,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
dmu_buf_rele(db, FTAG);
packed = kmem_alloc(nvsize, KM_SLEEP);
- error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+ DMU_READ_PREFETCH);
if (error == 0)
error = nvlist_unpack(packed, nvsize, value, 0);
kmem_free(packed, nvsize);
@@ -1016,9 +1085,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
static void
spa_check_removed(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
spa_check_removed(vd->vdev_child[c]);
if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
@@ -1028,6 +1095,33 @@ spa_check_removed(vdev_t *vd)
}
/*
+ * Load the slog device state from the config object since it's possible
+ * that the label does not contain the most up-to-date information.
+ */
+void
+spa_load_log_state(spa_t *spa)
+{
+ nvlist_t *nv, *nvroot, **child;
+ uint64_t is_log;
+ uint_t children;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
+ VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log) == 0 && is_log)
+ vdev_load_log_state(tvd, child[c]);
+ }
+ nvlist_free(nv);
+}
+
+/*
* Check for missing log devices
*/
int
@@ -1043,13 +1137,7 @@ spa_check_logs(spa_t *spa)
return (1);
}
break;
-
- case SPA_LOG_CLEAR:
- (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
- DS_FIND_CHILDREN);
- break;
}
- spa->spa_log_state = SPA_LOG_GOOD;
return (0);
}
@@ -1107,6 +1195,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa->spa_load_guid = pool_guid;
/*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+ /*
* Parse the configuration into a vdev tree. We explicitly set the
* value that will be returned by spa_version() since parsing the
* configuration requires knowing the version number.
@@ -1132,16 +1226,19 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
goto out;
/*
- * Validate the labels for all leaf vdevs. We need to grab the config
- * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand, which is dependent on the setting of mosconfig. If
+ * mosconfig is true then we're validating the vdev labels based on
+ * that config. Otherwise, we're validating against the cached config
+ * (zpool.cache) that was read when we loaded the zfs module, and then
+ * later we will recursively call spa_load() and validate against
+ * the vdev config.
*/
- if (mosconfig) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0)
- goto out;
- }
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error != 0)
+ goto out;
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
error = ENXIO;
@@ -1372,6 +1469,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, SCL_ALL, FTAG);
}
+ spa_load_log_state(spa);
+
if (spa_check_logs(spa)) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LOG);
@@ -1410,6 +1509,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa->spa_pool_props_object,
zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
sizeof (uint64_t), 1, &spa->spa_failmode);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND),
+ sizeof (uint64_t), 1, &spa->spa_autoexpand);
}
/*
@@ -1459,6 +1562,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
+ spa->spa_log_state = SPA_LOG_GOOD;
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
@@ -1646,6 +1750,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
uint_t vsc;
uint64_t pool;
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
if (spa->spa_spares.sav_count == 0)
return;
@@ -1693,11 +1799,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
vdev_stat_t *vs;
uint_t vsc;
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
if (spa->spa_l2cache.sav_count == 0)
return;
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
VERIFY(nvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
@@ -1731,8 +1837,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
vdev_get_stats(vd, vs);
}
}
-
- spa_config_exit(spa, SCL_CONFIG, FTAG);
}
int
@@ -1744,16 +1848,27 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
*config = NULL;
error = spa_open_common(name, &spa, FTAG, config);
- if (spa && *config != NULL) {
- VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
- spa_get_errlog_size(spa)) == 0);
+ if (spa != NULL) {
+ /*
+ * This still leaves a window of inconsistency where the spares
+ * or l2cache devices could change and the config would be
+ * self-inconsistent.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- if (spa_suspended(spa))
+ if (*config != NULL) {
VERIFY(nvlist_add_uint64(*config,
- ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0);
+ ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ if (spa_suspended(spa))
+ VERIFY(nvlist_add_uint64(*config,
+ ZPOOL_CONFIG_SUSPENDED,
+ spa->spa_failmode) == 0);
- spa_add_spares(spa, *config);
- spa_add_l2cache(spa, *config);
+ spa_add_spares(spa, *config);
+ spa_add_l2cache(spa, *config);
+ }
}
/*
@@ -1775,8 +1890,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
}
}
- if (spa != NULL)
+ if (spa != NULL) {
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_close(spa, FTAG);
+ }
return (error);
}
@@ -1969,7 +2086,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
- int c, error = 0;
+ int error = 0;
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
@@ -1995,7 +2112,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_uberblock.ub_txg = txg - 1;
if (props && (error = spa_prop_validate(spa, props))) {
- spa_unload(spa);
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
@@ -2010,6 +2126,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_ubsync = spa->spa_uberblock;
/*
+ * Create "The Godfather" zio to hold all async IOs
+ */
+ spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+ /*
* Create the root vdev.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
@@ -2026,9 +2148,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_init(rvd->vdev_child[c], txg);
- vdev_config_dirty(rvd);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_metaslab_set_size(rvd->vdev_child[c]);
+ vdev_expand(rvd->vdev_child[c], txg);
+ }
}
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2127,6 +2250,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(spa, props, CRED(), tx);
@@ -2155,17 +2279,239 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
return (0);
}
+#ifdef _KERNEL
/*
- * Import the given pool into the system. We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
*/
-static int
-spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
- boolean_t isroot, boolean_t allowfaulted)
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+static nvlist_t *
+spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
+{
+ nvlist_t *config;
+ nvlist_t *nvtop, *nvroot;
+ uint64_t pgid;
+
+ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+ return (NULL);
+
+ /*
+ * Add this top-level vdev to the child array.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pgid) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
+
+ /*
+ * Put this pool's top-level vdevs into a root vdev.
+ */
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &nvtop, 1) == 0);
+
+ /*
+ * Replace the existing vdev_tree with the new root vdev in
+ * this pool's configuration (remove the old, add the new).
+ */
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+ return (config);
+}
+
+/*
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
+ */
+static void
+spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t *label;
+ uint64_t label_txg;
+
+ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+ &label) != 0)
+ return;
+
+ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &label_txg) == 0);
+
+ /*
+ * Do we have a better boot device?
+ */
+ if (label_txg > *txg) {
+ *txg = label_txg;
+ *avd = vd;
+ }
+ nvlist_free(label);
+ }
+}
+
+/*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist of devid and/or physpath name of
+ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
+ * The GRUB "findroot" command will return the vdev we should boot.
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ * "/pci@1f,0/ide@d/disk@0,0:a"
+ */
+int
+spa_import_rootpool(char *devpath, char *devid)
+{
+ spa_t *spa;
+ vdev_t *rvd, *bvd, *avd = NULL;
+ nvlist_t *config, *nvtop;
+ uint64_t guid, txg;
+ char *pname;
+ int error;
+
+ /*
+ * Read the label from the boot device and generate a configuration.
+ */
+ if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+ devpath);
+ return (EIO);
+ }
+
+ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pname) == 0);
+ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pname)) != NULL) {
+ /*
+ * Remove the existing root pool from the namespace so that we
+ * can replace it with the correct config we just read in.
+ */
+ spa_remove(spa);
+ }
+
+ spa = spa_add(pname, NULL);
+ spa->spa_is_root = B_TRUE;
+
+ /*
+ * Build up a vdev tree based on the boot device's label config.
+ */
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvtop) == 0);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+ VDEV_ALLOC_ROOTPOOL);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+ pname);
+ return (error);
+ }
+
+ /*
+ * Get the boot vdev.
+ */
+ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+ (u_longlong_t)guid);
+ error = ENOENT;
+ goto out;
+ }
+
+ /*
+ * Determine if there is a better boot device.
+ */
+ avd = bvd;
+ spa_alt_rootvdev(rvd, &avd, &txg);
+ if (avd != bvd) {
+ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+ "try booting from '%s'", avd->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * If the boot device is part of a spare vdev then ensure that
+ * we're booting off the active spare.
+ */
+ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ !bvd->vdev_isspare) {
+ cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+ "try booting from '%s'",
+ bvd->vdev_parent->vdev_child[1]->vdev_path);
+ error = EINVAL;
+ goto out;
+ }
+
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+ error = 0;
+out:
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ vdev_free(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(config);
+ return (error);
+}
+
+#endif
+
+/*
+ * Take a pool and insert it into the namespace as if it had been loaded at
+ * boot.
+ */
+int
+spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
{
spa_t *spa;
char *altroot = NULL;
- int error, loaderr;
+
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ (void) nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+ spa = spa_add(pool, altroot);
+
+ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
+ if (props != NULL)
+ spa_configfile_set(spa, props, B_FALSE);
+
+ spa_config_sync(spa, B_FALSE, B_TRUE);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+{
+ spa_t *spa;
+ char *altroot = NULL;
+ int error;
nvlist_t *nvroot;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
@@ -2175,18 +2521,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
*/
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(pool)) != NULL) {
- if (isroot) {
- /*
- * Remove the existing root pool from the
- * namespace so that we can replace it with
- * the correct config we just read in.
- */
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
- spa_remove(spa);
- } else {
- mutex_exit(&spa_namespace_lock);
- return (EEXIST);
- }
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
}
/*
@@ -2197,29 +2533,29 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
spa = spa_add(pool, altroot);
spa_activate(spa, spa_mode_global);
- if (allowfaulted)
- spa->spa_import_faulted = B_TRUE;
- spa->spa_is_root = isroot;
+ /*
+ * Don't start async tasks until we know everything is healthy.
+ */
+ spa_async_suspend(spa);
/*
- * Pass off the heavy lifting to spa_load().
- * Pass TRUE for mosconfig (unless this is a root pool) because
- * the user-supplied config is actually the one to trust when
+ * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
+ * because the user-supplied config is actually the one to trust when
* doing an import.
*/
- loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
+ error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
- * Toss any existing sparelist, as it doesn't have any validity anymore,
- * and conflicts with spa_has_spare().
+ * Toss any existing sparelist, as it doesn't have any validity
+ * anymore, and conflicts with spa_has_spare().
*/
- if (!isroot && spa->spa_spares.sav_config) {
+ if (spa->spa_spares.sav_config) {
nvlist_free(spa->spa_spares.sav_config);
spa->spa_spares.sav_config = NULL;
spa_load_spares(spa);
}
- if (!isroot && spa->spa_l2cache.sav_config) {
+ if (spa->spa_l2cache.sav_config) {
nvlist_free(spa->spa_l2cache.sav_config);
spa->spa_l2cache.sav_config = NULL;
spa_load_l2cache(spa);
@@ -2228,7 +2564,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (error == 0)
- error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
+ error = spa_validate_aux(spa, nvroot, -1ULL,
+ VDEV_ALLOC_SPARE);
if (error == 0)
error = spa_validate_aux(spa, nvroot, -1ULL,
VDEV_ALLOC_L2CACHE);
@@ -2239,29 +2576,15 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
if (error != 0 || (props && spa_writeable(spa) &&
(error = spa_prop_set(spa, props)))) {
- if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
- /*
- * If we failed to load the pool, but 'allowfaulted' is
- * set, then manually set the config as if the config
- * passed in was specified in the cache file.
- */
- error = 0;
- spa->spa_import_faulted = B_FALSE;
- if (spa->spa_config == NULL)
- spa->spa_config = spa_config_generate(spa,
- NULL, -1ULL, B_TRUE);
- spa_unload(spa);
- spa_deactivate(spa);
- spa_config_sync(spa, B_FALSE, B_TRUE);
- } else {
- spa_unload(spa);
- spa_deactivate(spa);
- spa_remove(spa);
- }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (error);
}
+ spa_async_resume(spa);
+
/*
* Override any spares and level 2 cache devices as specified by
* the user, as these may have correct device names/devids, etc.
@@ -2301,246 +2624,20 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
/*
* Update the config cache to include the newly-imported pool.
*/
- spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
+ spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE);
}
- spa->spa_import_faulted = B_FALSE;
- mutex_exit(&spa_namespace_lock);
-
- return (0);
-}
-
-#ifdef _KERNEL
-/*
- * Build a "root" vdev for a top level vdev read in from a rootpool
- * device label.
- */
-static void
-spa_build_rootpool_config(nvlist_t *config)
-{
- nvlist_t *nvtop, *nvroot;
- uint64_t pgid;
-
- /*
- * Add this top-level vdev to the child array.
- */
- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
- == 0);
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
- == 0);
-
- /*
- * Put this pool's top-level vdevs into a root vdev.
- */
- VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
- == 0);
- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
- VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &nvtop, 1) == 0);
-
/*
- * Replace the existing vdev_tree with the new root vdev in
- * this pool's configuration (remove the old, add the new).
+ * It's possible that the pool was expanded while it was exported.
+ * We kick off an async task to handle this for us.
*/
- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
- nvlist_free(nvroot);
-}
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
-/*
- * Get the root pool information from the root disk, then import the root pool
- * during the system boot up time.
- */
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
-int
-spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
- uint64_t *besttxg)
-{
- nvlist_t *config;
- uint64_t txg;
- int error;
-
- if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
- return (error);
-
- VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
-
- if (bestconf != NULL)
- *bestconf = config;
- else
- nvlist_free(config);
- *besttxg = txg;
- return (0);
-}
-
-boolean_t
-spa_rootdev_validate(nvlist_t *nv)
-{
- uint64_t ival;
-
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
- nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-
-/*
- * Given the boot device's physical path or devid, check if the device
- * is in a valid state. If so, return the configuration from the vdev
- * label.
- */
-int
-spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
-{
- nvlist_t *conf = NULL;
- uint64_t txg = 0;
- nvlist_t *nvtop, **child;
- char *type;
- char *bootpath = NULL;
- uint_t children, c;
- char *tmp;
- int error;
-
- if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
- *tmp = '\0';
- if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
- cmn_err(CE_NOTE, "error reading device label");
- return (error);
- }
- if (txg == 0) {
- cmn_err(CE_NOTE, "this device is detached");
- nvlist_free(conf);
- return (EINVAL);
- }
-
- VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
- &nvtop) == 0);
- VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
-
- if (strcmp(type, VDEV_TYPE_DISK) == 0) {
- if (spa_rootdev_validate(nvtop)) {
- goto out;
- } else {
- nvlist_free(conf);
- return (EINVAL);
- }
- }
-
- ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
-
- VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
-
- /*
- * Go thru vdevs in the mirror to see if the given device
- * has the most recent txg. Only the device with the most
- * recent txg has valid information and should be booted.
- */
- for (c = 0; c < children; c++) {
- char *cdevid, *cpath;
- uint64_t tmptxg;
-
- cpath = NULL;
- cdevid = NULL;
- if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
- &cpath) != 0 && nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_DEVID, &cdevid) != 0)
- return (EINVAL);
- if ((spa_check_rootconf(cpath, cdevid, NULL,
- &tmptxg) == 0) && (tmptxg > txg)) {
- txg = tmptxg;
- VERIFY(nvlist_lookup_string(child[c],
- ZPOOL_CONFIG_PATH, &bootpath) == 0);
- }
- }
+ mutex_exit(&spa_namespace_lock);
- /* Does the best device match the one we've booted from? */
- if (bootpath) {
- cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
- return (EINVAL);
- }
-out:
- *bestconf = conf;
return (0);
}
-/*
- * Import a root pool.
- *
- * For x86. devpath_list will consist of devid and/or physpath name of
- * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
- * The GRUB "findroot" command will return the vdev we should boot.
- *
- * For Sparc, devpath_list consists the physpath name of the booting device
- * no matter the rootpool is a single device pool or a mirrored pool.
- * e.g.
- * "/pci@1f,0/ide@d/disk@0,0:a"
- */
-int
-spa_import_rootpool(char *devpath, char *devid)
-{
- nvlist_t *conf = NULL;
- char *pname;
- int error;
-
- /*
- * Get the vdev pathname and configuation from the most
- * recently updated vdev (highest txg).
- */
- if (error = spa_get_rootconf(devpath, devid, &conf))
- goto msg_out;
-
- /*
- * Add type "root" vdev to the config.
- */
- spa_build_rootpool_config(conf);
-
- VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
-
- /*
- * We specify 'allowfaulted' for this to be treated like spa_open()
- * instead of spa_import(). This prevents us from marking vdevs as
- * persistently unavailable, and generates FMA ereports as if it were a
- * pool open, not import.
- */
- error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE);
- ASSERT(error != EEXIST);
-
- nvlist_free(conf);
- return (error);
-
-msg_out:
- cmn_err(CE_NOTE, "\n"
- " *************************************************** \n"
- " * This device is not bootable! * \n"
- " * It is either offlined or detached or faulted. * \n"
- " * Please try to boot from a different device. * \n"
- " *************************************************** ");
-
- return (error);
-}
-#endif
-
-/*
- * Import a non-root pool into the system.
- */
-int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
-{
- return (spa_import_common(pool, config, props, B_FALSE, B_FALSE));
-}
-
-int
-spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props)
-{
- return (spa_import_common(pool, config, props, B_FALSE, B_TRUE));
-}
-
/*
* This (illegal) pool name is used when temporarily importing a spa_t in order
@@ -2624,8 +2721,10 @@ spa_tryimport(nvlist_t *tryconfig)
/*
* Add the list of hot spares and level 2 cache devices.
*/
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_add_spares(spa, config);
spa_add_l2cache(spa, config);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_unload(spa);
@@ -2971,10 +3070,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
/*
- * Compare the new device size with the replaceable/attachable
- * device size.
+ * Make sure the new device is big enough.
*/
- if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -3018,12 +3116,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
newvd->vdev_id = pvd->vdev_children;
vdev_add_child(pvd, newvd);
- /*
- * If newvd is smaller than oldvd, but larger than its rsize,
- * the addition of newvd may have decreased our parent's asize.
- */
- pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
tvd = newvd->vdev_top;
ASSERT(pvd->vdev_top == tvd);
ASSERT(tvd->vdev_parent == rvd);
@@ -3039,8 +3131,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_dtl_dirty(newvd, DTL_MISSING,
TXG_INITIAL, open_txg - TXG_INITIAL + 1);
- if (newvd->vdev_isspare)
+ if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
+ }
+
oldvdpath = spa_strdup(oldvd->vdev_path);
newvdpath = spa_strdup(newvd->vdev_path);
newvd_isspare = newvd->vdev_isspare;
@@ -3237,12 +3332,16 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_propagate_state(cvd);
/*
- * If the device we just detached was smaller than the others, it may be
- * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
- * can't fail because the existing metaslabs are already in core, so
- * there's nothing to read from disk.
+ * If the 'autoexpand' property is set on the pool then automatically
+ * try to expand the size of the pool. For example if the device we
+ * just detached was smaller than the others, it may be possible to
+ * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+ * first so that we can obtain the updated sizes of the leaf vdevs.
*/
- VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+ if (spa->spa_autoexpand) {
+ vdev_reopen(tvd);
+ vdev_expand(tvd, txg);
+ }
vdev_config_dirty(tvd);
@@ -3400,9 +3499,8 @@ static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
{
vdev_t *newvd, *oldvd;
- int c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
if (oldvd != NULL)
return (oldvd);
@@ -3475,59 +3573,50 @@ spa_vdev_resilver_done(spa_t *spa)
}
/*
- * Update the stored path for this vdev. Dirty the vdev configuration, relying
- * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev. Dirty the vdev configuration,
+ * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
*/
int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+ boolean_t ispath)
{
vdev_t *vd;
uint64_t txg;
txg = spa_vdev_enter(spa);
- if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) {
- /*
- * Determine if this is a reference to a hot spare device. If
- * it is, update the path manually as there is no associated
- * vdev_t that can be synced to disk.
- */
- nvlist_t **spares;
- uint_t i, nspares;
-
- if (spa->spa_spares.sav_config != NULL) {
- VERIFY(nvlist_lookup_nvlist_array(
- spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0);
- for (i = 0; i < nspares; i++) {
- uint64_t theguid;
- VERIFY(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &theguid) == 0);
- if (theguid == guid) {
- VERIFY(nvlist_add_string(spares[i],
- ZPOOL_CONFIG_PATH, newpath) == 0);
- spa_load_spares(spa);
- spa->spa_spares.sav_sync = B_TRUE;
- return (spa_vdev_exit(spa, NULL, txg,
- 0));
- }
- }
- }
-
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENOENT));
- }
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- spa_strfree(vd->vdev_path);
- vd->vdev_path = spa_strdup(newpath);
+ if (ispath) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(value);
+ } else {
+ if (vd->vdev_fru != NULL)
+ spa_strfree(vd->vdev_fru);
+ vd->vdev_fru = spa_strdup(value);
+ }
vdev_config_dirty(vd->vdev_top);
return (spa_vdev_exit(spa, NULL, txg, 0));
}
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
+
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+ return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
+}
+
/*
* ==========================================================================
* SPA Scrubbing
@@ -3599,6 +3688,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd)
}
static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath;
+
+ if (!spa->spa_autoexpand)
+ return;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ spa_async_autoexpand(spa, cvd);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+ return;
+
+ physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+}
+
+static void
spa_async_thread(spa_t *spa)
{
int tasks;
@@ -3614,9 +3734,33 @@ spa_async_thread(spa_t *spa)
* See if the config needs to be updated.
*/
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ uint64_t oldsz, space_update;
+
mutex_enter(&spa_namespace_lock);
+ oldsz = spa_get_space(spa);
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ space_update = spa_get_space(spa) - oldsz;
mutex_exit(&spa_namespace_lock);
+
+ /*
+ * If the pool grew as a result of the config update,
+ * then log an internal history event.
+ */
+ if (space_update) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
+ spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
+ spa, tx, CRED(),
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), spa_get_space(spa),
+ space_update);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ }
}
/*
@@ -3632,6 +3776,12 @@ spa_async_thread(spa_t *spa)
(void) spa_vdev_state_exit(spa, NULL, 0);
}
+ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_async_autoexpand(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ }
+
/*
* See if any devices need to be probed.
*/
@@ -3944,6 +4094,10 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
case ZPOOL_PROP_FAILUREMODE:
spa->spa_failmode = intval;
break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+ break;
default:
break;
}
@@ -4105,9 +4259,8 @@ spa_sync(spa_t *spa, uint64_t txg)
int svdcount = 0;
int children = rvd->vdev_children;
int c0 = spa_get_random(children);
- int c;
- for (c = 0; c < children; c++) {
+ for (int c = 0; c < children; c++) {
vd = rvd->vdev_child[(c0 + c) % children];
if (vd->vdev_ms_array == 0 || vd->vdev_islog)
continue;
@@ -4115,10 +4268,16 @@ spa_sync(spa_t *spa, uint64_t txg)
if (svdcount == SPA_DVAS_PER_BP)
break;
}
- error = vdev_config_sync(svd, svdcount, txg);
+ error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
+ if (error != 0)
+ error = vdev_config_sync(svd, svdcount, txg,
+ B_TRUE);
} else {
error = vdev_config_sync(rvd->vdev_child,
- rvd->vdev_children, txg);
+ rvd->vdev_children, txg, B_FALSE);
+ if (error != 0)
+ error = vdev_config_sync(rvd->vdev_child,
+ rvd->vdev_children, txg, B_TRUE);
}
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -4239,7 +4398,7 @@ spa_evict_all(void)
}
vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
{
vdev_t *vd;
int i;
@@ -4247,12 +4406,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
return (vd);
- if (l2cache) {
+ if (aux) {
for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
vd = spa->spa_l2cache.sav_vdevs[i];
if (vd->vdev_guid == guid)
return (vd);
}
+
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
+ if (vd->vdev_guid == guid)
+ return (vd);
+ }
}
return (NULL);
diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c
index 1c47efd0c..7103e179b 100644
--- a/module/zfs/spa_config.c
+++ b/module/zfs/spa_config.c
@@ -432,10 +432,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ms_array == 0) {
- vdev_init(tvd, txg);
- vdev_config_dirty(tvd);
- }
+ if (tvd->vdev_ms_array == 0)
+ vdev_metaslab_set_size(tvd);
+ vdev_expand(tvd, txg);
}
}
spa_config_exit(spa, SCL_ALL, FTAG);
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index c642bd768..ac0a20aaf 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Routines to manage the on-disk persistent error log.
*
@@ -61,8 +59,8 @@
* lowercase hexidecimal numbers that don't overflow.
*/
#ifdef _KERNEL
-static uint64_t
-strtonum(char *str, char **nptr)
+uint64_t
+strtonum(const char *str, char **nptr)
{
uint64_t val = 0;
char c;
@@ -82,7 +80,8 @@ strtonum(char *str, char **nptr)
str++;
}
- *nptr = str;
+ if (nptr)
+ *nptr = (char *)str;
return (val);
}
diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c
index c997240c1..97d97d847 100644
--- a/module/zfs/spa_history.c
+++ b/module/zfs/spa_history.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/zap.h>
@@ -127,12 +125,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
- buf)) != 0)
+ buf, DMU_READ_PREFETCH)) != 0)
return (err);
if (firstread != sizeof (reclen)) {
if ((err = dmu_read(mos, spa->spa_history,
shpp->sh_pool_create_len, sizeof (reclen) - firstread,
- buf + firstread)) != 0)
+ buf + firstread, DMU_READ_PREFETCH)) != 0)
return (err);
}
@@ -380,10 +378,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
return (0);
}
- err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+ DMU_READ_PREFETCH);
if (leftover && err == 0) {
err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
- leftover, buf + read_len);
+ leftover, buf + read_len, DMU_READ_PREFETCH);
}
mutex_exit(&spa->spa_history_lock);
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 485e83fce..aea3f5625 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -425,7 +425,6 @@ spa_add(const char *name, const char *altroot)
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -434,7 +433,6 @@ spa_add(const char *name, const char *altroot)
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
@@ -508,12 +506,10 @@ spa_remove(spa_t *spa)
spa_config_lock_destroy(spa);
cv_destroy(&spa->spa_async_cv);
- cv_destroy(&spa->spa_async_root_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_async_lock);
- mutex_destroy(&spa->spa_async_root_lock);
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_errlist_lock);
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index 1cdacc81d..75b55d5c1 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
if (merge_before && merge_after) {
avl_remove(&sm->sm_root, ss_before);
+ if (sm->sm_pp_root) {
+ avl_remove(sm->sm_pp_root, ss_before);
+ avl_remove(sm->sm_pp_root, ss_after);
+ }
ss_after->ss_start = ss_before->ss_start;
kmem_free(ss_before, sizeof (*ss_before));
+ ss = ss_after;
} else if (merge_before) {
ss_before->ss_end = end;
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss_before);
+ ss = ss_before;
} else if (merge_after) {
ss_after->ss_start = start;
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss_after);
+ ss = ss_after;
} else {
ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
ss->ss_start = start;
@@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
avl_insert(&sm->sm_root, ss, where);
}
+ if (sm->sm_pp_root)
+ avl_add(sm->sm_pp_root, ss);
+
sm->sm_space += size;
}
@@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
left_over = (ss->ss_start != start);
right_over = (ss->ss_end != end);
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss);
+
if (left_over && right_over) {
newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
newseg->ss_start = end;
newseg->ss_end = ss->ss_end;
ss->ss_end = start;
avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ if (sm->sm_pp_root)
+ avl_add(sm->sm_pp_root, newseg);
} else if (left_over) {
ss->ss_end = start;
} else if (right_over) {
@@ -176,8 +195,12 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
} else {
avl_remove(&sm->sm_root, ss);
kmem_free(ss, sizeof (*ss));
+ ss = NULL;
}
+ if (sm->sm_pp_root && ss != NULL)
+ avl_add(sm->sm_pp_root, ss);
+
sm->sm_space -= size;
}
@@ -288,7 +311,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
smo->smo_object, offset, size);
mutex_exit(sm->sm_lock);
- error = dmu_read(os, smo->smo_object, offset, size, entry_map);
+ error = dmu_read(os, smo->smo_object, offset, size, entry_map,
+ DMU_READ_PREFETCH);
mutex_enter(sm->sm_lock);
if (error != 0)
break;
@@ -342,6 +366,15 @@ space_map_unload(space_map_t *sm)
}
uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+ if (sm->sm_loaded && sm->sm_ops != NULL)
+ return (sm->sm_ops->smop_max(sm));
+ else
+ return (-1ULL);
+}
+
+uint64_t
space_map_alloc(space_map_t *sm, uint64_t size)
{
uint64_t start;
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 2554f96a9..3fa677e05 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -39,6 +39,7 @@
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/arc.h>
+#include <sys/zil.h>
/*
* Virtual device management.
@@ -83,9 +84,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
- uint64_t c;
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize);
}
@@ -94,40 +94,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
}
/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
*/
uint64_t
-vdev_get_rsize(vdev_t *vd)
+vdev_get_min_asize(vdev_t *vd)
{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
+ vdev_t *pvd = vd->vdev_parent;
- pvd = vd->vdev_parent;
+ /*
+ * The our parent is NULL (inactive spare or cache) or is the root,
+ * just return our own asize.
+ */
+ if (pvd == NULL)
+ return (vd->vdev_asize);
/*
- * If our parent is NULL or the root, just return our own psize.
+ * The top-level vdev just returns the allocatable size rounded
+ * to the nearest metaslab.
*/
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
+ if (vd == vd->vdev_top)
+ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
- rsize = 0;
+ /*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+ * so each child must provide at least 1/Nth of its asize.
+ */
+ if (pvd->vdev_ops == &vdev_raidz_ops)
+ return (pvd->vdev_min_asize / pvd->vdev_children);
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
+ return (pvd->vdev_min_asize);
+}
+
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
- return (rsize);
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_set_min_asize(vd->vdev_child[c]);
}
vdev_t *
@@ -148,13 +155,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
- int c;
vdev_t *mvd;
if (vd->vdev_guid == guid)
return (vd);
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL)
return (mvd);
@@ -250,17 +256,17 @@ vdev_compact_children(vdev_t *pvd)
{
vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children;
- int newc, c;
+ int newc;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
- for (c = newc = 0; c < oldc; c++)
+ for (int c = newc = 0; c < oldc; c++)
if (pvd->vdev_child[c])
newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
- for (c = newc = 0; c < oldc; c++) {
+ for (int c = newc = 0; c < oldc; c++) {
if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd;
cvd->vdev_id = newc++;
@@ -372,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
} else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
}
/*
@@ -435,6 +444,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
&vd->vdev_physpath) == 0)
vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+ vd->vdev_fru = spa_strdup(vd->vdev_fru);
/*
* Set the whole_disk property. If it's not specified, leave the value
@@ -448,9 +459,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
* Look for the 'not present' flag. This will only be set if the device
* was not present at the time of import.
*/
- if (!spa->spa_import_faulted)
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
- &vd->vdev_not_present);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
/*
* Get the alignment requirement.
@@ -473,13 +483,23 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
* If we're a leaf vdev, try to load the DTL object and other state.
*/
if (vd->vdev_ops->vdev_op_leaf &&
- (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
+
+ if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+ uint64_t spare = 0;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+ &spare) == 0 && spare)
+ spa_spare_add(vd);
+ }
+
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
@@ -511,7 +531,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
void
vdev_free(vdev_t *vd)
{
- int c;
spa_t *spa = vd->vdev_spa;
/*
@@ -525,7 +544,7 @@ vdev_free(vdev_t *vd)
/*
* Free all children.
*/
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL);
@@ -560,6 +579,8 @@ vdev_free(vdev_t *vd)
spa_strfree(vd->vdev_devid);
if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath);
+ if (vd->vdev_fru)
+ spa_strfree(vd->vdev_fru);
if (vd->vdev_isspare)
spa_spare_remove(vd);
@@ -653,14 +674,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
static void
vdev_top_update(vdev_t *tvd, vdev_t *vd)
{
- int c;
-
if (vd == NULL)
return;
vd->vdev_top = tvd;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_top_update(tvd, vd->vdev_child[c]);
}
@@ -679,6 +698,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
@@ -751,6 +771,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
return (0);
+ /*
+ * Compute the raidz-deflation ratio. Note, we hard-code
+ * in 128k (1 << 17) because it is the current "typical" blocksize.
+ * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
+ * or we will inconsistently account for existing bp's.
+ */
+ vd->vdev_deflate_ratio = (1 << 17) /
+ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+
ASSERT(oldc <= newc);
if (vd->vdev_islog)
@@ -776,7 +805,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (txg == 0) {
uint64_t object = 0;
error = dmu_read(mos, vd->vdev_ms_array,
- m * sizeof (uint64_t), sizeof (uint64_t), &object);
+ m * sizeof (uint64_t), sizeof (uint64_t), &object,
+ DMU_READ_PREFETCH);
if (error)
return (error);
if (object != 0) {
@@ -903,7 +933,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
- ZIO_FLAG_DONT_RETRY;
+ ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -950,8 +980,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad)),
- VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+ offsetof(vdev_label_t, vl_pad2)),
+ VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
@@ -971,7 +1001,6 @@ vdev_open(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int error;
- int c;
uint64_t osize = 0;
uint64_t asize, psize;
uint64_t ashift = 0;
@@ -983,6 +1012,9 @@ vdev_open(vdev_t *vd)
vd->vdev_state == VDEV_STATE_OFFLINE);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+ vd->vdev_cant_read = B_FALSE;
+ vd->vdev_cant_write = B_FALSE;
+ vd->vdev_min_asize = vdev_get_min_asize(vd);
if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0);
@@ -998,7 +1030,7 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
if (zio_injection_enabled && error == 0)
- error = zio_handle_device_injection(vd, ENXIO);
+ error = zio_handle_device_injection(vd, NULL, ENXIO);
if (error) {
if (vd->vdev_removed &&
@@ -1020,12 +1052,13 @@ vdev_open(vdev_t *vd)
vd->vdev_state = VDEV_STATE_HEALTHY;
}
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE);
break;
}
+ }
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
@@ -1050,6 +1083,15 @@ vdev_open(vdev_t *vd)
vd->vdev_psize = psize;
+ /*
+ * Make sure the allocatable size hasn't shrunk.
+ */
+ if (asize < vd->vdev_min_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
@@ -1066,25 +1108,18 @@ vdev_open(vdev_t *vd)
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
+ }
- /*
- * Make sure the device hasn't shrunk.
- */
- if (asize < vd->vdev_asize) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth. If automatic
+ * expansion is enabled then use the additional space.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
+ (vd->vdev_expanding || spa->spa_autoexpand))
+ vd->vdev_asize = asize;
- /*
- * If all children are healthy and the asize has increased,
- * then we've experienced dynamic LUN growth.
- */
- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
- asize > vd->vdev_asize) {
- vd->vdev_asize = asize;
- }
- }
+ vdev_set_min_asize(vd);
/*
* Ensure we can issue some IO before declaring the
@@ -1098,18 +1133,6 @@ vdev_open(vdev_t *vd)
}
/*
- * If this is a top-level vdev, compute the raidz-deflation
- * ratio. Note, we hard-code in 128k (1<<17) because it is the
- * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
- * changes, this algorithm must never change, or we will
- * inconsistently account for existing bp's.
- */
- if (vd->vdev_top == vd) {
- vd->vdev_deflate_ratio = (1<<17) /
- (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
- }
-
- /*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a scrub,
* since this would just restart the scrub we are already doing.
@@ -1135,12 +1158,11 @@ int
vdev_validate(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- int c;
nvlist_t *label;
uint64_t guid, top_guid;
uint64_t state;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF);
@@ -1226,7 +1248,7 @@ vdev_close(vdev_t *vd)
vdev_cache_purge(vd);
/*
- * We record the previous state before we close it, so that if we are
+ * We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted.
*/
@@ -1257,12 +1279,9 @@ vdev_reopen(vdev_t *vd)
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
- !l2arc_vdev_present(vd)) {
- uint64_t size = vdev_get_rsize(vd);
- l2arc_add_vdev(spa, vd,
- VDEV_LABEL_START_SIZE,
- size - VDEV_LABEL_START_SIZE);
- }
+ vd->vdev_aux == &spa->spa_l2cache &&
+ !l2arc_vdev_present(vd))
+ l2arc_add_vdev(spa, vd);
} else {
(void) vdev_validate(vd);
}
@@ -1302,26 +1321,14 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
return (0);
}
-/*
- * The is the latter half of vdev_create(). It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
void
-vdev_init(vdev_t *vd, uint64_t txg)
+vdev_metaslab_set_size(vdev_t *vd)
{
/*
* Aim for roughly 200 metaslabs per vdev.
*/
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
- /*
- * Initialize the vdev's metaslabs. This can't fail because
- * there's nothing to read when creating all new metaslabs.
- */
- VERIFY(vdev_metaslab_init(vd, txg) == 0);
}
void
@@ -1879,7 +1886,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
spa_vdev_state_enter(spa);
@@ -1889,13 +1896,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
- vdev_reopen(vd->vdev_top);
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+ }
+
+ vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+ if (!vd->vdev_aux) {
+ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+ pvd->vdev_expanding = B_FALSE;
+ }
+
if (newstate)
*newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) &&
@@ -1904,13 +1924,21 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE;
+ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+ /* XXX - L2ARC 1.0 does not support expansion */
+ if (vd->vdev_aux)
+ return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
int
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{
- vdev_t *vd;
+ vdev_t *vd, *tvd;
+ int error;
spa_vdev_state_enter(spa);
@@ -1920,34 +1948,58 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ tvd = vd->vdev_top;
+
/*
* If the device isn't already offline, try to offline it.
*/
if (!vd->vdev_offline) {
/*
* If this device has the only valid copy of some data,
- * don't allow it to be offlined.
+ * don't allow it to be offlined. Log devices are always
+ * expendable.
*/
- if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
* Offline this device and reopen its top-level vdev.
- * If this action results in the top-level vdev becoming
- * unusable, undo it and fail the request.
+ * If the top-level vdev is a log device then just offline
+ * it. Otherwise, if this action results in the top-level
+ * vdev becoming unusable, undo it and fail the request.
*/
vd->vdev_offline = B_TRUE;
- vdev_reopen(vd->vdev_top);
- if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
+ vdev_reopen(tvd);
+
+ if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+ vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE;
- vdev_reopen(vd->vdev_top);
+ vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY));
}
}
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
- return (spa_vdev_state_exit(spa, vd, 0));
+ if (!tvd->vdev_islog || !vdev_is_dead(tvd))
+ return (spa_vdev_state_exit(spa, vd, 0));
+
+ (void) spa_vdev_state_exit(spa, vd, 0);
+
+ error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+ NULL, DS_FIND_CHILDREN);
+ if (error) {
+ (void) vdev_online(spa, guid, 0, NULL);
+ return (error);
+ }
+ /*
+ * If we successfully offlined the log device then we need to
+ * sync out the current txg so that the "stubby" block can be
+ * removed by zil_sync().
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ return (0);
}
/*
@@ -2062,7 +2114,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_rsize(vd);
+ vs->vs_rsize = vdev_get_min_asize(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
mutex_exit(&vd->vdev_stat_lock);
/*
@@ -2155,14 +2209,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (flags & ZIO_FLAG_SPECULATIVE)
return;
+ /*
+ * If this is an I/O error that is going to be retried, then ignore the
+ * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
+ * hard errors, when in reality they can happen for any number of
+ * innocuous reasons (bus resets, MPxIO link failure, etc).
+ */
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+ return;
+
mutex_enter(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_READ) {
+ if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM)
vs->vs_checksum_errors++;
else
vs->vs_read_errors++;
}
- if (type == ZIO_TYPE_WRITE)
+ if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock);
@@ -2205,10 +2269,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
void
vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
{
- int c;
vdev_stat_t *vs = &vd->vdev_stat;
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
mutex_enter(&vd->vdev_stat_lock);
@@ -2252,6 +2315,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
* childrens', thus not accurate enough for us.
*/
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio;
@@ -2293,8 +2357,8 @@ vdev_config_dirty(vdev_t *vd)
int c;
/*
- * If this is an aux vdev (as with l2cache devices), then we update the
- * vdev config manually and set the sync flag.
+ * If this is an aux vdev (as with l2cache and spare devices), then we
+ * update the vdev config manually and set the sync flag.
*/
if (vd->vdev_aux != NULL) {
spa_aux_vdev_t *sav = vd->vdev_aux;
@@ -2316,8 +2380,11 @@ vdev_config_dirty(vdev_t *vd)
sav->sav_sync = B_TRUE;
- VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
- ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0);
+ if (nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+ }
ASSERT(c < naux);
@@ -2415,11 +2482,10 @@ vdev_propagate_state(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0;
int corrupted = 0;
- int c;
vdev_t *child;
if (vd->vdev_children > 0) {
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
if (!vdev_readable(child) ||
@@ -2523,7 +2589,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* an error.
*/
if (spa->spa_load_state == SPA_LOAD_IMPORT &&
- !spa->spa_import_faulted &&
vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1;
@@ -2582,8 +2647,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_removed = B_FALSE;
}
- if (!isopen)
- vdev_propagate_state(vd);
+ if (!isopen && vd->vdev_parent)
+ vdev_propagate_state(vd->vdev_parent);
}
/*
@@ -2595,8 +2660,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
boolean_t
vdev_is_bootable(vdev_t *vd)
{
- int c;
-
if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type;
@@ -2611,9 +2674,53 @@ vdev_is_bootable(vdev_t *vd)
return (B_FALSE);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
return (B_TRUE);
}
+
+void
+vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+{
+ uint_t children;
+ nvlist_t **child;
+ uint64_t val;
+ spa_t *spa = vd->vdev_spa;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (int c = 0; c < children; c++)
+ vdev_load_log_state(vd->vdev_child[c], child[c]);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+
+ /*
+ * It would be nice to call vdev_offline()
+ * directly but the pool isn't fully loaded and
+ * the txg threads have not been started yet.
+ */
+ spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
+ vd->vdev_offline = val;
+ vdev_reopen(vd->vdev_top);
+ spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ }
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+ ASSERT(vd->vdev_top == vd);
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+ vdev_config_dirty(vd);
+ }
+}
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index f61de3c4e..48d5fc232 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
vd->vdev_physpath) == 0);
+ if (vd->vdev_fru != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
+ vd->vdev_fru) == 0);
+
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0);
@@ -335,8 +339,8 @@ vdev_label_read_config(vdev_t *vd)
nvlist_t *config = NULL;
vdev_phys_t *vp;
zio_t *zio;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@@ -345,6 +349,7 @@ vdev_label_read_config(vdev_t *vd)
vp = zio_buf_alloc(sizeof (vdev_phys_t));
+retry:
for (int l = 0; l < VDEV_LABELS; l++) {
zio = zio_root(spa, NULL, NULL, flags);
@@ -364,6 +369,11 @@ vdev_label_read_config(vdev_t *vd)
}
}
+ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
zio_buf_free(vp, sizeof (vdev_phys_t));
return (config);
@@ -488,7 +498,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
- vdev_boot_header_t *vb;
+ char *pad2;
uberblock_t *ub;
zio_t *zio;
char *buf;
@@ -630,16 +640,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
}
/*
- * Initialize boot block header.
- */
- vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
- bzero(vb, sizeof (vdev_boot_header_t));
- vb->vb_magic = VDEV_BOOT_MAGIC;
- vb->vb_version = VDEV_BOOT_VERSION;
- vb->vb_offset = VDEV_BOOT_OFFSET;
- vb->vb_size = VDEV_BOOT_SIZE;
-
- /*
* Initialize uberblock template.
*/
ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
@@ -647,9 +647,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
+ /* Initialize the 2nd padding area. */
+ pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+ bzero(pad2, VDEV_PAD_SIZE);
+
/*
* Write everything in parallel.
*/
+retry:
zio = zio_root(spa, NULL, NULL, flags);
for (int l = 0; l < VDEV_LABELS; l++) {
@@ -658,9 +663,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
- vdev_label_write(zio, vd, l, vb,
- offsetof(vdev_label_t, vl_boot_header),
- sizeof (vdev_boot_header_t), NULL, NULL, flags);
+ /*
+ * Skip the 1st padding area.
+ * Zero out the 2nd padding area where it might have
+ * left over data from previous filesystem format.
+ */
+ vdev_label_write(zio, vd, l, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_write(zio, vd, l, ub,
@@ -671,9 +681,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
nvlist_free(label);
+ zio_buf_free(pad2, VDEV_PAD_SIZE);
zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
- zio_buf_free(vb, sizeof (vdev_boot_header_t));
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
@@ -757,8 +772,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
{
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- int flags =
- ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
if (vd == rvd) {
ASSERT(zio == NULL);
@@ -996,7 +1011,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
* at any time, you can just call it again, and it will resume its work.
*/
int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
@@ -1005,6 +1020,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
int error;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ /*
+ * Normally, we don't want to try too hard to write every label and
+ * uberblock. If there is a flaky disk, we don't want the rest of the
+ * sync process to block while we retry. But if we can't write a
+ * single label out, we should retry with ZIO_FLAG_TRYHARD before
+ * bailing out and declaring the pool faulted.
+ */
+ if (tryhard)
+ flags |= ZIO_FLAG_TRYHARD;
+
ASSERT(ub->ub_txg <= txg);
/*
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 4e3c20acd..5e57a1513 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -48,10 +48,11 @@ int zfs_vdev_time_shift = 6;
int zfs_vdev_ramp_rate = 2;
/*
- * i/os will be aggregated into a single large i/o up to
- * zfs_vdev_aggregation_limit bytes long.
+ * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
+ * For read i/os, we also aggregate across small adjacency gaps.
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
/*
* Virtual device vector for disk I/O scheduling.
@@ -159,16 +160,23 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_buf_free(aio->io_data, aio->io_size);
}
-#define IS_ADJACENT(io, nio) \
- ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
zio_t *fio, *lio, *aio, *dio, *nio;
avl_tree_t *t;
- uint64_t size;
int flags;
+ uint64_t maxspan = zfs_vdev_aggregation_limit;
+ uint64_t maxgap;
ASSERT(MUTEX_HELD(&vq->vq_lock));
@@ -179,8 +187,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
fio = lio = avl_first(&vq->vq_deadline_tree);
t = fio->io_vdev_tree;
- size = fio->io_size;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
@@ -191,22 +199,18 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
* scrub/resilver, can be preserved in the aggregate.
*/
while ((dio = AVL_PREV(t, fio)) != NULL &&
- IS_ADJACENT(dio, fio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
fio = dio;
- size += dio->io_size;
- }
+
while ((dio = AVL_NEXT(t, lio)) != NULL &&
- IS_ADJACENT(lio, dio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
lio = dio;
- size += dio->io_size;
- }
}
if (fio != lio) {
+ uint64_t size = IO_SPAN(fio, lio);
ASSERT(size <= zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
@@ -214,9 +218,10 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
- /* We want to process lio, then stop */
- lio = AVL_NEXT(t, lio);
- for (dio = fio; dio != lio; dio = nio) {
+ nio = fio;
+ do {
+ dio = nio;
+ nio = AVL_NEXT(t, dio);
ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == t);
@@ -224,13 +229,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
- nio = AVL_NEXT(t, dio);
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio);
zio_execute(dio);
- }
+ } while (dio != lio);
avl_add(&vq->vq_pending_tree, aio);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index ad997f528..92753d871 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -697,7 +697,7 @@ vdev_raidz_io_start(zio_t *zio)
continue;
}
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
- (zio->io_flags & ZIO_FLAG_SCRUB)) {
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size,
zio->io_type, zio->io_priority, 0,
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index ca859ec35..2dc2705b9 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -19,13 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* This file contains the top half of the zfs directory structure
* implementation. The bottom half is in zap_leaf.c.
@@ -45,6 +42,7 @@
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/zap_impl.h>
@@ -1134,3 +1132,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
}
+
+int
+fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+ uint64_t *tooverwrite)
+{
+ zap_t *zap = zn->zn_zap;
+ zap_leaf_t *l;
+ int err;
+
+ /*
+ * Account for the header block of the fatzap.
+ */
+ if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
+ tooverwrite += zap->zap_dbuf->db_size;
+ } else {
+ towrite += zap->zap_dbuf->db_size;
+ }
+
+ /*
+ * Account for the pointer table blocks.
+ * If we are adding we need to account for the following cases :
+ * - If the pointer table is embedded, this operation could force an
+ * external pointer table.
+ * - If this already has an external pointer table this operation
+ * could extend the table.
+ */
+ if (add) {
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+ towrite += zap->zap_dbuf->db_size;
+ else
+ towrite += (zap->zap_dbuf->db_size * 3);
+ }
+
+ /*
+ * Now, check if the block containing leaf is freeable
+ * and account accordingly.
+ */
+ err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
+ if (err != 0) {
+ return (err);
+ }
+
+ if (!add && dmu_buf_freeable(l->l_dbuf)) {
+ tooverwrite += l->l_dbuf->db_size;
+ } else {
+ /*
+ * If this an add operation, the leaf block could split.
+ * Hence, we need to account for an additional leaf block.
+ */
+ towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+ }
+
+ zap_put_leaf(l);
+ return (0);
+}
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index da498b6bc..9d8354e73 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -19,24 +19,23 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* The 512-byte leaf is broken into 32 16-byte chunks.
* chunk number n means l_chunk[n], even though the header precedes it.
* the names are stored null-terminated.
*/
+#include <sys/spa.h>
+#include <sys/dmu.h>
#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index abba42775..fbc93b423 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
@@ -78,8 +76,8 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
- zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
- &err);
+ zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
+ U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
return (err);
}
@@ -1067,3 +1065,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
zap_unlockdir(zap);
return (0);
}
+
+int
+zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
+ uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift)
+{
+ zap_t *zap;
+ int err = 0;
+
+
+ /*
+ * Since, we don't have a name, we cannot figure out which blocks will
+ * be affected in this operation. So, account for the worst case :
+ * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * - 4 new blocks written if adding:
+ * - 2 blocks for possibly split leaves,
+ * - 2 grown ptrtbl blocks
+ *
+ * This also accomodates the case where an add operation to a fairly
+ * large microzap results in a promotion to fatzap.
+ */
+ if (name == NULL) {
+ *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ return (err);
+ }
+
+ /*
+ * We lock the zap with adding == FALSE. Because, if we pass
+ * the actual value of add, it could trigger a mzap_upgrade().
+ * At present we are just evaluating the possibility of this operation
+ * and hence we donot want to trigger an upgrade.
+ */
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+
+ if (!zap->zap_ismicro) {
+ zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
+ if (zn) {
+ err = fzap_count_write(zn, add, towrite,
+ tooverwrite);
+ zap_name_free(zn);
+ } else {
+ /*
+ * We treat this case as similar to (name == NULL)
+ */
+ *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ }
+ } else {
+ if (!add) {
+ if (dmu_buf_freeable(zap->zap_dbuf))
+ *tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ *towrite += SPA_MAXBLOCKSIZE;
+ } else {
+ /*
+ * We are here if we are adding and (name != NULL).
+ * It is hard to find out if this add will promote this
+ * microzap to fatzap. Hence, we assume the worst case
+ * and account for the blocks assuming this microzap
+ * would be promoted to a fatzap.
+ *
+ * 1 block overwritten : header block
+ * 4 new blocks written : 2 new split leaf, 2 grown
+ * ptrtbl blocks
+ */
+ if (dmu_buf_freeable(zap->zap_dbuf))
+ *tooverwrite += 1 << dn_datablkshift;
+ else
+ *towrite += 1 << dn_datablkshift;
+ *towrite += 4 << dn_datablkshift;
+ }
+ }
+
+ zap_unlockdir(zap);
+ return (err);
+}
diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c
index fdf92a156..734bd8395 100644
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -65,15 +65,16 @@
ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
-#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
- ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@@ -538,8 +539,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
* ACE FUIDs will be created later.
*/
int
-zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
- zfs_ace_t *z_acl, int aclcnt, size_t *size)
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
{
int i;
uint16_t entry_type;
@@ -555,9 +557,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
entry_type != ACE_EVERYONE) {
- if (!aclp->z_has_fuids)
- aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who);
- aceptr->z_fuid = (uint64_t)acep->a_who;
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
}
/*
@@ -682,7 +684,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
* convert old ACL format to new
*/
void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
{
zfs_oldace_t *oldaclp;
int i;
@@ -714,9 +716,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
sizeof (zfs_object_ace_t));
aclp->z_ops = zfs_acl_fuid_ops;
- VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp,
- newaclnode->z_acldata, aclp->z_acl_count,
- &newaclnode->z_size) == 0);
+ VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+ oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
newaclnode->z_ace_count = aclp->z_acl_count;
aclp->z_version = ZFS_ACL_VERSION;
kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
@@ -770,8 +772,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
* Also, create FUIDs for any User/Group ACEs
*/
static uint64_t
-zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
- zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
{
int entry_type;
mode_t mode;
@@ -905,15 +906,6 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
}
}
}
- /*
- * Now handle FUID create for user/group ACEs
- */
- if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
- aclp->z_ops.ace_who_set(acep,
- zfs_fuid_create(zp->z_zfsvfs, who, cr,
- (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
- tx, fuidp));
- }
}
return (mode);
}
@@ -989,7 +981,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
aclnode = zfs_acl_node_alloc(aclsize);
list_insert_head(&aclp->z_acl, aclnode);
error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- aclsize, aclnode->z_acldata);
+ aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
aclnode->z_ace_count = acl_count;
aclp->z_acl_count = acl_count;
aclp->z_acl_bytes = aclsize;
@@ -1014,8 +1006,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
* already checked the acl and knows whether to inherit.
*/
int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
- zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
{
int error;
znode_phys_t *zphys = zp->z_phys;
@@ -1026,12 +1017,9 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
dmu_object_type_t otype;
zfs_acl_node_t *aclnode;
- ASSERT(MUTEX_HELD(&zp->z_lock));
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
dmu_buf_will_dirty(zp->z_dbuf, tx);
- zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx);
+ zphys->zp_mode = zfs_mode_compute(zp, aclp);
/*
* Decide which opbject type to use. If we are forced to
@@ -1043,7 +1031,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
} else {
if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
(zfsvfs->z_version >= ZPL_VERSION_FUID))
- zfs_acl_xform(zp, aclp);
+ zfs_acl_xform(zp, aclp, cr);
ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
otype = DMU_OT_ACL;
}
@@ -1125,7 +1113,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
return (0);
}
@@ -1336,7 +1323,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep)
* Prepend deny ACE
*/
static void *
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
+zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
mode_t mode)
{
zfs_acl_node_t *aclnode;
@@ -1349,7 +1336,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
fuid = aclp->z_ops.ace_who_get(acep);
flags = aclp->z_ops.ace_flags_get(acep);
zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
- zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid);
+ zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
return (newacep);
}
@@ -1473,9 +1460,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
* in PSARC/2002/240
*/
static void
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
+ uint64_t mode, zfs_acl_t *aclp)
{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
void *acep = NULL, *prevacep = NULL;
uint64_t who;
int i;
@@ -1485,11 +1472,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
uint16_t iflags, type;
uint32_t access_mask;
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
- aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
-
/*
* If discard then just discard all ACL nodes which
* represent the ACEs.
@@ -1554,17 +1536,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
if (!reuse_deny) {
prevacep =
- zfs_acl_prepend_deny(zp,
+ zfs_acl_prepend_deny(uid,
aclp, acep, mode);
} else {
zfs_acl_prepend_fixup(
aclp, prevacep,
- acep, mode,
- zp->z_phys->zp_uid);
+ acep, mode, uid);
}
zfs_fixup_group_entries(aclp, acep,
prevacep, mode);
-
}
}
}
@@ -1623,8 +1603,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
mutex_enter(&zp->z_acl_lock);
*aclp = NULL;
error = zfs_acl_node_read(zp, aclp, B_TRUE);
- if (error == 0)
- zfs_acl_chmod(zp, mode, *aclp);
+ if (error == 0) {
+ (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+ }
mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
return (error);
@@ -1649,9 +1631,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
* Should ACE be inherited?
*/
static int
-zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
{
- int vtype = ZTOV(zp)->v_type;
int iflags = (acep_flags & 0xf);
if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
@@ -1666,10 +1647,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
* inherit inheritable ACEs from parent
*/
static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
- boolean_t *need_chmod)
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
void *pacep;
void *acep, *acep2;
zfs_acl_node_t *aclnode, *aclnode2;
@@ -1680,8 +1660,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
size_t ace_size;
void *data1, *data2;
size_t data1sz, data2sz;
- boolean_t vdir = ZTOV(zp)->v_type == VDIR;
- boolean_t vreg = ZTOV(zp)->v_type == VREG;
+ boolean_t vdir = vtype == VDIR;
+ boolean_t vreg = vtype == VREG;
boolean_t passthrough, passthrough_x, noallow;
passthrough_x =
@@ -1710,7 +1690,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
ace_size = aclp->z_ops.ace_size(pacep);
- if (!zfs_ace_can_use(zp, iflags))
+ if (!zfs_ace_can_use(vtype, iflags))
continue;
/*
@@ -1806,55 +1786,58 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
* Create file system object initial permissions
* including inheritable ACEs.
*/
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
- vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
{
- uint64_t mode, fuid, fgid;
int error;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zfs_acl_t *aclp = NULL;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_acl_t *paclp;
- xvattr_t *xvap = (xvattr_t *)vap;
gid_t gid;
boolean_t need_chmod = B_TRUE;
- if (setaclp)
- aclp = setaclp;
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
- mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+ &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
/*
* Determine uid and gid.
*/
if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
((flag & IS_XATTR) && (vap->va_type == VDIR))) {
- fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr,
- ZFS_OWNER, tx, fuidp);
- fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
- ZFS_GROUP, tx, fuidp);
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr,
+ ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr,
+ ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
} else {
- fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp);
- fgid = 0;
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
if (vap->va_mask & AT_GID) {
- fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
- ZFS_GROUP, tx, fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
- if (fgid != parent->z_phys->zp_gid &&
+ if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
!groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0)
- fgid = 0;
+ acl_ids->z_fgid = 0;
}
- if (fgid == 0) {
- if (parent->z_phys->zp_mode & S_ISGID) {
- fgid = parent->z_phys->zp_gid;
- gid = zfs_fuid_map_id(zfsvfs, fgid,
+ if (acl_ids->z_fgid == 0) {
+ if (dzp->z_phys->zp_mode & S_ISGID) {
+ acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP);
} else {
- fgid = zfs_fuid_create_cred(zfsvfs,
- ZFS_GROUP, tx, cr, fuidp);
+ acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+ ZFS_GROUP, cr, &acl_ids->z_fuidp);
gid = crgetgid(cr);
}
}
@@ -1867,57 +1850,61 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
* file's new group, clear the file's set-GID bit.
*/
- if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) {
- mode |= S_ISGID;
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+ (vap->va_type == VDIR)) {
+ acl_ids->z_mode |= S_ISGID;
} else {
- if ((mode & S_ISGID) &&
+ if ((acl_ids->z_mode & S_ISGID) &&
secpolicy_vnode_setids_setgids(cr, gid) != 0)
- mode &= ~S_ISGID;
- }
-
- zp->z_phys->zp_uid = fuid;
- zp->z_phys->zp_gid = fgid;
- zp->z_phys->zp_mode = mode;
-
- if (aclp == NULL) {
- mutex_enter(&parent->z_lock);
- if ((ZTOV(parent)->v_type == VDIR &&
- (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
- !(zp->z_phys->zp_flags & ZFS_XATTR)) {
- mutex_enter(&parent->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
- mutex_exit(&parent->z_acl_lock);
- aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod);
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_lock);
+ if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
+ (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
+ !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ mutex_enter(&dzp->z_acl_lock);
+ VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+ mutex_exit(&dzp->z_acl_lock);
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
zfs_acl_free(paclp);
} else {
- aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ }
+ mutex_exit(&dzp->z_lock);
+ if (need_chmod) {
+ acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+ ZFS_ACL_AUTO_INHERIT : 0;
+ zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
+ acl_ids->z_mode, acl_ids->z_aclp);
}
- mutex_exit(&parent->z_lock);
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
- if (need_chmod)
- zfs_acl_chmod(zp, mode, aclp);
- } else {
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
}
- /* Force auto_inherit on all new directory objects */
- if (vap->va_type == VDIR)
- aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
-
- error = zfs_aclset_common(zp, aclp, cr, fuidp, tx);
-
- /* Set optional attributes if any */
- if (vap->va_mask & AT_XVATTR)
- zfs_xvattr_set(zp, xvap);
+ return (0);
+}
- mutex_exit(&zp->z_lock);
- mutex_exit(&zp->z_acl_lock);
- ASSERT3U(error, ==, 0);
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
- if (aclp != setaclp)
- zfs_acl_free(aclp);
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
+{
+ return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+ zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
}
/*
@@ -2018,7 +2005,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
int
zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
- vsecattr_t *vsecp, zfs_acl_t **zaclp)
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
{
zfs_acl_t *aclp;
zfs_acl_node_t *aclnode;
@@ -2041,9 +2028,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
return (error);
}
} else {
- if ((error = zfs_copy_ace_2_fuid(obj_type, aclp,
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
- &aclnode->z_size)) != 0) {
+ &aclnode->z_size, fuidp, cr)) != 0) {
zfs_acl_free(aclp);
zfs_acl_node_free(aclnode);
return (error);
@@ -2084,6 +2071,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
int error;
zfs_acl_t *aclp;
zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
if (mask == 0)
return (ENOSYS);
@@ -2094,7 +2082,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
return (error);
- error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp);
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+ &aclp);
if (error)
return (error);
@@ -2135,18 +2124,9 @@ top:
} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
}
- if (aclp->z_has_fuids) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
@@ -2163,9 +2143,13 @@ top:
return (error);
}
- error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+ error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
if (fuidp)
@@ -2180,45 +2164,17 @@ done:
}
/*
- * working_mode returns the permissions that were not granted
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
*/
static int
-zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
- boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
{
- zfs_acl_t *aclp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- uid_t uid = crgetuid(cr);
- uint64_t who;
- uint16_t type, iflags;
- uint16_t entry_type;
- uint32_t access_mask;
- uint32_t deny_mask = 0;
- zfs_ace_hdr_t *acep = NULL;
- boolean_t checkit;
- uid_t fowner;
- uid_t gowner;
-
- /*
- * Short circuit empty requests
- */
- if (v4_mode == 0)
- return (0);
-
- *check_privs = B_TRUE;
-
- if (zfsvfs->z_replay) {
- *working_mode = 0;
- return (0);
- }
-
- *working_mode = v4_mode;
-
if ((v4_mode & WRITE_MASK) &&
(zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
- (!IS_DEVVP(ZTOV(zp)))) {
- *check_privs = B_FALSE;
+ (!IS_DEVVP(ZTOV(zp)) ||
+ (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
return (EROFS);
}
@@ -2230,31 +2186,64 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
(zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
(ZTOV(zp)->v_type == VDIR &&
(zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
- *check_privs = B_FALSE;
return (EPERM);
}
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
(zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
- *check_privs = B_FALSE;
return (EPERM);
}
if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
- *check_privs = B_FALSE;
return (EACCES);
}
- /*
- * The caller requested that the ACL check be skipped. This
- * would only happen if the caller checked VOP_ACCESS() with a
- * 32 bit ACE mask and already had the appropriate permissions.
- */
- if (skipaclchk) {
- *working_mode = 0;
- return (0);
- }
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCESS if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t fowner;
+ uid_t gowner;
zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
@@ -2268,6 +2257,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
+ uint32_t mask_matched;
if (!zfs_acl_valid_ace_type(type, iflags))
continue;
@@ -2275,6 +2265,11 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
continue;
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
entry_type = (iflags & ACE_TYPE_FLAGS);
checkit = B_FALSE;
@@ -2313,14 +2308,24 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
}
if (checkit) {
- uint32_t mask_matched = (access_mask & *working_mode);
-
- if (mask_matched) {
- if (type == DENY)
- deny_mask |= mask_matched;
-
- *working_mode &= ~mask_matched;
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ zfs_acl_free(aclp);
+ return (0);
+ }
}
+ *working_mode &= ~mask_matched;
}
/* Are we done? */
@@ -2342,6 +2347,69 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
return (0);
}
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(zp->z_zfsvfs,
+ zp->z_phys->zp_uid, cr, ZFS_OWNER);
+
+ return (
+ secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
+ secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
+ secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
+ secpolicy_vnode_chown(cr, B_TRUE) == 0 ||
+ secpolicy_vnode_chown(cr, B_FALSE) == 0 ||
+ secpolicy_vnode_setdac(cr, owner) == 0 ||
+ secpolicy_vnode_remove(cr) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
static int
zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
cred_t *cr)
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 0da026100..27c2c51a3 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -114,12 +114,16 @@ snapentry_compare(const void *a, const void *b)
vnodeops_t *zfsctl_ops_root;
vnodeops_t *zfsctl_ops_snapdir;
vnodeops_t *zfsctl_ops_snapshot;
+vnodeops_t *zfsctl_ops_shares;
+vnodeops_t *zfsctl_ops_shares_dir;
static const fs_operation_def_t zfsctl_tops_root[];
static const fs_operation_def_t zfsctl_tops_snapdir[];
static const fs_operation_def_t zfsctl_tops_snapshot[];
+static const fs_operation_def_t zfsctl_tops_shares[];
static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_mknode_shares(vnode_t *);
static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
@@ -127,14 +131,18 @@ static gfs_opsvec_t zfsctl_opsvec[] = {
{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+ { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
+ { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
{ NULL }
};
/*
- * Root directory elements. We have only a single static entry, 'snapshot'.
+ * Root directory elements. We only have two entries
+ * snapshot and shares.
*/
static gfs_dirent_t zfsctl_root_entries[] = {
{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+ { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
{ NULL }
};
@@ -166,21 +174,34 @@ zfsctl_fini(void)
vn_freevnodeops(zfsctl_ops_snapdir);
if (zfsctl_ops_snapshot)
vn_freevnodeops(zfsctl_ops_snapshot);
+ if (zfsctl_ops_shares)
+ vn_freevnodeops(zfsctl_ops_shares);
+ if (zfsctl_ops_shares_dir)
+ vn_freevnodeops(zfsctl_ops_shares_dir);
zfsctl_ops_root = NULL;
zfsctl_ops_snapdir = NULL;
zfsctl_ops_snapshot = NULL;
+ zfsctl_ops_shares = NULL;
+ zfsctl_ops_shares_dir = NULL;
}
/*
- * Return the inode number associated with the 'snapshot' directory.
+ * Return the inode number associated with the 'snapshot' or
+ * 'shares' directory.
*/
/* ARGSUSED */
static ino64_t
zfsctl_root_inode_cb(vnode_t *vp, int index)
{
- ASSERT(index == 0);
- return (ZFSCTL_INO_SNAPDIR);
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+ ASSERT(index <= 2);
+
+ if (index == 0)
+ return (ZFSCTL_INO_SNAPDIR);
+
+ return (zfsvfs->z_shares_dir);
}
/*
@@ -348,6 +369,30 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
return (0);
}
+
+/*ARGSUSED*/
+static int
+zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTSUP);
+ }
+
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+ error = VOP_FID(ZTOV(dzp), fidp, ct);
+ VN_RELE(ZTOV(dzp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
/*
* .zfs inode namespace
*
@@ -478,7 +523,7 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
VN_RELE(svp);
return (error);
}
- VFS_RELE(svp->v_vfsp);
+
/*
* We can't use VN_RELE(), as that will try to invoke
* zfsctl_snapdir_inactive(), which would cause us to destroy
@@ -691,7 +736,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
return (err);
if (err == 0) {
- err = dmu_objset_snapshot(name, dirname, B_FALSE);
+ err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
if (err)
return (err);
err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@@ -732,9 +777,6 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
ASSERT(dvp->v_type == VDIR);
- if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
- return (0);
-
/*
* If we get a recursive call, that means we got called
* from the domount() code while it was trying to look up the
@@ -746,6 +788,11 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
ZFS_ENTER(zfsvfs);
+ if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
if (flags & FIGNORECASE) {
boolean_t conflict = B_FALSE;
@@ -844,7 +891,7 @@ domount:
* Return the mounted root rather than the covered mount point.
* Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
* the ZFS vnode mounted on top of the GFS node. This ZFS
- * vnode is the root the newly created vfsp.
+ * vnode is the root of the newly created vfsp.
*/
VFS_RELE(vfsp);
err = traverse(vpp);
@@ -879,6 +926,37 @@ domount:
/* ARGSUSED */
static int
+zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTSUP);
+ }
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
+ error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
+ flags, rdir, cr, ct, direntflags, realpnp);
+
+ VN_RELE(ZTOV(dzp));
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
offset_t *offp, offset_t *nextp, void *data, int flags)
{
@@ -921,6 +999,33 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
return (0);
}
+/* ARGSUSED */
+static int
+zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTSUP);
+ }
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+ error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
+ VN_RELE(ZTOV(dzp));
+ } else {
+ *eofp = 1;
+ error = ENOENT;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
/*
* pvp is the '.zfs' directory (zfsctl_node_t).
* Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
@@ -946,6 +1051,45 @@ zfsctl_mknode_snapdir(vnode_t *pvp)
return (vp);
}
+vnode_t *
+zfsctl_mknode_shares(vnode_t *pvp)
+{
+ vnode_t *vp;
+ zfsctl_node_t *sdp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
+ zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
+ NULL, NULL);
+ sdp = vp->v_data;
+ sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
+ return (vp);
+
+}
+
+/* ARGSUSED */
+static int
+zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTSUP);
+ }
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+ error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
+ VN_RELE(ZTOV(dzp));
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+
+
+}
+
/* ARGSUSED */
static int
zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
@@ -996,6 +1140,20 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = {
{ NULL }
};
+static const fs_operation_def_t zfsctl_tops_shares[] = {
+ { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
+ { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
+ { VOPNAME_IOCTL, { .error = fs_inval } },
+ { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } },
+ { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
+ { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } },
+ { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } },
+ { VOPNAME_SEEK, { .vop_seek = fs_seek } },
+ { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
+ { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } },
+ { NULL }
+};
+
/*
* pvp is the GFS vnode '.zfs/snapshot'.
*
@@ -1013,7 +1171,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
zcp = vp->v_data;
zcp->zc_id = objset;
- VFS_HOLD(vp->v_vfsp);
return (vp);
}
@@ -1052,7 +1209,6 @@ zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
mutex_exit(&sdp->sd_lock);
VN_RELE(dvp);
- VFS_RELE(vp->v_vfsp);
/*
* Dispose of the vnode for the snapshot mount point.
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 9353d0199..b3f768328 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -805,44 +805,49 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
znode_t *xzp;
dmu_tx_t *tx;
int error;
- zfs_fuid_info_t *fuidp = NULL;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
*xvpp = NULL;
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
return (error);
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (EDQUOT);
+ }
+
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
if (error == ERESTART)
dmu_tx_wait(tx);
dmu_tx_abort(tx);
return (error);
}
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
ASSERT(xzp->z_phys->zp_parent == zp->z_id);
dmu_buf_will_dirty(zp->z_dbuf, tx);
zp->z_phys->zp_xattr = xzp->z_id;
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
- xzp, "", NULL, fuidp, vap);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
*xvpp = ZTOV(xzp);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 236d69e7e..8b7785fa8 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -96,7 +96,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
nvlist_t *ereport, *detector;
uint64_t ena;
char class[64];
- int state;
/*
* If we are doing a spa_tryimport(), ignore errors.
@@ -130,15 +129,39 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
return;
/*
- * If the vdev has already been marked as failing due to a
- * failed probe, then ignore any subsequent I/O errors, as the
- * DE will automatically fault the vdev on the first such
- * failure.
+ * If this I/O is not a retry I/O, don't post an ereport.
+ * Otherwise, we risk making bad diagnoses based on B_FAILFAST
+ * I/Os.
*/
- if (vd != NULL &&
- (!vdev_readable(vd) || !vdev_writeable(vd)) &&
- strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+ if (zio->io_error == EIO &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
+
+ if (vd != NULL) {
+ /*
+ * If the vdev has already been marked as failing due
+ * to a failed probe, then ignore any subsequent I/O
+ * errors, as the DE will automatically fault the vdev
+ * on the first such failure. This also catches cases
+ * where vdev_remove_wanted is set and the device has
+ * not yet been asynchronously placed into the REMOVED
+ * state.
+ */
+ if (zio->io_vd == vd &&
+ !vdev_accessible(vd, zio) &&
+ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+ return;
+
+ /*
+ * Ignore checksum errors for reads from DTL regions of
+ * leaf vdevs.
+ */
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_error == ECKSUM &&
+ vd->vdev_ops->vdev_op_leaf &&
+ vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+ return;
+ }
}
if ((ereport = fm_nvlist_create(NULL)) == NULL)
@@ -189,21 +212,13 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
*/
/*
- * If we are importing a faulted pool, then we treat it like an open,
- * not an import. Otherwise, the DE will ignore all faults during
- * import, since the default behavior is to mark the devices as
- * persistently unavailable, not leave them in the faulted state.
- */
- state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
-
- /*
* Generic payload members common to all ereports.
*/
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
DATA_TYPE_UINT64, spa_guid(spa),
FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
- state, NULL);
+ spa->spa_load_state, NULL);
if (spa != NULL) {
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@@ -222,14 +237,18 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
DATA_TYPE_UINT64, vd->vdev_guid,
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
- if (vd->vdev_path)
+ if (vd->vdev_path != NULL)
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
DATA_TYPE_STRING, vd->vdev_path, NULL);
- if (vd->vdev_devid)
+ if (vd->vdev_devid != NULL)
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
DATA_TYPE_STRING, vd->vdev_devid, NULL);
+ if (vd->vdev_fru != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+ DATA_TYPE_STRING, vd->vdev_fru, NULL);
if (pvd != NULL) {
fm_payload_set(ereport,
diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c
index 286dafba8..8e481dffb 100644
--- a/module/zfs/zfs_fuid.c
+++ b/module/zfs/zfs_fuid.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -47,8 +47,10 @@
* During file system initialization the nvlist(s) are read and
* two AVL trees are created. One tree is keyed by the index number
* and the other by the domain string. Nodes are never removed from
- * trees, but new entries may be added. If a new entry is added then the
- * on-disk packed nvlist will also be updated.
+ * trees, but new entries may be added. If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
*/
#define FUID_IDX "fuid_idx"
@@ -97,6 +99,15 @@ domain_compare(const void *arg1, const void *arg2)
return (val > 0 ? 1 : -1);
}
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+ avl_create(idx_tree, idx_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+ avl_create(domain_tree, domain_compare,
+ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
/*
* load initial fuid domain and idx trees. This function is used by
* both the kernel and zdb.
@@ -108,12 +119,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
dmu_buf_t *db;
uint64_t fuid_size;
- avl_create(idx_tree, idx_compare,
- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
- avl_create(domain_tree, domain_compare,
- sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
-
- VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
+ ASSERT(fuid_obj != 0);
+ VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+ FTAG, &db));
fuid_size = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG);
@@ -125,7 +133,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
int i;
packed = kmem_alloc(fuid_size, KM_SLEEP);
- VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
+ VERIFY(dmu_read(os, fuid_obj, 0,
+ fuid_size, packed, DMU_READ_PREFETCH) == 0);
VERIFY(nvlist_unpack(packed, fuid_size,
&nvp, 0) == 0);
VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
@@ -189,10 +198,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
* Load the fuid table(s) into memory.
*/
static void
-zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+zfs_fuid_init(zfsvfs_t *zfsvfs)
{
- int error = 0;
-
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
if (zfsvfs->z_fuid_loaded) {
@@ -200,41 +207,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
return;
}
- if (zfsvfs->z_fuid_obj == 0) {
-
- /* first make sure we need to allocate object */
-
- error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
- ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
- if (error == ENOENT && tx != NULL) {
- zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
- DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
- sizeof (uint64_t), tx);
- VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
- ZFS_FUID_TABLES, sizeof (uint64_t), 1,
- &zfsvfs->z_fuid_obj, tx) == 0);
- }
- }
+ zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+ (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
if (zfsvfs->z_fuid_obj != 0) {
zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
&zfsvfs->z_fuid_domain);
- zfsvfs->z_fuid_loaded = B_TRUE;
}
+ zfsvfs->z_fuid_loaded = B_TRUE;
+ rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ nvlist_t *nvp;
+ nvlist_t **fuids;
+ size_t nvsize = 0;
+ char *packed;
+ dmu_buf_t *db;
+ fuid_domain_t *domnode;
+ int numnodes;
+ int i;
+
+ if (!zfsvfs->z_fuid_dirty) {
+ return;
+ }
+
+ rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+ /*
+ * First see if table needs to be created?
+ */
+ if (zfsvfs->z_fuid_obj == 0) {
+ zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+ sizeof (uint64_t), tx);
+ VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+ &zfsvfs->z_fuid_obj, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+ fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+ for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+ domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+ VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+ domnode->f_idx) == 0);
+ VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+ VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+ domnode->f_ksid->kd_name) == 0);
+ }
+ VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+ fuids, numnodes) == 0);
+ for (i = 0; i != numnodes; i++)
+ nvlist_free(fuids[i]);
+ kmem_free(fuids, numnodes * sizeof (void *));
+ VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+ nvlist_free(nvp);
+ zfsvfs->z_fuid_size = nvsize;
+ dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+ zfsvfs->z_fuid_size, packed, tx);
+ kmem_free(packed, zfsvfs->z_fuid_size);
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+ FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+ dmu_buf_rele(db, FTAG);
+
+ zfsvfs->z_fuid_dirty = B_FALSE;
rw_exit(&zfsvfs->z_fuid_lock);
}
/*
* Query domain table for a given domain.
*
- * If domain isn't found it is added to AVL trees and
- * the results are pushed out to disk.
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
*/
int
-zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
- dmu_tx_t *tx)
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+ char **retdomain, boolean_t addok)
{
fuid_domain_t searchnode, *findnode;
avl_index_t loc;
@@ -246,16 +313,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
* for the user nobody.
*/
if (domain[0] == '\0') {
- *retdomain = nulldomain;
+ if (retdomain)
+ *retdomain = nulldomain;
return (0);
}
searchnode.f_ksid = ksid_lookupdomain(domain);
- if (retdomain) {
+ if (retdomain)
*retdomain = searchnode.f_ksid->kd_name;
- }
if (!zfsvfs->z_fuid_loaded)
- zfs_fuid_init(zfsvfs, tx);
+ zfs_fuid_init(zfsvfs);
retry:
rw_enter(&zfsvfs->z_fuid_lock, rw);
@@ -265,15 +332,9 @@ retry:
rw_exit(&zfsvfs->z_fuid_lock);
ksiddomain_rele(searchnode.f_ksid);
return (findnode->f_idx);
- } else {
+ } else if (addok) {
fuid_domain_t *domnode;
- nvlist_t *nvp;
- nvlist_t **fuids;
uint64_t retidx;
- size_t nvsize = 0;
- char *packed;
- dmu_buf_t *db;
- int i = 0;
if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
rw_exit(&zfsvfs->z_fuid_lock);
@@ -288,46 +349,11 @@ retry:
avl_add(&zfsvfs->z_fuid_domain, domnode);
avl_add(&zfsvfs->z_fuid_idx, domnode);
- /*
- * Now resync the on-disk nvlist.
- */
- VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- domnode = avl_first(&zfsvfs->z_fuid_domain);
- fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
- while (domnode) {
- VERIFY(nvlist_alloc(&fuids[i],
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
- domnode->f_idx) == 0);
- VERIFY(nvlist_add_uint64(fuids[i],
- FUID_OFFSET, 0) == 0);
- VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
- domnode->f_ksid->kd_name) == 0);
- domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
- }
- VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
- fuids, retidx) == 0);
- for (i = 0; i != retidx; i++)
- nvlist_free(fuids[i]);
- kmem_free(fuids, retidx * sizeof (void *));
- VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
- packed = kmem_alloc(nvsize, KM_SLEEP);
- VERIFY(nvlist_pack(nvp, &packed, &nvsize,
- NV_ENCODE_XDR, KM_SLEEP) == 0);
- nvlist_free(nvp);
- zfsvfs->z_fuid_size = nvsize;
- dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
- zfsvfs->z_fuid_size, packed, tx);
- kmem_free(packed, zfsvfs->z_fuid_size);
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
- FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
- dmu_buf_rele(db, FTAG);
-
+ zfsvfs->z_fuid_dirty = B_TRUE;
rw_exit(&zfsvfs->z_fuid_lock);
return (retidx);
+ } else {
+ return (-1);
}
}
@@ -337,7 +363,7 @@ retry:
* Returns a pointer from an avl node of the domain string.
*
*/
-static char *
+const char *
zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
{
char *domain;
@@ -346,7 +372,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
return (NULL);
if (!zfsvfs->z_fuid_loaded)
- zfs_fuid_init(zfsvfs, NULL);
+ zfs_fuid_init(zfsvfs);
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
@@ -374,7 +400,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
cred_t *cr, zfs_fuid_type_t type)
{
uint32_t index = FUID_INDEX(fuid);
- char *domain;
+ const char *domain;
uid_t id;
if (index == 0)
@@ -439,6 +465,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
}
if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
/*
* Now allocate fuid entry and add it on the end of the list
*/
@@ -463,7 +490,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
*/
uint64_t
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
- dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
+ cred_t *cr, zfs_fuid_info_t **fuidp)
{
uint64_t idx;
ksid_t *ksid;
@@ -490,7 +517,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
rid = ksid_getrid(ksid);
domain = ksid_getdomain(ksid);
- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
@@ -511,7 +538,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
*/
uint64_t
zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
- zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
+ zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
{
const char *domain;
char *kdomain;
@@ -581,10 +608,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
}
}
- idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+ idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
if (!zfsvfs->z_replay)
- zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
+ zfs_fuid_node_add(fuidpp, kdomain,
+ rid, idx, id, type);
else if (zfuid != NULL) {
list_remove(&fuidp->z_fuids, zfuid);
kmem_free(zfuid, sizeof (zfs_fuid_t));
@@ -658,16 +686,15 @@ boolean_t
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
{
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
+ ksidlist_t *ksidlist = crgetsidlist(cr);
uid_t gid;
- if (ksid) {
+ if (ksid && ksidlist) {
int i;
ksid_t *ksid_groups;
- ksidlist_t *ksidlist = crgetsidlist(cr);
uint32_t idx = FUID_INDEX(id);
uint32_t rid = FUID_RID(id);
- ASSERT(ksidlist);
ksid_groups = ksidlist->ksl_sids;
for (i = 0; i != ksidlist->ksl_nsid; i++) {
@@ -677,7 +704,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
return (B_TRUE);
}
} else {
- char *domain;
+ const char *domain;
domain = zfs_fuid_find_by_idx(zfsvfs, idx);
ASSERT(domain != NULL);
@@ -700,4 +727,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
return (groupmember(gid, cr));
}
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+ if (zfsvfs->z_fuid_obj == 0) {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+ dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ }
+}
#endif
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index d032648b5..07dd03c35 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -79,17 +79,29 @@ dev_info_t *zfs_dip;
typedef int zfs_ioc_func_t(zfs_cmd_t *);
typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
+typedef enum {
+ NO_NAME,
+ POOL_NAME,
+ DATASET_NAME
+} zfs_ioc_namecheck_t;
+
typedef struct zfs_ioc_vec {
zfs_ioc_func_t *zvec_func;
zfs_secpolicy_func_t *zvec_secpolicy;
- enum {
- NO_NAME,
- POOL_NAME,
- DATASET_NAME
- } zvec_namecheck;
+ zfs_ioc_namecheck_t zvec_namecheck;
boolean_t zvec_his_log;
+ boolean_t zvec_pool_check;
} zfs_ioc_vec_t;
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+ ZFS_DELEG_PERM_USERUSED,
+ ZFS_DELEG_PERM_USERQUOTA,
+ ZFS_DELEG_PERM_GROUPUSED,
+ ZFS_DELEG_PERM_GROUPQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
boolean_t *);
@@ -389,6 +401,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
ZFS_DELEG_PERM_SEND, cr));
}
+static int
+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr)
+{
+ vnode_t *vp;
+ int error;
+
+ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+ NO_FOLLOW, NULL, &vp)) != 0)
+ return (error);
+
+ /* Now make sure mntpnt and dataset are ZFS */
+
+ if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+ zc->zc_name) != 0)) {
+ VN_RELE(vp);
+ return (EPERM);
+ }
+
+ VN_RELE(vp);
+ return (dsl_deleg_access(zc->zc_name,
+ ZFS_DELEG_PERM_SHARE, cr));
+}
+
int
zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
{
@@ -398,25 +434,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
if (secpolicy_nfs(cr) == 0) {
return (0);
} else {
- vnode_t *vp;
- int error;
-
- if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
- NO_FOLLOW, NULL, &vp)) != 0)
- return (error);
-
- /* Now make sure mntpnt and dataset are ZFS */
+ return (zfs_secpolicy_deleg_share(zc, cr));
+ }
+}
- if (vp->v_vfsp->vfs_fstype != zfsfstype ||
- (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
- zc->zc_name) != 0)) {
- VN_RELE(vp);
- return (EPERM);
- }
+int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr)
+{
+ if (!INGLOBALZONE(curproc))
+ return (EPERM);
- VN_RELE(vp);
- return (dsl_deleg_access(zc->zc_name,
- ZFS_DELEG_PERM_SHARE, cr));
+ if (secpolicy_smb(cr) == 0) {
+ return (0);
+ } else {
+ return (zfs_secpolicy_deleg_share(zc, cr));
}
}
@@ -681,11 +712,60 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
}
}
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (EINVAL);
+
+ if (zc->zc_value[0] == 0) {
+ /*
+ * They are asking about a posix uid/gid. If it's
+ * themself, allow it.
+ */
+ if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+ zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
+ if (zc->zc_guid == crgetuid(cr))
+ return (0);
+ } else {
+ if (groupmember(zc->zc_guid, cr))
+ return (0);
+ }
+ }
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
+{
+ int err = zfs_secpolicy_read(zc, cr);
+ if (err)
+ return (err);
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (EINVAL);
+
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
+}
+
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
-get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
{
char *packed;
int error;
@@ -699,7 +779,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
packed = kmem_alloc(size, KM_SLEEP);
- if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
+ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+ iflag)) != 0) {
kmem_free(packed, size);
return (error);
}
@@ -730,8 +811,8 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
packed = kmem_alloc(size, KM_SLEEP);
VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
KM_SLEEP) == 0);
- error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
- size);
+ error = ddi_copyout(packed,
+ (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags);
kmem_free(packed, size);
}
@@ -740,6 +821,69 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
}
static int
+getzfsvfs(const char *dsname, zfsvfs_t **zvp)
+{
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_open(dsname, DMU_OST_ZFS,
+ DS_MODE_USER | DS_MODE_READONLY, &os);
+ if (error)
+ return (error);
+
+ mutex_enter(&os->os->os_user_ptr_lock);
+ *zvp = dmu_objset_get_user(os);
+ if (*zvp) {
+ VFS_HOLD((*zvp)->z_vfs);
+ } else {
+ error = ESRCH;
+ }
+ mutex_exit(&os->os->os_user_ptr_lock);
+ dmu_objset_close(os);
+ return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_vfs will be NULL, and it will be opened as the owner.
+ */
+static int
+zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp)
+{
+ int error = 0;
+ int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0);
+
+ if (getzfsvfs(name, zvp) != 0)
+ error = zfsvfs_create(name, mode, zvp);
+ if (error == 0) {
+ rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag);
+ if ((*zvp)->z_unmounted) {
+ /*
+ * XXX we could probably try again, since the unmounting
+ * thread should be just about to disassociate the
+ * objset from the zfsvfs.
+ */
+ rrw_exit(&(*zvp)->z_teardown_lock, tag);
+ return (EBUSY);
+ }
+ }
+ return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+ rrw_exit(&zfsvfs->z_teardown_lock, tag);
+
+ if (zfsvfs->z_vfs) {
+ VFS_RELE(zfsvfs->z_vfs);
+ } else {
+ dmu_objset_close(zfsvfs->z_os);
+ zfsvfs_free(zfsvfs);
+ }
+}
+
+static int
zfs_ioc_pool_create(zfs_cmd_t *zc)
{
int error;
@@ -749,11 +893,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
char *buf;
if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config))
+ zc->zc_iflags, &config))
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -825,11 +970,12 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
uint64_t guid;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) != 0)
+ zc->zc_iflags, &config)) != 0)
return (error);
if (zc->zc_nvlist_src_size != 0 && (error =
- get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &props))) {
nvlist_free(config);
return (error);
}
@@ -838,7 +984,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
guid != zc->zc_guid)
error = EINVAL;
else if (zc->zc_cookie)
- error = spa_import_faulted(zc->zc_name, config,
+ error = spa_import_verbatim(zc->zc_name, config,
props);
else
error = spa_import(zc->zc_name, config, props);
@@ -917,7 +1063,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
int error;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &tryconfig)) != 0)
+ zc->zc_iflags, &tryconfig)) != 0)
return (error);
config = spa_tryimport(tryconfig);
@@ -1005,9 +1151,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
hist_buf = kmem_alloc(size, KM_SLEEP);
if ((error = spa_history_get(spa, &zc->zc_history_offset,
&zc->zc_history_len, hist_buf)) == 0) {
- error = xcopyout(hist_buf,
- (char *)(uintptr_t)zc->zc_history,
- zc->zc_history_len);
+ error = ddi_copyout(hist_buf,
+ (void *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len, zc->zc_iflags);
}
spa_close(spa, FTAG);
@@ -1055,7 +1201,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
return (error);
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config);
+ zc->zc_iflags, &config);
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache);
@@ -1145,7 +1291,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
return (error);
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) == 0) {
+ zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
@@ -1186,6 +1332,23 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
return (error);
}
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *fru = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setfru(spa, guid, fru);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
/*
* inputs:
* zc_name name of filesystem
@@ -1291,6 +1454,23 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
return (err);
}
+static boolean_t
+dataset_name_hidden(const char *name)
+{
+ /*
+ * Skip over datasets that are not visible in this zone,
+ * internal datasets (which have a $ in their name), and
+ * temporary datasets (which have a % in their name).
+ */
+ if (strchr(name, '$') != NULL)
+ return (B_TRUE);
+ if (strchr(name, '%') != NULL)
+ return (B_TRUE);
+ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
/*
* inputs:
* zc_name name of filesystem
@@ -1299,6 +1479,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
*
* outputs:
* zc_name name of next filesystem
+ * zc_cookie zap cursor
* zc_objset_stats stats
* zc_nvlist_dst property nvlist
* zc_nvlist_dst_size size of property nvlist
@@ -1322,12 +1503,16 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
p = zc->zc_name + strlen(zc->zc_name);
+ /*
+ * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0
+ * but is not declared void because its called by dmu_objset_find().
+ */
if (zc->zc_cookie == 0) {
uint64_t cookie = 0;
int len = sizeof (zc->zc_name) - (p - zc->zc_name);
while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
- dmu_objset_prefetch(p, NULL);
+ (void) dmu_objset_prefetch(p, NULL);
}
do {
@@ -1336,15 +1521,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
NULL, &zc->zc_cookie);
if (error == ENOENT)
error = ESRCH;
- } while (error == 0 && !INGLOBALZONE(curproc) &&
- !zone_dataset_visible(zc->zc_name, NULL));
+ } while (error == 0 && dataset_name_hidden(zc->zc_name));
dmu_objset_close(os);
- /*
- * If it's a hidden dataset (ie. with a '$' in its name), don't
- * try to get stats for it. Userland will skip over it.
- */
- if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+ if (error == 0)
error = zfs_ioc_objset_stats(zc); /* fill in the stats */
return (error);
@@ -1373,9 +1553,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
if (error)
return (error == ENOENT ? ESRCH : error);
- if (zc->zc_cookie == 0)
- dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+ if (zc->zc_cookie == 0) {
+ (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
NULL, DS_FIND_SNAPSHOTS);
+ }
/*
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
@@ -1404,9 +1585,11 @@ int
zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
{
nvpair_t *elem;
- int error;
+ int error = 0;
uint64_t intval;
char *strval;
+ nvlist_t *genericnvl;
+ boolean_t issnap = (strchr(name, '@') != NULL);
/*
* First validate permission to set all of the properties
@@ -1421,16 +1604,35 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
* If this is a user-defined property, it must be a
* string, and there is no further validation to do.
*/
- if (!zfs_prop_user(propname) ||
- nvpair_type(elem) != DATA_TYPE_STRING)
- return (EINVAL);
+ if (zfs_prop_user(propname) &&
+ nvpair_type(elem) == DATA_TYPE_STRING) {
+ if (error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_USERPROP, CRED()))
+ return (error);
+ continue;
+ }
- if (error = zfs_secpolicy_write_perms(name,
- ZFS_DELEG_PERM_USERPROP, CRED()))
- return (error);
- continue;
+ if (!issnap && zfs_prop_userquota(propname) &&
+ nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) {
+ const char *perm;
+ const char *up = zfs_userquota_prop_prefixes
+ [ZFS_PROP_USERQUOTA];
+ if (strncmp(propname, up, strlen(up)) == 0)
+ perm = ZFS_DELEG_PERM_USERQUOTA;
+ else
+ perm = ZFS_DELEG_PERM_GROUPQUOTA;
+ if (error = zfs_secpolicy_write_perms(name,
+ perm, CRED()))
+ return (error);
+ continue;
+ }
+
+ return (EINVAL);
}
+ if (issnap)
+ return (EINVAL);
+
if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
return (error);
@@ -1466,8 +1668,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
break;
case ZFS_PROP_COPIES:
- if (zfs_earlier_version(name,
- SPA_VERSION_DITTO_BLOCKS))
+ if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS))
return (ENOTSUP);
break;
@@ -1486,77 +1687,122 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
}
}
+ VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
elem = NULL;
while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
const char *propname = nvpair_name(elem);
zfs_prop_t prop = zfs_name_to_prop(propname);
if (prop == ZPROP_INVAL) {
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- error = dsl_prop_set(name, propname, 1,
- strlen(strval) + 1, strval);
- if (error == 0)
- continue;
- else
- return (error);
+ if (zfs_prop_userquota(propname)) {
+ uint64_t *valary;
+ unsigned int vallen;
+ const char *domain;
+ zfs_userquota_prop_t type;
+ uint64_t rid;
+ uint64_t quota;
+ zfsvfs_t *zfsvfs;
+
+ VERIFY(nvpair_value_uint64_array(elem,
+ &valary, &vallen) == 0);
+ VERIFY(vallen == 3);
+ type = valary[0];
+ rid = valary[1];
+ quota = valary[2];
+ domain = propname +
+ strlen(zfs_userquota_prop_prefixes[type]);
+
+ error = zfsvfs_hold(name, B_FALSE, FTAG,
+ &zfsvfs);
+ if (error == 0) {
+ error = zfs_set_userquota(zfsvfs,
+ type, domain, rid, quota);
+ zfsvfs_rele(zfsvfs, FTAG);
+ }
+ if (error == 0)
+ continue;
+ else
+ goto out;
+ } else if (zfs_prop_user(propname)) {
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ error = dsl_prop_set(name, propname, 1,
+ strlen(strval) + 1, strval);
+ if (error == 0)
+ continue;
+ else
+ goto out;
+ }
}
switch (prop) {
case ZFS_PROP_QUOTA:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = dsl_dir_set_quota(name, intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_REFQUOTA:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = dsl_dataset_set_quota(name, intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_RESERVATION:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = dsl_dir_set_reservation(name,
intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_REFRESERVATION:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = dsl_dataset_set_reservation(name,
intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_VOLSIZE:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = zvol_set_volsize(name,
ddi_driver_major(zfs_dip), intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_VOLBLOCKSIZE:
if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
(error = zvol_set_volblocksize(name, intval)) != 0)
- return (error);
+ goto out;
break;
case ZFS_PROP_VERSION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zfs_set_version(name, intval)) != 0)
- return (error);
+ {
+ zfsvfs_t *zfsvfs;
+
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0)
+ goto out;
+ if ((error = zfsvfs_hold(name, B_FALSE, FTAG,
+ &zfsvfs)) != 0)
+ goto out;
+ error = zfs_set_version(zfsvfs, intval);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ if (error == 0 && intval >= ZPL_VERSION_USERSPACE) {
+ zfs_cmd_t zc = { 0 };
+ (void) strcpy(zc.zc_name, name);
+ (void) zfs_ioc_userspace_upgrade(&zc);
+ }
+ if (error)
+ goto out;
break;
+ }
default:
if (nvpair_type(elem) == DATA_TYPE_STRING) {
if (zfs_prop_get_type(prop) !=
- PROP_TYPE_STRING)
- return (EINVAL);
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- if ((error = dsl_prop_set(name,
- nvpair_name(elem), 1, strlen(strval) + 1,
- strval)) != 0)
- return (error);
+ PROP_TYPE_STRING) {
+ error = EINVAL;
+ goto out;
+ }
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
const char *unused;
@@ -1566,35 +1812,72 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
case PROP_TYPE_NUMBER:
break;
case PROP_TYPE_STRING:
- return (EINVAL);
+ error = EINVAL;
+ goto out;
case PROP_TYPE_INDEX:
if (zfs_prop_index_to_string(prop,
- intval, &unused) != 0)
- return (EINVAL);
+ intval, &unused) != 0) {
+ error = EINVAL;
+ goto out;
+ }
break;
default:
cmn_err(CE_PANIC,
"unknown property type");
break;
}
-
- if ((error = dsl_prop_set(name, propname,
- 8, 1, &intval)) != 0)
- return (error);
} else {
- return (EINVAL);
+ error = EINVAL;
+ goto out;
}
- break;
+ if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0)
+ goto out;
}
}
+ if (nvlist_next_nvpair(genericnvl, NULL) != NULL) {
+ error = dsl_props_set(name, genericnvl);
+ }
+out:
+ nvlist_free(genericnvl);
+ return (error);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(char *fsname, nvlist_t *nvl)
+{
+ nvpair_t *elem = NULL;
+ int error = 0;
+
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ const char *propname = nvpair_name(elem);
+ char *valstr;
+
+ if (!zfs_prop_user(propname) ||
+ nvpair_type(elem) != DATA_TYPE_STRING)
+ return (EINVAL);
+
+ if (error = zfs_secpolicy_write_perms(fsname,
+ ZFS_DELEG_PERM_USERPROP, CRED()))
+ return (error);
+
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ VERIFY(nvpair_value_string(elem, &valstr) == 0);
+ if (strlen(valstr) >= ZAP_MAXVALUELEN)
+ return (E2BIG);
+ }
return (0);
}
/*
* inputs:
* zc_name name of filesystem
- * zc_value name of property to inherit
+ * zc_value name of property to set
* zc_nvlist_src{_size} nvlist of properties to apply
* zc_cookie clear existing local props?
*
@@ -1607,7 +1890,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
int error;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvl)) != 0)
+ zc->zc_iflags, &nvl)) != 0)
return (error);
if (zc->zc_cookie) {
@@ -1654,7 +1937,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
nvpair_t *elem;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)))
+ zc->zc_iflags, &props)))
return (error);
/*
@@ -1731,7 +2014,7 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
cred_t *usercred;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvp)) != 0) {
+ zc->zc_iflags, &nvp)) != 0) {
return (error);
}
@@ -1781,7 +2064,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
nvlist_t *fsaclnv = NULL;
if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &fsaclnv)) != 0)
+ zc->zc_iflags, &fsaclnv)) != 0)
return (error);
/*
@@ -1918,11 +2201,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
* processing.
*/
static int
-zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver,
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
boolean_t *is_ci)
{
- uint64_t zplver = default_zplver;
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
@@ -2010,6 +2292,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
ASSERT(cp != NULL);
cp[0] = '\0';
+ if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
+ zplver = ZPL_VERSION_USERSPACE - 1;
if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
zplver = ZPL_VERSION_FUID - 1;
fuids_ok = B_FALSE;
@@ -2085,7 +2369,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != NULL &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
zct.zct_zplprops = NULL;
@@ -2188,32 +2472,12 @@ zfs_ioc_create(zfs_cmd_t *zc)
return (error);
}
-struct snap_prop_arg {
- nvlist_t *nvprops;
- const char *snapname;
-};
-
-static int
-set_snap_props(char *name, void *arg)
-{
- struct snap_prop_arg *snpa = arg;
- int len = strlen(name) + strlen(snpa->snapname) + 2;
- char *buf = kmem_alloc(len, KM_SLEEP);
- int err;
-
- (void) snprintf(buf, len, "%s@%s", name, snpa->snapname);
- err = zfs_set_prop_nvlist(buf, snpa->nvprops);
- if (err)
- (void) dmu_objset_destroy(buf);
- kmem_free(buf, len);
- return (err);
-}
-
/*
* inputs:
* zc_name name of filesystem
* zc_value short name of snapshot
* zc_cookie recursive flag
+ * zc_nvlist_src[_size] property list
*
* outputs: none
*/
@@ -2229,29 +2493,23 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != NULL &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &nvprops)) != 0)
+ zc->zc_iflags, &nvprops)) != 0)
return (error);
- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive);
+ error = zfs_check_userprops(zc->zc_name, nvprops);
+ if (error)
+ goto out;
- /*
- * It would be nice to do this atomically.
- */
- if (error == 0) {
- struct snap_prop_arg snpa;
- snpa.nvprops = nvprops;
- snpa.snapname = zc->zc_value;
- if (recursive) {
- error = dmu_objset_find(zc->zc_name,
- set_snap_props, &snpa, DS_FIND_CHILDREN);
- if (error) {
- (void) dmu_snapshots_destroy(zc->zc_name,
- zc->zc_value);
- }
- } else {
- error = set_snap_props(zc->zc_name, &snpa);
- }
+ if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL &&
+ zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
+ error = ENOTSUP;
+ goto out;
}
+
+ error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
+ nvprops, recursive);
+
+out:
nvlist_free(nvprops);
return (error);
}
@@ -2355,31 +2613,19 @@ zfs_ioc_rollback(zfs_cmd_t *zc)
if (error)
return (error);
- if (dmu_objset_type(os) == DMU_OST_ZFS) {
- mutex_enter(&os->os->os_user_ptr_lock);
- zfsvfs = dmu_objset_get_user(os);
- if (zfsvfs != NULL)
- VFS_HOLD(zfsvfs->z_vfs);
- mutex_exit(&os->os->os_user_ptr_lock);
- }
-
- if (zfsvfs != NULL) {
- char *osname;
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
int mode;
- osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
+ error = zfs_suspend_fs(zfsvfs, NULL, &mode);
if (error == 0) {
int resume_err;
- ASSERT(strcmp(osname, zc->zc_name) == 0);
error = dmu_objset_rollback(os);
- resume_err = zfs_resume_fs(zfsvfs, osname, mode);
+ resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
error = error ? error : resume_err;
} else {
dmu_objset_close(os);
}
- kmem_free(osname, MAXNAMELEN);
VFS_RELE(zfsvfs->z_vfs);
} else {
error = dmu_objset_rollback(os);
@@ -2484,7 +2730,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != NULL &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- &props)) != 0)
+ zc->zc_iflags, &props)) != 0)
return (error);
fd = zc->zc_cookie;
@@ -2494,32 +2740,26 @@ zfs_ioc_recv(zfs_cmd_t *zc)
return (EBADF);
}
- if (dmu_objset_open(tofs, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- /*
- * Try to get the zfsvfs for the receiving objset.
- * There won't be one if we're operating on a zvol,
- * if the objset doesn't exist yet, or is not mounted.
- */
- mutex_enter(&os->os->os_user_ptr_lock);
- if (zfsvfs = dmu_objset_get_user(os)) {
- if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
- mutex_exit(&os->os->os_user_ptr_lock);
- dmu_objset_close(os);
- zfsvfs = NULL;
- error = EBUSY;
- goto out;
- }
- VFS_HOLD(zfsvfs->z_vfs);
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
+ VFS_RELE(zfsvfs->z_vfs);
+ zfsvfs = NULL;
+ error = EBUSY;
+ goto out;
}
- mutex_exit(&os->os->os_user_ptr_lock);
-
/*
* If new properties are supplied, they are to completely
* replace the existing ones, so stash away the existing ones.
*/
if (props)
- (void) dsl_prop_get_all(os, &origprops, TRUE);
+ (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
+ } else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
+ DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+ /*
+ * Get the props even if there was no zfsvfs (zvol or
+ * unmounted zpl).
+ */
+ (void) dsl_prop_get_all(os, &origprops, TRUE);
dmu_objset_close(os);
}
@@ -2759,11 +2999,12 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* Resume any suspended I/Os.
*/
- zio_resume(spa);
+ if (zio_resume(spa) != 0)
+ error = EIO;
spa_close(spa, FTAG);
- return (0);
+ return (error);
}
/*
@@ -2791,6 +3032,120 @@ zfs_ioc_promote(zfs_cmd_t *zc)
}
/*
+ * Retrieve a single {user|group}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value domain name (eg. "S-1-234-567-89")
+ * zc_guid RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+ return (EINVAL);
+
+ error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ if (error)
+ return (error);
+
+ error = zfs_userspace_one(zfsvfs,
+ zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_cookie zap cursor
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t)
+ * zc_cookie zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+ if (error)
+ return (error);
+
+ int bufsize = zc->zc_nvlist_dst_size;
+ void *buf = kmem_alloc(bufsize, KM_SLEEP);
+
+ error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+ buf, &zc->zc_nvlist_dst_size);
+
+ if (error == 0) {
+ error = xcopyout(buf,
+ (void *)(uintptr_t)zc->zc_nvlist_dst,
+ zc->zc_nvlist_dst_size);
+ }
+ kmem_free(buf, bufsize);
+ zfsvfs_rele(zfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+ zfsvfs_t *zfsvfs;
+
+ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) {
+ /*
+ * If userused is not enabled, it may be because the
+ * objset needs to be closed & reopened (to grow the
+ * objset_phys_t). Suspend/resume the fs will do that.
+ */
+ int mode;
+ error = zfs_suspend_fs(zfsvfs, NULL, &mode);
+ if (error == 0) {
+ error = zfs_resume_fs(zfsvfs,
+ zc->zc_name, mode);
+ }
+ }
+ if (error == 0)
+ error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+ VFS_RELE(zfsvfs->z_vfs);
+ } else {
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_USER, &os);
+ if (error)
+ return (error);
+
+ error = dmu_objset_userspace_upgrade(os);
+ dmu_objset_close(os);
+ }
+
+ return (error);
+}
+
+/*
* We don't want to have a hard dependency
* against some special symbols in sharefs
* nfs, and smbsrv. Determine them if needed when
@@ -2903,7 +3258,7 @@ zfs_ioc_share(zfs_cmd_t *zc)
if (error = zsmbexport_fs((void *)
(uintptr_t)zc->zc_share.z_exportdata,
zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
- B_TRUE : B_FALSE)) {
+ B_TRUE: B_FALSE)) {
return (error);
}
break;
@@ -2924,64 +3279,280 @@ zfs_ioc_share(zfs_cmd_t *zc)
}
+ace_t full_access[] = {
+ {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+/*
+ * Remove all ACL files in shares dir
+ */
+static int
+zfs_smb_acl_purge(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
+ NULL, 0)) != 0)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+ vnode_t *vp;
+ znode_t *dzp;
+ vnode_t *resourcevp = NULL;
+ znode_t *sharedir;
+ zfsvfs_t *zfsvfs;
+ nvlist_t *nvlist;
+ char *src, *target;
+ vattr_t vattr;
+ vsecattr_t vsec;
+ int error = 0;
+
+ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+ NO_FOLLOW, NULL, &vp)) != 0)
+ return (error);
+
+ /* Now make sure mntpnt and dataset are ZFS */
+
+ if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+ zc->zc_name) != 0)) {
+ VN_RELE(vp);
+ return (EINVAL);
+ }
+
+ dzp = VTOZ(vp);
+ zfsvfs = dzp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Create share dir if its missing.
+ */
+ mutex_enter(&zfsvfs->z_lock);
+ if (zfsvfs->z_shares_dir == 0) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
+ ZFS_SHARES_DIR);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ error = zfs_create_share_dir(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ }
+ if (error) {
+ mutex_exit(&zfsvfs->z_lock);
+ VN_RELE(vp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ ASSERT(zfsvfs->z_shares_dir);
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
+ VN_RELE(vp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ switch (zc->zc_cookie) {
+ case ZFS_SMB_ACL_ADD:
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VREG;
+ vattr.va_mode = S_IFREG|0777;
+ vattr.va_uid = 0;
+ vattr.va_gid = 0;
+
+ vsec.vsa_mask = VSA_ACE;
+ vsec.vsa_aclentp = &full_access;
+ vsec.vsa_aclentsz = sizeof (full_access);
+ vsec.vsa_aclcnt = 1;
+
+ error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
+ &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
+ if (resourcevp)
+ VN_RELE(resourcevp);
+ break;
+
+ case ZFS_SMB_ACL_REMOVE:
+ error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
+ NULL, 0);
+ break;
+
+ case ZFS_SMB_ACL_RENAME:
+ if ((error = get_nvlist(zc->zc_nvlist_src,
+ zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
+ VN_RELE(vp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
+ nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
+ &target)) {
+ VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
+ kcred, NULL, 0);
+ nvlist_free(nvlist);
+ break;
+
+ case ZFS_SMB_ACL_PURGE:
+ error = zfs_smb_acl_purge(sharedir);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
/*
* pool create, destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
* do the logging of those commands.
*/
static zfs_ioc_vec_t zfs_ioc_vec[] = {
- { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE },
- { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE },
- { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_dataset_list_next, zfs_secpolicy_read,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_snapshot_list_next, zfs_secpolicy_read,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE },
- { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
- { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
- { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE },
- { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
- { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE },
- { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE },
- { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE },
- { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE },
- { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE },
- { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE },
- { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE },
- { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
- { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE },
- { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE },
- { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE },
- { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE },
- { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
- { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi,
- DATASET_NAME, B_FALSE },
- { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE },
- { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE },
+ { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_FALSE },
+ { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_FALSE },
+ { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_TRUE },
+ { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
+ B_TRUE},
+ { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
+ { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
+ { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
+ { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
+ B_FALSE },
+ { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
+ DATASET_NAME, B_FALSE, B_FALSE },
+ { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
+ DATASET_NAME, B_FALSE, B_FALSE },
+ { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+ DATASET_NAME, B_FALSE, B_TRUE },
};
+int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+{
+ spa_t *spa;
+ int error;
+
+ ASSERT(type == POOL_NAME || type == DATASET_NAME);
+
+ error = spa_open(name, &spa, FTAG);
+ if (error == 0) {
+ if (spa_suspended(spa))
+ error = EAGAIN;
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
static int
zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
{
@@ -3000,7 +3571,7 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
- error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
+ error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
if (error == 0)
error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
@@ -3011,15 +3582,22 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
*/
if (error == 0) {
zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ zc->zc_iflags = flag & FKIOCTL;
switch (zfs_ioc_vec[vec].zvec_namecheck) {
case POOL_NAME:
if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
error = EINVAL;
+ if (zfs_ioc_vec[vec].zvec_pool_check)
+ error = pool_status_check(zc->zc_name,
+ zfs_ioc_vec[vec].zvec_namecheck);
break;
case DATASET_NAME:
if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
error = EINVAL;
+ if (zfs_ioc_vec[vec].zvec_pool_check)
+ error = pool_status_check(zc->zc_name,
+ zfs_ioc_vec[vec].zvec_namecheck);
break;
case NO_NAME:
@@ -3030,10 +3608,10 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
if (error == 0)
error = zfs_ioc_vec[vec].zvec_func(zc);
- rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
+ rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
if (error == 0) {
error = rc;
- if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE)
+ if (zfs_ioc_vec[vec].zvec_his_log)
zfs_log_history(zc);
}
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 84d64b4df..3f0b6b0ed 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -467,9 +467,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
ssize_t zfs_immediate_write_sz = 32768;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
- sizeof (lr_write_t))
-
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -483,29 +480,6 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * In this mode, if we need to commit the write later, then the block
- * is immediately written into the file system (using dmu_sync),
- * and a pointer to the block is put into the log record.
- * When the txg commits the block is linked in.
- * This saves additionally writing the data into the log record.
- * There are a few requirements for this to occur:
- * - write is greater than zfs_immediate_write_sz
- * - not using slogs (as slogs are assumed to always be faster
- * than writing into the main pool)
- * - the write occupies only one block
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FSYNC or FDSYNC), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
slogging = spa_has_slogs(zilog->zl_spa);
if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
write_state = WR_INDIRECT;
@@ -535,7 +509,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(write_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
- zp->z_id, off, len, lr + 1) != 0) {
+ zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen);
itx = zil_itx_create(txtype, sizeof (*lr));
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index 1bf1bc527..8a859b575 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,6 +67,8 @@ static major_t zfs_major;
static minor_t zfs_minor;
static kmutex_t zfs_dev_mtx;
+extern int sys_shutdown;
+
static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
@@ -145,12 +147,24 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
* Sync a specific filesystem.
*/
zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ dsl_pool_t *dp;
ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (sys_shutdown && spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
else
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ txg_wait_synced(dp, 0);
ZFS_EXIT(zfsvfs);
} else {
/*
@@ -554,6 +568,393 @@ unregister:
}
+static void
+uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
+ int64_t delta, dmu_tx_t *tx)
+{
+ uint64_t used = 0;
+ char buf[32];
+ int err;
+ uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+
+ if (delta == 0)
+ return;
+
+ (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
+ err = zap_lookup(os, obj, buf, 8, 1, &used);
+ ASSERT(err == 0 || err == ENOENT);
+ /* no underflow/overflow */
+ ASSERT(delta > 0 || used >= -delta);
+ ASSERT(delta < 0 || used + delta > used);
+ used += delta;
+ if (used == 0)
+ err = zap_remove(os, obj, buf, tx);
+ else
+ err = zap_update(os, obj, buf, 8, 1, &used, tx);
+ ASSERT(err == 0);
+}
+
+static void
+zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
+ void *oldbonus, void *newbonus,
+ uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+{
+ znode_phys_t *oldznp = oldbonus;
+ znode_phys_t *newznp = newbonus;
+
+ if (bonustype != DMU_OT_ZNODE)
+ return;
+
+ /* We charge 512 for the dnode (if it's allocated). */
+ if (oldznp->zp_gen != 0)
+ oldused += DNODE_SIZE;
+ if (newznp->zp_gen != 0)
+ newused += DNODE_SIZE;
+
+ if (oldznp->zp_uid == newznp->zp_uid) {
+ uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+ } else {
+ uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
+ uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
+ }
+
+ if (oldznp->zp_gid == newznp->zp_gid) {
+ uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
+ } else {
+ uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
+ uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+ }
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+ char *domainbuf, int buflen, uid_t *ridp)
+{
+ extern uint64_t strtonum(const char *str, char **nptr);
+ uint64_t fuid;
+ const char *domain;
+
+ fuid = strtonum(fuidstr, NULL);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+ if (domain)
+ (void) strlcpy(domainbuf, domain, buflen);
+ else
+ domainbuf[0] = '\0';
+ *ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+ switch (type) {
+ case ZFS_PROP_USERUSED:
+ return (DMU_USERUSED_OBJECT);
+ case ZFS_PROP_GROUPUSED:
+ return (DMU_GROUPUSED_OBJECT);
+ case ZFS_PROP_USERQUOTA:
+ return (zfsvfs->z_userquota_obj);
+ case ZFS_PROP_GROUPQUOTA:
+ return (zfsvfs->z_groupquota_obj);
+ }
+ return (0);
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+ int error;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zfs_useracct_t *buf = vbuf;
+ uint64_t obj;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (ENOTSUP);
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == 0) {
+ *bufsizep = 0;
+ return (0);
+ }
+
+ for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+ *bufsizep)
+ break;
+
+ fuidstr_to_sid(zfsvfs, za.za_name,
+ buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+ buf->zu_space = za.za_first_integer;
+ buf++;
+ }
+ if (error == ENOENT)
+ error = 0;
+
+ ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+ *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+ *cookiep = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+static int
+id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+ char *buf, boolean_t addok)
+{
+ uint64_t fuid;
+ int domainid = 0;
+
+ if (domain && domain[0]) {
+ domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+ if (domainid == -1)
+ return (ENOENT);
+ }
+ fuid = FUID_ENCODE(domainid, rid);
+ (void) sprintf(buf, "%llx", (longlong_t)fuid);
+ return (0);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valp)
+{
+ char buf[32];
+ int err;
+ uint64_t obj;
+
+ *valp = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (ENOTSUP);
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == 0)
+ return (0);
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
+ if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota)
+{
+ char buf[32];
+ int err;
+ dmu_tx_t *tx;
+ uint64_t *objp;
+ boolean_t fuid_dirtied;
+
+ if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
+ return (EINVAL);
+
+ if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+ return (ENOTSUP);
+
+ objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
+ &zfsvfs->z_groupquota_obj;
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
+ if (err)
+ return (err);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+ if (*objp == 0) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ zfs_userquota_prop_prefixes[type]);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ mutex_enter(&zfsvfs->z_lock);
+ if (*objp == 0) {
+ *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ if (quota == 0) {
+ err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+ if (err == ENOENT)
+ err = 0;
+ } else {
+ err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+ }
+ ASSERT(err == 0);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+boolean_t
+zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+{
+ char buf[32];
+ uint64_t used, quota, usedobj, quotaobj;
+ int err;
+
+ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) sprintf(buf, "%llx", (longlong_t)fuid);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+int
+zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ uint64_t zval;
+ int i, error;
+
+ if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
+ return (error);
+ if (zval)
+ mode |= DS_MODE_READONLY;
+
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
+ if (error == EROFS) {
+ mode |= DS_MODE_READONLY;
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
+ }
+ if (error)
+ return (error);
+
+ /*
+ * Initialize the zfs-specific filesystem structure.
+ * Should probably make this a kmem cache, shuffle fields,
+ * and just bzero up to z_hold_mtx[].
+ */
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error) {
+ goto out;
+ } else if (zfsvfs->z_version > ZPL_VERSION) {
+ (void) printf("Mismatched versions: File system "
+ "is version %llu on-disk format, which is "
+ "incompatible with this software version %lld!",
+ (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+ error = ENOTSUP;
+ goto out;
+ }
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
+ goto out;
+ zfsvfs->z_norm = (int)zval;
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
+ goto out;
+ zfsvfs->z_utf8 = (zval != 0);
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
+ goto out;
+ zfsvfs->z_case = (uint_t)zval;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error)
+ goto out;
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error)
+ goto out;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error && error != ENOENT)
+ goto out;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error && error != ENOENT)
+ goto out;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error && error != ENOENT)
+ goto out;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error && error != ENOENT)
+ goto out;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ rrw_init(&zfsvfs->z_teardown_lock);
+ rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ *zvp = zfsvfs;
+ return (0);
+
+out:
+ dmu_objset_close(os);
+ *zvp = NULL;
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+}
+
static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{
@@ -570,6 +971,12 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+ if (zil_disable) {
+ zil_destroy(zfsvfs->z_log, 0);
+ zfsvfs->z_log = NULL;
+ }
+
/*
* If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all
@@ -588,11 +995,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
else
zfs_unlinked_drain(zfsvfs);
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
- if (zil_disable) {
- zil_destroy(zfsvfs->z_log, 0);
- zfsvfs->z_log = NULL;
- } else {
+ if (zfsvfs->z_log) {
/*
* Parse and replay the intent log.
*
@@ -630,49 +1033,63 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
return (0);
}
-static void
-zfs_freezfsvfs(zfsvfs_t *zfsvfs)
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
{
+ int i;
+ extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
+ /*
+ * This is a barrier to prevent the filesystem from going away in
+ * zfs_znode_move() until we can safely ensure that the filesystem is
+ * not unmounted. We consider the filesystem valid before the barrier
+ * and invalid after the barrier.
+ */
+ rw_enter(&zfsvfs_lock, RW_READER);
+ rw_exit(&zfsvfs_lock);
+
+ zfs_fuid_destroy(zfsvfs);
+
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_online_recv_lock);
+ mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrw_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+ }
+}
+
static int
zfs_domount(vfs_t *vfsp, char *osname)
{
dev_t mount_dev;
- uint64_t recordsize, readonly;
+ uint64_t recordsize, fsid_guid;
int error = 0;
- int mode;
zfsvfs_t *zfsvfs;
- znode_t *zp = NULL;
ASSERT(vfsp);
ASSERT(osname);
- /*
- * Initialize the zfs-specific filesystem structure.
- * Should probably make this a kmem cache, shuffle fields,
- * and just bzero up to z_hold_mtx[].
- */
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+ if (error)
+ return (error);
zfsvfs->z_vfs = vfsp;
- zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
- zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
- rrw_init(&zfsvfs->z_teardown_lock);
- rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
/* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
@@ -694,39 +1111,24 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfsp->vfs_flag |= VFS_NOTRUNC;
vfsp->vfs_data = zfsvfs;
- if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
- goto out;
-
- mode = DS_MODE_OWNER;
- if (readonly)
- mode |= DS_MODE_READONLY;
-
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
- if (error == EROFS) {
- mode = DS_MODE_OWNER | DS_MODE_READONLY;
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
- &zfsvfs->z_os);
- }
-
- if (error)
- goto out;
-
- if (error = zfs_init_fs(zfsvfs, &zp))
- goto out;
-
- /* The call to zfs_init_fs leaves the vnode held, release it here. */
- VN_RELE(ZTOV(zp));
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ vfsp->vfs_fsid.val[0] = fsid_guid;
+ vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ zfsfstype & 0xFF;
/*
* Set features for file system.
*/
- zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
- if (zfsvfs->z_use_fuids) {
- vfs_set_feature(vfsp, VFSFT_XVATTR);
- vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
- vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
- vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
- }
+ zfs_set_fuid_feature(zfsvfs);
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
@@ -739,13 +1141,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
- ASSERT(mode & DS_MODE_READONLY);
atime_changed_cb(zfsvfs, B_FALSE);
readonly_changed_cb(zfsvfs, B_TRUE);
if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
goto out;
xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE;
+
+ mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
} else {
error = zfsvfs_setup(zfsvfs, B_TRUE);
}
@@ -754,9 +1159,8 @@ zfs_domount(vfs_t *vfsp, char *osname)
zfsctl_create(zfsvfs);
out:
if (error) {
- if (zfsvfs->z_os)
- dmu_objset_close(zfsvfs->z_os);
- zfs_freezfsvfs(zfsvfs);
+ dmu_objset_close(zfsvfs->z_os);
+ zfsvfs_free(zfsvfs);
} else {
atomic_add_32(&zfs_active_fs_count, 1);
}
@@ -1067,6 +1471,13 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
error = zfs_domount(vfsp, osname);
+ /*
+ * Add an extra VFS_HOLD on our parent vfs so that it can't
+ * disappear due to a forced unmount.
+ */
+ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+ VFS_HOLD(mvp->v_vfsp);
+
out:
pn_free(&spn);
return (error);
@@ -1426,15 +1837,16 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
* 'z_teardown_inactive_lock' write held.
*/
int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
+zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
{
int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
- *mode = zfsvfs->z_os->os_mode;
- dmu_objset_name(zfsvfs->z_os, name);
+ *modep = zfsvfs->z_os->os_mode;
+ if (name)
+ dmu_objset_name(zfsvfs->z_os, name);
dmu_objset_close(zfsvfs->z_os);
return (0);
@@ -1493,13 +1905,15 @@ static void
zfs_freevfs(vfs_t *vfsp)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
- int i;
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ /*
+ * If this is a snapshot, we have an extra VFS_HOLD on our parent
+ * from zfs_mount(). Release it here.
+ */
+ if (zfsvfs->z_issnap)
+ VFS_RELE(zfsvfs->z_parent->z_vfs);
- zfs_fuid_destroy(zfsvfs);
- zfs_freezfsvfs(zfsvfs);
+ zfsvfs_free(zfsvfs);
atomic_add_32(&zfs_active_fs_count, -1);
}
@@ -1558,6 +1972,8 @@ zfs_init(void)
* Initialize znode cache, vnode ops, etc...
*/
zfs_znode_init();
+
+ dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
}
void
@@ -1574,54 +1990,46 @@ zfs_busy(void)
}
int
-zfs_set_version(const char *name, uint64_t newvers)
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
{
int error;
- objset_t *os;
+ objset_t *os = zfsvfs->z_os;
dmu_tx_t *tx;
- uint64_t curvers;
-
- /*
- * XXX for now, require that the filesystem be unmounted. Would
- * be nice to find the zfsvfs_t and just update that if
- * possible.
- */
if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
return (EINVAL);
- error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
- if (error)
- return (error);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
- 8, 1, &curvers);
- if (error)
- goto out;
- if (newvers < curvers) {
- error = EINVAL;
- goto out;
- }
+ if (newvers < zfsvfs->z_version)
+ return (EINVAL);
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- goto out;
+ return (error);
+ }
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
}
- error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
- &newvers, tx);
spa_history_internal_log(LOG_DS_UPGRADE,
dmu_objset_spa(os), tx, CRED(),
- "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
- dmu_objset_id(os));
+ "oldver=%llu newver=%llu dataset = %llu",
+ zfsvfs->z_version, newvers, dmu_objset_id(os));
+
dmu_tx_commit(tx);
-out:
- dmu_objset_close(os);
- return (error);
+ zfsvfs->z_version = newvers;
+
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID)
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
}
/*
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index e38abb28c..88d4e52c1 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -101,6 +101,7 @@
* pushing cached pages (which acquires range locks) and syncing out
* cached atime changes. Third, zfs_zinactive() may require a new tx,
* which could deadlock the system if you were already holding one.
+ * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
*
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls.
@@ -363,7 +364,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
caddr_t va;
va = zfs_map_page(pp, S_WRITE);
- (void) dmu_read(os, oid, start+off, nbytes, va+off);
+ (void) dmu_read(os, oid, start+off, nbytes, va+off,
+ DMU_READ_PREFETCH);
zfs_unmap_page(pp, va);
page_unlock(pp);
}
@@ -567,6 +569,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
int max_blksz = zfsvfs->z_max_blksz;
uint64_t pflags;
int error;
+ arc_buf_t *abuf;
/*
* Fasttrack empty write
@@ -663,10 +666,46 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* and allows us to do more fine-grained space accounting.
*/
while (n > 0) {
+ abuf = NULL;
+ woff = uio->uio_loffset;
+
+again:
+ if (zfs_usergroup_overquota(zfsvfs,
+ B_FALSE, zp->z_phys->zp_uid) ||
+ zfs_usergroup_overquota(zfsvfs,
+ B_TRUE, zp->z_phys->zp_gid)) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ error = EDQUOT;
+ break;
+ }
+
+ /*
+ * If dmu_assign_arcbuf() is expected to execute with minimum
+ * overhead loan an arc buffer and copy user data to it before
+ * we enter a txg. This avoids holding a txg forever while we
+ * pagefault on a hanging NFS server mapping.
+ */
+ if (abuf == NULL && n >= max_blksz &&
+ woff >= zp->z_phys->zp_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if (error = uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes)) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT(cbytes == max_blksz);
+ }
+
/*
* Start a transaction.
*/
- woff = uio->uio_loffset;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
@@ -675,9 +714,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (error == ERESTART) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
- continue;
+ goto again;
}
dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
break;
}
@@ -706,12 +747,22 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
- tx_bytes = uio->uio_resid;
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx);
- tx_bytes -= uio->uio_resid;
- if (tx_bytes && vn_has_cached_data(vp))
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
+ nbytes, tx);
+ tx_bytes -= uio->uio_resid;
+ } else {
+ tx_bytes = nbytes;
+ ASSERT(tx_bytes == max_blksz);
+ dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ ASSERT(tx_bytes <= uio->uio_resid);
+ uioskip(uio, tx_bytes);
+ }
+ if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff,
tx_bytes, zfsvfs->z_os, zp->z_id);
+ }
/*
* If we made no progress, we're done. If we made even
@@ -791,10 +842,15 @@ zfs_get_done(dmu_buf_t *db, void *vzgd)
zgd_t *zgd = (zgd_t *)vzgd;
rl_t *rl = zgd->zgd_rl;
vnode_t *vp = ZTOV(rl->r_zp);
+ objset_t *os = rl->r_zp->z_zfsvfs->z_os;
dmu_buf_rele(db, vzgd);
zfs_range_unlock(rl);
- VN_RELE(vp);
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
}
@@ -824,7 +880,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
return (ENOENT);
if (zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (ENOENT);
}
@@ -842,7 +903,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
error = ENOENT;
goto out;
}
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
+ VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
+ DMU_READ_NO_PREFETCH));
} else { /* indirect write */
uint64_t boff; /* block starting offset */
@@ -896,7 +958,11 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
}
out:
zfs_range_unlock(rl);
- VN_RELE(ZTOV(zp));
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (error);
}
@@ -1074,11 +1140,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
- zfs_acl_t *aclp = NULL;
- zfs_fuid_info_t *fuidp = NULL;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
/*
* If we have an ephemeral id, ACL, or XVATTR then
@@ -1141,21 +1207,9 @@ top:
if (strcmp(name, "..") == 0)
error = EISDIR;
ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
return (error);
}
}
- if (vsecp && aclp == NULL) {
- error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- if (dl)
- zfs_dirent_unlock(dl);
- return (error);
- }
- }
-
if (zp == NULL) {
uint64_t txtype;
@@ -1177,30 +1231,28 @@ top:
goto out;
}
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+ &acl_ids)) != 0)
+ goto out;
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ error = EDQUOT;
+ goto out;
+ }
+
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
- IS_EPHEMERAL(gid)) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
- FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
+ if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE);
}
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
@@ -1209,19 +1261,21 @@ top:
}
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
return (error);
}
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
(void) zfs_link_create(dl, zp, tx, ZNEW);
+
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE)
txtype |= TX_CI;
zfs_log_create(zilog, tx, txtype, dzp, zp, name,
- vsecp, fuidp, vap);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
} else {
int aflags = (flag & FAPPEND) ? V_APPEND : 0;
@@ -1292,8 +1346,6 @@ out:
*vpp = svp;
}
}
- if (aclp)
- zfs_acl_free(aclp);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1528,12 +1580,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
uint64_t txtype;
dmu_tx_t *tx;
int error;
- zfs_acl_t *aclp = NULL;
- zfs_fuid_info_t *fuidp = NULL;
int zf = ZNEW;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
@@ -1594,38 +1646,33 @@ top:
return (error);
}
- if (vsecp && aclp == NULL) {
- error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
- if (error) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+ &acl_ids)) != 0) {
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
}
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (EDQUOT);
+ }
+
/*
* Add a new entry to the directory.
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
- IS_EPHEMERAL(gid)) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
- if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
@@ -1634,19 +1681,16 @@ top:
}
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
return (error);
}
/*
* Create new node.
*/
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
-
- if (aclp)
- zfs_acl_free(aclp);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
/*
* Now put new name in parent dir.
*/
@@ -1657,10 +1701,10 @@ top:
txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
if (flags & FIGNORECASE)
txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+ acl_ids.z_fuidp, vap);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
zfs_dirent_unlock(dl);
@@ -1969,6 +2013,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
}
}
+ if (flags & V_RDDIR_ACCFILTER) {
+ /*
+ * If we have no access at all, don't include
+ * this entry in the returned information
+ */
+ znode_t *ezp;
+ if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+ goto skip_entry;
+ if (!zfs_has_access(ezp, cr)) {
+ VN_RELE(ZTOV(ezp));
+ goto skip_entry;
+ }
+ VN_RELE(ZTOV(ezp));
+ }
+
if (flags & V_RDDIR_ENTFLAGS)
reclen = EDIRENT_RECLEN(strlen(zap.za_name));
else
@@ -2020,6 +2079,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
if (prefetch)
dmu_prefetch(os, objnum, 0, 0);
+ skip_entry:
/*
* Move to the next entry, fill in the previous offset.
*/
@@ -2120,8 +2180,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_VERIFY_ZP(zp);
pzp = zp->z_phys;
- mutex_enter(&zp->z_lock);
-
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
@@ -2131,7 +2189,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
(pzp->zp_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
- mutex_exit(&zp->z_lock);
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -2142,6 +2199,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
* than to determine whether we were asked the question.
*/
+ mutex_enter(&zp->z_lock);
vap->va_type = vp->v_type;
vap->va_mode = pzp->zp_mode & MODEMASK;
zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
@@ -2312,6 +2370,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
uint_t saved_mask;
int trim_mask = 0;
uint64_t new_mode;
+ uint64_t new_uid, new_gid;
znode_t *attrzp;
int need_policy = FALSE;
int err;
@@ -2320,6 +2379,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
xoptattr_t *xoap;
zfs_acl_t *aclp = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
if (mask == 0)
return (0);
@@ -2610,30 +2670,14 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
- if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
- ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
if (mask & AT_MODE) {
uint64_t pmode = pzp->zp_mode;
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
+ if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
+ goto out;
if (pzp->zp_acl.z_acl_extern_obj) {
/* Are we upgrading ACL from old V0 format to new V1 */
if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
@@ -2655,36 +2699,53 @@ top:
}
}
- if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
- err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
- if (err) {
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- if (aclp)
- zfs_acl_free(aclp);
- return (err);
+ if (mask & (AT_UID | AT_GID)) {
+ if (pzp->zp_xattr) {
+ err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+ if (err)
+ goto out;
+ dmu_tx_hold_bonus(tx, attrzp->z_id);
+ }
+ if (mask & AT_UID) {
+ new_uid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_uid != pzp->zp_uid &&
+ zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+ err = EDQUOT;
+ goto out;
+ }
+ }
+
+ if (mask & AT_GID) {
+ new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &fuidp);
+ if (new_gid != pzp->zp_gid &&
+ zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+ err = EDQUOT;
+ goto out;
+ }
+ }
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied) {
+ if (zfsvfs->z_fuid_obj == 0) {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
+ FALSE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+ dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+ FUID_SIZE_ESTIMATE(zfsvfs));
+ }
}
- dmu_tx_hold_bonus(tx, attrzp->z_id);
}
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) {
- if (attrzp)
- VN_RELE(ZTOV(attrzp));
-
- if (aclp) {
- zfs_acl_free(aclp);
- aclp = NULL;
- }
-
- if (err == ERESTART) {
+ if (err == ERESTART)
dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
+ goto out;
}
dmu_buf_will_dirty(zp->z_dbuf, tx);
@@ -2702,7 +2763,7 @@ top:
if (mask & AT_MODE) {
mutex_enter(&zp->z_acl_lock);
zp->z_phys->zp_mode = new_mode;
- err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
mutex_exit(&zp->z_acl_lock);
}
@@ -2711,25 +2772,17 @@ top:
mutex_enter(&attrzp->z_lock);
if (mask & AT_UID) {
- pzp->zp_uid = zfs_fuid_create(zfsvfs,
- vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
- if (attrzp) {
- attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
- vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
- }
+ pzp->zp_uid = new_uid;
+ if (attrzp)
+ attrzp->z_phys->zp_uid = new_uid;
}
if (mask & AT_GID) {
- pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
- cr, ZFS_GROUP, tx, &fuidp);
+ pzp->zp_gid = new_gid;
if (attrzp)
- attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
- vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
+ attrzp->z_phys->zp_gid = new_gid;
}
- if (aclp)
- zfs_acl_free(aclp);
-
if (attrzp)
mutex_exit(&attrzp->z_lock);
@@ -2791,17 +2844,35 @@ top:
zfs_xvattr_set(zp, xvap);
}
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
- if (fuidp)
- zfs_fuid_info_free(fuidp);
mutex_exit(&zp->z_lock);
+out:
if (attrzp)
VN_RELE(ZTOV(attrzp));
- dmu_tx_commit(tx);
+ if (aclp) {
+ zfs_acl_free(aclp);
+ aclp = NULL;
+ }
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err)
+ dmu_tx_abort(tx);
+ else
+ dmu_tx_commit(tx);
+
+ if (err == ERESTART)
+ goto top;
ZFS_EXIT(zfsvfs);
return (err);
@@ -3232,7 +3303,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
int len = strlen(link);
int error;
int zflg = ZNEW;
- zfs_fuid_info_t *fuidp = NULL;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
ASSERT(vap->va_type == VLNK);
@@ -3267,26 +3339,25 @@ top:
return (error);
}
+ VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (EDQUOT);
+ }
tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
- if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- }
- }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
if (error == ERESTART) {
dmu_tx_wait(tx);
@@ -3306,13 +3377,16 @@ top:
* otherwise, store it just like any other file data.
*/
if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
if (len != 0)
bcopy(link, zp->z_phys + 1, len);
} else {
dmu_buf_t *dbp;
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
/*
* Nothing can access the znode yet so no locking needed
* for growing the znode's blocksize.
@@ -3333,15 +3407,14 @@ top:
* Insert the new object into the directory.
*/
(void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
if (error == 0) {
uint64_t txtype = TX_SYMLINK;
if (flags & FIGNORECASE)
txtype |= TX_CI;
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
}
- if (fuidp)
- zfs_fuid_info_free(fuidp);
+
+ zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
@@ -3618,6 +3691,12 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
pvn_write_done(trunc, flags);
len = filesz - off;
}
+
+ if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) ||
+ zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) {
+ err = EDQUOT;
+ goto out;
+ }
top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
@@ -3705,7 +3784,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
else
io_off = 0;
if (len > 0 && ISP2(blksz))
- io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t);
+ io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
else
io_len = 0;
@@ -3869,7 +3948,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
* If we can't find a page in the cache, we will create a new page
* and fill it with file data. For efficiency, we may try to fill
* multiple pages at once (klustering) to fill up the supplied page
- * list.
+ * list. Note that the pages to be filled are held with an exclusive
+ * lock to prevent access by other threads while they are being filled.
*/
static int
zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
@@ -3888,7 +3968,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
*/
io_off = off;
io_len = PAGESIZE;
- pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
+ pp = page_create_va(vp, io_off, io_len,
+ PG_EXCL | PG_WAIT, seg, addr);
} else {
/*
* Try to find enough pages to fill the page list
@@ -3913,7 +3994,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
ASSERT3U(io_off, ==, cur_pp->p_offset);
va = zfs_map_page(cur_pp, S_WRITE);
- err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+ DMU_READ_PREFETCH);
zfs_unmap_page(cur_pp, va);
if (err) {
/* On error, toss the entire kluster */
@@ -3991,7 +4073,7 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
*protp = PROT_ALL;
/*
- * Loop through the requested range [off, off + len] looking
+ * Loop through the requested range [off, off + len) looking
* for pages. If we don't find a page, we will need to create
* a new page and fill it with data from the file.
*/
@@ -4337,6 +4419,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
(vp->v_type == VREG || vp->v_type == VDIR);
return (0);
+ case _PC_ACCESS_FILTERING:
+ *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
+ vp->v_type == VDIR;
+ return (0);
+
case _PC_ACL_ENABLED:
*valp = _ACL_ACE_ENABLED;
return (0);
@@ -4489,6 +4576,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = {
};
/*
+ * special share hidden files vnode operations template
+ */
+vnodeops_t *zfs_sharevnodeops;
+const fs_operation_def_t zfs_sharevnodeops_template[] = {
+ VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
+ VOPNAME_ACCESS, { .vop_access = zfs_access },
+ VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
+ VOPNAME_FID, { .vop_fid = zfs_fid },
+ VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
+ VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
+ VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
+
+/*
* Extended attribute directory vnode operations template
* This template is identical to the directory vnodes
* operation template except for restricted operations:
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 74983cdc5..8ced95174 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -87,6 +87,12 @@
* (such as VFS logic) that will not compile easily in userland.
*/
#ifdef _KERNEL
+/*
+ * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
+ * be freed before it can be safely accessed.
+ */
+krwlock_t zfsvfs_lock;
+
static kmem_cache_t *znode_cache = NULL;
/*ARGSUSED*/
@@ -154,8 +160,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
#ifdef ZNODE_STATS
static struct {
uint64_t zms_zfsvfs_invalid;
+ uint64_t zms_zfsvfs_recheck1;
uint64_t zms_zfsvfs_unmounted;
- uint64_t zms_zfsvfs_recheck_invalid;
+ uint64_t zms_zfsvfs_recheck2;
uint64_t zms_obj_held;
uint64_t zms_vnode_locked;
uint64_t zms_not_only_dnlc;
@@ -206,17 +213,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
POINTER_INVALIDATE(&ozp->z_zfsvfs);
}
-/*
- * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
- * returns a non-zero error code.
- */
-static int
-zfs_enter(zfsvfs_t *zfsvfs)
-{
- ZFS_ENTER(zfsvfs);
- return (0);
-}
-
/*ARGSUSED*/
static kmem_cbrc_t
zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
@@ -240,12 +236,32 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
}
/*
- * Ensure that the filesystem is not unmounted during the move.
+ * Close a small window in which it's possible that the filesystem could
+ * be unmounted and freed, and zfsvfs, though valid in the previous
+ * statement, could point to unrelated memory by the time we try to
+ * prevent the filesystem from being unmounted.
+ */
+ rw_enter(&zfsvfs_lock, RW_WRITER);
+ if (zfsvfs != ozp->z_zfsvfs) {
+ rw_exit(&zfsvfs_lock);
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the znode is still valid, then so is the file system. We know that
+ * no valid file system can be freed while we hold zfsvfs_lock, so we
+ * can safely ensure that the filesystem is not and will not be
+ * unmounted. The next statement is equivalent to ZFS_ENTER().
*/
- if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */
+ rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+ if (zfsvfs->z_unmounted) {
+ ZFS_EXIT(zfsvfs);
+ rw_exit(&zfsvfs_lock);
ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
return (KMEM_CBRC_DONT_KNOW);
}
+ rw_exit(&zfsvfs_lock);
mutex_enter(&zfsvfs->z_znodes_lock);
/*
@@ -255,7 +271,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
if (zfsvfs != ozp->z_zfsvfs) {
mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs);
- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
+ ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -311,6 +327,7 @@ zfs_znode_init(void)
/*
* Initialize zcache
*/
+ rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
ASSERT(znode_cache == NULL);
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor,
@@ -332,6 +349,7 @@ zfs_znode_fini(void)
if (znode_cache)
kmem_cache_destroy(znode_cache);
znode_cache = NULL;
+ rw_destroy(&zfsvfs_lock);
}
struct vnodeops *zfs_dvnodeops;
@@ -339,6 +357,7 @@ struct vnodeops *zfs_fvnodeops;
struct vnodeops *zfs_symvnodeops;
struct vnodeops *zfs_xdvnodeops;
struct vnodeops *zfs_evnodeops;
+struct vnodeops *zfs_sharevnodeops;
void
zfs_remove_op_tables()
@@ -363,12 +382,15 @@ zfs_remove_op_tables()
vn_freevnodeops(zfs_xdvnodeops);
if (zfs_evnodeops)
vn_freevnodeops(zfs_evnodeops);
+ if (zfs_sharevnodeops)
+ vn_freevnodeops(zfs_sharevnodeops);
zfs_dvnodeops = NULL;
zfs_fvnodeops = NULL;
zfs_symvnodeops = NULL;
zfs_xdvnodeops = NULL;
zfs_evnodeops = NULL;
+ zfs_sharevnodeops = NULL;
}
extern const fs_operation_def_t zfs_dvnodeops_template[];
@@ -376,6 +398,7 @@ extern const fs_operation_def_t zfs_fvnodeops_template[];
extern const fs_operation_def_t zfs_xdvnodeops_template[];
extern const fs_operation_def_t zfs_symvnodeops_template[];
extern const fs_operation_def_t zfs_evnodeops_template[];
+extern const fs_operation_def_t zfs_sharevnodeops_template[];
int
zfs_create_op_tables()
@@ -412,103 +435,58 @@ zfs_create_op_tables()
error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
&zfs_evnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
+ &zfs_sharevnodeops);
return (error);
}
-/*
- * zfs_init_fs - Initialize the zfsvfs struct and the file system
- * incore "master" object. Verify version compatibility.
- */
int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
- extern int zfsfstype;
-
- objset_t *os = zfsvfs->z_os;
- int i, error;
- uint64_t fsid_guid;
- uint64_t zval;
-
- *zpp = NULL;
-
- error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
- if (error) {
- return (error);
- } else if (zfsvfs->z_version > ZPL_VERSION) {
- (void) printf("Mismatched versions: File system "
- "is version %llu on-disk format, which is "
- "incompatible with this software version %lld!",
- (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
- return (ENOTSUP);
- }
-
- if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
- return (error);
- zfsvfs->z_norm = (int)zval;
- if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
- return (error);
- zfsvfs->z_utf8 = (zval != 0);
- if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
- return (error);
- zfsvfs->z_case = (uint_t)zval;
- /*
- * Fold case on file systems that are always or sometimes case
- * insensitive.
- */
- if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- zfsvfs->z_case == ZFS_CASE_MIXED)
- zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+ zfs_acl_ids_t acl_ids;
+ vattr_t vattr;
+ znode_t *sharezp;
+ vnode_t *vp;
+ znode_t *zp;
+ int error;
- /*
- * The fsid is 64 bits, composed of an 8-bit fs type, which
- * separates our fsid from any other filesystem types, and a
- * 56-bit objset unique ID. The objset unique ID is unique to
- * all objsets open on this system, provided by unique_create().
- * The 8-bit fs type must be put in the low bits of fsid[1]
- * because that's where other Solaris filesystems put it.
- */
- fsid_guid = dmu_objset_fsid_guid(os);
- ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
- zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
- zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
- zfsfstype & 0xFF;
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
- &zfsvfs->z_root);
- if (error)
- return (error);
- ASSERT(zfsvfs->z_root != 0);
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0555;
+ vattr.va_uid = crgetuid(kcred);
+ vattr.va_gid = crgetgid(kcred);
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
- &zfsvfs->z_unlinkedobj);
- if (error)
- return (error);
+ sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ sharezp->z_unlinked = 0;
+ sharezp->z_atime_dirty = 0;
+ sharezp->z_zfsvfs = zfsvfs;
- /*
- * Initialize zget mutex's
- */
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+ vp = ZTOV(sharezp);
+ vn_reinit(vp);
+ vp->v_type = VDIR;
- error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
- if (error) {
- /*
- * On error, we destroy the mutexes here since it's not
- * possible for the caller to determine if the mutexes were
- * initialized properly.
- */
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
- return (error);
- }
- ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
- &zfsvfs->z_fuid_obj);
- if (error == ENOENT)
- error = 0;
+ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+ kcred, NULL, &acl_ids));
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
+ &zp, 0, &acl_ids);
+ ASSERT3P(zp, ==, sharezp);
+ ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
+ POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+ error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+ zfsvfs->z_shares_dir = sharezp->z_id;
+
+ zfs_acl_ids_free(&acl_ids);
+ ZTOV(sharezp)->v_count = 0;
+ dmu_buf_rele(sharezp->z_dbuf, NULL);
+ sharezp->z_dbuf = NULL;
+ kmem_cache_free(znode_cache, sharezp);
- return (0);
+ return (error);
}
/*
@@ -676,7 +654,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
break;
case VREG:
vp->v_flag |= VMODSORT;
- vn_setops(vp, zfs_fvnodeops);
+ if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
+ vn_setops(vp, zfs_sharevnodeops);
+ else
+ vn_setops(vp, zfs_fvnodeops);
break;
case VLNK:
vn_setops(vp, zfs_symvnodeops);
@@ -720,8 +701,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
*/
void
zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
- zfs_fuid_info_t **fuidp)
+ uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
{
dmu_buf_t *db;
znode_phys_t *pzp;
@@ -846,7 +826,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
*zpp = dzp;
}
- zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
+ pzp->zp_uid = acl_ids->z_fuid;
+ pzp->zp_gid = acl_ids->z_fgid;
+ pzp->zp_mode = acl_ids->z_mode;
+ VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ if (vap->va_mask & AT_XVATTR)
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap);
}
void
@@ -1474,7 +1459,7 @@ void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
zfsvfs_t zfsvfs;
- uint64_t moid, doid, version;
+ uint64_t moid, obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0;
nvpair_t *elem;
@@ -1483,6 +1468,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vnode_t *vp;
vattr_t vattr;
znode_t *zp;
+ zfs_acl_ids_t acl_ids;
/*
* First attempt to create master node.
@@ -1499,12 +1485,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
/*
* Set starting attributes.
*/
- if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+ if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
version = ZPL_VERSION;
+ else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+ version = ZPL_VERSION_USERSPACE - 1;
else
version = ZPL_VERSION_FUID - 1;
- error = zap_update(os, moid, ZPL_VERSION_STR,
- 8, 1, &version, tx);
elem = NULL;
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */
@@ -1515,9 +1501,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
VERIFY(nvpair_value_uint64(elem, &val) == 0);
name = nvpair_name(elem);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
- version = val;
- error = zap_update(os, moid, ZPL_VERSION_STR,
- 8, 1, &version, tx);
+ if (val < version)
+ version = val;
} else {
error = zap_update(os, moid, name, 8, 1, &val, tx);
}
@@ -1528,13 +1513,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
sense = val;
}
ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
/*
* Create a delete queue.
*/
- doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
- error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
ASSERT(error == 0);
/*
@@ -1575,17 +1561,28 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs;
- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
ASSERT3P(zp, ==, rootzp);
ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
POINTER_INVALIDATE(&rootzp->z_zfsvfs);
ZTOV(rootzp)->v_count = 0;
dmu_buf_rele(rootzp->z_dbuf, NULL);
rootzp->z_dbuf = NULL;
kmem_cache_free(znode_cache, rootzp);
+
+ /*
+ * Create shares directory
+ */
+
+ error = zfs_create_share_dir(&zfsvfs, tx);
+
+ ASSERT(error == 0);
}
#endif /* _KERNEL */
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 83fef0d87..53d9d9bf7 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -19,12 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -471,34 +472,22 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
}
/*
- * zil_rollback_destroy() is only called by the rollback code.
- * We already have a syncing tx. Rollback has exclusive access to the
- * dataset, so we don't have to worry about concurrent zil access.
- * The actual freeing of any log blocks occurs in zil_sync() later in
- * this txg syncing phase.
+ * return true if the initial log block is not valid
*/
-void
-zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx)
+static boolean_t
+zil_empty(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
- uint64_t txg;
+ arc_buf_t *abuf = NULL;
if (BP_IS_HOLE(&zh->zh_log))
- return;
+ return (B_TRUE);
- txg = dmu_tx_get_txg(tx);
- ASSERT3U(zilog->zl_destroy_txg, <, txg);
- zilog->zl_destroy_txg = txg;
- zilog->zl_keep_first = B_FALSE;
+ if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
+ return (B_TRUE);
- /*
- * Ensure there's no outstanding ZIL IO. No lwbs or just the
- * unused one that allocated in advance is ok.
- */
- ASSERT(zilog->zl_lwb_list.list_head.list_next ==
- zilog->zl_lwb_list.list_head.list_prev);
- (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record,
- tx, zh->zh_claim_txg);
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ return (B_FALSE);
}
int
@@ -520,6 +509,30 @@ zil_claim(char *osname, void *txarg)
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
+ if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+ if (!BP_IS_HOLE(&zh->zh_log))
+ zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+ BP_ZERO(&zh->zh_log);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ /*
+ * Record here whether the zil has any records to replay.
+ * If the header block pointer is null or the block points
+ * to the stubby then we know there are no valid log records.
+ * We use the header to store this state as the the zilog gets
+ * freed later in dmu_objset_close().
+ * The flags (and the rest of the header fields) are cleared in
+ * zil_sync() as a result of a zil_destroy(), after replaying the log.
+ *
+ * Note, the intent log can be empty but still need the
+ * stubby to be claimed.
+ */
+ if (!zil_empty(zilog)) {
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
/*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
@@ -587,36 +600,6 @@ zil_check_log_chain(char *osname, void *txarg)
return (error);
}
-/*
- * Clear a log chain
- */
-/* ARGSUSED */
-int
-zil_clear_log_chain(char *osname, void *txarg)
-{
- zilog_t *zilog;
- zil_header_t *zh;
- objset_t *os;
- dmu_tx_t *tx;
- int error;
-
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
- if (error) {
- cmn_err(CE_WARN, "can't open objset for %s", osname);
- return (0);
- }
-
- zilog = dmu_objset_zil(os);
- tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- zh = zil_header_in_syncing_context(zilog);
- BP_ZERO(&zh->zh_log);
- dsl_dataset_dirty(dmu_objset_ds(os), tx);
- dmu_tx_commit(tx);
- dmu_objset_close(os);
- return (0);
-}
-
static int
zil_vdev_compare(const void *x1, const void *x2)
{
@@ -719,18 +702,26 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(zio->io_bp->blk_fill == 0);
/*
- * Now that we've written this log block, we have a stable pointer
- * to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync.
+ * Ensure the lwb buffer pointer is cleared before releasing
+ * the txg. If we have had an allocation failure and
+ * the txg is waiting to sync then we want want zil_sync()
+ * to remove the lwb so that it's not picked up as the next new
+ * one in zil_commit_writer(). zil_sync() will only remove
+ * the lwb if lwb_buf is null.
*/
- txg_rele_to_sync(&lwb->lwb_txgh);
-
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
if (zio->io_error)
zilog->zl_log_error = B_TRUE;
mutex_exit(&zilog->zl_lock);
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ txg_rele_to_sync(&lwb->lwb_txgh);
}
/*
@@ -752,9 +743,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
}
if (lwb->lwb_zio == NULL) {
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf,
- lwb->lwb_sz, zil_lwb_write_done, lwb,
- ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb);
+ 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+ zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
}
}
@@ -1040,7 +1031,7 @@ zil_clean(zilog_t *zilog)
if ((itx != NULL) &&
(itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
(void) taskq_dispatch(zilog->zl_clean_taskq,
- (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
}
mutex_exit(&zilog->zl_lock);
}
@@ -1216,6 +1207,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
spa_t *spa = zilog->zl_spa;
lwb_t *lwb;
+ /*
+ * We don't zero out zl_destroy_txg, so make sure we don't try
+ * to destroy it twice.
+ */
+ if (spa_sync_pass(spa) != 1)
+ return;
+
mutex_enter(&zilog->zl_lock);
ASSERT(zilog->zl_stop_sync == 0);
@@ -1226,7 +1224,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
blkptr_t blk = zh->zh_log;
ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
- ASSERT(spa_sync_pass(spa) == 1);
bzero(zh, sizeof (zil_header_t));
bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
@@ -1245,12 +1242,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
}
}
- for (;;) {
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- mutex_exit(&zilog->zl_lock);
- return;
- }
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
zh->zh_log = lwb->lwb_blk;
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break;
@@ -1344,25 +1336,6 @@ zil_free(zilog_t *zilog)
}
/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- arc_buf_t *abuf = NULL;
-
- if (BP_IS_HOLE(&zh->zh_log))
- return (B_TRUE);
-
- if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
- return (B_TRUE);
-
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- return (B_FALSE);
-}
-
-/*
* Open an intent log.
*/
zilog_t *
@@ -1417,7 +1390,7 @@ zil_suspend(zilog_t *zilog)
const zil_header_t *zh = zilog->zl_header;
mutex_enter(&zilog->zl_lock);
- if (zh->zh_claim_txg != 0) { /* unplayed log */
+ if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
mutex_exit(&zilog->zl_lock);
return (EBUSY);
}
@@ -1601,7 +1574,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
const zil_header_t *zh = zilog->zl_header;
zil_replay_arg_t zr;
- if (zil_empty(zilog)) {
+ if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
zil_destroy(zilog, B_TRUE);
return;
}
@@ -1671,3 +1644,24 @@ out:
mutex_exit(&zilog->zl_lock);
return (ret);
}
+
+/* ARGSUSED */
+int
+zil_vdev_offline(char *osname, void *arg)
+{
+ objset_t *os;
+ zilog_t *zilog;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ if (error)
+ return (error);
+
+ zilog = dmu_objset_zil(os);
+ if (zil_suspend(zilog) != 0)
+ error = EEXIST;
+ else
+ zil_resume(zilog);
+ dmu_objset_close(os);
+ return (error);
+}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a669ad64a..a2bdab9a7 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -490,11 +490,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_bp_orig = *bp;
if (type != ZIO_TYPE_WRITE)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
- if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
- if (BP_IS_GANG(bp))
- pipeline |= ZIO_GANG_STAGES;
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
- }
+ if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+ pipeline |= ZIO_GANG_STAGES;
}
zio->io_spa = spa;
@@ -520,6 +519,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
if (pio != NULL) {
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
+ if (zio->io_child_type == ZIO_CHILD_GANG)
+ zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
}
@@ -529,21 +530,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
static void
zio_destroy(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
- uint8_t async_root = zio->io_async_root;
-
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);
mutex_destroy(&zio->io_lock);
cv_destroy(&zio->io_cv);
kmem_cache_free(zio_cache, zio);
-
- if (async_root) {
- mutex_enter(&spa->spa_async_root_lock);
- if (--spa->spa_async_root_count == 0)
- cv_broadcast(&spa->spa_async_root_cv);
- mutex_exit(&spa->spa_async_root_lock);
- }
}
zio_t *
@@ -836,7 +827,8 @@ zio_read_bp_init(zio_t *zio)
blkptr_t *bp = zio->io_bp;
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
- zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) {
+ zio->io_child_type == ZIO_CHILD_LOGICAL &&
+ !(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t csize = BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(csize);
@@ -885,16 +877,10 @@ zio_write_bp_init(zio_t *zio)
* few passes, stop compressing to ensure convergence.
*/
pass = spa_sync_pass(zio->io_spa);
- ASSERT(pass > 1);
if (pass > SYNC_PASS_DONT_COMPRESS)
compress = ZIO_COMPRESS_OFF;
- /*
- * Only MOS (objset 0) data should need to be rewritten.
- */
- ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
-
/* Make sure someone doesn't change their mind on overwrites */
ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
@@ -956,11 +942,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
zio_type_t t = zio->io_type;
/*
- * If we're a config writer, the normal issue and interrupt threads
- * may all be blocked waiting for the config lock. In this case,
- * select the otherwise-unused taskq for ZIO_TYPE_NULL.
+ * If we're a config writer or a probe, the normal issue and
+ * interrupt threads may all be blocked waiting for the config lock.
+ * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
*/
- if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER)
+ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
t = ZIO_TYPE_NULL;
/*
@@ -1092,14 +1078,12 @@ zio_nowait(zio_t *zio)
zio_unique_parent(zio) == NULL) {
/*
* This is a logical async I/O with no parent to wait for it.
- * Track how many outstanding I/Os of this type exist so
- * that spa_unload() knows when they are all done.
+ * We add it to the spa_async_root_zio "Godfather" I/O which
+ * will ensure they complete prior to unloading the pool.
*/
spa_t *spa = zio->io_spa;
- zio->io_async_root = B_TRUE;
- mutex_enter(&spa->spa_async_root_lock);
- spa->spa_async_root_count++;
- mutex_exit(&spa->spa_async_root_lock);
+
+ zio_add_child(spa->spa_async_zio_root, zio);
}
zio_execute(zio);
@@ -1118,6 +1102,8 @@ zio_reexecute(zio_t *pio)
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+ ASSERT(pio->io_gang_leader == NULL);
+ ASSERT(pio->io_gang_tree == NULL);
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
@@ -1161,8 +1147,11 @@ zio_reexecute(zio_t *pio)
/*
* Now that all children have been reexecuted, execute the parent.
+ * We don't reexecute "The Godfather" I/O here as it's the
+ * responsibility of the caller to wait on him.
*/
- zio_execute(pio);
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+ zio_execute(pio);
}
void
@@ -1178,11 +1167,14 @@ zio_suspend(spa_t *spa, zio_t *zio)
mutex_enter(&spa->spa_suspend_lock);
if (spa->spa_suspend_zio_root == NULL)
- spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
+ spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
spa->spa_suspended = B_TRUE;
if (zio != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
ASSERT(zio != spa->spa_suspend_zio_root);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio_unique_parent(zio) == NULL);
@@ -1193,10 +1185,10 @@ zio_suspend(spa_t *spa, zio_t *zio)
mutex_exit(&spa->spa_suspend_lock);
}
-void
+int
zio_resume(spa_t *spa)
{
- zio_t *pio, *cio, *cio_next;
+ zio_t *pio;
/*
* Reexecute all previously suspended i/o.
@@ -1209,18 +1201,10 @@ zio_resume(spa_t *spa)
mutex_exit(&spa->spa_suspend_lock);
if (pio == NULL)
- return;
-
- for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
- zio_link_t *zl = pio->io_walk_link;
- cio_next = zio_walk_children(pio);
- zio_remove_child(pio, cio, zl);
- zio_reexecute(cio);
- }
+ return (0);
- ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
-
- (void) zio_wait(pio);
+ zio_reexecute(pio);
+ return (zio_wait(pio));
}
void
@@ -1327,7 +1311,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
* (Presently, nothing actually uses interior data checksums;
* this is just good hygiene.)
*/
- if (gn != pio->io_logical->io_gang_tree) {
+ if (gn != pio->io_gang_leader->io_gang_tree) {
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp));
}
@@ -1409,27 +1393,26 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
}
static void
-zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
- ASSERT(lio->io_logical == lio);
+ ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
- lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
+ gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
static void
zio_gang_tree_assemble_done(zio_t *zio)
{
- zio_t *lio = zio->io_logical;
+ zio_t *gio = zio->io_gang_leader;
zio_gang_node_t *gn = zio->io_private;
blkptr_t *bp = zio->io_bp;
- zio_t *pio = zio_unique_parent(zio);
- ASSERT(pio == lio);
+ ASSERT(gio == zio_unique_parent(zio));
ASSERT(zio_walk_children(zio) == NULL);
if (zio->io_error)
@@ -1446,25 +1429,25 @@ zio_gang_tree_assemble_done(zio_t *zio)
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
continue;
- zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
+ zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
}
}
static void
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
{
- zio_t *lio = pio->io_logical;
+ zio_t *gio = pio->io_gang_leader;
zio_t *zio;
ASSERT(BP_IS_GANG(bp) == !!gn);
- ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
- ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
+ ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+ ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
/*
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
- zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
+ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
@@ -1478,8 +1461,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
}
}
- if (gn == lio->io_gang_tree)
- ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
+ if (gn == gio->io_gang_tree)
+ ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
if (zio != pio)
zio_nowait(zio);
@@ -1490,7 +1473,10 @@ zio_gang_assemble(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ zio->io_gang_leader = zio;
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
@@ -1500,18 +1486,18 @@ zio_gang_assemble(zio_t *zio)
static int
zio_gang_issue(zio_t *zio)
{
- zio_t *lio = zio->io_logical;
blkptr_t *bp = zio->io_bp;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
- ASSERT(BP_IS_GANG(bp) && zio == lio);
+ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
- zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
+ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
else
- zio_gang_tree_free(&lio->io_gang_tree);
+ zio_gang_tree_free(&zio->io_gang_tree);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1522,7 +1508,7 @@ static void
zio_write_gang_member_ready(zio_t *zio)
{
zio_t *pio = zio_unique_parent(zio);
- zio_t *lio = zio->io_logical;
+ zio_t *gio = zio->io_gang_leader;
dva_t *cdva = zio->io_bp->blk_dva;
dva_t *pdva = pio->io_bp->blk_dva;
uint64_t asize;
@@ -1533,7 +1519,7 @@ zio_write_gang_member_ready(zio_t *zio)
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
- ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
+ ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
@@ -1553,28 +1539,28 @@ zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
blkptr_t *bp = pio->io_bp;
- zio_t *lio = pio->io_logical;
+ zio_t *gio = pio->io_gang_leader;
zio_t *zio;
zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
- int ndvas = lio->io_prop.zp_ndvas;
+ int ndvas = gio->io_prop.zp_ndvas;
int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
zio_prop_t zp;
int error;
error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
+ bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
if (error) {
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
- if (pio == lio) {
- gnpp = &lio->io_gang_tree;
+ if (pio == gio) {
+ gnpp = &gio->io_gang_tree;
} else {
gnpp = pio->io_private;
ASSERT(pio->io_ready == zio_write_gang_member_ready);
@@ -1598,11 +1584,11 @@ zio_write_gang_block(zio_t *pio)
SPA_MINBLOCKSIZE);
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
- zp.zp_checksum = lio->io_prop.zp_checksum;
+ zp.zp_checksum = gio->io_prop.zp_checksum;
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
- zp.zp_ndvas = lio->io_prop.zp_ndvas;
+ zp.zp_ndvas = gio->io_prop.zp_ndvas;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1635,6 +1621,11 @@ zio_dva_allocate(zio_t *zio)
blkptr_t *bp = zio->io_bp;
int error;
+ if (zio->io_gang_leader == NULL) {
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+ zio->io_gang_leader = zio;
+ }
+
ASSERT(BP_IS_HOLE(bp));
ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
@@ -1864,7 +1855,8 @@ zio_vdev_io_done(zio_t *zio)
vdev_cache_write(zio);
if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(vd, EIO);
+ zio->io_error = zio_handle_device_injection(vd,
+ zio, EIO);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_label_injection(zio, EIO);
@@ -2079,11 +2071,10 @@ zio_ready(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
- if (zio->io_ready) {
- if (BP_IS_GANG(bp) &&
- zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
- return (ZIO_PIPELINE_STOP);
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+ return (ZIO_PIPELINE_STOP);
+ if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
@@ -2128,7 +2119,7 @@ zio_done(zio_t *zio)
zio_t *pio, *pio_next;
/*
- * If our of children haven't all completed,
+ * If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
@@ -2219,6 +2210,21 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
+ if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
+ zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+ zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+ }
+
+ zio_gang_tree_free(&zio->io_gang_tree);
+
+ /*
+ * Godfather I/Os should never suspend.
+ */
+ if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+ zio->io_reexecute = 0;
+
if (zio->io_reexecute) {
/*
* This is a logical I/O that wants to reexecute.
@@ -2235,21 +2241,37 @@ zio_done(zio_t *zio)
*/
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
- if (IO_IS_ALLOCATING(zio))
- zio_dva_unallocate(zio, zio->io_gang_tree, bp);
-
- zio_gang_tree_free(&zio->io_gang_tree);
+ zio->io_gang_leader = NULL;
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
+ /*
+ * "The Godfather" I/O monitors its children but is
+ * not a true parent to them. It will track them through
+ * the pipeline but severs its ties whenever they get into
+ * trouble (e.g. suspended). This allows "The Godfather"
+ * I/O to return status without blocking.
+ */
+ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+ zio_link_t *zl = zio->io_walk_link;
+ pio_next = zio_walk_parents(zio);
+
+ if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+ zio_remove_child(pio, zio, zl);
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+ }
+ }
+
if ((pio = zio_unique_parent(zio)) != NULL) {
/*
* We're not a root i/o, so there's nothing to do
* but notify our parent. Don't propagate errors
* upward since we haven't permanently failed yet.
*/
+ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
@@ -2282,8 +2304,6 @@ zio_done(zio_t *zio)
if (zio->io_done)
zio->io_done(zio);
- zio_gang_tree_free(&zio->io_gang_tree);
-
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index b3469fdd5..f8e6880c9 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -195,7 +195,7 @@ zio_handle_label_injection(zio_t *zio, int error)
int
-zio_handle_device_injection(vdev_t *vd, int error)
+zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
{
inject_handler_t *handler;
int ret = 0;
@@ -210,6 +210,12 @@ zio_handle_device_injection(vdev_t *vd, int error)
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_failfast &&
+ (zio == NULL || (zio->io_flags &
+ (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
+ continue;
+ }
+
if (handler->zi_record.zi_error == error) {
/*
* For a failed open, pretend like the device
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index d3be8fb50..cfd3b3dbd 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -107,7 +107,7 @@ typedef struct zvol_state {
uint64_t zv_volblocksize; /* volume block size */
minor_t zv_minor; /* minor number */
uint8_t zv_min_bs; /* minimum addressable block shift */
- uint8_t zv_flags; /* readonly; dumpified */
+ uint8_t zv_flags; /* readonly, dumpified, etc. */
objset_t *zv_objset; /* objset handle */
uint32_t zv_mode; /* DS_MODE_* flags at open time */
uint32_t zv_open_count[OTYPCNT]; /* open counts */
@@ -123,6 +123,7 @@ typedef struct zvol_state {
#define ZVOL_RDONLY 0x1
#define ZVOL_DUMPIFIED 0x2
#define ZVOL_EXCL 0x4
+#define ZVOL_WCE 0x8
/*
* zvol maximum transfer in one DMU tx.
@@ -741,6 +742,27 @@ zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
}
}
+ /*
+ * Generate a LUN expansion event.
+ */
+ if (error == 0) {
+ sysevent_id_t eid;
+ nvlist_t *attr;
+ char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ (void) snprintf(physpath, MAXPATHLEN, "%s%uc", ZVOL_PSEUDO_DEV,
+ zv->zv_minor);
+
+ VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+ (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+ ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+ nvlist_free(attr);
+ kmem_free(physpath, MAXPATHLEN);
+ }
+
out:
if (state.zv_objset)
dmu_objset_close(state.zv_objset);
@@ -924,7 +946,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
+ return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
+ DMU_READ_NO_PREFETCH));
zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_zilog = zv->zv_zilog;
@@ -968,11 +991,15 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
ssize_t zvol_immediate_write_sz = 32768;
static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
{
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
- lr_write_t *lr;
+ boolean_t slogging;
+
+ if (zil_disable)
+ return;
if (zilog->zl_replay) {
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
@@ -981,23 +1008,58 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
return;
}
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ slogging = spa_has_slogs(zilog->zl_spa);
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ ssize_t len;
+ itx_wr_state_t write_state;
+
+ /*
+ * Unlike zfs_log_write() we can be called with
+ * upto DMU_MAX_ACCESS/2 (5MB) writes.
+ */
+ if (blocksize > zvol_immediate_write_sz && !slogging &&
+ resid >= blocksize && off % blocksize == 0) {
+ write_state = WR_INDIRECT; /* uses dmu_sync */
+ len = blocksize;
+ } else if (sync) {
+ write_state = WR_COPIED;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ } else {
+ write_state = WR_NEED_COPY;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ }
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = write_state;
+ if (write_state == WR_NEED_COPY)
+ itx->itx_sod += len;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
- lr->lr_length = nbytes;
+ lr->lr_length = len;
lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
BP_ZERO(&lr->lr_blkptr);
+ itx->itx_private = zv;
+ itx->itx_sync = sync;
+
(void) zil_itx_assign(zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
+
+ off += len;
+ resid -= len;
}
}
@@ -1010,7 +1072,9 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
int numerrors = 0;
for (c = 0; c < vd->vdev_children; c++) {
- ASSERT(vd->vdev_ops == &vdev_mirror_ops);
+ ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
int err = zvol_dumpio_vdev(vd->vdev_child[c],
addr, offset, size, doread, isdump);
if (err != 0) {
@@ -1086,6 +1150,7 @@ zvol_strategy(buf_t *bp)
int error = 0;
boolean_t doread = bp->b_flags & B_READ;
boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
+ boolean_t sync;
if (zv == NULL) {
bioerror(bp, ENXIO);
@@ -1123,6 +1188,9 @@ zvol_strategy(buf_t *bp)
return (0);
}
+ sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump &&
+ !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
/*
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
@@ -1137,7 +1205,8 @@ zvol_strategy(buf_t *bp)
error = zvol_dumpio(zv, addr, off, size,
doread, B_FALSE);
} else if (doread) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr);
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
} else {
dmu_tx_t *tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
@@ -1146,7 +1215,7 @@ zvol_strategy(buf_t *bp)
dmu_tx_abort(tx);
} else {
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
+ zvol_log_write(zv, tx, off, size, sync);
dmu_tx_commit(tx);
}
}
@@ -1165,7 +1234,7 @@ zvol_strategy(buf_t *bp)
if ((bp->b_resid = resid) == bp->b_bcount)
bioerror(bp, off > volsize ? EINVAL : error);
- if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump)
+ if (sync)
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
biodone(bp);
@@ -1280,6 +1349,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
uint64_t volsize;
rl_t *rl;
int error = 0;
+ boolean_t sync;
if (minor == 0) /* This is the control device */
return (ENXIO);
@@ -1299,6 +1369,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_WRITER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1317,14 +1389,14 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
}
error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
if (error == 0)
- zvol_log_write(zv, tx, off, bytes);
+ zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx);
if (error)
break;
}
zfs_range_unlock(rl);
- if (!zil_disable)
+ if (sync)
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
return (error);
}
@@ -1408,6 +1480,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
mutex_exit(&zvol_state_lock);
return (ENXIO);
}
+ ASSERT(zv->zv_total_opens > 0);
switch (cmd) {
@@ -1444,12 +1517,40 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
case DKIOCFLUSHWRITECACHE:
dkc = (struct dk_callback *)arg;
+ mutex_exit(&zvol_state_lock);
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
(*dkc->dkc_callback)(dkc->dkc_cookie, error);
error = 0;
}
- break;
+ return (error);
+
+ case DKIOCGETWCE:
+ {
+ int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+ if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+ flag))
+ error = EFAULT;
+ break;
+ }
+ case DKIOCSETWCE:
+ {
+ int wce;
+ if (ddi_copyin((void *)arg, &wce, sizeof (int),
+ flag)) {
+ error = EFAULT;
+ break;
+ }
+ if (wce) {
+ zv->zv_flags |= ZVOL_WCE;
+ mutex_exit(&zvol_state_lock);
+ } else {
+ zv->zv_flags &= ~ZVOL_WCE;
+ mutex_exit(&zvol_state_lock);
+ zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+ }
+ return (0);
+ }
case DKIOCGGEOM:
case DKIOCGVTOC:
@@ -1468,6 +1569,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
break;
case DKIOCDUMPFINI:
+ if (!(zv->zv_flags & ZVOL_DUMPIFIED))
+ break;
rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
RL_WRITER);
error = zvol_dump_fini(zv);