Add linux user disk support

This topic branch contains all the changes needed to integrate the user side zfs tools with Linux style devices. Primarily this includes fixing up the Solaris libefi library to be Linux friendly, and integrating with the libblkid library which is provided by e2fsprogs. Signed-off-by: Brian Behlendorf <[email protected]>
author: Brian Behlendorf <[email protected]> 2010-08-26 11:56:53 -0700
committer: Brian Behlendorf <[email protected]> 2010-08-31 13:42:00 -0700
commit: d603ed6c278f9c25b17ba8e75e9bce6e5d715ac0 (patch)
tree: 61dc04f07ed9772dd97faefd879412e2a6578be0 /lib
parent: f1fb119f6bb0c3185ec88912e4488fdd9ec08ab2 (diff)
12 files changed, 997 insertions, 450 deletions
diff --git a/lib/libefi/include/sys/uuid.h b/lib/libefi/include/sys/uuid.h
index 9ce872e34..eab4622a6 100644
--- a/lib/libefi/include/sys/uuid.h
+++ b/lib/libefi/include/sys/uuid.h
@@ -74,12 +74,8 @@ struct uuid {
 	uint8_t		node_addr[6];
 };
 
-#define	UUID_LEN	16
-
 #define	UUID_PRINTABLE_STRING_LENGTH 37
 
-typedef uchar_t		uuid_t[UUID_LEN];
-
 /*
  * Convert a uuid to/from little-endian format
  */
diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c
index e682b840a..da71e3486 100644
--- a/lib/libefi/rdwr_efi.c
+++ b/lib/libefi/rdwr_efi.c
@@ -29,6 +29,7 @@
 #include <strings.h>
 #include <unistd.h>
 #include <uuid/uuid.h>
+#include <zlib.h>
 #include <libintl.h>
 #include <sys/types.h>
 #include <sys/dkio.h>
@@ -38,7 +39,9 @@
 #include <sys/dktp/fdisk.h>
 #include <sys/efi_partition.h>
 #include <sys/byteorder.h>
-#include <sys/ddi.h>
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
 
 static struct uuid_to_ptag {
 	struct uuid	uuid;
@@ -49,11 +52,11 @@ static struct uuid_to_ptag {
 	{ EFI_SWAP },
 	{ EFI_USR },
 	{ EFI_BACKUP },
-	{ 0 },			/* STAND is never used */
+	{ EFI_UNUSED },		/* STAND is never used */
 	{ EFI_VAR },
 	{ EFI_HOME },
 	{ EFI_ALTSCTR },
-	{ 0 },			/* CACHE (cachefs) is never used */
+	{ EFI_UNUSED },		/* CACHE (cachefs) is never used */
 	{ EFI_RESERVED },
 	{ EFI_SYSTEM },
 	{ EFI_LEGACY_MBR },
@@ -107,19 +110,142 @@ int efi_debug = 1;
 int efi_debug = 0;
 #endif
 
-extern unsigned int	efi_crc32(const unsigned char *, unsigned int);
-static int		efi_read(int, struct dk_gpt *);
+static int efi_read(int, struct dk_gpt *);
+
+/*
+ * Return a 32-bit CRC of the contents of the buffer.  Pre-and-post
+ * one's conditioning will be handled by crc32() internally.
+ */
+static uint32_t
+efi_crc32(const unsigned char *buf, unsigned int size)
+{
+	uint32_t crc = crc32(0, Z_NULL, 0);
+
+	crc = crc32(crc, buf, size);
+
+	return (crc);
+}
 
 static int
 read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize)
 {
-	struct dk_minfo		disk_info;
+	int sector_size;
+	unsigned long long capacity_size;
+
+        if (ioctl(fd, BLKSSZGET, &sector_size) < 0)
+                return (-1);
+
+	if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0)
+		return (-1);
+
+	*lbsize = (uint_t)sector_size;
+	*capacity = (diskaddr_t)(capacity_size / sector_size);
+
+	return (0);
+}
 
-	if ((ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info)) == -1)
-		return (errno);
-	*capacity = disk_info.dki_capacity;
-	*lbsize = disk_info.dki_lbsize;
+static int
+efi_get_info(int fd, struct dk_cinfo *dki_info)
+{
+#if defined(__linux__)
+	char *path;
+	char *dev_path;
+	int rval = 0;
+
+	memset(dki_info, 0, sizeof(*dki_info));
+
+	path = calloc(PATH_MAX, 1);
+	if (path == NULL)
+		goto error;
+
+	/*
+	 * The simplest way to get the partition number under linux is
+	 * to parse it out of the /dev/<disk><parition> block device name.
+	 * The kernel creates this using the partition number when it
+	 * populates /dev/ so it may be trusted.  The tricky bit here is
+	 * that the naming convention is based on the block device type.
+	 * So we need to take this in to account when parsing out the
+	 * partition information.  Another issue is that the libefi API
+	 * API only provides the open fd and not the file path.  To handle
+	 * this realpath(3) is used to resolve the block device name from
+	 * /proc/self/fd/<fd>.  Aside from the partition number we collect
+	 * some additional device info.
+	 */
+	(void) sprintf(path, "/proc/self/fd/%d", fd);
+	dev_path = realpath(path, NULL);
+	free(path);
+
+	if (dev_path == NULL)
+		goto error;
+
+	if ((strncmp(dev_path, "/dev/sd", 7) == 0)) {
+		strcpy(dki_info->dki_cname, "sd");
+		dki_info->dki_ctype = DKC_SCSI_CCS;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) {
+		strcpy(dki_info->dki_cname, "hd");
+		dki_info->dki_ctype = DKC_DIRECT;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else if ((strncmp(dev_path, "/dev/md", 7) == 0)) {
+		strcpy(dki_info->dki_cname, "pseudo");
+		dki_info->dki_ctype = DKC_MD;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) {
+		strcpy(dki_info->dki_cname, "pseudo");
+		dki_info->dki_ctype = DKC_VBD;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9-]p%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) {
+		strcpy(dki_info->dki_cname, "pseudo");
+		dki_info->dki_ctype = DKC_PCMCIA_MEM;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) {
+		strcpy(dki_info->dki_cname, "pseudo");
+		dki_info->dki_ctype = DKC_VBD;
+		rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu",
+			      dki_info->dki_dname,
+			      &dki_info->dki_partition);
+	} else {
+		strcpy(dki_info->dki_dname, "unknown");
+		strcpy(dki_info->dki_cname, "unknown");
+		dki_info->dki_ctype = DKC_UNKNOWN;
+	}
+
+	switch (rval) {
+	case 0:
+		errno = EINVAL;
+		goto error;
+	case 1:
+		dki_info->dki_partition = 0;
+	}
+
+	free(dev_path);
+#else
+	if (ioctl(fd, DKIOCINFO, (caddr_t)dki_info) == -1)
+		goto error;
+#endif
 	return (0);
+error:
+	if (efi_debug)
+		(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
+
+	switch (errno) {
+	case EIO:
+		return (VT_EIO);
+	case EINVAL:
+		return (VT_EINVAL);
+	default:
+		return (VT_ERROR);
+	}
 }
 
 /*
@@ -135,12 +261,13 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize)
 int
 efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
 {
-	diskaddr_t	capacity;
-	uint_t		lbsize;
+	diskaddr_t	capacity = 0;
+	uint_t		lbsize = 0;
 	uint_t		nblocks;
 	size_t		length;
 	struct dk_gpt	*vptr;
 	struct uuid	uuid;
+	struct dk_cinfo	dki_info;
 
 	if (read_disk_info(fd, &capacity, &lbsize) != 0) {
 		if (efi_debug)
@@ -148,6 +275,22 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
 			    "couldn't read disk information\n");
 		return (-1);
 	}
+#if defined(__linux__)
+	if (efi_get_info(fd, &dki_info) != 0) {
+		if (efi_debug)
+			(void) fprintf(stderr,
+			    "couldn't read disk information\n");
+		return (-1);
+	}
+
+	if (dki_info.dki_partition != 0)
+		return (-1);
+
+	if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) ||
+	    (dki_info.dki_ctype == DKC_VBD) ||
+	    (dki_info.dki_ctype == DKC_UNKNOWN))
+		return (-1);
+#endif
 
 	nblocks = NBLOCKS(nparts, lbsize);
 	if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) {
@@ -243,14 +386,138 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc)
 {
 	void *data = dk_ioc->dki_data;
 	int error;
+#if defined(__linux__)
+	diskaddr_t capacity;
+	uint_t lbsize;
+
+	/*
+	 * When the IO is not being performed in kernel as an ioctl we need
+	 * to know the sector size so we can seek to the proper byte offset.
+	 */
+	if (read_disk_info(fd, &capacity, &lbsize) == -1) {
+		if (efi_debug)
+			fprintf(stderr,"unable to read disk info: %d",errno);
+
+		errno = EIO;
+		return -1;
+	}
+
+	switch (cmd) {
+	case DKIOCGETEFI:
+		if (lbsize == 0) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCGETEFI assuming "
+					       "LBA %d bytes\n", DEV_BSIZE);
+
+			lbsize = DEV_BSIZE;
+		}
+
+		error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
+		if (error == -1) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCGETEFI lseek "
+				               "error: %d\n", errno);
+			return error;
+		}
+
+		error = read(fd, data, dk_ioc->dki_length);
+		if (error == -1) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCGETEFI read "
+				               "error: %d\n", errno);
+			return error;
+		}
 
+		if (error != dk_ioc->dki_length) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCGETEFI short "
+					       "read of %d bytes\n", error);
+			errno = EIO;
+			return -1;
+		}
+		error = 0;
+		break;
+
+	case DKIOCSETEFI:
+		if (lbsize == 0) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCSETEFI unknown "
+					       "LBA size\n");
+			errno = EIO;
+			return -1;
+		}
+
+		error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET);
+		if (error == -1) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCSETEFI lseek "
+				               "error: %d\n", errno);
+			return error;
+		}
+
+		error = write(fd, data, dk_ioc->dki_length);
+		if (error == -1) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCSETEFI write "
+				               "error: %d\n", errno);
+			return error;
+		}
+
+		if (error != dk_ioc->dki_length) {
+			if (efi_debug)
+				(void) fprintf(stderr, "DKIOCSETEFI short "
+					       "write of %d bytes\n", error);
+			errno = EIO;
+			return -1;
+		}
+
+		/* Sync the new EFI table to disk */
+		error = fsync(fd);
+		if (error == -1)
+			return error;
+
+		/* Ensure any local disk cache is also flushed */
+		if (ioctl(fd, BLKFLSBUF, 0) == -1)
+			return error;
+
+		error = 0;
+		break;
+
+	default:
+		if (efi_debug)
+			(void) fprintf(stderr, "unsupported ioctl()\n");
+
+		errno = EIO;
+		return -1;
+	}
+#else
 	dk_ioc->dki_data_64 = (uint64_t)(uintptr_t)data;
 	error = ioctl(fd, cmd, (void *)dk_ioc);
 	dk_ioc->dki_data = data;
-
+#endif
 	return (error);
 }
 
+#if defined(__linux__)
+static int
+efi_rescan(int fd)
+{
+	int retry = 5;
+	int error;
+
+	/* Notify the kernel a devices partition table has been updated */
+	while ((error = ioctl(fd, BLKRRPART)) != 0) {
+		if (--retry == 0) {
+			(void) fprintf(stderr, "the kernel failed to rescan "
+				       "the partition table: %d\n", errno);
+			return (-1);
+		}
+	}
+
+	return (0);
+}
+#endif
+
 static int
 check_label(int fd, dk_efi_t *dk_ioc)
 {
@@ -305,6 +572,8 @@ efi_read(int fd, struct dk_gpt *vtoc)
 	int			rval = 0;
 	int			md_flag = 0;
 	int			vdc_flag = 0;
+	diskaddr_t		capacity = 0;
+	uint_t			lbsize = 0;
 	struct dk_minfo		disk_info;
 	dk_efi_t		dk_ioc;
 	efi_gpt_t		*efi;
@@ -316,19 +585,9 @@ efi_read(int fd, struct dk_gpt *vtoc)
 	/*
 	 * get the partition number for this file descriptor.
 	 */
-	if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) {
-		if (efi_debug) {
-			(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
-		}
-		switch (errno) {
-		case EIO:
-			return (VT_EIO);
-		case EINVAL:
-			return (VT_EINVAL);
-		default:
-			return (VT_ERROR);
-		}
-	}
+	if ((rval = efi_get_info(fd, &dki_info)) != 0)
+		return rval;
+
 	if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
 	    (strncmp(dki_info.dki_dname, "md", 3) == 0)) {
 		md_flag++;
@@ -342,14 +601,18 @@ efi_read(int fd, struct dk_gpt *vtoc)
 	}
 
 	/* get the LBA size */
-	if (ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info) == -1) {
+	if (read_disk_info(fd, &capacity, &lbsize) == -1) {
 		if (efi_debug) {
 			(void) fprintf(stderr,
-			    "assuming LBA 512 bytes %d\n",
-			    errno);
+				       "unable to read disk info: %d",
+				       errno);
 		}
-		disk_info.dki_lbsize = DEV_BSIZE;
+		return (VT_EINVAL);
 	}
+
+	disk_info.dki_lbsize = lbsize;
+	disk_info.dki_capacity = capacity;
+
 	if (disk_info.dki_lbsize == 0) {
 		if (efi_debug) {
 			(void) fprintf(stderr,
@@ -374,9 +637,11 @@ efi_read(int fd, struct dk_gpt *vtoc)
 		}
 	}
 
-	if ((dk_ioc.dki_data = calloc(label_len, 1)) == NULL)
+	if (posix_memalign((void **)&dk_ioc.dki_data,
+		           disk_info.dki_lbsize, label_len))
 		return (VT_ERROR);
 
+	memset(dk_ioc.dki_data, 0, label_len);
 	dk_ioc.dki_length = disk_info.dki_lbsize;
 	user_length = vtoc->efi_nparts;
 	efi = dk_ioc.dki_data;
@@ -572,12 +837,14 @@ write_pmbr(int fd, struct dk_gpt *vtoc)
 	int		len;
 
 	len = (vtoc->efi_lbasize == 0) ? sizeof (mb) : vtoc->efi_lbasize;
-	buf = calloc(len, 1);
+	if (posix_memalign((void **)&buf, len, len))
+		return (VT_ERROR);
 
 	/*
 	 * Preserve any boot code and disk signature if the first block is
 	 * already an MBR.
 	 */
+	memset(buf, 0, len);
 	dk_ioc.dki_lba = 0;
 	dk_ioc.dki_length = len;
 	/* LINTED -- always longlong aligned */
@@ -663,10 +930,9 @@ check_input(struct dk_gpt *vtoc)
 		if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) &&
 		    (vtoc->efi_parts[i].p_size != 0)) {
 			if (efi_debug) {
-				(void) fprintf(stderr,
-"partition %d is \"unassigned\" but has a size of %llu",
-				    i,
-				    vtoc->efi_parts[i].p_size);
+				(void) fprintf(stderr, "partition %d is "
+				    "\"unassigned\" but has a size of %llu",
+				    i, vtoc->efi_parts[i].p_size);
 			}
 			return (VT_EINVAL);
 		}
@@ -679,9 +945,9 @@ check_input(struct dk_gpt *vtoc)
 		if (vtoc->efi_parts[i].p_tag == V_RESERVED) {
 			if (resv_part != -1) {
 				if (efi_debug) {
-					(void) fprintf(stderr,
-"found duplicate reserved partition at %d\n",
-					    i);
+					(void) fprintf(stderr, "found "
+					    "duplicate reserved partition "
+					    "at %d\n", i);
 				}
 				return (VT_EINVAL);
 			}
@@ -732,8 +998,8 @@ check_input(struct dk_gpt *vtoc)
 				    (istart <= endsect)) {
 					if (efi_debug) {
 						(void) fprintf(stderr,
-"Partition %d overlaps partition %d.",
-						    i, j);
+						    "Partition %d overlaps "
+						    "partition %d.", i, j);
 					}
 					return (VT_EINVAL);
 				}
@@ -839,22 +1105,13 @@ efi_write(int fd, struct dk_gpt *vtoc)
 	efi_gpe_t		*efi_parts;
 	int			i, j;
 	struct dk_cinfo		dki_info;
+	int			rval;
 	int			md_flag = 0;
 	int			nblocks;
 	diskaddr_t		lba_backup_gpt_hdr;
 
-	if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) {
-		if (efi_debug)
-			(void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno);
-		switch (errno) {
-		case EIO:
-			return (VT_EIO);
-		case EINVAL:
-			return (VT_EINVAL);
-		default:
-			return (VT_ERROR);
-		}
-	}
+	if ((rval = efi_get_info(fd, &dki_info)) != 0)
+		return rval;
 
 	/* check if we are dealing wih a metadevice */
 	if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) &&
@@ -892,9 +1149,11 @@ efi_write(int fd, struct dk_gpt *vtoc)
 	 * for backup GPT header.
 	 */
 	lba_backup_gpt_hdr = vtoc->efi_last_u_lba + 1 + nblocks;
-	if ((dk_ioc.dki_data = calloc(dk_ioc.dki_length, 1)) == NULL)
+	if (posix_memalign((void **)&dk_ioc.dki_data,
+		           vtoc->efi_lbasize, dk_ioc.dki_length))
 		return (VT_ERROR);
 
+	memset(dk_ioc.dki_data, 0, dk_ioc.dki_length);
 	efi = dk_ioc.dki_data;
 
 	/* stuff user's input into EFI struct */
@@ -941,6 +1200,10 @@ efi_write(int fd, struct dk_gpt *vtoc)
 			return (VT_EINVAL);
 		}
 
+		/* Zero's should be written for empty partitions */
+		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED)
+			continue;
+
 		efi_parts[i].efi_gpe_StartingLBA =
 		    LE_64(vtoc->efi_parts[i].p_start);
 		efi_parts[i].efi_gpe_EndingLBA =
@@ -1032,6 +1295,13 @@ efi_write(int fd, struct dk_gpt *vtoc)
 	/* write the PMBR */
 	(void) write_pmbr(fd, vtoc);
 	free(dk_ioc.dki_data);
+
+#if defined(__linux__)
+	rval = efi_rescan(fd);
+	if (rval)
+		return (VT_ERROR);
+#endif
+
 	return (0);
 }
 
@@ -1049,6 +1319,7 @@ efi_free(struct dk_gpt *ptr)
 int
 efi_type(int fd)
 {
+#if 0
 	struct vtoc vtoc;
 	struct extvtoc extvtoc;
 
@@ -1062,6 +1333,9 @@ efi_type(int fd)
 		}
 	}
 	return (0);
+#else
+	return (ENOSYS);
+#endif
 }
 
 void
@@ -1175,7 +1449,7 @@ efi_auto_sense(int fd, struct dk_gpt **vtoc)
 		return (-1);
 	}
 
-	for (i = 0; i < min((*vtoc)->efi_nparts, V_NUMPAR); i++) {
+	for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) {
 		(*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag;
 		(*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag;
 		(*vtoc)->efi_parts[i].p_start = 0;
diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h
index 197e2eefc..dcbd283ac 100644
--- a/lib/libzfs/include/libzfs.h
+++ b/lib/libzfs/include/libzfs.h
@@ -49,6 +49,26 @@ extern "C" {
 #define	ZPOOL_MAXPROPLEN	MAXPATHLEN
 
 /*
+ * Default device paths
+ */
+
+#if defined(__sun__) || defined(__sun)
+#define	DISK_ROOT	"/dev/dsk"
+#define	RDISK_ROOT	"/dev/rdsk"
+#define	UDISK_ROOT	RDISK_ROOT
+#define	FIRST_SLICE	"s0"
+#define	BACKUP_SLICE	"s2"
+#endif
+
+#ifdef __linux__
+#define	DISK_ROOT	"/dev"
+#define	RDISK_ROOT	DISK_ROOT
+#define	UDISK_ROOT	"/dev/disk"
+#define	FIRST_SLICE	"1"
+#define	BACKUP_SLICE	""
+#endif
+
+/*
  * libzfs errors
  */
 enum {
@@ -248,6 +268,7 @@ extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
     boolean_t *, boolean_t *);
 extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
     boolean_t *, boolean_t *, boolean_t *);
+extern int zpool_label_disk_wait(char *, int);
 extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
 
 /*
@@ -661,9 +682,6 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
 extern int zpool_read_label(int, nvlist_t **);
 extern int zpool_clear_label(int);
 
-/* is this zvol valid for use as a dump device? */
-extern int zvol_check_dump_config(char *);
-
 /*
  * Management interfaces for SMB ACL files
  */
diff --git a/lib/libzfs/include/libzfs_impl.h b/lib/libzfs/include/libzfs_impl.h
index 3d001df07..2389b7823 100644
--- a/lib/libzfs/include/libzfs_impl.h
+++ b/lib/libzfs/include/libzfs_impl.h
@@ -191,6 +191,8 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
+int zvol_create_link(libzfs_handle_t *, const char *);
+int zvol_remove_link(libzfs_handle_t *, const char *);
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
 int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c
index 0bcfc0423..6f067d563 100644
--- a/lib/libzfs/libzfs_changelist.c
+++ b/lib/libzfs/libzfs_changelist.c
@@ -93,6 +93,7 @@ struct prop_changelist {
 int
 changelist_prefix(prop_changelist_t *clp)
 {
+#ifdef HAVE_ZPL
 	prop_changenode_t *cn;
 	int ret = 0;
 
@@ -141,6 +142,9 @@ changelist_prefix(prop_changelist_t *clp)
 		(void) changelist_postfix(clp);
 
 	return (ret);
+#else
+	return 0;
+#endif  /* HAVE_ZPL */
 }
 
 /*
@@ -155,6 +159,7 @@ changelist_prefix(prop_changelist_t *clp)
 int
 changelist_postfix(prop_changelist_t *clp)
 {
+#ifdef HAVE_ZPL
 	prop_changenode_t *cn;
 	char shareopts[ZFS_MAXPROPLEN];
 	int errors = 0;
@@ -255,6 +260,9 @@ changelist_postfix(prop_changelist_t *clp)
 	}
 
 	return (errors ? -1 : 0);
+#else
+	return 0;
+#endif  /* HAVE_ZPL */
 }
 
 /*
@@ -317,6 +325,7 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst)
 int
 changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto)
 {
+#ifdef HAVE_ZPL
 	prop_changenode_t *cn;
 	int ret = 0;
 
@@ -331,6 +340,9 @@ changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto)
 	}
 
 	return (ret);
+#else
+	return 0;
+#endif
 }
 
 /*
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index baf289b64..d876e5d1f 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -57,6 +57,7 @@
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
+static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
 static int userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
@@ -994,6 +995,7 @@ badlabel:
 
 			/*FALLTHRU*/
 
+#ifdef HAVE_ZPL
 		case ZFS_PROP_SHARESMB:
 		case ZFS_PROP_SHARENFS:
 			/*
@@ -1104,6 +1106,7 @@ badlabel:
 			}
 
 			break;
+#endif /* HAVE_ZPL */
 		case ZFS_PROP_UTF8ONLY:
 			chosen_utf = (int)intval;
 			break;
@@ -2742,6 +2745,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 			goto ancestorerr;
 		}
 
+#ifdef HAVE_ZPL
 		if (zfs_mount(h, NULL, 0) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "mount");
 			goto ancestorerr;
@@ -2751,6 +2755,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 			opname = dgettext(TEXT_DOMAIN, "share");
 			goto ancestorerr;
 		}
+#endif /* HAVE_ZPL */
 
 		zfs_close(h);
 	}
@@ -2887,6 +2892,18 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
 	/* create the dataset */
 	ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
 
+	if (ret == 0 && type == ZFS_TYPE_VOLUME) {
+		ret = zvol_create_link(hdl, path);
+		if (ret) {
+			(void) zfs_standard_error(hdl, errno,
+			    dgettext(TEXT_DOMAIN,
+			    "Volume successfully created, but device links "
+			    "were not created"));
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+
 	zcmd_free_nvlists(&zc);
 
 	/* check for failure */
@@ -2949,6 +2966,9 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp)) {
+		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
+			return (-1);
+
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
@@ -2991,9 +3011,17 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 		zfs_close(szhp);
 	}
 
+	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		(void) zvol_remove_link(zhp->zfs_hdl, name);
+		/*
+		 * NB: this is simply a best-effort.  We don't want to
+		 * return an error, because then we wouldn't visit all
+		 * the volumes.
+		 */
+	}
+
 	dd->closezhp = B_TRUE;
-	if (!dd->gotone)
-		rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
+	rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
@@ -3128,11 +3156,70 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
+	} else if (ZFS_IS_VOLUME(zhp)) {
+		ret = zvol_create_link(zhp->zfs_hdl, target);
 	}
 
 	return (ret);
 }
 
+typedef struct promote_data {
+	char cb_mountpoint[MAXPATHLEN];
+	const char *cb_target;
+	const char *cb_errbuf;
+	uint64_t cb_pivot_txg;
+} promote_data_t;
+
+static int
+promote_snap_cb(zfs_handle_t *zhp, void *data)
+{
+	promote_data_t *pd = data;
+	zfs_handle_t *szhp;
+	char snapname[MAXPATHLEN];
+	int rv = 0;
+
+	/* We don't care about snapshots after the pivot point */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/* Remove the device link if it's a zvol. */
+	if (ZFS_IS_VOLUME(zhp))
+		(void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
+
+	/* Check for conflicting names */
+	(void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
+	(void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
+	szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
+	if (szhp != NULL) {
+		zfs_close(szhp);
+		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+		    "snapshot name '%s' from origin \n"
+		    "conflicts with '%s' from target"),
+		    zhp->zfs_name, snapname);
+		rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
+	}
+	zfs_close(zhp);
+	return (rv);
+}
+
+static int
+promote_snap_done_cb(zfs_handle_t *zhp, void *data)
+{
+	promote_data_t *pd = data;
+
+	/* We don't care about snapshots after the pivot point */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
+		/* Create the device link if it's a zvol. */
+		if (ZFS_IS_VOLUME(zhp))
+			(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
 /*
  * Promotes the given clone fs to be the clone parent.
  */
@@ -3142,7 +3229,10 @@ zfs_promote(zfs_handle_t *zhp)
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
 	char parent[MAXPATHLEN];
+	char *cp;
 	int ret;
+	zfs_handle_t *pzhp;
+	promote_data_t pd;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -3160,7 +3250,29 @@ zfs_promote(zfs_handle_t *zhp)
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
+	cp = strchr(parent, '@');
+	*cp = '\0';
+
+	/* Walk the snapshots we will be moving */
+	pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
+	if (pzhp == NULL)
+		return (-1);
+	pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
+	zfs_close(pzhp);
+	pd.cb_target = zhp->zfs_name;
+	pd.cb_errbuf = errbuf;
+	pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET);
+	if (pzhp == NULL)
+		return (-1);
+	(void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
+	    sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
+	ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
+	if (ret != 0) {
+		zfs_close(pzhp);
+		return (-1);
+	}
 
+	/* issue the ioctl */
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -3169,9 +3281,16 @@ zfs_promote(zfs_handle_t *zhp)
 	if (ret != 0) {
 		int save_errno = errno;
 
+		(void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
+		zfs_close(pzhp);
+
 		switch (save_errno) {
 		case EEXIST:
-			/* There is a conflicting snapshot name. */
+			/*
+			 * There is a conflicting snapshot name.  We
+			 * should have caught this above, but they could
+			 * have renamed something in the mean time.
+			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "conflicting snapshot '%s' from parent '%s'"),
 			    zc.zc_string, parent);
@@ -3180,7 +3299,44 @@ zfs_promote(zfs_handle_t *zhp)
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
+	} else {
+		(void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
 	}
+
+	zfs_close(pzhp);
+	return (ret);
+}
+
+struct createdata {
+	const char *cd_snapname;
+	int cd_ifexists;
+};
+
+static int
+zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
+{
+	struct createdata *cd = arg;
+	int ret;
+
+	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		char name[MAXPATHLEN];
+
+		(void) strlcpy(name, zhp->zfs_name, sizeof (name));
+		(void) strlcat(name, "@", sizeof (name));
+		(void) strlcat(name, cd->cd_snapname, sizeof (name));
+		(void) zvol_create_link_common(zhp->zfs_hdl, name,
+		    cd->cd_ifexists);
+		/*
+		 * NB: this is simply a best-effort.  We don't want to
+		 * return an error, because then we wouldn't visit all
+		 * the volumes.
+		 */
+	}
+
+	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
+
+	zfs_close(zhp);
+
 	return (ret);
 }
 
@@ -3244,12 +3400,32 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
 	 * if it was recursive, the one that actually failed will be in
 	 * zc.zc_name.
 	 */
-	if (ret != 0) {
+	if (ret != 0)
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-		(void) zfs_standard_error(hdl, errno, errbuf);
+
+	if (ret == 0 && recursive) {
+		struct createdata cd;
+
+		cd.cd_snapname = delim + 1;
+		cd.cd_ifexists = B_FALSE;
+		(void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
+	}
+	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		ret = zvol_create_link(zhp->zfs_hdl, path);
+		if (ret != 0) {
+			(void) zfs_standard_error(hdl, errno,
+			    dgettext(TEXT_DOMAIN,
+			    "Volume successfully snapshotted, but device links "
+			    "were not created"));
+			zfs_close(zhp);
+			return (-1);
+		}
 	}
 
+	if (ret != 0)
+		(void) zfs_standard_error(hdl, errno, errbuf);
+
 	zfs_close(zhp);
 
 	return (ret);
@@ -3351,6 +3527,8 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 	 */
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
+			return (-1);
 		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 			return (-1);
 		old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
@@ -3388,6 +3566,10 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 	 */
 	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
 	    (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
+		if ((err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name))) {
+			zfs_close(zhp);
+			return (err);
+		}
 		if (restore_resv) {
 			new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 			if (old_volsize != new_volsize)
@@ -3536,6 +3718,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 	}
 
 	if (recursive) {
+		struct destroydata dd;
 
 		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
@@ -3550,6 +3733,15 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 			goto error;
 		}
 
+		dd.snapname = delim + 1;
+		dd.gotone = B_FALSE;
+		dd.closezhp = B_TRUE;
+
+		/* We remove any zvol links prior to renaming them */
+		ret = zfs_iter_filesystems(zhrp, zfs_check_snap_cb, &dd);
+		if (ret) {
+			goto error;
+		}
 	} else {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
 			return (-1);
@@ -3598,10 +3790,27 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
-		if (!recursive)
+		if (recursive) {
+			struct createdata cd;
+
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = delim + 1;
+			cd.cd_ifexists = B_TRUE;
+			(void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
 			(void) changelist_postfix(cl);
+		}
 	} else {
-		if (!recursive) {
+		if (recursive) {
+			struct createdata cd;
+
+			/* only create links for datasets that had existed */
+			cd.cd_snapname = strchr(target, '@') + 1;
+			cd.cd_ifexists = B_TRUE;
+			ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
+			    &cd);
+		} else {
 			changelist_rename(cl, zfs_get_name(zhp), target);
 			ret = changelist_postfix(cl);
 		}
@@ -3620,19 +3829,103 @@ error:
 	return (ret);
 }
 
-nvlist_t *
-zfs_get_user_props(zfs_handle_t *zhp)
+/*
+ * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
+ * and wait briefly for udev to create the /dev link.
+ */
+int
+zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
 {
-	return (zhp->zfs_user_props);
+	return (zvol_create_link_common(hdl, dataset, B_FALSE));
+}
+
+static int
+zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	char path[MAXPATHLEN];
+	int error;
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+
+	/*
+	 * Issue the appropriate ioctl.
+	 */
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
+		switch (errno) {
+		case EEXIST:
+			/*
+			 * Silently ignore the case where the link already
+			 * exists.  This allows 'zfs volinit' to be run multiple
+			 * times without errors.
+			 */
+			return (0);
+
+		case ENOENT:
+			/*
+			 * Dataset does not exist in the kernel.  If we
+			 * don't care (see zfs_rename), then ignore the
+			 * error quietly.
+			 */
+			if (ifexists) {
+				return (0);
+			}
+
+			/* FALLTHROUGH */
+
+		default:
+			return (zfs_standard_error_fmt(hdl, errno,
+			    dgettext(TEXT_DOMAIN, "cannot create device links "
+			    "for '%s'"), dataset));
+		}
+	}
+
+	/*
+	 * Wait up to 10 seconds for udev to create the device.
+	 */
+	(void) snprintf(path, sizeof (path), "%s/%s", ZVOL_DIR, dataset);
+	error = zpool_label_disk_wait(path, 10000);
+	if (error)
+		(void) printf(gettext("%s may not be immediately "
+		    "available\n"), path);
+
+	return (0);
+}
+
+/*
+ * Remove a minor node for the given zvol and the associated /dev links.
+ */
+int
+zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
+		switch (errno) {
+		case ENXIO:
+			/*
+			 * Silently ignore the case where the link no longer
+			 * exists, so that 'zfs volfini' can be run multiple
+			 * times without errors.
+			 */
+			return (0);
+
+		default:
+			return (zfs_standard_error_fmt(hdl, errno,
+			    dgettext(TEXT_DOMAIN, "cannot remove device "
+			    "links for '%s'"), dataset));
+		}
+	}
+
+	return (0);
 }
 
 nvlist_t *
-zfs_get_recvd_props(zfs_handle_t *zhp)
+zfs_get_user_props(zfs_handle_t *zhp)
 {
-	if (zhp->zfs_recvd_props == NULL)
-		if (get_recvd_props_ioctl(zhp) != 0)
-			return (NULL);
-	return (zhp->zfs_recvd_props);
+	return (zhp->zfs_user_props);
 }
 
 /*
@@ -3744,6 +4037,7 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received)
 	return (0);
 }
 
+#ifdef HAVE_ZPL
 int
 zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
     char *resource, void *export, void *sharetab,
@@ -3763,6 +4057,7 @@ zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
 	return (error);
 }
+#endif /* HAVE_ZPL */
 
 void
 zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index 386ab002f..ee0064892 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -52,9 +52,11 @@
 #include <sys/vtoc.h>
 #include <sys/dktp/fdisk.h>
 #include <sys/efi_partition.h>
-#include <thread_pool.h>
 
 #include <sys/vdev_impl.h>
+#ifdef HAVE_LIBBLKID
+#include <blkid/blkid.h>
+#endif
 
 #include "libzfs.h"
 #include "libzfs_impl.h"
@@ -904,211 +906,76 @@ zpool_read_label(int fd, nvlist_t **config)
 	return (0);
 }
 
-typedef struct rdsk_node {
-	char *rn_name;
-	int rn_dfd;
-	libzfs_handle_t *rn_hdl;
-	nvlist_t *rn_config;
-	avl_tree_t *rn_avl;
-	avl_node_t rn_node;
-	boolean_t rn_nozpool;
-} rdsk_node_t;
-
+#ifdef HAVE_LIBBLKID
+/*
+ * Use libblkid to quickly search for zfs devices
+ */
 static int
-slice_cache_compare(const void *arg1, const void *arg2)
-{
-	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
-	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
-	char *nm1slice, *nm2slice;
-	int rv;
-
-	/*
-	 * slices zero and two are the most likely to provide results,
-	 * so put those first
-	 */
-	nm1slice = strstr(nm1, "s0");
-	nm2slice = strstr(nm2, "s0");
-	if (nm1slice && !nm2slice) {
-		return (-1);
-	}
-	if (!nm1slice && nm2slice) {
-		return (1);
-	}
-	nm1slice = strstr(nm1, "s2");
-	nm2slice = strstr(nm2, "s2");
-	if (nm1slice && !nm2slice) {
-		return (-1);
-	}
-	if (!nm1slice && nm2slice) {
-		return (1);
-	}
-
-	rv = strcmp(nm1, nm2);
-	if (rv == 0)
-		return (0);
-	return (rv > 0 ? 1 : -1);
-}
-
-static void
-check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
-    diskaddr_t size, uint_t blksz)
-{
-	rdsk_node_t tmpnode;
-	rdsk_node_t *node;
-	char sname[MAXNAMELEN];
-
-	tmpnode.rn_name = &sname[0];
-	(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
-	    diskname, partno);
-	/*
-	 * protect against division by zero for disk labels that
-	 * contain a bogus sector size
-	 */
-	if (blksz == 0)
-		blksz = DEV_BSIZE;
-	/* too small to contain a zpool? */
-	if ((size < (SPA_MINDEVSIZE / blksz)) &&
-	    (node = avl_find(r, &tmpnode, NULL)))
-		node->rn_nozpool = B_TRUE;
-}
-
-static void
-nozpool_all_slices(avl_tree_t *r, const char *sname)
-{
-	char diskname[MAXNAMELEN];
-	char *ptr;
-	int i;
-
-	(void) strncpy(diskname, sname, MAXNAMELEN);
-	if (((ptr = strrchr(diskname, 's')) == NULL) &&
-	    ((ptr = strrchr(diskname, 'p')) == NULL))
-		return;
-	ptr[0] = 's';
-	ptr[1] = '\0';
-	for (i = 0; i < NDKMAP; i++)
-		check_one_slice(r, diskname, i, 0, 1);
-	ptr[0] = 'p';
-	for (i = 0; i <= FD_NUMPART; i++)
-		check_one_slice(r, diskname, i, 0, 1);
-}
-
-static void
-check_slices(avl_tree_t *r, int fd, const char *sname)
+zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools)
 {
-	struct extvtoc vtoc;
-	struct dk_gpt *gpt;
-	char diskname[MAXNAMELEN];
-	char *ptr;
-	int i;
-
-	(void) strncpy(diskname, sname, MAXNAMELEN);
-	if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
-		return;
-	ptr[1] = '\0';
-
-	if (read_extvtoc(fd, &vtoc) >= 0) {
-		for (i = 0; i < NDKMAP; i++)
-			check_one_slice(r, diskname, i,
-			    vtoc.v_part[i].p_size, vtoc.v_sectorsz);
-	} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
-		/*
-		 * on x86 we'll still have leftover links that point
-		 * to slices s[9-15], so use NDKMAP instead
-		 */
-		for (i = 0; i < NDKMAP; i++)
-			check_one_slice(r, diskname, i,
-			    gpt->efi_parts[i].p_size, gpt->efi_lbasize);
-		/* nodes p[1-4] are never used with EFI labels */
-		ptr[0] = 'p';
-		for (i = 1; i <= FD_NUMPART; i++)
-			check_one_slice(r, diskname, i, 0, 1);
-		efi_free(gpt);
-	}
-}
-
-static void
-zpool_open_func(void *arg)
-{
-	rdsk_node_t *rn = arg;
-	struct stat64 statbuf;
+	blkid_cache cache;
+	blkid_dev_iterate iter;
+	blkid_dev dev;
+	const char *devname;
 	nvlist_t *config;
-	int fd;
+	int fd, err;
 
-	if (rn->rn_nozpool)
-		return;
-	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
-		/* symlink to a device that's no longer there */
-		if (errno == ENOENT)
-			nozpool_all_slices(rn->rn_avl, rn->rn_name);
-		return;
-	}
-	/*
-	 * Ignore failed stats.  We only want regular
-	 * files, character devs and block devs.
-	 */
-	if (fstat64(fd, &statbuf) != 0 ||
-	    (!S_ISREG(statbuf.st_mode) &&
-	    !S_ISCHR(statbuf.st_mode) &&
-	    !S_ISBLK(statbuf.st_mode))) {
-		(void) close(fd);
-		return;
-	}
-	/* this file is too small to hold a zpool */
-	if (S_ISREG(statbuf.st_mode) &&
-	    statbuf.st_size < SPA_MINDEVSIZE) {
-		(void) close(fd);
-		return;
-	} else if (!S_ISREG(statbuf.st_mode)) {
-		/*
-		 * Try to read the disk label first so we don't have to
-		 * open a bunch of minor nodes that can't have a zpool.
-		 */
-		check_slices(rn->rn_avl, fd, rn->rn_name);
+	err = blkid_get_cache(&cache, NULL);
+	if (err != 0) {
+		(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
+		    dgettext(TEXT_DOMAIN, "blkid_get_cache() %d"), err);
+		goto err_blkid1;
 	}
 
-	if ((zpool_read_label(fd, &config)) != 0) {
-		(void) close(fd);
-		(void) no_memory(rn->rn_hdl);
-		return;
+	err = blkid_probe_all(cache);
+	if (err != 0) {
+		(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
+		    dgettext(TEXT_DOMAIN, "blkid_probe_all() %d"), err);
+		goto err_blkid2;
 	}
-	(void) close(fd);
 
+	iter = blkid_dev_iterate_begin(cache);
+	if (iter == NULL) {
+		(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
+		    dgettext(TEXT_DOMAIN, "blkid_dev_iterate_begin()"));
+		goto err_blkid2;
+	}
 
-	rn->rn_config = config;
-	if (config != NULL) {
-		assert(rn->rn_nozpool == B_FALSE);
+	err = blkid_dev_set_search(iter, "TYPE", "zfs");
+	if (err != 0) {
+		(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
+		    dgettext(TEXT_DOMAIN, "blkid_dev_set_search() %d"), err);
+		goto err_blkid3;
 	}
-}
 
-/*
- * Given a file descriptor, clear (zero) the label information.  This function
- * is currently only used in the appliance stack as part of the ZFS sysevent
- * module.
- */
-int
-zpool_clear_label(int fd)
-{
-	struct stat64 statbuf;
-	int l;
-	vdev_label_t *label;
-	uint64_t size;
+	while (blkid_dev_next(iter, &dev) == 0) {
+		devname = blkid_dev_devname(dev);
+		if ((fd = open64(devname, O_RDONLY)) < 0)
+			continue;
 
-	if (fstat64(fd, &statbuf) == -1)
-		return (0);
-	size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+		err = zpool_read_label(fd, &config);
+		(void) close(fd);
 
-	if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
-		return (-1);
+		if (err != 0) {
+			(void) no_memory(hdl);
+			goto err_blkid3;
+		}
 
-	for (l = 0; l < VDEV_LABELS; l++) {
-		if (pwrite64(fd, label, sizeof (vdev_label_t),
-		    label_offset(size, l)) != sizeof (vdev_label_t))
-			return (-1);
+		if (config != NULL) {
+			err = add_config(hdl, pools, devname, config);
+			if (err != 0)
+				goto err_blkid3;
+		}
 	}
 
-	free(label);
-	return (0);
+err_blkid3:
+	blkid_dev_iterate_end(iter);
+err_blkid2:
+	blkid_put_cache(cache);
+err_blkid1:
+	return err;
 }
+#endif /* HAVE_LIBBLKID */
 
 /*
  * Given a list of directories to search, find all pools stored on disk.  This
@@ -1126,18 +993,28 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 	char path[MAXPATHLEN];
 	char *end, **dir = iarg->path;
 	size_t pathleft;
-	nvlist_t *ret = NULL;
-	static char *default_dir = "/dev/dsk";
+	struct stat64 statbuf;
+	nvlist_t *ret = NULL, *config;
+	static char *default_dir = DISK_ROOT;
+	int fd;
 	pool_list_t pools = { 0 };
 	pool_entry_t *pe, *penext;
 	vdev_entry_t *ve, *venext;
 	config_entry_t *ce, *cenext;
 	name_entry_t *ne, *nenext;
-	avl_tree_t slice_cache;
-	rdsk_node_t *slice;
-	void *cookie;
+
+	verify(iarg->poolname == NULL || iarg->guid == 0);
 
 	if (dirs == 0) {
+#ifdef HAVE_LIBBLKID
+		/* Use libblkid to scan all device for their type */
+		if (zpool_find_import_blkid(hdl, &pools) == 0)
+			goto skip_scanning;
+
+		(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
+		    dgettext(TEXT_DOMAIN, "blkid failure falling back "
+		    "to manual probing"));
+#endif /* HAVE_LIBBLKID */
 		dirs = 1;
 		dir = &default_dir;
 	}
@@ -1148,7 +1025,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 	 * and toplevel GUID.
 	 */
 	for (i = 0; i < dirs; i++) {
-		tpool_t *t;
 		char *rdsk;
 		int dfd;
 
@@ -1182,8 +1058,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 			goto error;
 		}
 
-		avl_create(&slice_cache, slice_cache_compare,
-		    sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
 		/*
 		 * This is not MT-safe, but we have no MT consumers of libzfs
 		 */
@@ -1193,37 +1067,51 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 			    (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
 				continue;
 
-			slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
-			slice->rn_name = zfs_strdup(hdl, name);
-			slice->rn_avl = &slice_cache;
-			slice->rn_dfd = dfd;
-			slice->rn_hdl = hdl;
-			slice->rn_nozpool = B_FALSE;
-			avl_add(&slice_cache, slice);
-		}
-		/*
-		 * create a thread pool to do all of this in parallel;
-		 * rn_nozpool is not protected, so this is racy in that
-		 * multiple tasks could decide that the same slice can
-		 * not hold a zpool, which is benign.  Also choose
-		 * double the number of processors; we hold a lot of
-		 * locks in the kernel, so going beyond this doesn't
-		 * buy us much.
-		 */
-		t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
-		    0, NULL);
-		for (slice = avl_first(&slice_cache); slice;
-		    (slice = avl_walk(&slice_cache, slice,
-		    AVL_AFTER)))
-			(void) tpool_dispatch(t, zpool_open_func, slice);
-		tpool_wait(t);
-		tpool_destroy(t);
-
-		cookie = NULL;
-		while ((slice = avl_destroy_nodes(&slice_cache,
-		    &cookie)) != NULL) {
-			if (slice->rn_config != NULL) {
-				nvlist_t *config = slice->rn_config;
+			/*
+			 * Skip checking devices with well known prefixes:
+			 * watchdog - A special close is required to avoid
+			 *            triggering it and resetting the system.
+			 * fuse     - Fuse control device.
+			 * ppp      - Generic PPP driver.
+			 * tty*     - Generic serial interface.
+			 * vcs*     - Virtual console memory.
+			 * parport* - Parallel port interface.
+			 * lp*      - Printer interface.
+			 * fd*      - Floppy interface.
+			 */
+			if ((strncmp(name, "watchdog", 8) == 0) ||
+			    (strncmp(name, "fuse", 4) == 0)     ||
+			    (strncmp(name, "ppp", 3) == 0)      ||
+			    (strncmp(name, "tty", 3) == 0)      ||
+			    (strncmp(name, "vcs", 3) == 0)      ||
+			    (strncmp(name, "parport", 7) == 0)  ||
+			    (strncmp(name, "lp", 2) == 0)       ||
+			    (strncmp(name, "fd", 2) == 0))
+				continue;
+
+			if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
+				continue;
+
+			/*
+			 * Ignore failed stats.  We only want regular
+			 * files and block devs.
+			 */
+			if (fstat64(fd, &statbuf) != 0 ||
+			    (!S_ISREG(statbuf.st_mode) &&
+			    !S_ISBLK(statbuf.st_mode))) {
+				(void) close(fd);
+				continue;
+			}
+
+			if ((zpool_read_label(fd, &config)) != 0) {
+				(void) close(fd);
+				(void) no_memory(hdl);
+				goto error;
+			}
+
+			(void) close(fd);
+
+			if (config != NULL) {
 				boolean_t matched = B_TRUE;
 
 				if (iarg->poolname != NULL) {
@@ -1247,19 +1135,19 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 					continue;
 				}
 				/* use the non-raw path for the config */
-				(void) strlcpy(end, slice->rn_name, pathleft);
+				(void) strlcpy(end, name, pathleft);
 				if (add_config(hdl, &pools, path, config) != 0)
 					goto error;
 			}
-			free(slice->rn_name);
-			free(slice);
 		}
-		avl_destroy(&slice_cache);
 
 		(void) closedir(dirp);
 		dirp = NULL;
 	}
 
+#ifdef HAVE_LIBBLKID
+skip_scanning:
+#endif
 	ret = get_configs(hdl, &pools, iarg->can_be_active);
 
 error:
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index c31a12371..4b9038de8 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -81,6 +81,7 @@
 #include <sys/systeminfo.h>
 #define	MAXISALEN	257	/* based on sysinfo(2) man page */
 
+#ifdef HAVE_ZPL
 static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
     zfs_share_proto_t);
@@ -1268,3 +1269,53 @@ out:
 
 	return (ret);
 }
+
+#else  /* HAVE_ZPL */
+
+int
+zfs_unshare_iscsi(zfs_handle_t *zhp)
+{
+	return 0;
+}
+
+int
+zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
+{
+	return 0;
+}
+
+void
+remove_mountpoint(zfs_handle_t *zhp) {
+	return;
+}
+
+boolean_t
+is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where)
+{
+	return B_FALSE;
+}
+
+boolean_t
+zfs_is_mounted(zfs_handle_t *zhp, char **where)
+{
+	return is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where);
+}
+
+boolean_t
+zfs_is_shared(zfs_handle_t *zhp)
+{
+	return B_FALSE;
+}
+
+int
+zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
+{
+	return B_FALSE;
+}
+
+int
+zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
+{
+	return B_FALSE;
+}
+#endif /* HAVE_ZPL */
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 42f303894..ec27b5756 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -32,6 +32,8 @@
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
+#include <zone.h>
+#include <sys/stat.h>
 #include <sys/efi_partition.h>
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
@@ -44,10 +46,6 @@
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
-#define	DISK_ROOT	"/dev/dsk"
-#define	RDISK_ROOT	"/dev/rdsk"
-#define	BACKUP_SLICE	"s2"
-
 typedef struct prop_flags {
 	int create:1;	/* Validate property on creation */
 	int import:1;	/* Validate property on import */
@@ -651,9 +649,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
 
 /*
  * Don't start the slice at the default block of 34; many storage
- * devices will use a stripe width of 128k, so start there instead.
+ * devices will use a stripe width of 128k, other vendors prefer a 1m
+ * alignment.  It is best to play it safe and ensure a 1m alignment
+ * give 512b blocks.  When the block size is larger by a power of 2
+ * we will still be 1m aligned.
  */
-#define	NEW_START_BLOCK	256
+#define	NEW_START_BLOCK	2048
 
 /*
  * Validate the given pool name, optionally putting an extended error message in
@@ -948,10 +949,12 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
 			 * This can happen if the user has specified the same
 			 * device multiple times.  We can't reliably detect this
 			 * until we try to add it and see we already have a
-			 * label.
+			 * label.  This can also happen under if the device is
+			 * part of an active md or lvm device.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "one or more vdevs refer to the same device"));
+			    "one or more vdevs refer to the same device, or one of\n"
+			    "the devices is part of an active md or lvm device"));
 			return (zfs_error(hdl, EZFS_BADDEV, msg));
 
 		case EOVERFLOW:
@@ -1928,7 +1931,7 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
 	} else if (zpool_vdev_is_interior(path)) {
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
 	} else if (path[0] != '/') {
-		(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
+		(void) snprintf(buf, sizeof (buf), "%s/%s", DISK_ROOT, path);
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
 	} else {
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
@@ -2101,22 +2104,14 @@ zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
  * the disk to use the new unallocated space.
  */
 static int
-zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
+zpool_relabel_disk(libzfs_handle_t *hdl, const char *path)
 {
-	char path[MAXPATHLEN];
 	char errbuf[1024];
 	int fd, error;
-	int (*_efi_use_whole_disk)(int);
-
-	if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
-	    "efi_use_whole_disk")) == NULL)
-		return (-1);
 
-	(void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name);
-
-	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
-		    "relabel '%s': unable to open device"), name);
+		    "relabel '%s': unable to open device"), path);
 		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
 	}
 
@@ -2125,11 +2120,11 @@ zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
 	 * does not have any unallocated space left. If so, we simply
 	 * ignore that error and continue on.
 	 */
-	error = _efi_use_whole_disk(fd);
+	error = efi_use_whole_disk(fd);
 	(void) close(fd);
 	if (error && error != VT_ENOSPC) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
-		    "relabel '%s': unable to read disk capacity"), name);
+		    "relabel '%s': unable to read disk capacity"), path);
 		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
 	}
 	return (0);
@@ -3071,7 +3066,7 @@ char *
 zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
     boolean_t verbose)
 {
-	char *path, *devid;
+	char *path, *devid, *type;
 	uint64_t value;
 	char buf[64];
 	vdev_stat_t *vs;
@@ -3085,7 +3080,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 		    (u_longlong_t)value);
 		path = buf;
 	} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
-
 		/*
 		 * If the device is dead (faulted, offline, etc) then don't
 		 * bother opening it.  Otherwise we may be forcing the user to
@@ -3124,9 +3118,19 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 				devid_str_free(newdevid);
 		}
 
-		if (strncmp(path, "/dev/dsk/", 9) == 0)
-			path += 9;
+		/*
+		 * For a block device only use the name.
+		 */
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+		if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+			path = strrchr(path, '/');
+			path++;
+		}
 
+#if defined(__sun__) || defined(__sun)
+		/*
+		 * The following code strips the slice from the device path.
+		 */
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    &value) == 0 && value) {
 			int pathlen = strlen(path);
@@ -3148,6 +3152,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 			}
 			return (tmp);
 		}
+#endif
 	} else {
 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
 
@@ -3629,7 +3634,7 @@ read_efi_label(nvlist_t *config, diskaddr_t *sb)
 
 	(void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT,
 	    strrchr(path, '/'));
-	if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) {
+	if ((fd = open(diskname, O_RDWR|O_DIRECT)) >= 0) {
 		struct dk_gpt *vtoc;
 
 		if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
@@ -3675,6 +3680,54 @@ find_start_block(nvlist_t *config)
 	return (MAXOFFSET_T);
 }
 
+int
+zpool_label_disk_wait(char *path, int timeout)
+{
+	struct stat64 statbuf;
+	int i;
+
+	/*
+	 * Wait timeout miliseconds for a newly created device to be available
+	 * from the given path.  There is a small window when a /dev/ device
+	 * will exist and the udev link will not, so we must wait for the
+	 * symlink.  Depending on the udev rules this may take a few seconds.
+	 */
+	for (i = 0; i < timeout; i++) {
+		usleep(1000);
+
+		errno = 0;
+		if ((stat64(path, &statbuf) == 0) && (errno == 0))
+			return (0);
+	}
+
+	return (ENOENT);
+}
+
+int
+zpool_label_disk_check(char *path)
+{
+	struct dk_gpt *vtoc;
+	int fd, err;
+
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0)
+		return errno;
+
+	if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
+		(void) close(fd);
+		return err;
+	}
+
+	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
+		efi_free(vtoc);
+		(void) close(fd);
+		return EIDRM;
+	}
+
+	efi_free(vtoc);
+	(void) close(fd);
+	return 0;
+}
+
 /*
  * Label an individual disk.  The name provided is the short name,
  * stripped of any leading /dev path.
@@ -3684,7 +3737,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
 {
 	char path[MAXPATHLEN];
 	struct dk_gpt *vtoc;
-	int fd;
+	int rval, fd;
 	size_t resv = EFI_MIN_RESV_SIZE;
 	uint64_t slice_size;
 	diskaddr_t start_block;
@@ -3720,13 +3773,13 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
 	(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
 	    BACKUP_SLICE);
 
-	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
 		/*
 		 * This shouldn't happen.  We've long since verified that this
 		 * is a valid device.
 		 */
-		zfs_error_aux(hdl,
-		    dgettext(TEXT_DOMAIN, "unable to open device"));
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "unable to open device '%s': %d"), path, errno);
 		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
 	}
 
@@ -3769,7 +3822,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
 	vtoc->efi_parts[8].p_size = resv;
 	vtoc->efi_parts[8].p_tag = V_RESERVED;
 
-	if (efi_write(fd, vtoc) != 0) {
+	if ((rval = efi_write(fd, vtoc)) != 0) {
 		/*
 		 * Some block drivers (like pcata) may not support EFI
 		 * GPT labels.  Print out a helpful error message dir-
@@ -3779,123 +3832,34 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
 		(void) close(fd);
 		efi_free(vtoc);
 
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "try using fdisk(1M) and then provide a specific slice"));
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using "
+		    "parted(8) and then provide a specific slice: %d"), rval);
 		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
 	}
 
 	(void) close(fd);
 	efi_free(vtoc);
-	return (0);
-}
-
-static boolean_t
-supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
-{
-	char *type;
-	nvlist_t **child;
-	uint_t children, c;
-
-	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
-	if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
-	    strcmp(type, VDEV_TYPE_FILE) == 0 ||
-	    strcmp(type, VDEV_TYPE_LOG) == 0 ||
-	    strcmp(type, VDEV_TYPE_HOLE) == 0 ||
-	    strcmp(type, VDEV_TYPE_MISSING) == 0) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "vdev type '%s' is not supported"), type);
-		(void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf);
-		return (B_FALSE);
-	}
-	if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (c = 0; c < children; c++) {
-			if (!supported_dump_vdev_type(hdl, child[c], errbuf))
-				return (B_FALSE);
-		}
-	}
-	return (B_TRUE);
-}
-
-/*
- * check if this zvol is allowable for use as a dump device; zero if
- * it is, > 0 if it isn't, < 0 if it isn't a zvol
- */
-int
-zvol_check_dump_config(char *arg)
-{
-	zpool_handle_t *zhp = NULL;
-	nvlist_t *config, *nvroot;
-	char *p, *volname;
-	nvlist_t **top;
-	uint_t toplevels;
-	libzfs_handle_t *hdl;
-	char errbuf[1024];
-	char poolname[ZPOOL_MAXNAMELEN];
-	int pathlen = strlen(ZVOL_FULL_DEV_DIR);
-	int ret = 1;
-
-	if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) {
-		return (-1);
-	}
-
-	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-	    "dump is not supported on device '%s'"), arg);
 
-	if ((hdl = libzfs_init()) == NULL)
-		return (1);
-	libzfs_print_on_error(hdl, B_TRUE);
-
-	volname = arg + pathlen;
-
-	/* check the configuration of the pool */
-	if ((p = strchr(volname, '/')) == NULL) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "malformed dataset name"));
-		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
-		return (1);
-	} else if (p - volname >= ZFS_MAXNAMELEN) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "dataset name is too long"));
-		(void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf);
-		return (1);
-	} else {
-		(void) strncpy(poolname, volname, p - volname);
-		poolname[p - volname] = '\0';
-	}
-
-	if ((zhp = zpool_open(hdl, poolname)) == NULL) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "could not open pool '%s'"), poolname);
-		(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
-		goto out;
-	}
-	config = zpool_get_config(zhp, NULL);
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) != 0) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "could not obtain vdev configuration for  '%s'"), poolname);
-		(void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf);
-		goto out;
-	}
-
-	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    &top, &toplevels) == 0);
-	if (toplevels != 1) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "'%s' has multiple top level vdevs"), poolname);
-		(void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf);
-		goto out;
+	/* Wait for the first expected slice to appear. */
+	(void) snprintf(path, sizeof (path), "%s/%s%s%s", DISK_ROOT, name,
+	    isdigit(name[strlen(name)-1]) ? "p" : "", FIRST_SLICE);
+	rval = zpool_label_disk_wait(path, 3000);
+	if (rval) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
+		    "detect device partitions on '%s': %d"), path, rval);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
 	}
 
-	if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
-		goto out;
+	/* We can't be to paranoid.  Read the label back and verify it. */
+	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
+	rval = zpool_label_disk_check(path);
+	if (rval) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written "
+		    "EFI label on '%s' is damaged.  Ensure\nthis device "
+		    "is not in in use, and is functioning properly: %d"),
+		    path, rval);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
 	}
-	ret = 0;
 
-out:
-	if (zhp)
-		zpool_close(zhp);
-	libzfs_fini(hdl);
-	return (ret);
+	return 0;
 }
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 87ffd124f..40d1d2e53 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -2608,6 +2608,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 				return (-1);
 			}
 		}
+		if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME &&
+		    zvol_remove_link(hdl, zhp->zfs_name) != 0) {
+			zfs_close(zhp);
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
 		zfs_close(zhp);
 	} else {
 		/*
@@ -2813,6 +2819,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		if (h != NULL) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				*cp = '@';
+				err = zvol_create_link(hdl, h->zfs_name);
+				if (err == 0 && ioctl_err == 0)
+					err = zvol_create_link(hdl,
+					    zc.zc_value);
 			} else if (newfs || stream_avl) {
 				/*
 				 * Track the first/top of hierarchy fs,
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index cb7d87cb2..71f81831b 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -36,6 +36,7 @@
 #include <unistd.h>
 #include <ctype.h>
 #include <math.h>
+#include <sys/stat.h>
 #include <sys/mnttab.h>
 #include <sys/mntent.h>
 #include <sys/types.h>
@@ -648,7 +649,9 @@ libzfs_fini(libzfs_handle_t *hdl)
 #endif
 	if (hdl->libzfs_sharetab)
 		(void) fclose(hdl->libzfs_sharetab);
+#ifdef HAVE_ZPL
 	zfs_uninit_libshare(hdl);
+#endif
 	if (hdl->libzfs_log_str)
 		(void) free(hdl->libzfs_log_str);
 	zpool_free_handles(hdl);
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index 494e544ea..6f06f4001 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -35,6 +35,8 @@
 #include <sys/processor.h>
 #include <sys/zfs_context.h>
 #include <sys/utsname.h>
+#include <sys/time.h>
+#include <sys/mount.h> /* for BLKGETSIZE64 */
 #include <sys/systeminfo.h>
 
 /*
@@ -533,7 +535,11 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
 	 * for its size.  So -- gag -- we open the block device to get
 	 * its size, and remember it for subsequent VOP_GETATTR().
 	 */
+#if defined(__sun__) || defined(__sun)
 	if (strncmp(path, "/dev/", 5) == 0) {
+#else
+	if (0) {
+#endif
 		char *dsk;
 		fd = open64(path, O_RDONLY);
 		if (fd == -1) {
@@ -562,6 +568,14 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
 		}
 	}
 
+	if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) {
+#ifdef __linux__
+		flags |= O_DIRECT;
+#endif
+		/* We shouldn't be writing to block devices in userspace */
+		VERIFY(!(flags & FWRITE));
+	}
+
 	if (flags & FCREAT)
 		old_umask = umask(0);
 
@@ -584,6 +598,16 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
 		return (err);
 	}
 
+#ifdef __linux__
+	/* In Linux, use an ioctl to get the size of a block device. */
+	if (S_ISBLK(st.st_mode)) {
+		if (ioctl(fd, BLKGETSIZE64, &st.st_size) != 0) {
+			err = errno;
+			close(fd);
+			return (err);
+		}
+	}
+#endif
 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
 
 	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
@@ -637,6 +661,16 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
 		}
 	}
 
+#ifdef __linux__
+	if (rc == -1 && errno == EINVAL) {
+		/*
+		 * Under Linux, this most likely means an alignment issue
+		 * (memory or disk) due to O_DIRECT, so we abort() in order to
+		 * catch the offender.
+		 */
+		 abort();
+	}
+#endif
 	if (rc == -1)
 		return (errno);
author	Brian Behlendorf <[email protected]>	2010-08-26 11:56:53 -0700
committer	Brian Behlendorf <[email protected]>	2010-08-31 13:42:00 -0700
commit	d603ed6c278f9c25b17ba8e75e9bce6e5d715ac0 (patch)
tree	61dc04f07ed9772dd97faefd879412e2a6578be0 /lib
parent	f1fb119f6bb0c3185ec88912e4488fdd9ec08ab2 (diff)