From 24f3d6e49e5a63555995bc837248a7b383c62b07 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 23 Oct 2009 11:57:59 -0700 Subject: Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do. --- cmd/zpool/zpool_vdev.c | 53 +++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'cmd') diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 7d4d87ede..113abb608 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -237,14 +237,9 @@ check_disk(const char *path, blkid_cache cache, int force, int err = 0; int fd, i; - /* Check the device as given */ - err = check_slice(path, cache, force, isspare); - if (err) - return (err); - - /* Additional checking is only required for whole disks */ + /* This is not a wholedisk we only check the given partition */ if (!iswholedisk) - return 0; + return check_slice(path, cache, force, isspare); /* * When the device is a whole disk try to read the efi partition @@ -252,9 +247,9 @@ check_disk(const char *path, blkid_cache cache, int force, * partitions. However, when it fails it may simply be because * the disk is partitioned via the MBR. Since we currently can * not easily decode the MBR return a failure and prompt to the - * user to use --force since we cannot check the partitions. + * user to use force option since we cannot check the partitions. */ - if ((fd = open(path, O_RDONLY|O_NDELAY)) < 0) { + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { check_error(errno); return -1; } @@ -265,11 +260,9 @@ check_disk(const char *path, blkid_cache cache, int force, if (force) { return 0; } else { - vdev_error(gettext( - "%s may contain a non-efi partition table " - "describing existing\nfilesystems. If you are " - "sure you want to use this device use the\n" - "force command line option.\n"), path); + vdev_error(gettext("%s does not contain an EFI " + "label but it may contain partition\n" + "information in the MBR.\n"), path); return -1; } } @@ -279,14 +272,18 @@ check_disk(const char *path, blkid_cache cache, int force, * label at the end of the device is intact. Rather than use this * label we should play it safe and treat this as a non efi device. */ - if (!force && vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { - vdev_error(gettext( - "%s contains a corrupt primary efi partition table. " - "If you are\nsure you want to use this device use " - "the force command line option.\n"), path); + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { efi_free(vtoc); (void) close(fd); - return -1; + + if (force) { + /* Partitions will no be created using the backup */ + return 0; + } else { + vdev_error(gettext("%s contains a corrupt primary " + "EFI label.\n"), path); + return -1; + } } for (i = 0; i < vtoc->efi_nparts; i++) { @@ -295,15 +292,13 @@ check_disk(const char *path, blkid_cache cache, int force, uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) continue; - /* Resolve possible symlink to safely append partition */ - if (realpath(path, slice_path) == NULL) { - (void) fprintf(stderr, - gettext("cannot resolve path '%s'\n"), slice_path); - err = errno; - break; - } + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, "-part", i+1); + else + (void) snprintf(slice_path, sizeof (slice_path), + "%s%d", path, i+1); - sprintf(slice_path, "%s%d", slice_path, i+1); err = check_slice(slice_path, cache, force, isspare); if (err) break; @@ -369,7 +364,7 @@ is_whole_disk(const char *arg) (void) snprintf(path, sizeof (path), "%s%s%s", RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); -- cgit v1.2.3