summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2012-09-02 16:34:12 -0700
committerBrian Behlendorf <[email protected]>2012-09-04 15:35:32 -0700
commit395350c85d9903beba43bac7ae79092ae25f1526 (patch)
treee1bd38ebaa3c1ba06d5eaa61bca7e3afee64bd6c
parent594b4dd82a6ba6b046b894a24986ce727f4d7391 (diff)
Improve AF hard disk detection
Use the bdev_physical_block_size() interface to determine the minimize write size which can be issued without incurring a read-modify-write operation. This is used to set the ashift correctly to prevent a performance penalty when using AF hard disks. Unfortunately, this interface isn't entirely reliable because it's not uncommon for disks to misreport this value. For this reason you may still need to manually set your ashift with: zpool create -o ashift=12 ... The solution to this in the upstream Illumos source was to add a while list of known offending drives. Maintaining such a list will be a burden, but it still may be worth doing if we can detect a large number of these drives. This should be considered as future work. Reported-by: Richard Yao <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #916
-rw-r--r--config/kernel-bdev-physical-size.m439
-rw-r--r--config/kernel.m41
-rw-r--r--include/linux/blkdev_compat.h24
-rw-r--r--zfs_config.h.in3
4 files changed, 62 insertions, 5 deletions
diff --git a/config/kernel-bdev-physical-size.m4 b/config/kernel-bdev-physical-size.m4
new file mode 100644
index 000000000..0a1fe8e26
--- /dev/null
+++ b/config/kernel-bdev-physical-size.m4
@@ -0,0 +1,39 @@
+dnl #
+dnl # 2.6.30 API change
+dnl #
+dnl # The bdev_physical_block_size() interface was added to provide a way
+dnl # to determine the smallest write which can be performed without a
+dnl # read-modify-write operation. From the kernel documentation:
+dnl #
+dnl # What: /sys/block/<disk>/queue/physical_block_size
+dnl # Date: May 2009
+dnl # Contact: Martin K. Petersen <[email protected]>
+dnl # Description:
+dnl # This is the smallest unit the storage device can write
+dnl # without resorting to read-modify-write operation. It is
+dnl # usually the same as the logical block size but may be
+dnl # bigger. One example is SATA drives with 4KB sectors
+dnl # that expose a 512-byte logical block size to the
+dnl # operating system.
+dnl #
+dnl # Unfortunately, this interface isn't entirely reliable because
+dnl # drives are sometimes known to misreport this value.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [
+ AC_MSG_CHECKING([whether bdev_physical_block_size() is available])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Wno-unused-but-set-variable"
+ ZFS_LINUX_TRY_COMPILE([
+ #include <linux/blkdev.h>
+ ],[
+ struct block_device *bdev = NULL;
+ bdev_physical_block_size(bdev);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1,
+ [bdev_physical_block_size() is available])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index d10c6e628..71b0161a8 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -14,6 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE
ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS
ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE
+ ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE
ZFS_AC_KERNEL_BIO_EMPTY_BARRIER
ZFS_AC_KERNEL_BIO_FAILFAST
ZFS_AC_KERNEL_BIO_FAILFAST_DTD
diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
index a5294ceba..1ff8eeaf3 100644
--- a/include/linux/blkdev_compat.h
+++ b/include/linux/blkdev_compat.h
@@ -394,13 +394,27 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
/*
* 2.6.30 API change
- * Change to make it explicit there this is the logical block size.
+ * To ensure good performance preferentially use the physical block size
+ * for proper alignment. The physical size is supposed to be the internal
+ * sector size used by the device. This is often 4096 byte for AF devices,
+ * while a smaller 512 byte logical size is supported for compatibility.
+ *
+ * Unfortunately, many drives still misreport their physical sector size.
+ * For devices which are known to lie you may need to manually set this
+ * at pool creation time with 'zpool create -o ashift=12 ...'.
+ *
+ * When the physical block size interface isn't available, we fall back to
+ * the logical block size interface and then the older hard sector size.
*/
-#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
-# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev)
+#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE
+# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev)
#else
-# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
-#endif
+# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
+# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev)
+# else
+# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
+# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */
+#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */
/*
* 2.6.37 API change
diff --git a/zfs_config.h.in b/zfs_config.h.in
index 260bd5ab7..9ea8d2c53 100644
--- a/zfs_config.h.in
+++ b/zfs_config.h.in
@@ -24,6 +24,9 @@
/* bdev_logical_block_size() is available */
#undef HAVE_BDEV_LOGICAL_BLOCK_SIZE
+/* bdev_physical_block_size() is available */
+#undef HAVE_BDEV_PHYSICAL_BLOCK_SIZE
+
/* struct super_block has s_bdi */
#undef HAVE_BDI