1 files changed, 335 insertions, 85 deletions
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index 23e45d0d3..ef18bafab 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <[email protected]>
  * Copyright 2017 RackTop Systems.
  * Copyright (c) 2018 Datto Inc.
@@ -34,25 +34,25 @@
  * they are used by mount and unmount and when changing a filesystem's
  * mountpoint.
  *
- * 	zfs_is_mounted()
- * 	zfs_mount()
- * 	zfs_unmount()
- * 	zfs_unmountall()
+ *	zfs_is_mounted()
+ *	zfs_mount()
+ *	zfs_unmount()
+ *	zfs_unmountall()
  *
  * This file also contains the functions used to manage sharing filesystems via
  * NFS and iSCSI:
  *
- * 	zfs_is_shared()
- * 	zfs_share()
- * 	zfs_unshare()
+ *	zfs_is_shared()
+ *	zfs_share()
+ *	zfs_unshare()
  *
- * 	zfs_is_shared_nfs()
- * 	zfs_is_shared_smb()
- * 	zfs_share_proto()
- * 	zfs_shareall();
- * 	zfs_unshare_nfs()
- * 	zfs_unshare_smb()
- * 	zfs_unshareall_nfs()
+ *	zfs_is_shared_nfs()
+ *	zfs_is_shared_smb()
+ *	zfs_share_proto()
+ *	zfs_shareall();
+ *	zfs_unshare_nfs()
+ *	zfs_unshare_smb()
+ *	zfs_unshareall_nfs()
  *	zfs_unshareall_smb()
  *	zfs_unshareall()
  *	zfs_unshareall_bypath()
@@ -60,8 +60,8 @@
  * The following functions are available for pool consumers, and will
  * mount/unmount and share/unshare all datasets within pool:
  *
- * 	zpool_enable_datasets()
- * 	zpool_disable_datasets()
+ *	zpool_enable_datasets()
+ *	zpool_disable_datasets()
  */
 
 #include <dirent.h>
@@ -84,11 +84,15 @@
 #include <libzfs.h>
 
 #include "libzfs_impl.h"
+#include <thread_pool.h>
 
 #include <libshare.h>
 #include <sys/systeminfo.h>
 #define	MAXISALEN	257	/* based on sysinfo(2) man page */
 
+static int mount_tp_nthr = 512;	/* tpool threads for multi-threaded mounting */
+
+static void zfs_mount_task(void *);
 static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
     zfs_share_proto_t);
@@ -1146,25 +1150,32 @@ remove_mountpoint(zfs_handle_t *zhp)
 	}
 }
 
+/*
+ * Add the given zfs handle to the cb_handles array, dynamically reallocating
+ * the array if it is out of space.
+ */
 void
 libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
 {
 	if (cbp->cb_alloc == cbp->cb_used) {
 		size_t newsz;
-		void *ptr;
+		zfs_handle_t **newhandles;
 
-		newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
-		ptr = zfs_realloc(zhp->zfs_hdl,
-		    cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
-		    newsz * sizeof (void *));
-		cbp->cb_handles = ptr;
+		newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
+		newhandles = zfs_realloc(zhp->zfs_hdl,
+		    cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
+		    newsz * sizeof (zfs_handle_t *));
+		cbp->cb_handles = newhandles;
 		cbp->cb_alloc = newsz;
 	}
 	cbp->cb_handles[cbp->cb_used++] = zhp;
 }
 
+/*
+ * Recursive helper function used during file system enumeration
+ */
 static int
-mount_cb(zfs_handle_t *zhp, void *data)
+zfs_iter_cb(zfs_handle_t *zhp, void *data)
 {
 	get_all_cb_t *cbp = data;
 
@@ -1196,112 +1207,351 @@ mount_cb(zfs_handle_t *zhp, void *data)
 	}
 
 	libzfs_add_handle(cbp, zhp);
-	if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+	if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
 		zfs_close(zhp);
 		return (-1);
 	}
 	return (0);
 }
 
+/*
+ * Sort comparator that compares two mountpoint paths. We sort these paths so
+ * that subdirectories immediately follow their parents. This means that we
+ * effectively treat the '/' character as the lowest value non-nul char. An
+ * example sorted list using this comparator would look like:
+ *
+ * /foo
+ * /foo/bar
+ * /foo/bar/baz
+ * /foo/baz
+ * /foo.bar
+ *
+ * The mounting code depends on this ordering to deterministically iterate
+ * over filesystems in order to spawn parallel mount tasks.
+ */
 int
-libzfs_dataset_cmp(const void *a, const void *b)
+mountpoint_cmp(const void *arga, const void *argb)
 {
-	zfs_handle_t **za = (zfs_handle_t **)a;
-	zfs_handle_t **zb = (zfs_handle_t **)b;
+	zfs_handle_t *const *zap = arga;
+	zfs_handle_t *za = *zap;
+	zfs_handle_t *const *zbp = argb;
+	zfs_handle_t *zb = *zbp;
 	char mounta[MAXPATHLEN];
 	char mountb[MAXPATHLEN];
+	const char *a = mounta;
+	const char *b = mountb;
 	boolean_t gota, gotb;
 
-	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
-		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
+	gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
+	if (gota) {
+		verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
 		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
-	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
-		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
+	}
+	gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
+	if (gotb) {
+		verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
 		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
+	}
 
-	if (gota && gotb)
-		return (strcmp(mounta, mountb));
+	if (gota && gotb) {
+		while (*a != '\0' && (*a == *b)) {
+			a++;
+			b++;
+		}
+		if (*a == *b)
+			return (0);
+		if (*a == '\0')
+			return (-1);
+		if (*b == '\0')
+			return (1);
+		if (*a == '/')
+			return (-1);
+		if (*b == '/')
+			return (1);
+		return (*a < *b ? -1 : *a > *b);
+	}
 
 	if (gota)
 		return (-1);
 	if (gotb)
 		return (1);
 
-	return (strcmp(zfs_get_name(*za), zfs_get_name(*zb)));
+	/*
+	 * If neither filesystem has a mountpoint, revert to sorting by
+	 * dataset name.
+	 */
+	return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
 }
 
 /*
- * Mount and share all datasets within the given pool.  This assumes that no
- * datasets within the pool are currently mounted.  Because users can create
- * complicated nested hierarchies of mountpoints, we first gather all the
- * datasets and mountpoints within the pool, and sort them by mountpoint.  Once
- * we have the list of all filesystems, we iterate over them in order and mount
- * and/or share each one.
+ * Return true if path2 is a child of path1.
  */
-#pragma weak zpool_mount_datasets = zpool_enable_datasets
-int
-zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
+static boolean_t
+libzfs_path_contains(const char *path1, const char *path2)
 {
-	get_all_cb_t cb = { 0 };
-	libzfs_handle_t *hdl = zhp->zpool_hdl;
-	zfs_handle_t *zfsp;
-	int i, ret = -1;
-	int *good;
+	return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
+}
+
+/*
+ * Given a mountpoint specified by idx in the handles array, find the first
+ * non-descendent of that mountpoint and return its index. Descendant paths
+ * start with the parent's path. This function relies on the ordering
+ * enforced by mountpoint_cmp().
+ */
+static int
+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)
+{
+	char parent[ZFS_MAXPROPLEN];
+	char child[ZFS_MAXPROPLEN];
+	int i;
+
+	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
+	    sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
+
+	for (i = idx + 1; i < num_handles; i++) {
+		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
+		    sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+		if (!libzfs_path_contains(parent, child))
+			break;
+	}
+	return (i);
+}
+
+typedef struct mnt_param {
+	libzfs_handle_t	*mnt_hdl;
+	tpool_t		*mnt_tp;
+	zfs_handle_t	**mnt_zhps; /* filesystems to mount */
+	size_t		mnt_num_handles;
+	int		mnt_idx;	/* Index of selected entry to mount */
+	zfs_iter_f	mnt_func;
+	void		*mnt_data;
+} mnt_param_t;
+
+/*
+ * Allocate and populate the parameter struct for mount function, and
+ * schedule mounting of the entry selected by idx.
+ */
+static void
+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
+    size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp)
+{
+	mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
+
+	mnt_param->mnt_hdl = hdl;
+	mnt_param->mnt_tp = tp;
+	mnt_param->mnt_zhps = handles;
+	mnt_param->mnt_num_handles = num_handles;
+	mnt_param->mnt_idx = idx;
+	mnt_param->mnt_func = func;
+	mnt_param->mnt_data = data;
+
+	(void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
+}
+
+/*
+ * This is the structure used to keep state of mounting or sharing operations
+ * during a call to zpool_enable_datasets().
+ */
+typedef struct mount_state {
+	/*
+	 * ms_mntstatus is set to -1 if any mount fails. While multiple threads
+	 * could update this variable concurrently, no synchronization is
+	 * needed as it's only ever set to -1.
+	 */
+	int		ms_mntstatus;
+	int		ms_mntflags;
+	const char	*ms_mntopts;
+} mount_state_t;
+
+static int
+zfs_mount_one(zfs_handle_t *zhp, void *arg)
+{
+	mount_state_t *ms = arg;
+	int ret = 0;
 
 	/*
-	 * Gather all non-snap datasets within the pool.
+	 * don't attempt to mount encrypted datasets with
+	 * unloaded keys
 	 */
-	if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
-		goto out;
+	if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
+	    ZFS_KEYSTATUS_UNAVAILABLE)
+		return (0);
+
+	if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
+		ret = ms->ms_mntstatus = -1;
+	return (ret);
+}
+
+static int
+zfs_share_one(zfs_handle_t *zhp, void *arg)
+{
+	mount_state_t *ms = arg;
+	int ret = 0;
+
+	if (zfs_share(zhp) != 0)
+		ret = ms->ms_mntstatus = -1;
+	return (ret);
+}
+
+/*
+ * Thread pool function to mount one file system. On completion, it finds and
+ * schedules its children to be mounted. This depends on the sorting done in
+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries
+ * each descending from the previous) will have no parallelism since we always
+ * have to wait for the parent to finish mounting before we can schedule
+ * its children.
+ */
+static void
+zfs_mount_task(void *arg)
+{
+	mnt_param_t *mp = arg;
+	int idx = mp->mnt_idx;
+	zfs_handle_t **handles = mp->mnt_zhps;
+	size_t num_handles = mp->mnt_num_handles;
+	char mountpoint[ZFS_MAXPROPLEN];
+
+	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+
+	if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
+		return;
 
-	libzfs_add_handle(&cb, zfsp);
-	if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
-		goto out;
 	/*
-	 * Sort the datasets by mountpoint.
+	 * We dispatch tasks to mount filesystems with mountpoints underneath
+	 * this one. We do this by dispatching the next filesystem with a
+	 * descendant mountpoint of the one we just mounted, then skip all of
+	 * its descendants, dispatch the next descendant mountpoint, and so on.
+	 * The non_descendant_idx() function skips over filesystems that are
+	 * descendants of the filesystem we just dispatched.
 	 */
-	qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
-	    libzfs_dataset_cmp);
+	for (int i = idx + 1; i < num_handles;
+	    i = non_descendant_idx(handles, num_handles, i)) {
+		char child[ZFS_MAXPROPLEN];
+		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
+		    child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+
+		if (!libzfs_path_contains(mountpoint, child))
+			break; /* not a descendant, return */
+		zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
+		    mp->mnt_func, mp->mnt_data, mp->mnt_tp);
+	}
+	free(mp);
+}
 
+/*
+ * Issue the func callback for each ZFS handle contained in the handles
+ * array. This function is used to mount all datasets, and so this function
+ * guarantees that filesystems for parent mountpoints are called before their
+ * children. As such, before issuing any callbacks, we first sort the array
+ * of handles by mountpoint.
+ *
+ * Callbacks are issued in one of two ways:
+ *
+ * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT
+ *    environment variable is set, then we issue callbacks sequentially.
+ *
+ * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT
+ *    environment variable is not set, then we use a tpool to dispatch threads
+ *    to mount filesystems in parallel. This function dispatches tasks to mount
+ *    the filesystems at the top-level mountpoints, and these tasks in turn
+ *    are responsible for recursively mounting filesystems in their children
+ *    mountpoints.
+ */
+void
+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
+    size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)
+{
 	/*
-	 * And mount all the datasets, keeping track of which ones
-	 * succeeded or failed.
+	 * The ZFS_SERIAL_MOUNT environment variable is an undocumented
+	 * variable that can be used as a convenience to do a/b comparison
+	 * of serial vs. parallel mounting.
 	 */
-	if ((good = zfs_alloc(zhp->zpool_hdl,
-	    cb.cb_used * sizeof (int))) == NULL)
-		goto out;
+	boolean_t serial_mount = !parallel ||
+	    (getenv("ZFS_SERIAL_MOUNT") != NULL);
 
-	ret = 0;
-	for (i = 0; i < cb.cb_used; i++) {
-		/*
-		 * don't attempt to mount encrypted datasets with
-		 * unloaded keys
-		 */
-		if (zfs_prop_get_int(cb.cb_handles[i], ZFS_PROP_KEYSTATUS) ==
-		    ZFS_KEYSTATUS_UNAVAILABLE)
-			continue;
+	/*
+	 * Sort the datasets by mountpoint. See mountpoint_cmp for details
+	 * of how these are sorted.
+	 */
+	qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
 
-		if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
-			ret = -1;
-		else
-			good[i] = 1;
+	if (serial_mount) {
+		for (int i = 0; i < num_handles; i++) {
+			func(handles[i], data);
+		}
+		return;
 	}
 
 	/*
-	 * Then share all the ones that need to be shared. This needs
-	 * to be a separate pass in order to avoid excessive reloading
-	 * of the configuration. Good should never be NULL since
-	 * zfs_alloc is supposed to exit if memory isn't available.
+	 * Issue the callback function for each dataset using a parallel
+	 * algorithm that uses a thread pool to manage threads.
+	 */
+	tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
+
+	/*
+	 * There may be multiple "top level" mountpoints outside of the pool's
+	 * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of
+	 * these.
 	 */
-	for (i = 0; i < cb.cb_used; i++) {
-		if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
-			ret = -1;
+	for (int i = 0; i < num_handles;
+	    i = non_descendant_idx(handles, num_handles, i)) {
+		zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
+		    tp);
 	}
 
-	free(good);
+	tpool_wait(tp);	/* wait for all scheduled mounts to complete */
+	tpool_destroy(tp);
+}
+
+/*
+ * Mount and share all datasets within the given pool.  This assumes that no
+ * datasets within the pool are currently mounted.
+ */
+#pragma weak zpool_mount_datasets = zpool_enable_datasets
+int
+zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
+{
+	get_all_cb_t cb = { 0 };
+	mount_state_t ms = { 0 };
+	zfs_handle_t *zfsp;
+	int ret = 0;
+
+	if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
+	    ZFS_TYPE_DATASET)) == NULL)
+		goto out;
+
+	/*
+	 * Gather all non-snapshot datasets within the pool. Start by adding
+	 * the root filesystem for this pool to the list, and then iterate
+	 * over all child filesystems.
+	 */
+	libzfs_add_handle(&cb, zfsp);
+	if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
+		goto out;
+
+	/*
+	 * Mount all filesystems
+	 */
+	ms.ms_mntopts = mntopts;
+	ms.ms_mntflags = flags;
+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
+	    zfs_mount_one, &ms, B_TRUE);
+	if (ms.ms_mntstatus != 0)
+		ret = ms.ms_mntstatus;
+
+	/*
+	 * Share all filesystems that need to be shared. This needs to be
+	 * a separate pass because libshare is not mt-safe, and so we need
+	 * to share serially.
+	 */
+	ms.ms_mntstatus = 0;
+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
+	    zfs_share_one, &ms, B_FALSE);
+	if (ms.ms_mntstatus != 0)
+		ret = ms.ms_mntstatus;
 
 out:
-	for (i = 0; i < cb.cb_used; i++)
+	for (int i = 0; i < cb.cb_used; i++)
 		zfs_close(cb.cb_handles[i]);
 	free(cb.cb_handles);