diff options
author | Brian Behlendorf <[email protected]> | 2008-11-20 12:01:55 -0800 |
---|---|---|
committer | Brian Behlendorf <[email protected]> | 2008-11-20 12:01:55 -0800 |
commit | 34dc7c2f2553220ebc6e29ca195fb6d57155f95f (patch) | |
tree | 634a0df4aa30200d83c16025768c9ef76a26136d /zfs/lib/libumem |
Initial Linux ZFS GIT Repo
Diffstat (limited to 'zfs/lib/libumem')
32 files changed, 9521 insertions, 0 deletions
diff --git a/zfs/lib/libumem/COPYING b/zfs/lib/libumem/COPYING new file mode 100644 index 000000000..90f44901b --- /dev/null +++ b/zfs/lib/libumem/COPYING @@ -0,0 +1,2 @@ +Please see COPYRIGHT and OPENSOLARIS.LICENSE for the copyright +and license details. diff --git a/zfs/lib/libumem/COPYRIGHT b/zfs/lib/libumem/COPYRIGHT new file mode 100644 index 000000000..2ca159945 --- /dev/null +++ b/zfs/lib/libumem/COPYRIGHT @@ -0,0 +1,6 @@ +The bulk of the library is Copyright 2004 Sun Microsystems, Inc. +Portions are Copyright 2006 OmniTI, Inc. + +The library is distributed under the terms of the CDDL. +See the file OPENSOLARIS.LICENSE for more information. + diff --git a/zfs/lib/libumem/Makefile.in b/zfs/lib/libumem/Makefile.in new file mode 100644 index 000000000..008748e7c --- /dev/null +++ b/zfs/lib/libumem/Makefile.in @@ -0,0 +1,40 @@ +subdir-m += include sys +DISTFILES = COPYING COPYRIGHT OPENSOLARIS.LICENSE README README-alpha TODO +DISTFILES += config.h envvar.c getpcstack.c init_lib.c misc.c misc.h +DISTFILES += sol_compat.h umem.c umem_agent_support.c umem_base.h +DISTFILES += umem_fail.c umem_fork.c umem_impl.h umem_update_thread.c +DISTFILES += vmem.c vmem_base.c vmem_base.h vmem_mmap.c vmem_sbrk.c +DISTFILES += vmem_stand.h + +LIBRARY := libumem + +# Compile as shared library. There's an extra useless host program +# here called 'zu' because it was the easiest way I could convince +# the kernel build system to construct a user space shared library. + +HOSTCFLAGS += @HOSTCFLAGS@ +HOSTCFLAGS += -I@LIBDIR@/libumem +HOSTCFLAGS += -I@LIBDIR@/libumem/include +HOSTCFLAGS += -I@LIBDIR@/libumem/sys + +HOSTLDFLAGS += -pthread +HOSTLDFLAGS += -ldl + +hostprogs-y := zu +always := $(hostprogs-y) + +zu-objs := zu.o ${LIBRARY}.so + +${LIBRARY}-objs += envvar.o +${LIBRARY}-objs += getpcstack.o +${LIBRARY}-objs += init_lib.o +${LIBRARY}-objs += misc.o +${LIBRARY}-objs += umem.o +${LIBRARY}-objs += umem_agent_support.o +${LIBRARY}-objs += umem_fail.o +${LIBRARY}-objs += umem_fork.o +${LIBRARY}-objs += umem_update_thread.o +${LIBRARY}-objs += vmem.o +${LIBRARY}-objs += vmem_base.o +${LIBRARY}-objs += vmem_mmap.o +${LIBRARY}-objs += vmem_sbrk.o diff --git a/zfs/lib/libumem/OPENSOLARIS.LICENSE b/zfs/lib/libumem/OPENSOLARIS.LICENSE new file mode 100644 index 000000000..535dec222 --- /dev/null +++ b/zfs/lib/libumem/OPENSOLARIS.LICENSE @@ -0,0 +1,385 @@ +Unless otherwise noted, all files in this distribution are released +under the Common Development and Distribution License (CDDL), +Version 1.0 only. Exceptions are noted within the associated +source files. + +-------------------------------------------------------------------- + + +COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0 + +1. Definitions. + + 1.1. "Contributor" means each individual or entity that creates + or contributes to the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Software, prior Modifications used by a Contributor (if any), + and the Modifications made by that particular Contributor. + + 1.3. "Covered Software" means (a) the Original Software, or (b) + Modifications, or (c) the combination of files containing + Original Software with files containing Modifications, in + each case including portions thereof. + + 1.4. "Executable" means the Covered Software in any form other + than Source Code. + + 1.5. "Initial Developer" means the individual or entity that first + makes Original Software available under this License. + + 1.6. "Larger Work" means a work which combines Covered Software or + portions thereof with code not governed by the terms of this + License. + + 1.7. "License" means this document. + + 1.8. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed + herein. + + 1.9. "Modifications" means the Source Code and Executable form of + any of the following: + + A. Any file that results from an addition to, deletion from or + modification of the contents of a file containing Original + Software or previous Modifications; + + B. Any new file that contains any part of the Original + Software or previous Modifications; or + + C. Any new file that is contributed or otherwise made + available under the terms of this License. + + 1.10. "Original Software" means the Source Code and Executable + form of computer software code that is originally released + under this License. + + 1.11. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, + process, and apparatus claims, in any patent Licensable by + grantor. + + 1.12. "Source Code" means (a) the common form of computer software + code in which modifications are made and (b) associated + documentation included in or with such code. + + 1.13. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms + of, this License. For legal entities, "You" includes any + entity which controls, is controlled by, or is under common + control with You. For purposes of this definition, + "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty + percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants. + + 2.1. The Initial Developer Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, the Initial + Developer hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer, to use, + reproduce, modify, display, perform, sublicense and + distribute the Original Software (or portions thereof), + with or without Modifications, and/or as part of a Larger + Work; and + + (b) under Patent Claims infringed by the making, using or + selling of Original Software, to make, have made, use, + practice, sell, and offer for sale, and/or otherwise + dispose of the Original Software (or portions thereof). + + (c) The licenses granted in Sections 2.1(a) and (b) are + effective on the date Initial Developer first distributes + or otherwise makes the Original Software available to a + third party under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: (1) for code that You delete from the Original + Software, or (2) for infringements caused by: (i) the + modification of the Original Software, or (ii) the + combination of the Original Software with other software + or devices. + + 2.2. Contributor Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, each + Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor to use, reproduce, + modify, display, perform, sublicense and distribute the + Modifications created by such Contributor (or portions + thereof), either on an unmodified basis, with other + Modifications, as Covered Software and/or as part of a + Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either + alone and/or in combination with its Contributor Version + (or portions of such combination), to make, use, sell, + offer for sale, have made, and/or otherwise dispose of: + (1) Modifications made by that Contributor (or portions + thereof); and (2) the combination of Modifications made by + that Contributor with its Contributor Version (or portions + of such combination). + + (c) The licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first distributes or + otherwise makes the Modifications available to a third + party. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: (1) for any code that Contributor has deleted + from the Contributor Version; (2) for infringements caused + by: (i) third party modifications of Contributor Version, + or (ii) the combination of Modifications made by that + Contributor with other software (except as part of the + Contributor Version) or other devices; or (3) under Patent + Claims infringed by Covered Software in the absence of + Modifications made by that Contributor. + +3. Distribution Obligations. + + 3.1. Availability of Source Code. + + Any Covered Software that You distribute or otherwise make + available in Executable form must also be made available in Source + Code form and that Source Code form must be distributed only under + the terms of this License. You must include a copy of this + License with every copy of the Source Code form of the Covered + Software You distribute or otherwise make available. You must + inform recipients of any such Covered Software in Executable form + as to how they can obtain such Covered Software in Source Code + form in a reasonable manner on or through a medium customarily + used for software exchange. + + 3.2. Modifications. + + The Modifications that You create or to which You contribute are + governed by the terms of this License. You represent that You + believe Your Modifications are Your original creation(s) and/or + You have sufficient rights to grant the rights conveyed by this + License. + + 3.3. Required Notices. + + You must include a notice in each of Your Modifications that + identifies You as the Contributor of the Modification. You may + not remove or alter any copyright, patent or trademark notices + contained within the Covered Software, or any notices of licensing + or any descriptive text giving attribution to any Contributor or + the Initial Developer. + + 3.4. Application of Additional Terms. + + You may not offer or impose any terms on any Covered Software in + Source Code form that alters or restricts the applicable version + of this License or the recipients' rights hereunder. You may + choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of + Covered Software. However, you may do so only on Your own behalf, + and not on behalf of the Initial Developer or any Contributor. + You must make it absolutely clear that any such warranty, support, + indemnity or liability obligation is offered by You alone, and You + hereby agree to indemnify the Initial Developer and every + Contributor for any liability incurred by the Initial Developer or + such Contributor as a result of warranty, support, indemnity or + liability terms You offer. + + 3.5. Distribution of Executable Versions. + + You may distribute the Executable form of the Covered Software + under the terms of this License or under the terms of a license of + Your choice, which may contain terms different from this License, + provided that You are in compliance with the terms of this License + and that the license for the Executable form does not attempt to + limit or alter the recipient's rights in the Source Code form from + the rights set forth in this License. If You distribute the + Covered Software in Executable form under a different license, You + must make it absolutely clear that any terms which differ from + this License are offered by You alone, not by the Initial + Developer or Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred + by the Initial Developer or such Contributor as a result of any + such terms You offer. + + 3.6. Larger Works. + + You may create a Larger Work by combining Covered Software with + other code not governed by the terms of this License and + distribute the Larger Work as a single product. In such a case, + You must make sure the requirements of this License are fulfilled + for the Covered Software. + +4. Versions of the License. + + 4.1. New Versions. + + Sun Microsystems, Inc. is the initial license steward and may + publish revised and/or new versions of this License from time to + time. Each version will be given a distinguishing version number. + Except as provided in Section 4.3, no one other than the license + steward has the right to modify this License. + + 4.2. Effect of New Versions. + + You may always continue to use, distribute or otherwise make the + Covered Software available under the terms of the version of the + License under which You originally received the Covered Software. + If the Initial Developer includes a notice in the Original + Software prohibiting it from being distributed or otherwise made + available under any subsequent version of the License, You must + distribute and make the Covered Software available under the terms + of the version of the License under which You originally received + the Covered Software. Otherwise, You may also choose to use, + distribute or otherwise make the Covered Software available under + the terms of any subsequent version of the License published by + the license steward. + + 4.3. Modified Versions. + + When You are an Initial Developer and You want to create a new + license for Your Original Software, You may create and use a + modified version of this License if You: (a) rename the license + and remove any references to the name of the license steward + (except to note that the license differs from this License); and + (b) otherwise make it clear that the license contains terms which + differ from this License. + +5. DISCLAIMER OF WARRANTY. + + COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" + BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, + INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED + SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR + PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND + PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY + COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE + INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY + NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF + WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS + DISCLAIMER. + +6. TERMINATION. + + 6.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to + cure such breach within 30 days of becoming aware of the breach. + Provisions which, by their nature, must remain in effect beyond + the termination of this License shall survive. + + 6.2. If You assert a patent infringement claim (excluding + declaratory judgment actions) against Initial Developer or a + Contributor (the Initial Developer or Contributor against whom You + assert such claim is referred to as "Participant") alleging that + the Participant Software (meaning the Contributor Version where + the Participant is a Contributor or the Original Software where + the Participant is the Initial Developer) directly or indirectly + infringes any patent, then any and all rights granted directly or + indirectly to You by such Participant, the Initial Developer (if + the Initial Developer is not the Participant) and all Contributors + under Sections 2.1 and/or 2.2 of this License shall, upon 60 days + notice from Participant terminate prospectively and automatically + at the expiration of such 60 day notice period, unless if within + such 60 day period You withdraw Your claim with respect to the + Participant Software against such Participant either unilaterally + or pursuant to a written agreement with Participant. + + 6.3. In the event of termination under Sections 6.1 or 6.2 above, + all end user licenses that have been validly granted by You or any + distributor hereunder prior to termination (excluding licenses + granted to You by any distributor) shall survive termination. + +7. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE + INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF + COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE + LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR + CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT + LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK + STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL + INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT + APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO + NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR + CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT + APPLY TO YOU. + +8. U.S. GOVERNMENT END USERS. + + The Covered Software is a "commercial item," as that term is + defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial + computer software" (as that term is defined at 48 + C.F.R. 252.227-7014(a)(1)) and "commercial computer software + documentation" as such terms are used in 48 C.F.R. 12.212 + (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 + C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all + U.S. Government End Users acquire Covered Software with only those + rights set forth herein. This U.S. Government Rights clause is in + lieu of, and supersedes, any other FAR, DFAR, or other clause or + provision that addresses Government rights in computer software + under this License. + +9. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed + by the law of the jurisdiction specified in a notice contained + within the Original Software (except to the extent applicable law, + if any, provides otherwise), excluding such jurisdiction's + conflict-of-law provisions. Any litigation relating to this + License shall be subject to the jurisdiction of the courts located + in the jurisdiction and venue specified in a notice contained + within the Original Software, with the losing party responsible + for costs, including, without limitation, court costs and + reasonable attorneys' fees and expenses. The application of the + United Nations Convention on Contracts for the International Sale + of Goods is expressly excluded. Any law or regulation which + provides that the language of a contract shall be construed + against the drafter shall not apply to this License. You agree + that You alone are responsible for compliance with the United + States export administration regulations (and the export control + laws and regulation of any other countries) when You use, + distribute or otherwise make available any Covered Software. + +10. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or + indirectly, out of its utilization of rights under this License + and You agree to work with Initial Developer and Contributors to + distribute such responsibility on an equitable basis. Nothing + herein is intended or shall be deemed to constitute any admission + of liability. + +-------------------------------------------------------------------- + +NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND +DISTRIBUTION LICENSE (CDDL) + +For Covered Software in this distribution, this License shall +be governed by the laws of the State of California (excluding +conflict-of-law provisions). + +Any litigation relating to this License shall be subject to the +jurisdiction of the Federal Courts of the Northern District of +California and the state courts of the State of California, with +venue lying in Santa Clara County, California. diff --git a/zfs/lib/libumem/README b/zfs/lib/libumem/README new file mode 100644 index 000000000..63ec2486d --- /dev/null +++ b/zfs/lib/libumem/README @@ -0,0 +1,23 @@ +Portable libumem. +================ + +This is a port of Solaris libumem to non-Solaris systems. + +The port was made while integrating libumem with our Ecelerity MTA product, so +your initial experience will not be 100% out-of-the-box, because there is no +standalone configure script for the library at this time. (patches welcome!) + +In addition, since our deployment is threaded, we force the library into +threaded mode. + +While the library is itself stable (it's the memory allocator used by the +Solaris OS), the port may have a few rough edges. We're shipping umem with +Linux and Windows versions of our product as we have found it to be stable. + +We will continue to update this project as and when we make improvements, and +welcome third-party patches that improve the usability for everyone. + + +Wez Furlong, +OmniTI, Inc. + diff --git a/zfs/lib/libumem/README-alpha b/zfs/lib/libumem/README-alpha new file mode 100644 index 000000000..44573d8f2 --- /dev/null +++ b/zfs/lib/libumem/README-alpha @@ -0,0 +1,56 @@ +This is the libumem package. +This document describes the actions needed to build the pre-release +or CVS version of the package. See end of file for copying conditions. + +* Introduction + + This is a *pre-release* version, and not ready for production use +yet. If you are taking source from CVS, you will need to have libtool, +automake, and autoconf installed to help contribute. See the chapter +`Building' for the detailed instructions. The script autogen.sh is +provided to help autoconfigure libumem from the cvs src. After you +run autogen.sh, there should be a file 'INSTALL' with (generic) +installation instructions. Package-specific installation instructions +are set forth in the file README. + + Please, note that the accompanying documentation may be inaccurate +or incomplete. The CVS history is the authoritative documentation of +all recent changes. + +Report bugs to (TODO: mailing list <>) + +* Checking Out the Sources + +The following instructions apply if you wish to obtain sources from +the CVS repository: + +To checkout the source tree from CVS issue the following command: + + cvs -z3 -d:pserver:[email protected]:/cvsroot/umem co \ + -P umem + + This will give you read-only access. If you think you need write access, +contact the mailing list. + +* Building + + In order to build this you will first need to have right versions +of autotools and some auxiliary GNU programs. At the time of this +writing these are: + + Package Version (>=) + ======== ============ + automake 1.4 + autoconf 2.50 + libtool 1.5.0 + + To prepare the package for building run autogen.sh. Then run +./configure with the desired options (See INSTALL and README for the +detailed instructions). Finally, run make. Notice that the first make +of the package should be made in the source directory. Subsequent +makes can use build directory different from the source one. + +* Copyright information: + +Please see COPYRIGHT and OPENSOLARIS.LICENSE for the copyright +and license details. diff --git a/zfs/lib/libumem/TODO b/zfs/lib/libumem/TODO new file mode 100644 index 000000000..81ab0c3a5 --- /dev/null +++ b/zfs/lib/libumem/TODO @@ -0,0 +1,19 @@ +To-do List for the Linux port of umem +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Autconf'ery for <stdint.h> in "sol_compat.h". + +* Fix #define wreakage for THR_RETURN, THR_API in "sol_compat.h". + +* Replace use of Ecelerity's portable atomic locking header with something + equivalent in "sol_compat.h". + +* ec_debug_vprintf -> something else? + +* test suite. + +* static library support. + +* doxygen'ate the headers/code, to produce reference docs. + +* HAVE_DOT in Doxyfile.in should be detected by configure. diff --git a/zfs/lib/libumem/config.h b/zfs/lib/libumem/config.h new file mode 100644 index 000000000..9994cd102 --- /dev/null +++ b/zfs/lib/libumem/config.h @@ -0,0 +1,13 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define UMEM_PTHREAD_MUTEX_TOO_BIG 1 + +#define HAVE_SYS_TIME_H 1 +#define HAVE_DLFCN_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_SYS_MMAN_H 1 +#define HAVE_SYS_SYSMACROS_H 1 +#define HAVE_STRINGS_H 1 + +#endif diff --git a/zfs/lib/libumem/envvar.c b/zfs/lib/libumem/envvar.c new file mode 100644 index 000000000..7452de7c5 --- /dev/null +++ b/zfs/lib/libumem/envvar.c @@ -0,0 +1,693 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)envvar.c 1.5 05/06/08 SMI" */ + +#include "config.h" +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#if HAVE_DLFCN_H +#include <dlfcn.h> +#endif + +#include "umem_base.h" +#include "vmem_base.h" + +/* + * A umem environment variable, like UMEM_DEBUG, is set to a series + * of items, seperated by ',': + * + * UMEM_DEBUG="audit=10,guards,firewall=512" + * + * This structure describes items. Each item has a name, type, and + * description. During processing, an item read from the user may + * be either "valid" or "invalid". + * + * A valid item has an argument, if required, and it is of the right + * form (doesn't overflow, doesn't contain any unexpected characters). + * + * If the item is valid, item_flag_target != NULL, and: + * type is not CLEARFLAG, then (*item_flag_target) |= item_flag_value + * type is CLEARFLAG, then (*item_flag_target) &= ~item_flag_value + */ + +#define UMEM_ENV_ITEM_MAX 512 + +struct umem_env_item; + +typedef int arg_process_t(const struct umem_env_item *item, const char *value); +#define ARG_SUCCESS 0 /* processing successful */ +#define ARG_BAD 1 /* argument had a bad value */ + +typedef struct umem_env_item { + const char *item_name; /* tag in environment variable */ + const char *item_interface_stability; + enum { + ITEM_INVALID, + ITEM_FLAG, /* only a flag. No argument allowed */ + ITEM_CLEARFLAG, /* only a flag, but clear instead of set */ + ITEM_OPTUINT, /* optional integer argument */ + ITEM_UINT, /* required integer argument */ + ITEM_OPTSIZE, /* optional size_t argument */ + ITEM_SIZE, /* required size_t argument */ + ITEM_SPECIAL /* special argument processing */ + } item_type; + const char *item_description; + uint_t *item_flag_target; /* the variable containing the flag */ + uint_t item_flag_value; /* the value to OR in */ + uint_t *item_uint_target; /* the variable to hold the integer */ + size_t *item_size_target; + arg_process_t *item_special; /* callback for special handling */ +} umem_env_item_t; + +#ifndef UMEM_STANDALONE +static arg_process_t umem_backend_process; +#endif + +static arg_process_t umem_log_process; + +const char *____umem_environ_msg_options = "-- UMEM_OPTIONS --"; + +static umem_env_item_t umem_options_items[] = { +#ifndef UMEM_STANDALONE + { "backend", "Evolving", ITEM_SPECIAL, + "=sbrk for sbrk(2), =mmap for mmap(2)", + NULL, 0, NULL, NULL, + &umem_backend_process + }, +#endif + + { "concurrency", "Private", ITEM_UINT, + "Max concurrency", + NULL, 0, &umem_max_ncpus + }, + { "max_contention", "Private", ITEM_UINT, + "Maximum contention in a reap interval before the depot is " + "resized.", + NULL, 0, &umem_depot_contention + }, + { "nomagazines", "Private", ITEM_FLAG, + "no caches will be multithreaded, and no caching will occur.", + &umem_flags, UMF_NOMAGAZINE + }, + { "reap_interval", "Private", ITEM_UINT, + "Minimum time between reaps and updates, in seconds.", + NULL, 0, &umem_reap_interval + }, + +#ifndef _WIN32 +#ifndef UMEM_STANDALONE + { "sbrk_pagesize", "Private", ITEM_SIZE, + "The preferred page size for the sbrk(2) heap.", + NULL, 0, NULL, &vmem_sbrk_pagesize + }, +#endif +#endif + + { NULL, "-- end of UMEM_OPTIONS --", ITEM_INVALID } +}; + +const char *____umem_environ_msg_debug = "-- UMEM_DEBUG --"; + +static umem_env_item_t umem_debug_items[] = { + { "default", "Unstable", ITEM_FLAG, + "audit,contents,guards", + &umem_flags, + UMF_AUDIT | UMF_CONTENTS | UMF_DEADBEEF | UMF_REDZONE + }, + { "audit", "Unstable", ITEM_OPTUINT, + "Enable auditing. optionally =frames to set the number of " + "stored stack frames", + &umem_flags, UMF_AUDIT, &umem_stack_depth + }, + { "contents", "Unstable", ITEM_OPTSIZE, + "Enable contents storing. UMEM_LOGGING=contents also " + "required. optionally =bytes to set the number of stored " + "bytes", + &umem_flags, UMF_CONTENTS, NULL, &umem_content_maxsave + }, + { "guards", "Unstable", ITEM_FLAG, + "Enables guards and special patterns", + &umem_flags, UMF_DEADBEEF | UMF_REDZONE + }, + { "verbose", "Unstable", ITEM_FLAG, + "Enables writing error messages to stderr", + &umem_output, 1 + }, + + { "nosignal", "Private", ITEM_FLAG, + "Abort if called from a signal handler. Turns on 'audit'. " + "Note that this is not always a bug.", + &umem_flags, UMF_AUDIT | UMF_CHECKSIGNAL + }, + { "firewall", "Private", ITEM_SIZE, + "=minbytes. Every object >= minbytes in size will have its " + "end against an unmapped page", + &umem_flags, UMF_FIREWALL, NULL, &umem_minfirewall + }, + { "lite", "Private", ITEM_FLAG, + "debugging-lite", + &umem_flags, UMF_LITE + }, + { "maxverify", "Private", ITEM_SIZE, + "=maxbytes, Maximum bytes to check when 'guards' is active. " + "Normally all bytes are checked.", + NULL, 0, NULL, &umem_maxverify + }, + { "noabort", "Private", ITEM_CLEARFLAG, + "umem will not abort when a recoverable error occurs " + "(i.e. double frees, certain kinds of corruption)", + &umem_abort, 1 + }, + { "mtbf", "Private", ITEM_UINT, + "=mtbf, the mean time between injected failures. Works best " + "if prime.\n", + NULL, 0, &umem_mtbf + }, + { "random", "Private", ITEM_FLAG, + "randomize flags on a per-cache basis", + &umem_flags, UMF_RANDOMIZE + }, + { "allverbose", "Private", ITEM_FLAG, + "Enables writing all logged messages to stderr", + &umem_output, 2 + }, + + { NULL, "-- end of UMEM_DEBUG --", ITEM_INVALID } +}; + +const char *____umem_environ_msg_logging = "-- UMEM_LOGGING --"; + +static umem_env_item_t umem_logging_items[] = { + { "transaction", "Unstable", ITEM_SPECIAL, + "If 'audit' is set in UMEM_DEBUG, the audit structures " + "from previous transactions are entered into this log.", + NULL, 0, NULL, + &umem_transaction_log_size, &umem_log_process + }, + { "contents", "Unstable", ITEM_SPECIAL, + "If 'audit' is set in UMEM_DEBUG, the contents of objects " + "are recorded in this log as they are freed. If the " + "'contents' option is not set in UMEM_DEBUG, the first " + "256 bytes of each freed buffer will be saved.", + &umem_flags, UMF_CONTENTS, NULL, + &umem_content_log_size, &umem_log_process + }, + { "fail", "Unstable", ITEM_SPECIAL, + "Records are entered into this log for every failed " + "allocation.", + NULL, 0, NULL, + &umem_failure_log_size, &umem_log_process + }, + + { "slab", "Private", ITEM_SPECIAL, + "Every slab created will be entered into this log.", + NULL, 0, NULL, + &umem_slab_log_size, &umem_log_process + }, + + { NULL, "-- end of UMEM_LOGGING --", ITEM_INVALID } +}; + +typedef struct umem_envvar { + const char *env_name; + const char *env_func; + umem_env_item_t *env_item_list; + const char *env_getenv_result; + const char *env_func_result; +} umem_envvar_t; + +static umem_envvar_t umem_envvars[] = { + { "UMEM_DEBUG", "_umem_debug_init", umem_debug_items }, + { "UMEM_OPTIONS", "_umem_options_init", umem_options_items }, + { "UMEM_LOGGING", "_umem_logging_init", umem_logging_items }, + { NULL, NULL, NULL } +}; + +static umem_envvar_t *env_current; +#define CURRENT (env_current->env_name) + +static int +empty(const char *str) +{ + char c; + + while ((c = *str) != '\0' && isspace(c)) + str++; + + return (*str == '\0'); +} + +static int +item_uint_process(const umem_env_item_t *item, const char *item_arg) +{ + ulong_t result; + char *endptr = ""; + int olderrno; + + olderrno = errno; + errno = 0; + + if (empty(item_arg)) { + goto badnumber; + } + + result = strtoul(item_arg, &endptr, 10); + + if (result == ULONG_MAX && errno == ERANGE) { + errno = olderrno; + goto overflow; + } + errno = olderrno; + + if (*endptr != '\0') + goto badnumber; + if ((uint_t)result != result) + goto overflow; + + (*item->item_uint_target) = (uint_t)result; + return (ARG_SUCCESS); + +badnumber: + log_message("%s: %s: not a number\n", CURRENT, item->item_name); + return (ARG_BAD); + +overflow: + log_message("%s: %s: overflowed\n", CURRENT, item->item_name); + return (ARG_BAD); +} + +static int +item_size_process(const umem_env_item_t *item, const char *item_arg) +{ + ulong_t result; + ulong_t result_arg; + char *endptr = ""; + int olderrno; + + if (empty(item_arg)) + goto badnumber; + + olderrno = errno; + errno = 0; + + result_arg = strtoul(item_arg, &endptr, 10); + + if (result_arg == ULONG_MAX && errno == ERANGE) { + errno = olderrno; + goto overflow; + } + errno = olderrno; + + result = result_arg; + + switch (*endptr) { + case 't': + case 'T': + result *= 1024; + if (result < result_arg) + goto overflow; + /*FALLTHRU*/ + case 'g': + case 'G': + result *= 1024; + if (result < result_arg) + goto overflow; + /*FALLTHRU*/ + case 'm': + case 'M': + result *= 1024; + if (result < result_arg) + goto overflow; + /*FALLTHRU*/ + case 'k': + case 'K': + result *= 1024; + if (result < result_arg) + goto overflow; + endptr++; /* skip over the size character */ + break; + default: + break; /* handled later */ + } + + if (*endptr != '\0') + goto badnumber; + + (*item->item_size_target) = result; + return (ARG_SUCCESS); + +badnumber: + log_message("%s: %s: not a number\n", CURRENT, item->item_name); + return (ARG_BAD); + +overflow: + log_message("%s: %s: overflowed\n", CURRENT, item->item_name); + return (ARG_BAD); +} + +static int +umem_log_process(const umem_env_item_t *item, const char *item_arg) +{ + if (item_arg != NULL) { + int ret; + ret = item_size_process(item, item_arg); + if (ret != ARG_SUCCESS) + return (ret); + + if (*item->item_size_target == 0) + return (ARG_SUCCESS); + } else + *item->item_size_target = 64*1024; + + umem_logging = 1; + return (ARG_SUCCESS); +} + +#ifndef UMEM_STANDALONE +static int +umem_backend_process(const umem_env_item_t *item, const char *item_arg) +{ + const char *name = item->item_name; + + if (item_arg == NULL) + goto fail; + + if (strcmp(item_arg, "sbrk") == 0) + vmem_backend |= VMEM_BACKEND_SBRK; + else if (strcmp(item_arg, "mmap") == 0) + vmem_backend |= VMEM_BACKEND_MMAP; + else + goto fail; + + return (ARG_SUCCESS); + +fail: + log_message("%s: %s: must be %s=sbrk or %s=mmap\n", + CURRENT, name, name, name); + return (ARG_BAD); +} +#endif + +static int +process_item(const umem_env_item_t *item, const char *item_arg) +{ + int arg_required = 0; + arg_process_t *processor; + + switch (item->item_type) { + case ITEM_FLAG: + case ITEM_CLEARFLAG: + case ITEM_OPTUINT: + case ITEM_OPTSIZE: + case ITEM_SPECIAL: + arg_required = 0; + break; + + case ITEM_UINT: + case ITEM_SIZE: + arg_required = 1; + break; + + default: + log_message("%s: %s: Invalid type. Ignored\n", + CURRENT, item->item_name); + return (1); + } + + switch (item->item_type) { + case ITEM_FLAG: + case ITEM_CLEARFLAG: + if (item_arg != NULL) { + log_message("%s: %s: does not take a value. ignored\n", + CURRENT, item->item_name); + return (1); + } + processor = NULL; + break; + + case ITEM_UINT: + case ITEM_OPTUINT: + processor = item_uint_process; + break; + + case ITEM_SIZE: + case ITEM_OPTSIZE: + processor = item_size_process; + break; + + case ITEM_SPECIAL: + processor = item->item_special; + break; + + default: + log_message("%s: %s: Invalid type. Ignored\n", + CURRENT, item->item_name); + return (1); + } + + if (arg_required && item_arg == NULL) { + log_message("%s: %s: Required value missing\n", + CURRENT, item->item_name); + goto invalid; + } + + if (item_arg != NULL || item->item_type == ITEM_SPECIAL) { + if (processor(item, item_arg) != ARG_SUCCESS) + goto invalid; + } + + if (item->item_flag_target) { + if (item->item_type == ITEM_CLEARFLAG) + (*item->item_flag_target) &= ~item->item_flag_value; + else + (*item->item_flag_target) |= item->item_flag_value; + } + return (0); + +invalid: + return (1); +} + +#define ENV_SHORT_BYTES 10 /* bytes to print on error */ +void +umem_process_value(umem_env_item_t *item_list, const char *beg, const char *end) +{ + char buf[UMEM_ENV_ITEM_MAX]; + char *argptr; + + size_t count; + + while (beg < end && isspace(*beg)) + beg++; + + while (beg < end && isspace(*(end - 1))) + end--; + + if (beg >= end) { + log_message("%s: empty option\n", CURRENT); + return; + } + + count = end - beg; + + if (count + 1 > sizeof (buf)) { + char outbuf[ENV_SHORT_BYTES + 1]; + /* + * Have to do this, since sprintf("%10s",...) calls malloc() + */ + (void) strncpy(outbuf, beg, ENV_SHORT_BYTES); + outbuf[ENV_SHORT_BYTES] = 0; + + log_message("%s: argument \"%s...\" too long\n", CURRENT, + outbuf); + return; + } + + (void) strncpy(buf, beg, count); + buf[count] = 0; + + argptr = strchr(buf, '='); + + if (argptr != NULL) + *argptr++ = 0; + + for (; item_list->item_name != NULL; item_list++) { + if (strcmp(buf, item_list->item_name) == 0) { + (void) process_item(item_list, argptr); + return; + } + } + log_message("%s: '%s' not recognized\n", CURRENT, buf); +} + +/*ARGSUSED*/ +void +umem_setup_envvars(int invalid) +{ + umem_envvar_t *cur_env; + static volatile enum { + STATE_START, + STATE_GETENV, + STATE_DLSYM, + STATE_FUNC, + STATE_DONE + } state = STATE_START; +#ifndef UMEM_STANDALONE + void *h; +#endif + + if (invalid) { + const char *where; + /* + * One of the calls below invoked malloc() recursively. We + * remove any partial results and return. + */ + + switch (state) { + case STATE_START: + where = "before getenv(3C) calls -- " + "getenv(3C) results ignored."; + break; + case STATE_GETENV: + where = "during getenv(3C) calls -- " + "getenv(3C) results ignored."; + break; + case STATE_DLSYM: + where = "during dlsym(3C) call -- " + "_umem_*() results ignored."; + break; + case STATE_FUNC: + where = "during _umem_*() call -- " + "_umem_*() results ignored."; + break; + case STATE_DONE: + where = "after dlsym() or _umem_*() calls."; + break; + default: + where = "at unknown point -- " + "_umem_*() results ignored."; + break; + } + + log_message("recursive allocation %s\n", where); + + for (cur_env = umem_envvars; cur_env->env_name != NULL; + cur_env++) { + if (state == STATE_GETENV) + cur_env->env_getenv_result = NULL; + if (state != STATE_DONE) + cur_env->env_func_result = NULL; + } + + state = STATE_DONE; + return; + } + + state = STATE_GETENV; + + for (cur_env = umem_envvars; cur_env->env_name != NULL; cur_env++) { + cur_env->env_getenv_result = getenv(cur_env->env_name); + if (state == STATE_DONE) + return; /* recursed */ + } + +#ifndef UMEM_STANDALONE +#ifdef _WIN32 +# define dlopen(a, b) GetModuleHandle(NULL) +# define dlsym(a, b) GetProcAddress((HANDLE)a, b) +# define dlclose(a) 0 +# define dlerror() 0 +#endif + /* get a handle to the "a.out" object */ + if ((h = dlopen(0, RTLD_FIRST | RTLD_LAZY)) != NULL) { + for (cur_env = umem_envvars; cur_env->env_name != NULL; + cur_env++) { + const char *(*func)(void); + const char *value; + + state = STATE_DLSYM; + func = (const char *(*)(void))dlsym(h, + cur_env->env_func); + + if (state == STATE_DONE) + break; /* recursed */ + + state = STATE_FUNC; + if (func != NULL) { + value = func(); + if (state == STATE_DONE) + break; /* recursed */ + cur_env->env_func_result = value; + } + } + (void) dlclose(h); + } else { + (void) dlerror(); /* snarf dlerror() */ + } +#endif /* UMEM_STANDALONE */ + + state = STATE_DONE; +} + +/* + * Process the environment variables. + */ +void +umem_process_envvars(void) +{ + const char *value; + const char *end, *next; + umem_envvar_t *cur_env; + + for (cur_env = umem_envvars; cur_env->env_name != NULL; cur_env++) { + env_current = cur_env; + + value = cur_env->env_getenv_result; + if (value == NULL) + value = cur_env->env_func_result; + + /* ignore if missing or empty */ + if (value == NULL) + continue; + + for (end = value; *end != '\0'; value = next) { + end = strchr(value, ','); + if (end != NULL) + next = end + 1; /* skip the comma */ + else + next = end = value + strlen(value); + + umem_process_value(cur_env->env_item_list, value, end); + } + } +} diff --git a/zfs/lib/libumem/getpcstack.c b/zfs/lib/libumem/getpcstack.c new file mode 100644 index 000000000..dffcc8591 --- /dev/null +++ b/zfs/lib/libumem/getpcstack.c @@ -0,0 +1,211 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)getpcstack.c 1.5 05/06/08 SMI" */ + +#include "config.h" +#include "misc.h" + +#if HAVE_UCONTEXT_H +#include <ucontext.h> +#endif + +#if HAVE_SYS_FRAME_H +#include <sys/frame.h> +#endif +#if HAVE_SYS_STACK_H +#include <sys/stack.h> +#endif + +#include <stdio.h> + +#if defined(__MACH__) +/* + * Darwin doesn't have any exposed frame info, so give it some space. + */ +#define UMEM_FRAMESIZE (2 * sizeof(long long)) + +#elif defined(__sparc) || defined(__sparcv9) +extern void flush_windows(void); +#define UMEM_FRAMESIZE MINFRAME + +#elif defined(__i386) || defined(__amd64) +/* + * On x86, MINFRAME is defined to be 0, but we want to be sure we can + * dereference the entire frame structure. + */ +#define UMEM_FRAMESIZE (sizeof (struct frame)) + +#elif !defined(EC_UMEM_DUMMY_PCSTACK) +#error needs update for new architecture +#endif + +/* + * Get a pc-only stacktrace. Used for kmem_alloc() buffer ownership tracking. + * Returns MIN(current stack depth, pcstack_limit). + */ +/*ARGSUSED*/ +int +getpcstack(uintptr_t *pcstack, int pcstack_limit, int check_signal) +{ +#ifdef EC_UMEM_DUMMY_PCSTACK + return 0; +#else + struct frame *fp; + struct frame *nextfp, *minfp; + int depth = 0; + uintptr_t base = 0; + size_t size = 0; +#ifndef UMEM_STANDALONE + int on_altstack = 0; + uintptr_t sigbase = 0; + size_t sigsize = 0; + + stack_t st; + + if (stack_getbounds(&st) != 0) { + if (thr_stksegment(&st) != 0 || + (uintptr_t)st.ss_sp < st.ss_size) { + return (0); /* unable to get stack bounds */ + } + /* + * thr_stksegment(3C) has a slightly different interface than + * stack_getbounds(3C) -- correct it + */ + st.ss_sp = (void *)(((uintptr_t)st.ss_sp) - st.ss_size); + st.ss_flags = 0; /* can't be on-stack */ + } + on_altstack = (st.ss_flags & SS_ONSTACK); + + if (st.ss_size != 0) { + base = (uintptr_t)st.ss_sp; + size = st.ss_size; + } else { + /* + * If size == 0, then ss_sp is the *top* of the stack. + * + * Since we only allow increasing frame pointers, and we + * know our caller set his up correctly, we can treat ss_sp + * as an upper bound safely. + */ + base = 0; + size = (uintptr_t)st.ss_sp; + } + + if (check_signal != 0) { + void (*sigfunc)() = NULL; + int sigfuncsize = 0; + extern void thr_sighndlrinfo(void (**)(), int *); + + thr_sighndlrinfo(&sigfunc, &sigfuncsize); + sigbase = (uintptr_t)sigfunc; + sigsize = sigfuncsize; + } +#else /* UMEM_STANDALONE */ + base = (uintptr_t)umem_min_stack; + size = umem_max_stack - umem_min_stack; +#endif + + /* + * shorten size so that fr_savfp and fr_savpc will be within the stack + * bounds. + */ + if (size >= UMEM_FRAMESIZE - 1) + size -= (UMEM_FRAMESIZE - 1); + else + size = 0; + +#if defined(__sparc) || defined(__sparcv9) + flush_windows(); +#endif + + /* LINTED alignment */ + fp = (struct frame *)((caddr_t)getfp() + STACK_BIAS); + + minfp = fp; + + if (((uintptr_t)fp - base) >= size) + return (0); /* the frame pointer isn't in our stack */ + + while (depth < pcstack_limit) { + uintptr_t tmp; + + /* LINTED alignment */ + nextfp = (struct frame *)((caddr_t)fp->fr_savfp + STACK_BIAS); + tmp = (uintptr_t)nextfp; + + /* + * Check nextfp for validity. It must be properly aligned, + * increasing compared to the last %fp (or the top of the + * stack we just switched to), and it must be inside + * [base, base + size). + */ + if (tmp != SA(tmp)) + break; + else if (nextfp <= minfp || (tmp - base) >= size) { +#ifndef UMEM_STANDALONE + if (tmp == NULL || !on_altstack) + break; + /* + * If we're on an alternate signal stack, try jumping + * to the main thread stack. + * + * If the main thread stack has an unlimited size, we + * punt, since we don't know where the frame pointer's + * been. + * + * (thr_stksegment() returns the *top of stack* + * in ss_sp, not the bottom) + */ + if (thr_stksegment(&st) == 0) { + if (st.ss_size >= (uintptr_t)st.ss_sp || + st.ss_size < UMEM_FRAMESIZE - 1) + break; + + on_altstack = 0; + base = (uintptr_t)st.ss_sp - st.ss_size; + size = st.ss_size - (UMEM_FRAMESIZE - 1); + minfp = (struct frame *)base; + continue; /* try again */ + } +#endif + break; + } + +#ifndef UMEM_STANDALONE + if (check_signal && (fp->fr_savpc - sigbase) <= sigsize) + umem_panic("called from signal handler"); +#endif + pcstack[depth++] = fp->fr_savpc; + fp = nextfp; + minfp = fp; + } + return (depth); +#endif +} diff --git a/zfs/lib/libumem/include/Makefile.in b/zfs/lib/libumem/include/Makefile.in new file mode 100644 index 000000000..b16a5d217 --- /dev/null +++ b/zfs/lib/libumem/include/Makefile.in @@ -0,0 +1 @@ +DISTFILES = umem.h diff --git a/zfs/lib/libumem/include/umem.h b/zfs/lib/libumem/include/umem.h new file mode 100644 index 000000000..ac6ed92ee --- /dev/null +++ b/zfs/lib/libumem/include/umem.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _UMEM_H +#define _UMEM_H + + + +#include <sys/types.h> +#include <sys/vmem.h> +#include <stdlib.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ +#define UMEM_NOFAIL 0x0100 /* Never fails -- may call exit(2) */ + +#define UMEM_FLAGS 0xffff /* all settable umem flags */ + +extern void *umem_alloc(size_t, int); +extern void *umem_alloc_align(size_t, size_t, int); +extern void *umem_zalloc(size_t, int); +extern void umem_free(void *, size_t); +extern void umem_free_align(void *, size_t); + +/* + * Flags for umem_cache_create() + */ +#define UMC_NOTOUCH 0x00010000 +#define UMC_NODEBUG 0x00020000 +#define UMC_NOMAGAZINE 0x00040000 +#define UMC_NOHASH 0x00080000 + +struct umem_cache; /* cache structure is opaque to umem clients */ + +typedef struct umem_cache umem_cache_t; +typedef int umem_constructor_t(void *, void *, int); +typedef void umem_destructor_t(void *, void *); +typedef void umem_reclaim_t(void *); + +typedef int umem_nofail_callback_t(void); +#define UMEM_CALLBACK_RETRY 0 +#define UMEM_CALLBACK_EXIT(status) (0x100 | ((status) & 0xFF)) + +extern void umem_nofail_callback(umem_nofail_callback_t *); + +extern umem_cache_t *umem_cache_create(char *, size_t, + size_t, umem_constructor_t *, umem_destructor_t *, umem_reclaim_t *, + void *, vmem_t *, int); +extern void umem_cache_destroy(umem_cache_t *); + +extern void *umem_cache_alloc(umem_cache_t *, int); +extern void umem_cache_free(umem_cache_t *, void *); + +extern void umem_reap(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _UMEM_H */ diff --git a/zfs/lib/libumem/init_lib.c b/zfs/lib/libumem/init_lib.c new file mode 100644 index 000000000..bc1b3819f --- /dev/null +++ b/zfs/lib/libumem/init_lib.c @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)init_lib.c 1.2 05/06/08 SMI" */ + +/* + * Initialization routines for the library version of libumem. + */ + +#include "config.h" +#include "umem_base.h" +#include "vmem_base.h" + +#if HAVE_UNISTD_H +#include <unistd.h> +#endif +#if HAVE_DLFCN_H +#include <dlfcn.h> +#endif + +#include <fcntl.h> +#include <string.h> + +#ifdef __FreeBSD__ +#include <machine/param.h> +#endif + +void +vmem_heap_init(void) +{ +#ifdef _WIN32 + vmem_backend = VMEM_BACKEND_MMAP; +#else +#if 0 + void *handle = dlopen("libmapmalloc.so.1", RTLD_NOLOAD); + + if (handle != NULL) { +#endif + log_message("sbrk backend disabled\n"); + vmem_backend = VMEM_BACKEND_MMAP; +#if 0 + } +#endif +#endif + + if ((vmem_backend & VMEM_BACKEND_MMAP) != 0) { + vmem_backend = VMEM_BACKEND_MMAP; + (void) vmem_mmap_arena(NULL, NULL); + } else { +#ifndef _WIN32 + vmem_backend = VMEM_BACKEND_SBRK; + (void) vmem_sbrk_arena(NULL, NULL); +#endif + } +} + +/*ARGSUSED*/ +void +umem_type_init(caddr_t start, size_t len, size_t pgsize) +{ +#ifdef _WIN32 + SYSTEM_INFO info; + GetSystemInfo(&info); + pagesize = info.dwPageSize; +#elif !defined(__FreeBSD__) + pagesize = _sysconf(_SC_PAGESIZE); +#else + pagesize = PAGE_SIZE; +#endif +} + +int +umem_get_max_ncpus(void) +{ +#ifdef linux + /* + * HACK: sysconf() will invoke malloc() on Linux as part of reading + * in /proc/stat. To avoid recursion in the malloc replacement + * version of libumem, read /proc/stat into a static buffer. + */ + static char proc_stat[8192]; + int fd; + int ncpus = 1; + + fd = open("/proc/stat", O_RDONLY); + if (fd >= 0) { + const ssize_t n = read(fd, proc_stat, sizeof(proc_stat) - 1); + if (n >= 0) { + const char *cur; + const char *next; + + proc_stat[n] = '\0'; + cur = proc_stat; + while (*cur && (next = strstr(cur + 3, "cpu"))) { + cur = next; + } + + if (*cur) + ncpus = atoi(cur + 3) + 1; + } + + close(fd); + } + + return ncpus; + +#else /* !linux */ + +#if _SC_NPROCESSORS_ONLN + return (2 * sysconf(_SC_NPROCESSORS_ONLN)); +#elif defined(_SC_NPROCESSORS_CONF) + return (2 * sysconf(_SC_NPROCESSORS_CONF)); +#elif defined(_WIN32) + SYSTEM_INFO info; + GetSystemInfo(&info); + return info.dwNumberOfProcessors; +#else + /* XXX: determine CPU count on other platforms */ + return (1); +#endif + +#endif /* linux */ +} diff --git a/zfs/lib/libumem/misc.c b/zfs/lib/libumem/misc.c new file mode 100644 index 000000000..aa4d63ff0 --- /dev/null +++ b/zfs/lib/libumem/misc.c @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)misc.c 1.6 05/06/08 SMI" */ + +#define _BUILDING_UMEM_MISC_C +#include "config.h" +/* #include "mtlib.h" */ +#if HAVE_UNISTD_H +#include <unistd.h> +#endif +#if HAVE_DLFCN_H +#include <dlfcn.h> +#endif +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#if HAVE_SYS_MACHELF_H +#include <sys/machelf.h> +#endif + +#include <umem_impl.h> +#include "misc.h" + +#ifdef ECELERITY +#include "util.h" +#endif + +#define UMEM_ERRFD 2 /* goes to standard error */ +#define UMEM_MAX_ERROR_SIZE 4096 /* error messages are truncated to this */ + +/* + * This is a circular buffer for holding error messages. + * umem_error_enter appends to the buffer, adding "..." to the beginning + * if data has been lost. + */ + +#define ERR_SIZE 8192 /* must be a power of 2 */ + +static mutex_t umem_error_lock = DEFAULTMUTEX; + +static char umem_error_buffer[ERR_SIZE] = ""; +static uint_t umem_error_begin = 0; +static uint_t umem_error_end = 0; + +#define WRITE_AND_INC(var, value) { \ + umem_error_buffer[(var)++] = (value); \ + var = P2PHASE((var), ERR_SIZE); \ +} + +static void +umem_log_enter(const char *error_str, int serious) +{ + int looped; + char c; + + looped = 0; +#ifdef ECELERITY + mem_printf(serious ? DCRITICAL : DINFO, "umem: %s", error_str); +#endif + + (void) mutex_lock(&umem_error_lock); + + while ((c = *error_str++) != '\0') { + WRITE_AND_INC(umem_error_end, c); + if (umem_error_end == umem_error_begin) + looped = 1; + } + + umem_error_buffer[umem_error_end] = 0; + + if (looped) { + uint_t idx; + umem_error_begin = P2PHASE(umem_error_end + 1, ERR_SIZE); + + idx = umem_error_begin; + WRITE_AND_INC(idx, '.'); + WRITE_AND_INC(idx, '.'); + WRITE_AND_INC(idx, '.'); + } + + (void) mutex_unlock(&umem_error_lock); +} + +void +umem_error_enter(const char *error_str) +{ +#ifndef UMEM_STANDALONE + if (umem_output && !issetugid()) + (void) write(UMEM_ERRFD, error_str, strlen(error_str)); +#endif + + umem_log_enter(error_str, 1); +} + +int +highbit(ulong_t i) +{ + register int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +int +lowbit(ulong_t i) +{ + register int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (!(i & 0xffffffff)) { + h += 32; i >>= 32; + } +#endif + if (!(i & 0xffff)) { + h += 16; i >>= 16; + } + if (!(i & 0xff)) { + h += 8; i >>= 8; + } + if (!(i & 0xf)) { + h += 4; i >>= 4; + } + if (!(i & 0x3)) { + h += 2; i >>= 2; + } + if (!(i & 0x1)) { + h += 1; + } + return (h); +} + +void +hrt2ts(hrtime_t hrt, timestruc_t *tsp) +{ + tsp->tv_sec = hrt / NANOSEC; + tsp->tv_nsec = hrt % NANOSEC; +} + +void +log_message(const char *format, ...) +{ + char buf[UMEM_MAX_ERROR_SIZE] = ""; + + va_list va; + + va_start(va, format); + (void) vsnprintf(buf, UMEM_MAX_ERROR_SIZE-1, format, va); + va_end(va); + +#ifndef UMEM_STANDALONE + if (umem_output > 1) + (void) write(UMEM_ERRFD, buf, strlen(buf)); +#endif + + umem_log_enter(buf, 0); +} + +#ifndef UMEM_STANDALONE +void +debug_printf(const char *format, ...) +{ + char buf[UMEM_MAX_ERROR_SIZE] = ""; + + va_list va; + + va_start(va, format); + (void) vsnprintf(buf, UMEM_MAX_ERROR_SIZE-1, format, va); + va_end(va); + + (void) write(UMEM_ERRFD, buf, strlen(buf)); +} +#endif + +void +umem_vprintf(const char *format, va_list va) +{ + char buf[UMEM_MAX_ERROR_SIZE] = ""; + + (void) vsnprintf(buf, UMEM_MAX_ERROR_SIZE-1, format, va); + + umem_error_enter(buf); +} + +void +umem_printf(const char *format, ...) +{ + va_list va; + + va_start(va, format); + umem_vprintf(format, va); + va_end(va); +} + +/*ARGSUSED*/ +void +umem_printf_warn(void *ignored, const char *format, ...) +{ + va_list va; + + va_start(va, format); + umem_vprintf(format, va); + va_end(va); +} + +/* + * print_sym tries to print out the symbol and offset of a pointer + */ +int +print_sym(void *pointer) +{ +#if HAVE_SYS_MACHELF_H + int result; + Dl_info sym_info; + + uintptr_t end = NULL; + + Sym *ext_info = NULL; + + result = dladdr1(pointer, &sym_info, (void **)&ext_info, + RTLD_DL_SYMENT); + + if (result != 0) { + const char *endpath; + + end = (uintptr_t)sym_info.dli_saddr + ext_info->st_size; + + endpath = strrchr(sym_info.dli_fname, '/'); + if (endpath) + endpath++; + else + endpath = sym_info.dli_fname; + umem_printf("%s'", endpath); + } + + if (result == 0 || (uintptr_t)pointer > end) { + umem_printf("?? (0x%p)", pointer); + return (0); + } else { + umem_printf("%s+0x%p", sym_info.dli_sname, + (char *)pointer - (char *)sym_info.dli_saddr); + return (1); + } +#else + return 0; +#endif +} diff --git a/zfs/lib/libumem/misc.h b/zfs/lib/libumem/misc.h new file mode 100644 index 000000000..41e1fde3a --- /dev/null +++ b/zfs/lib/libumem/misc.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +#ifndef _MISC_H +#define _MISC_H + +/* #pragma ident "@(#)misc.h 1.6 05/06/08 SMI" */ + +#include "config.h" +#include <sys/types.h> +#ifndef _WIN32 +#include <sys/time.h> +#endif +#ifdef HAVE_THREAD_H +# include <thread.h> +#else +# include "sol_compat.h" +#endif +#include <stdarg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint_t umem_abort; /* abort when errors occur */ +extern uint_t umem_output; /* output error messages to stderr */ +extern caddr_t umem_min_stack; /* max stack address for audit log */ +extern caddr_t umem_max_stack; /* min stack address for audit log */ + +/* + * various utility functions + * These are globally implemented. + */ + +#undef offsetof +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) + +/* + * a safe printf -- do not use for error messages. + */ +void debug_printf(const char *format, ...); + +/* + * adds a message to the log without writing it out. + */ +void log_message(const char *format, ...); + +/* + * returns the index of the (high/low) bit + 1 + */ +int highbit(ulong_t) __attribute__ ((pure)); +int lowbit(ulong_t) __attribute__ ((pure)); +/* #pragma no_side_effect(highbit, lowbit) */ + +/* + * Converts a hrtime_t to a timestruc_t + */ +void hrt2ts(hrtime_t hrt, timestruc_t *tsp); + +/* + * tries to print out the symbol and offset of a pointer using umem_error_info + */ +int print_sym(void *pointer); + +/* + * Information about the current error. Can be called multiple times, should + * be followed eventually with a call to umem_err or umem_err_recoverable. + */ +void umem_printf(const char *format, ...); +void umem_vprintf(const char *format, va_list); + +void umem_printf_warn(void *ignored, const char *format, ...); + +void umem_error_enter(const char *); + +/* + * prints error message and stack trace, then aborts. Cannot return. + */ +void umem_panic(const char *format, ...) __attribute__((noreturn)); +/* #pragma does_not_return(umem_panic) */ +/* #pragma rarely_called(umem_panic) */ + +/* + * like umem_err, but only aborts if umem_abort > 0 + */ +void umem_err_recoverable(const char *format, ...); + +/* + * We define our own assertion handling since libc's assert() calls malloc() + */ +#ifdef NDEBUG +#define ASSERT(assertion) (void)0 +#else +#define ASSERT(assertion) (void)((assertion) || \ + __umem_assert_failed(#assertion, __FILE__, __LINE__)) +#endif + +int __umem_assert_failed(const char *assertion, const char *file, int line) __attribute__ ((noreturn)); +/* #pragma does_not_return(__umem_assert_failed) */ +/* #pragma rarely_called(__umem_assert_failed) */ +/* + * These have architecture-specific implementations. + */ + +/* + * Returns the current function's frame pointer. + */ +extern void *getfp(void); + +/* + * puts a pc-only stack trace of up to pcstack_limit frames into pcstack. + * Returns the number of stacks written. + * + * if check_sighandler != 0, and we are in a signal context, calls + * umem_err_recoverable. + */ +extern int getpcstack(uintptr_t *pcstack, int pcstack_limit, + int check_sighandler); + +#ifdef __cplusplus +} +#endif + +#endif /* _MISC_H */ diff --git a/zfs/lib/libumem/sol_compat.h b/zfs/lib/libumem/sol_compat.h new file mode 100644 index 000000000..4b7e6cf32 --- /dev/null +++ b/zfs/lib/libumem/sol_compat.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2006 OmniTI, Inc. All rights reserved + * This header file distributed under the terms of the CDDL. + * Portions Copyright 2004 Sun Microsystems, Inc. All Rights reserved. + */ +#ifndef _EC_UMEM_SOL_COMPAT_H_ +#define _EC_UMEM_SOL_COMPAT_H_ + +#include "config.h" + +#include <stdint.h> +#include <pthread.h> + +#ifdef HAVE_SYS_TIME_H +#include <sys/time.h> +#endif + +#ifdef _WIN32 +# define THR_RETURN DWORD +# define THR_API WINAPI +# define INLINE __inline +#else +# define THR_RETURN void * +# define THR_API +# define INLINE inline +#endif + +#if defined(__MACH__) || defined(_WIN32) +#define NO_WEAK_SYMBOLS +#define _umem_cache_alloc(a,b) umem_cache_alloc(a,b) +#define _umem_cache_free(a,b) umem_cache_free(a,b) +#define _umem_zalloc(a,b) umem_zalloc(a,b) +#define _umem_alloc(a,b) umem_alloc(a,b) +#define _umem_alloc_align(a,b,c) umem_alloc_align(a,b,c) +#define _umem_free(a,b) umem_free(a,b) +#define _umem_free_align(a,b) umem_free_align(a,b) +#endif + +#ifdef _WIN32 +#define bcopy(s, d, n) memcpy(d, s, n) +#define bzero(m, s) memset(m, 0, s) +#endif + +typedef pthread_t thread_t; +typedef pthread_mutex_t mutex_t; +typedef pthread_cond_t cond_t; +typedef u_int64_t hrtime_t; +typedef uint32_t uint_t; +typedef unsigned long ulong_t; +typedef struct timespec timestruc_t; +typedef long long longlong_t; +typedef struct timespec timespec_t; +static INLINE hrtime_t gethrtime(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return (((u_int64_t)tv.tv_sec) << 32) | tv.tv_usec; +} +# define thr_self() pthread_self() +static INLINE thread_t _thr_self(void) { + return thr_self(); +} +#if defined(_MACH_PORT_T) +#define CPUHINT() (pthread_mach_thread_np(pthread_self())) +#endif +# define thr_sigsetmask pthread_sigmask + +#define THR_BOUND 1 +#define THR_DETACHED 2 +#define THR_DAEMON 4 + +static INLINE int thr_create(void *stack_base, + size_t stack_size, THR_RETURN (THR_API *start_func)(void*), + void *arg, long flags, thread_t *new_thread_ID) +{ + int ret; + pthread_attr_t attr; + + pthread_attr_init(&attr); + + if (flags & THR_DETACHED) { + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + } + ret = pthread_create(new_thread_ID, &attr, start_func, arg); + pthread_attr_destroy(&attr); + return ret; +} + + +# define mutex_init(mp, type, arg) pthread_mutex_init(mp, NULL) +# define mutex_lock(mp) pthread_mutex_lock(mp) +# define mutex_unlock(mp) pthread_mutex_unlock(mp) +# define mutex_destroy(mp) pthread_mutex_destroy(mp) +# define mutex_trylock(mp) pthread_mutex_trylock(mp) +# define DEFAULTMUTEX PTHREAD_MUTEX_INITIALIZER +# define DEFAULTCV PTHREAD_COND_INITIALIZER +# define MUTEX_HELD(mp) 1 /* not really, but only used in an assert */ + +# define cond_init(c, type, arg) pthread_cond_init(c, NULL) +# define cond_wait(c, m) pthread_cond_wait(c, m) +# define _cond_wait(c, m) pthread_cond_wait(c, m) +# define cond_signal(c) pthread_cond_signal(c) +# define cond_broadcast(c) pthread_cond_broadcast(c) +# define cond_destroy(c) pthread_cond_destroy(c) +# define cond_timedwait pthread_cond_timedwait +# define _cond_timedwait pthread_cond_timedwait + +#ifndef RTLD_FIRST +# define RTLD_FIRST 0 +#endif + +#ifdef ECELERITY +# include "ec_atomic.h" +#else +# ifdef _WIN32 +# define ec_atomic_inc(a) InterlockedIncrement(a) +# define ec_atomic_inc64(a) InterlockedIncrement64(a) +# elif (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) +static INLINE uint_t ec_atomic_cas(uint_t *mem, uint_t with, uint_t cmp) +{ + uint_t prev; + __asm volatile ("lock; cmpxchgl %1, %2" + : "=a" (prev) + : "r" (with), "m" (*(mem)), "0" (cmp) + : "memory"); + return prev; +} +# elif defined(__sparc__) && defined(__GNUC__) +static INLINE uint_t ec_atomic_cas(uint_t *mem, uint_t with, uint_t cmp) +{ + __asm volatile ("cas [%3],%2,%0" + : "+r"(with), "=m"(*(mem)) + : "r"(cmp), "r"(mem), "m"(*(mem)) ); + return with; +} +# endif + +# ifndef ec_atomic_inc +static INLINE uint_t ec_atomic_inc(uint_t *mem) +{ + register uint_t last; + do { + last = *mem; + } while (ec_atomic_cas(mem, last+1, last) != last); + return ++last; +} +# endif +# ifndef ec_atomic_inc64 + /* yeah, it's not great. It's only used to bump failed allocation + * counts, so it is not critical right now. */ +# define ec_atomic_inc64(a) (*a)++ +# endif + +#endif + +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2END(x, align) (-(~(x) & -(align))) +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* beware! umem only uses these atomic adds for incrementing by 1 */ +#define atomic_add_64(lvalptr, delta) ec_atomic_inc64(lvalptr) +#define atomic_add_32_nv(a, b) ec_atomic_inc(a) + +#ifndef NANOSEC +#define NANOSEC 1000000000 +#endif + +#ifdef _WIN32 +#define issetugid() 0 +#elif !defined(__FreeBSD__) +#define issetugid() (geteuid() == 0) +#endif + +#define _sysconf(a) sysconf(a) +#define __NORETURN __attribute__ ((noreturn)) + +#define EC_UMEM_DUMMY_PCSTACK 1 +static INLINE int __nthreads(void) +{ + /* or more; just to force multi-threaded mode */ + return 2; +} + +#if (SIZEOF_VOID_P == 8) +# define _LP64 1 +#endif + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + + +#endif diff --git a/zfs/lib/libumem/sys/Makefile.in b/zfs/lib/libumem/sys/Makefile.in new file mode 100644 index 000000000..eed2c3118 --- /dev/null +++ b/zfs/lib/libumem/sys/Makefile.in @@ -0,0 +1 @@ +DISTFILES = vmem.h vmem_impl_user.h diff --git a/zfs/lib/libumem/sys/vmem.h b/zfs/lib/libumem/sys/vmem.h new file mode 100644 index 000000000..fed44c3d4 --- /dev/null +++ b/zfs/lib/libumem/sys/vmem.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VMEM_H +#define _SYS_VMEM_H + +/* #pragma ident "@(#)vmem.h 1.13 05/06/08 SMI" */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Per-allocation flags + */ +#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ +#define VM_PANIC 0x00000002 /* same as KM_PANIC */ +#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ + +#define VM_BESTFIT 0x00000100 +#define VM_FIRSTFIT 0x00000200 +#define VM_NEXTFIT 0x00000400 + +/* + * The following flags are restricted for use only within the kernel. + * VM_MEMLOAD is for use by the HAT to avoid infinite recursion. + * VM_NORELOC is used by the kernel when static VA->PA mappings are required. + */ +#define VM_MEMLOAD 0x00000800 +#define VM_NORELOC 0x00001000 +/* + * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags + * and forgo reaping if the allocation or attempted import, fails. This + * flag is a segkmem-specific flag, and should not be used by anyone else. + */ +#define VM_ABORT 0x00002000 + +#define VM_FLAGS 0x0000FFFF + +/* + * Arena creation flags + */ +#define VMC_POPULATOR 0x00010000 +#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */ +#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */ +/* + * internal use only; the import function uses the vmem_ximport_t interface + * and may increase the request size if it so desires + */ +#define VMC_XALLOC 0x00080000 +#define VMC_FLAGS 0xFFFF0000 + +/* + * Public segment types + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +/* + * Implementation-private segment types + */ +#define VMEM_SPAN 0x10 +#define VMEM_ROTOR 0x20 +#define VMEM_WALKER 0x40 + +/* + * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may + * call back into the arena being walked, so vmem_walk() must drop the + * arena lock before each callback. The caveat is that since the arena + * isn't locked, its state can change. Therefore it is up to the callback + * routine to handle cases where the segment isn't of the expected type. + * For example, we use this to walk heap_arena when generating a crash dump; + * see segkmem_dump() for sample usage. + */ +#define VMEM_REENTRANT 0x80000000 + +typedef struct vmem vmem_t; +typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); +typedef void (vmem_free_t)(vmem_t *, void *, size_t); + +/* + * Alternate import style; the requested size is passed in a pointer, + * which can be increased by the import function if desired. + */ +typedef void *(vmem_ximport_t)(vmem_t *, size_t *, int); + +#if 0 +extern vmem_t *vmem_init(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *); +extern void vmem_update(void *); +extern int vmem_is_populator(); +extern size_t vmem_seg_size; +#endif + +extern vmem_t *vmem_create(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int); +extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t, + vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int); +extern void vmem_destroy(vmem_t *); +extern void *vmem_alloc(vmem_t *, size_t, int); +extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, + void *, void *, int); +extern void vmem_free(vmem_t *, void *, size_t); +extern void vmem_xfree(vmem_t *, void *, size_t); +extern void *vmem_add(vmem_t *, void *, size_t, int); +extern int vmem_contains(vmem_t *, void *, size_t); +extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *); +extern size_t vmem_size(vmem_t *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_H */ diff --git a/zfs/lib/libumem/sys/vmem_impl_user.h b/zfs/lib/libumem/sys/vmem_impl_user.h new file mode 100644 index 000000000..c7dd5cc46 --- /dev/null +++ b/zfs/lib/libumem/sys/vmem_impl_user.h @@ -0,0 +1,165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1999-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +#ifndef _SYS_VMEM_IMPL_USER_H +#define _SYS_VMEM_IMPL_USER_H + +/* #pragma ident "@(#)vmem_impl_user.h 1.2 05/06/08 SMI" */ + +#if HAVE_SYS_KSTAT +#include <sys/kstat.h> +#endif +#ifndef _WIN32 +#include <sys/time.h> +#endif +#include <sys/vmem.h> +#if HAVE_THREAD_H +#include <thread.h> +#else +# include "sol_compat.h" +#endif +#if HAVE_SYNC_H +#include <synch.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vmem_seg vmem_seg_t; + +#define VMEM_STACK_DEPTH 20 + +struct vmem_seg { + /* + * The first four fields must match vmem_freelist_t exactly. + */ + uintptr_t vs_start; /* start of segment (inclusive) */ + uintptr_t vs_end; /* end of segment (exclusive) */ + vmem_seg_t *vs_knext; /* next of kin (alloc, free, span) */ + vmem_seg_t *vs_kprev; /* prev of kin */ + + vmem_seg_t *vs_anext; /* next in arena */ + vmem_seg_t *vs_aprev; /* prev in arena */ + uint8_t vs_type; /* alloc, free, span */ + uint8_t vs_import; /* non-zero if segment was imported */ + uint8_t vs_depth; /* stack depth if UMF_AUDIT active */ + /* + * The following fields are present only when UMF_AUDIT is set. + */ + thread_t vs_thread; + hrtime_t vs_timestamp; + uintptr_t vs_stack[VMEM_STACK_DEPTH]; +}; + +typedef struct vmem_freelist { + uintptr_t vs_start; /* always zero */ + uintptr_t vs_end; /* segment size */ + vmem_seg_t *vs_knext; /* next of kin */ + vmem_seg_t *vs_kprev; /* prev of kin */ +} vmem_freelist_t; + +#define VS_SIZE(vsp) ((vsp)->vs_end - (vsp)->vs_start) + +/* + * Segment hashing + */ +#define VMEM_HASH_INDEX(a, s, q, m) \ + ((((a) + ((a) >> (s)) + ((a) >> ((s) << 1))) >> (q)) & (m)) + +#define VMEM_HASH(vmp, addr) \ + (&(vmp)->vm_hash_table[VMEM_HASH_INDEX(addr, \ + (vmp)->vm_hash_shift, (vmp)->vm_qshift, (vmp)->vm_hash_mask)]) + +#define VMEM_NAMELEN 30 +#define VMEM_HASH_INITIAL 16 +#define VMEM_NQCACHE_MAX 16 +#define VMEM_FREELISTS (sizeof (void *) * 8) + +typedef struct vmem_kstat { + uint64_t vk_mem_inuse; /* memory in use */ + uint64_t vk_mem_import; /* memory imported */ + uint64_t vk_mem_total; /* total memory in arena */ + uint32_t vk_source_id; /* vmem id of vmem source */ + uint64_t vk_alloc; /* number of allocations */ + uint64_t vk_free; /* number of frees */ + uint64_t vk_wait; /* number of allocations that waited */ + uint64_t vk_fail; /* number of allocations that failed */ + uint64_t vk_lookup; /* hash lookup count */ + uint64_t vk_search; /* freelist search count */ + uint64_t vk_populate_wait; /* populates that waited */ + uint64_t vk_populate_fail; /* populates that failed */ + uint64_t vk_contains; /* vmem_contains() calls */ + uint64_t vk_contains_search; /* vmem_contains() search cnt */ +} vmem_kstat_t; + +struct vmem { + char vm_name[VMEM_NAMELEN]; /* arena name */ + cond_t vm_cv; /* cv for blocking allocations */ + mutex_t vm_lock; /* arena lock */ + uint32_t vm_id; /* vmem id */ + uint32_t vm_mtbf; /* induced alloc failure rate */ + int vm_cflags; /* arena creation flags */ + int vm_qshift; /* log2(vm_quantum) */ + size_t vm_quantum; /* vmem quantum */ + size_t vm_qcache_max; /* maximum size to front by umem */ + vmem_alloc_t *vm_source_alloc; + vmem_free_t *vm_source_free; + vmem_t *vm_source; /* vmem source for imported memory */ + vmem_t *vm_next; /* next in vmem_list */ + ssize_t vm_nsegfree; /* number of free vmem_seg_t's */ + vmem_seg_t *vm_segfree; /* free vmem_seg_t list */ + vmem_seg_t **vm_hash_table; /* allocated-segment hash table */ + size_t vm_hash_mask; /* hash_size - 1 */ + size_t vm_hash_shift; /* log2(vm_hash_mask + 1) */ + ulong_t vm_freemap; /* bitmap of non-empty freelists */ + vmem_seg_t vm_seg0; /* anchor segment */ + vmem_seg_t vm_rotor; /* rotor for VM_NEXTFIT allocations */ + vmem_seg_t *vm_hash0[VMEM_HASH_INITIAL]; /* initial hash table */ + void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */ + vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */ + vmem_kstat_t vm_kstat; /* kstat data */ +}; + +/* + * We cannot use a mutex_t and MUTEX_HELD, since that will not work + * when libthread is not linked. + */ +typedef struct vmem_populate_lock { + mutex_t vmpl_mutex; + thread_t vmpl_thr; +} vmem_populate_lock_t; + +#define VM_UMFLAGS VM_KMFLAGS + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_IMPL_USER_H */ diff --git a/zfs/lib/libumem/umem.c b/zfs/lib/libumem/umem.c new file mode 100644 index 000000000..635c19e1a --- /dev/null +++ b/zfs/lib/libumem/umem.c @@ -0,0 +1,3208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)umem.c 1.11 05/06/08 SMI" */ + +/*! + * \mainpage Main Page + * + * \section README + * + * \include README + * + * \section Nuances + * + * There is a nuance in the behaviour of the umem port compared + * with umem on Solaris. + * + * On Linux umem will not return memory back to the OS until umem fails + * to allocate a chunk. On failure, umem_reap() will be called automatically, + * to return memory to the OS. If your code is going to be running + * for a long time on Linux and mixes calls to different memory allocators + * (e.g.: malloc()) and umem, your code will need to call + * umem_reap() periodically. + * + * This doesn't happen on Solaris, because malloc is replaced + * with umem calls, meaning that umem_reap() is called automatically. + * + * \section References + * + * http://docs.sun.com/app/docs/doc/816-5173/6mbb8advq?a=view + * + * http://access1.sun.com/techarticles/libumem.html + * + * \section Overview + * + * \code + * based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18 + * + * The slab allocator, as described in the following two papers: + * + * Jeff Bonwick, + * The Slab Allocator: An Object-Caching Kernel Memory Allocator. + * Proceedings of the Summer 1994 Usenix Conference. + * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf. + * + * Jeff Bonwick and Jonathan Adams, + * Magazines and vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources. + * Proceedings of the 2001 Usenix Conference. + * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. + * + * 1. Overview + * ----------- + * umem is very close to kmem in implementation. There are four major + * areas of divergence: + * + * * Initialization + * + * * CPU handling + * + * * umem_update() + * + * * KM_SLEEP v.s. UMEM_NOFAIL + * + * + * 2. Initialization + * ----------------- + * kmem is initialized early on in boot, and knows that no one will call + * into it before it is ready. umem does not have these luxuries. Instead, + * initialization is divided into two phases: + * + * * library initialization, and + * + * * first use + * + * umem's full initialization happens at the time of the first allocation + * request (via malloc() and friends, umem_alloc(), or umem_zalloc()), + * or the first call to umem_cache_create(). + * + * umem_free(), and umem_cache_alloc() do not require special handling, + * since the only way to get valid arguments for them is to successfully + * call a function from the first group. + * + * 2.1. Library Initialization: umem_startup() + * ------------------------------------------- + * umem_startup() is libumem.so's .init section. It calls pthread_atfork() + * to install the handlers necessary for umem's Fork1-Safety. Because of + * race condition issues, all other pre-umem_init() initialization is done + * statically (i.e. by the dynamic linker). + * + * For standalone use, umem_startup() returns everything to its initial + * state. + * + * 2.2. First use: umem_init() + * ------------------------------ + * The first time any memory allocation function is used, we have to + * create the backing caches and vmem arenas which are needed for it. + * umem_init() is the central point for that task. When it completes, + * umem_ready is either UMEM_READY (all set) or UMEM_READY_INIT_FAILED (unable + * to initialize, probably due to lack of memory). + * + * There are four different paths from which umem_init() is called: + * + * * from umem_alloc() or umem_zalloc(), with 0 < size < UMEM_MAXBUF, + * + * * from umem_alloc() or umem_zalloc(), with size > UMEM_MAXBUF, + * + * * from umem_cache_create(), and + * + * * from memalign(), with align > UMEM_ALIGN. + * + * The last three just check if umem is initialized, and call umem_init() + * if it is not. For performance reasons, the first case is more complicated. + * + * 2.2.1. umem_alloc()/umem_zalloc(), with 0 < size < UMEM_MAXBUF + * ----------------------------------------------------------------- + * In this case, umem_cache_alloc(&umem_null_cache, ...) is called. + * There is special case code in which causes any allocation on + * &umem_null_cache to fail by returning (NULL), regardless of the + * flags argument. + * + * So umem_cache_alloc() returns NULL, and umem_alloc()/umem_zalloc() call + * umem_alloc_retry(). umem_alloc_retry() sees that the allocation + * was agains &umem_null_cache, and calls umem_init(). + * + * If initialization is successful, umem_alloc_retry() returns 1, which + * causes umem_alloc()/umem_zalloc() to start over, which causes it to load + * the (now valid) cache pointer from umem_alloc_table. + * + * 2.2.2. Dealing with race conditions + * ----------------------------------- + * There are a couple race conditions resulting from the initialization + * code that we have to guard against: + * + * * In umem_cache_create(), there is a special UMC_INTERNAL cflag + * that is passed for caches created during initialization. It + * is illegal for a user to try to create a UMC_INTERNAL cache. + * This allows initialization to proceed, but any other + * umem_cache_create()s will block by calling umem_init(). + * + * * Since umem_null_cache has a 1-element cache_cpu, it's cache_cpu_mask + * is always zero. umem_cache_alloc uses cp->cache_cpu_mask to + * mask the cpu number. This prevents a race between grabbing a + * cache pointer out of umem_alloc_table and growing the cpu array. + * + * + * 3. CPU handling + * --------------- + * kmem uses the CPU's sequence number to determine which "cpu cache" to + * use for an allocation. Currently, there is no way to get the sequence + * number in userspace. + * + * umem keeps track of cpu information in umem_cpus, an array of umem_max_ncpus + * umem_cpu_t structures. CURCPU() is a a "hint" function, which we then mask + * with either umem_cpu_mask or cp->cache_cpu_mask to find the actual "cpu" id. + * The mechanics of this is all in the CPU(mask) macro. + * + * Currently, umem uses _lwp_self() as its hint. + * + * + * 4. The update thread + * -------------------- + * kmem uses a task queue, kmem_taskq, to do periodic maintenance on + * every kmem cache. vmem has a periodic timeout for hash table resizing. + * The kmem_taskq also provides a separate context for kmem_cache_reap()'s + * to be done in, avoiding issues of the context of kmem_reap() callers. + * + * Instead, umem has the concept of "updates", which are asynchronous requests + * for work attached to single caches. All caches with pending work are + * on a doubly linked list rooted at the umem_null_cache. All update state + * is protected by the umem_update_lock mutex, and the umem_update_cv is used + * for notification between threads. + * + * 4.1. Cache states with regards to updates + * ----------------------------------------- + * A given cache is in one of three states: + * + * Inactive cache_uflags is zero, cache_u{next,prev} are NULL + * + * Work Requested cache_uflags is non-zero (but UMU_ACTIVE is not set), + * cache_u{next,prev} link the cache onto the global + * update list + * + * Active cache_uflags has UMU_ACTIVE set, cache_u{next,prev} + * are NULL, and either umem_update_thr or + * umem_st_update_thr are actively doing work on the + * cache. + * + * An update can be added to any cache in any state -- if the cache is + * Inactive, it transitions to being Work Requested. If the cache is + * Active, the worker will notice the new update and act on it before + * transitioning the cache to the Inactive state. + * + * If a cache is in the Active state, UMU_NOTIFY can be set, which asks + * the worker to broadcast the umem_update_cv when it has finished. + * + * 4.2. Update interface + * --------------------- + * umem_add_update() adds an update to a particular cache. + * umem_updateall() adds an update to all caches. + * umem_remove_updates() returns a cache to the Inactive state. + * + * umem_process_updates() process all caches in the Work Requested state. + * + * 4.3. Reaping + * ------------ + * When umem_reap() is called (at the time of heap growth), it schedule + * UMU_REAP updates on every cache. It then checks to see if the update + * thread exists (umem_update_thr != 0). If it is, it broadcasts + * the umem_update_cv to wake the update thread up, and returns. + * + * If the update thread does not exist (umem_update_thr == 0), and the + * program currently has multiple threads, umem_reap() attempts to create + * a new update thread. + * + * If the process is not multithreaded, or the creation fails, umem_reap() + * calls umem_st_update() to do an inline update. + * + * 4.4. The update thread + * ---------------------- + * The update thread spends most of its time in cond_timedwait() on the + * umem_update_cv. It wakes up under two conditions: + * + * * The timedwait times out, in which case it needs to run a global + * update, or + * + * * someone cond_broadcast(3THR)s the umem_update_cv, in which case + * it needs to check if there are any caches in the Work Requested + * state. + * + * When it is time for another global update, umem calls umem_cache_update() + * on every cache, then calls vmem_update(), which tunes the vmem structures. + * umem_cache_update() can request further work using umem_add_update(). + * + * After any work from the global update completes, the update timer is + * reset to umem_reap_interval seconds in the future. This makes the + * updates self-throttling. + * + * Reaps are similarly self-throttling. After a UMU_REAP update has + * been scheduled on all caches, umem_reap() sets a flag and wakes up the + * update thread. The update thread notices the flag, and resets the + * reap state. + * + * 4.5. Inline updates + * ------------------- + * If the update thread is not running, umem_st_update() is used instead. It + * immediately does a global update (as above), then calls + * umem_process_updates() to process both the reaps that umem_reap() added and + * any work generated by the global update. Afterwards, it resets the reap + * state. + * + * While the umem_st_update() is running, umem_st_update_thr holds the thread + * id of the thread performing the update. + * + * 4.6. Updates and fork1() + * ------------------------ + * umem has fork1() pre- and post-handlers which lock up (and release) every + * mutex in every cache. They also lock up the umem_update_lock. Since + * fork1() only copies over a single lwp, other threads (including the update + * thread) could have been actively using a cache in the parent. This + * can lead to inconsistencies in the child process. + * + * Because we locked all of the mutexes, the only possible inconsistancies are: + * + * * a umem_cache_alloc() could leak its buffer. + * + * * a caller of umem_depot_alloc() could leak a magazine, and all the + * buffers contained in it. + * + * * a cache could be in the Active update state. In the child, there + * would be no thread actually working on it. + * + * * a umem_hash_rescale() could leak the new hash table. + * + * * a umem_magazine_resize() could be in progress. + * + * * a umem_reap() could be in progress. + * + * The memory leaks we can't do anything about. umem_release_child() resets + * the update state, moves any caches in the Active state to the Work Requested + * state. This might cause some updates to be re-run, but UMU_REAP and + * UMU_HASH_RESCALE are effectively idempotent, and the worst that can + * happen from umem_magazine_resize() is resizing the magazine twice in close + * succession. + * + * Much of the cleanup in umem_release_child() is skipped if + * umem_st_update_thr == thr_self(). This is so that applications which call + * fork1() from a cache callback does not break. Needless to say, any such + * application is tremendously broken. + * + * + * 5. KM_SLEEP v.s. UMEM_NOFAIL + * ---------------------------- + * Allocations against kmem and vmem have two basic modes: SLEEP and + * NOSLEEP. A sleeping allocation is will go to sleep (waiting for + * more memory) instead of failing (returning NULL). + * + * SLEEP allocations presume an extremely multithreaded model, with + * a lot of allocation and deallocation activity. umem cannot presume + * that its clients have any particular type of behavior. Instead, + * it provides two types of allocations: + * + * * UMEM_DEFAULT, equivalent to KM_NOSLEEP (i.e. return NULL on + * failure) + * + * * UMEM_NOFAIL, which, on failure, calls an optional callback + * (registered with umem_nofail_callback()). + * + * The callback is invoked with no locks held, and can do an arbitrary + * amount of work. It then has a choice between: + * + * * Returning UMEM_CALLBACK_RETRY, which will cause the allocation + * to be restarted. + * + * * Returning UMEM_CALLBACK_EXIT(status), which will cause exit(2) + * to be invoked with status. If multiple threads attempt to do + * this simultaneously, only one will call exit(2). + * + * * Doing some kind of non-local exit (thr_exit(3thr), longjmp(3C), + * etc.) + * + * The default callback returns UMEM_CALLBACK_EXIT(255). + * + * To have these callbacks without risk of state corruption (in the case of + * a non-local exit), we have to ensure that the callbacks get invoked + * close to the original allocation, with no inconsistent state or held + * locks. The following steps are taken: + * + * * All invocations of vmem are VM_NOSLEEP. + * + * * All constructor callbacks (which can themselves to allocations) + * are passed UMEM_DEFAULT as their required allocation argument. This + * way, the constructor will fail, allowing the highest-level allocation + * invoke the nofail callback. + * + * If a constructor callback _does_ do a UMEM_NOFAIL allocation, and + * the nofail callback does a non-local exit, we will leak the + * partially-constructed buffer. + * \endcode + */ + +#include "config.h" +/* #include "mtlib.h" */ +#include <umem_impl.h> +#include <sys/vmem_impl_user.h> +#include "umem_base.h" +#include "vmem_base.h" + +#if HAVE_SYS_PROCESSOR_H +#include <sys/processor.h> +#endif +#if HAVE_SYS_SYSMACROS_H +#include <sys/sysmacros.h> +#endif + +#if HAVE_ALLOCA_H +#include <alloca.h> +#endif +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#if HAVE_STRINGS_H +#include <strings.h> +#endif +#include <signal.h> +#if HAVE_UNISTD_H +#include <unistd.h> +#endif +#if HAVE_ATOMIC_H +#include <atomic.h> +#endif + +#include "misc.h" + +#define UMEM_VMFLAGS(umflag) (VM_NOSLEEP) + +size_t pagesize; + +/* + * The default set of caches to back umem_alloc(). + * These sizes should be reevaluated periodically. + * + * We want allocations that are multiples of the coherency granularity + * (64 bytes) to be satisfied from a cache which is a multiple of 64 + * bytes, so that it will be 64-byte aligned. For all multiples of 64, + * the next kmem_cache_size greater than or equal to it must be a + * multiple of 64. + */ +static const int umem_alloc_sizes[] = { +#ifdef _LP64 + 1 * 8, + 1 * 16, + 2 * 16, + 3 * 16, +#else + 1 * 8, + 2 * 8, + 3 * 8, + 4 * 8, 5 * 8, 6 * 8, 7 * 8, +#endif + 4 * 16, 5 * 16, 6 * 16, 7 * 16, + 4 * 32, 5 * 32, 6 * 32, 7 * 32, + 4 * 64, 5 * 64, 6 * 64, 7 * 64, + 4 * 128, 5 * 128, 6 * 128, 7 * 128, + P2ALIGN(8192 / 7, 64), + P2ALIGN(8192 / 6, 64), + P2ALIGN(8192 / 5, 64), + P2ALIGN(8192 / 4, 64), + P2ALIGN(8192 / 3, 64), + P2ALIGN(8192 / 2, 64), + P2ALIGN(8192 / 1, 64), + 4096 * 3, + 8192 * 2, +}; +#define NUM_ALLOC_SIZES (sizeof (umem_alloc_sizes) / sizeof (*umem_alloc_sizes)) + +#define UMEM_MAXBUF 16384 + +static umem_magtype_t umem_magtype[] = { + { 1, 8, 3200, 65536 }, + { 3, 16, 256, 32768 }, + { 7, 32, 64, 16384 }, + { 15, 64, 0, 8192 }, + { 31, 64, 0, 4096 }, + { 47, 64, 0, 2048 }, + { 63, 64, 0, 1024 }, + { 95, 64, 0, 512 }, + { 143, 64, 0, 0 }, +}; + +/* + * umem tunables + */ +uint32_t umem_max_ncpus; /* # of CPU caches. */ + +uint32_t umem_stack_depth = 15; /* # stack frames in a bufctl_audit */ +uint32_t umem_reap_interval = 10; /* max reaping rate (seconds) */ +uint_t umem_depot_contention = 2; /* max failed trylocks per real interval */ +uint_t umem_abort = 1; /* whether to abort on error */ +uint_t umem_output = 0; /* whether to write to standard error */ +uint_t umem_logging = 0; /* umem_log_enter() override */ +uint32_t umem_mtbf = 0; /* mean time between failures [default: off] */ +size_t umem_transaction_log_size; /* size of transaction log */ +size_t umem_content_log_size; /* size of content log */ +size_t umem_failure_log_size; /* failure log [4 pages per CPU] */ +size_t umem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t umem_content_maxsave = 256; /* UMF_CONTENTS max bytes to log */ +size_t umem_lite_minsize = 0; /* minimum buffer size for UMF_LITE */ +size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */ +size_t umem_maxverify; /* maximum bytes to inspect in debug routines */ +size_t umem_minfirewall; /* hardware-enforced redzone threshold */ + +uint_t umem_flags = 0; + +mutex_t umem_init_lock = DEFAULTMUTEX; /* locks initialization */ +cond_t umem_init_cv = DEFAULTCV; /* initialization CV */ +thread_t umem_init_thr; /* thread initializing */ +int umem_init_env_ready; /* environ pre-initted */ +int umem_ready = UMEM_READY_STARTUP; + +static umem_nofail_callback_t *nofail_callback; +static mutex_t umem_nofail_exit_lock = DEFAULTMUTEX; +static thread_t umem_nofail_exit_thr; + +static umem_cache_t *umem_slab_cache; +static umem_cache_t *umem_bufctl_cache; +static umem_cache_t *umem_bufctl_audit_cache; + +mutex_t umem_flags_lock = DEFAULTMUTEX; + +static vmem_t *heap_arena; +static vmem_alloc_t *heap_alloc; +static vmem_free_t *heap_free; + +static vmem_t *umem_internal_arena; +static vmem_t *umem_cache_arena; +static vmem_t *umem_hash_arena; +static vmem_t *umem_log_arena; +static vmem_t *umem_oversize_arena; +static vmem_t *umem_va_arena; +static vmem_t *umem_default_arena; +static vmem_t *umem_firewall_va_arena; +static vmem_t *umem_firewall_arena; + +vmem_t *umem_memalign_arena; + +umem_log_header_t *umem_transaction_log; +umem_log_header_t *umem_content_log; +umem_log_header_t *umem_failure_log; +umem_log_header_t *umem_slab_log; + +extern thread_t _thr_self(void); +#if defined(__MACH__) || defined(__FreeBSD__) +# define CPUHINT() ((int)(_thr_self())) +#endif + +#ifndef CPUHINT +#define CPUHINT() (_thr_self()) +#endif + +#define CPUHINT_MAX() INT_MAX + +#define CPU(mask) (umem_cpus + (CPUHINT() & (mask))) +static umem_cpu_t umem_startup_cpu = { /* initial, single, cpu */ + UMEM_CACHE_SIZE(0), + 0 +}; + +static uint32_t umem_cpu_mask = 0; /* global cpu mask */ +static umem_cpu_t *umem_cpus = &umem_startup_cpu; /* cpu list */ + +volatile uint32_t umem_reaping; + +thread_t umem_update_thr; +struct timeval umem_update_next; /* timeofday of next update */ +volatile thread_t umem_st_update_thr; /* only used when single-thd */ + +#define IN_UPDATE() (thr_self() == umem_update_thr || \ + thr_self() == umem_st_update_thr) +#define IN_REAP() IN_UPDATE() + +mutex_t umem_update_lock = DEFAULTMUTEX; /* cache_u{next,prev,flags} */ +cond_t umem_update_cv = DEFAULTCV; + +volatile hrtime_t umem_reap_next; /* min hrtime of next reap */ + +mutex_t umem_cache_lock = DEFAULTMUTEX; /* inter-cache linkage only */ + +#ifdef UMEM_STANDALONE +umem_cache_t umem_null_cache; +static const umem_cache_t umem_null_cache_template = { +#else +umem_cache_t umem_null_cache = { +#endif + 0, 0, 0, 0, 0, + 0, 0, + 0, 0, + 0, 0, + "invalid_cache", + 0, 0, + NULL, NULL, NULL, NULL, + NULL, + 0, 0, 0, 0, + &umem_null_cache, &umem_null_cache, + &umem_null_cache, &umem_null_cache, + 0, + DEFAULTMUTEX, /* start of slab layer */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + &umem_null_cache.cache_nullslab, + { + &umem_null_cache, + NULL, + &umem_null_cache.cache_nullslab, + &umem_null_cache.cache_nullslab, + NULL, + -1, + 0 + }, + NULL, + NULL, + DEFAULTMUTEX, /* start of depot layer */ + NULL, { + NULL, 0, 0, 0, 0 + }, { + NULL, 0, 0, 0, 0 + }, { + { + DEFAULTMUTEX, /* start of CPU cache */ + 0, 0, NULL, NULL, -1, -1, 0 + } + } +}; + +#define ALLOC_TABLE_4 \ + &umem_null_cache, &umem_null_cache, &umem_null_cache, &umem_null_cache + +#define ALLOC_TABLE_64 \ + ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \ + ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \ + ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \ + ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4 + +#define ALLOC_TABLE_1024 \ + ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \ + ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \ + ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \ + ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64 + +static umem_cache_t *umem_alloc_table[UMEM_MAXBUF >> UMEM_ALIGN_SHIFT] = { + ALLOC_TABLE_1024, + ALLOC_TABLE_1024 +}; + + +/* Used to constrain audit-log stack traces */ +caddr_t umem_min_stack; +caddr_t umem_max_stack; + + +/* + * we use the _ versions, since we don't want to be cancelled. + * Actually, this is automatically taken care of by including "mtlib.h". + */ +extern int _cond_wait(cond_t *cv, mutex_t *mutex); + +#define UMERR_MODIFIED 0 /* buffer modified while on freelist */ +#define UMERR_REDZONE 1 /* redzone violation (write past end of buf) */ +#define UMERR_DUPFREE 2 /* freed a buffer twice */ +#define UMERR_BADADDR 3 /* freed a bad (unallocated) address */ +#define UMERR_BADBUFTAG 4 /* buftag corrupted */ +#define UMERR_BADBUFCTL 5 /* bufctl corrupted */ +#define UMERR_BADCACHE 6 /* freed a buffer to the wrong cache */ +#define UMERR_BADSIZE 7 /* alloc size != free size */ +#define UMERR_BADBASE 8 /* buffer base address wrong */ + +struct { + hrtime_t ump_timestamp; /* timestamp of error */ + int ump_error; /* type of umem error (UMERR_*) */ + void *ump_buffer; /* buffer that induced abort */ + void *ump_realbuf; /* real start address for buffer */ + umem_cache_t *ump_cache; /* buffer's cache according to client */ + umem_cache_t *ump_realcache; /* actual cache containing buffer */ + umem_slab_t *ump_slab; /* slab accoring to umem_findslab() */ + umem_bufctl_t *ump_bufctl; /* bufctl */ +} umem_abort_info; + +static void +copy_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf = buf_arg; + + while (buf < bufend) + *buf++ = pattern; +} + +static void * +verify_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) + if (*buf != pattern) + return (buf); + return (NULL); +} + +static void * +verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) { + if (*buf != old) { + copy_pattern(old, buf_arg, + (char *)buf - (char *)buf_arg); + return (buf); + } + *buf = new; + } + + return (NULL); +} + +void +umem_cache_applyall(void (*func)(umem_cache_t *)) +{ + umem_cache_t *cp; + + (void) mutex_lock(&umem_cache_lock); + for (cp = umem_null_cache.cache_next; cp != &umem_null_cache; + cp = cp->cache_next) + func(cp); + (void) mutex_unlock(&umem_cache_lock); +} + +static void +umem_add_update_unlocked(umem_cache_t *cp, int flags) +{ + umem_cache_t *cnext, *cprev; + + flags &= ~UMU_ACTIVE; + + if (!flags) + return; + + if (cp->cache_uflags & UMU_ACTIVE) { + cp->cache_uflags |= flags; + } else { + if (cp->cache_unext != NULL) { + ASSERT(cp->cache_uflags != 0); + cp->cache_uflags |= flags; + } else { + ASSERT(cp->cache_uflags == 0); + cp->cache_uflags = flags; + cp->cache_unext = cnext = &umem_null_cache; + cp->cache_uprev = cprev = umem_null_cache.cache_uprev; + cnext->cache_uprev = cp; + cprev->cache_unext = cp; + } + } +} + +static void +umem_add_update(umem_cache_t *cp, int flags) +{ + (void) mutex_lock(&umem_update_lock); + + umem_add_update_unlocked(cp, flags); + + if (!IN_UPDATE()) + (void) cond_broadcast(&umem_update_cv); + + (void) mutex_unlock(&umem_update_lock); +} + +/* + * Remove a cache from the update list, waiting for any in-progress work to + * complete first. + */ +static void +umem_remove_updates(umem_cache_t *cp) +{ + (void) mutex_lock(&umem_update_lock); + + /* + * Get it out of the active state + */ + while (cp->cache_uflags & UMU_ACTIVE) { + ASSERT(cp->cache_unext == NULL); + + cp->cache_uflags |= UMU_NOTIFY; + + /* + * Make sure the update state is sane, before we wait + */ + ASSERT(umem_update_thr != 0 || umem_st_update_thr != 0); + ASSERT(umem_update_thr != thr_self() && + umem_st_update_thr != thr_self()); + + (void) _cond_wait(&umem_update_cv, &umem_update_lock); + } + /* + * Get it out of the Work Requested state + */ + if (cp->cache_unext != NULL) { + cp->cache_uprev->cache_unext = cp->cache_unext; + cp->cache_unext->cache_uprev = cp->cache_uprev; + cp->cache_uprev = cp->cache_unext = NULL; + cp->cache_uflags = 0; + } + /* + * Make sure it is in the Inactive state + */ + ASSERT(cp->cache_unext == NULL && cp->cache_uflags == 0); + (void) mutex_unlock(&umem_update_lock); +} + +static void +umem_updateall(int flags) +{ + umem_cache_t *cp; + + /* + * NOTE: To prevent deadlock, umem_cache_lock is always acquired first. + * + * (umem_add_update is called from things run via umem_cache_applyall) + */ + (void) mutex_lock(&umem_cache_lock); + (void) mutex_lock(&umem_update_lock); + + for (cp = umem_null_cache.cache_next; cp != &umem_null_cache; + cp = cp->cache_next) + umem_add_update_unlocked(cp, flags); + + if (!IN_UPDATE()) + (void) cond_broadcast(&umem_update_cv); + + (void) mutex_unlock(&umem_update_lock); + (void) mutex_unlock(&umem_cache_lock); +} + +/* + * Debugging support. Given a buffer address, find its slab. + */ +static umem_slab_t * +umem_findslab(umem_cache_t *cp, void *buf) +{ + umem_slab_t *sp; + + (void) mutex_lock(&cp->cache_lock); + for (sp = cp->cache_nullslab.slab_next; + sp != &cp->cache_nullslab; sp = sp->slab_next) { + if (UMEM_SLAB_MEMBER(sp, buf)) { + (void) mutex_unlock(&cp->cache_lock); + return (sp); + } + } + (void) mutex_unlock(&cp->cache_lock); + + return (NULL); +} + +static void +umem_error(int error, umem_cache_t *cparg, void *bufarg) +{ + umem_buftag_t *btp = NULL; + umem_bufctl_t *bcp = NULL; + umem_cache_t *cp = cparg; + umem_slab_t *sp; + uint64_t *off; + void *buf = bufarg; + + int old_logging = umem_logging; + + umem_logging = 0; /* stop logging when a bad thing happens */ + + umem_abort_info.ump_timestamp = gethrtime(); + + sp = umem_findslab(cp, buf); + if (sp == NULL) { + for (cp = umem_null_cache.cache_prev; cp != &umem_null_cache; + cp = cp->cache_prev) { + if ((sp = umem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + cp = NULL; + error = UMERR_BADADDR; + } else { + if (cp != cparg) + error = UMERR_BADCACHE; + else + buf = (char *)bufarg - ((uintptr_t)bufarg - + (uintptr_t)sp->slab_base) % cp->cache_chunksize; + if (buf != bufarg) + error = UMERR_BADBASE; + if (cp->cache_flags & UMF_BUFTAG) + btp = UMEM_BUFTAG(cp, buf); + if (cp->cache_flags & UMF_HASH) { + (void) mutex_lock(&cp->cache_lock); + for (bcp = *UMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next) + if (bcp->bc_addr == buf) + break; + (void) mutex_unlock(&cp->cache_lock); + if (bcp == NULL && btp != NULL) + bcp = btp->bt_bufctl; + if (umem_findslab(cp->cache_bufctl_cache, bcp) == + NULL || P2PHASE((uintptr_t)bcp, UMEM_ALIGN) || + bcp->bc_addr != buf) { + error = UMERR_BADBUFCTL; + bcp = NULL; + } + } + } + + umem_abort_info.ump_error = error; + umem_abort_info.ump_buffer = bufarg; + umem_abort_info.ump_realbuf = buf; + umem_abort_info.ump_cache = cparg; + umem_abort_info.ump_realcache = cp; + umem_abort_info.ump_slab = sp; + umem_abort_info.ump_bufctl = bcp; + + umem_printf("umem allocator: "); + + switch (error) { + + case UMERR_MODIFIED: + umem_printf("buffer modified after being freed\n"); + off = verify_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify); + if (off == NULL) /* shouldn't happen */ + off = buf; + umem_printf("modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)UMEM_FREE_PATTERN, (longlong_t)*off); + break; + + case UMERR_REDZONE: + umem_printf("redzone violation: write past end of buffer\n"); + break; + + case UMERR_BADADDR: + umem_printf("invalid free: buffer not in cache\n"); + break; + + case UMERR_DUPFREE: + umem_printf("duplicate free: buffer freed twice\n"); + break; + + case UMERR_BADBUFTAG: + umem_printf("boundary tag corrupted\n"); + umem_printf("bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + UMEM_BUFTAG_FREE); + break; + + case UMERR_BADBUFCTL: + umem_printf("bufctl corrupted\n"); + break; + + case UMERR_BADCACHE: + umem_printf("buffer freed to wrong cache\n"); + umem_printf("buffer was allocated from %s,\n", cp->cache_name); + umem_printf("caller attempting free to %s.\n", + cparg->cache_name); + break; + + case UMERR_BADSIZE: + umem_printf("bad free: free size (%u) != alloc size (%u)\n", + UMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + UMEM_SIZE_DECODE(((uint32_t *)btp)[1])); + break; + + case UMERR_BADBASE: + umem_printf("bad free: free address (%p) != alloc address " + "(%p)\n", bufarg, buf); + break; + } + + umem_printf("buffer=%p bufctl=%p cache: %s\n", + bufarg, (void *)bcp, cparg->cache_name); + + if (bcp != NULL && (cp->cache_flags & UMF_AUDIT) && + error != UMERR_BADBUFCTL) { + int d; + timespec_t ts; + hrtime_t diff; + umem_bufctl_audit_t *bcap = (umem_bufctl_audit_t *)bcp; + + diff = umem_abort_info.ump_timestamp - bcap->bc_timestamp; + ts.tv_sec = diff / NANOSEC; + ts.tv_nsec = diff % NANOSEC; + + umem_printf("previous transaction on buffer %p:\n", buf); + umem_printf("thread=%p time=T-%ld.%09ld slab=%p cache: %s\n", + (void *)(intptr_t)bcap->bc_thread, ts.tv_sec, ts.tv_nsec, + (void *)sp, cp->cache_name); + for (d = 0; d < MIN(bcap->bc_depth, umem_stack_depth); d++) { + (void) print_sym((void *)bcap->bc_stack[d]); + umem_printf("\n"); + } + } + + umem_err_recoverable("umem: heap corruption detected"); + + umem_logging = old_logging; /* resume logging */ +} + +void +umem_nofail_callback(umem_nofail_callback_t *cb) +{ + nofail_callback = cb; +} + +static int +umem_alloc_retry(umem_cache_t *cp, int umflag) +{ + if (cp == &umem_null_cache) { + if (umem_init()) + return (1); /* retry */ + /* + * Initialization failed. Do normal failure processing. + */ + } + if (umflag & UMEM_NOFAIL) { + int def_result = UMEM_CALLBACK_EXIT(255); + int result = def_result; + umem_nofail_callback_t *callback = nofail_callback; + + if (callback != NULL) + result = callback(); + + if (result == UMEM_CALLBACK_RETRY) + return (1); + + if ((result & ~0xFF) != UMEM_CALLBACK_EXIT(0)) { + log_message("nofail callback returned %x\n", result); + result = def_result; + } + + /* + * only one thread will call exit + */ + if (umem_nofail_exit_thr == thr_self()) + umem_panic("recursive UMEM_CALLBACK_EXIT()\n"); + + (void) mutex_lock(&umem_nofail_exit_lock); + umem_nofail_exit_thr = thr_self(); + exit(result & 0xFF); + /*NOTREACHED*/ + } + return (0); +} + +static umem_log_header_t * +umem_log_init(size_t logsize) +{ + umem_log_header_t *lhp; + int nchunks = 4 * umem_max_ncpus; + size_t lhsize = offsetof(umem_log_header_t, lh_cpu[umem_max_ncpus]); + int i; + + if (logsize == 0) + return (NULL); + + /* + * Make sure that lhp->lh_cpu[] is nicely aligned + * to prevent false sharing of cache lines. + */ + lhsize = P2ROUNDUP(lhsize, UMEM_ALIGN); + lhp = vmem_xalloc(umem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0, + NULL, NULL, VM_NOSLEEP); + if (lhp == NULL) + goto fail; + + bzero(lhp, lhsize); + + (void) mutex_init(&lhp->lh_lock, USYNC_THREAD, NULL); + lhp->lh_nchunks = nchunks; + lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks, PAGESIZE); + if (lhp->lh_chunksize == 0) + lhp->lh_chunksize = PAGESIZE; + + lhp->lh_base = vmem_alloc(umem_log_arena, + lhp->lh_chunksize * nchunks, VM_NOSLEEP); + if (lhp->lh_base == NULL) + goto fail; + + lhp->lh_free = vmem_alloc(umem_log_arena, + nchunks * sizeof (int), VM_NOSLEEP); + if (lhp->lh_free == NULL) + goto fail; + + bzero(lhp->lh_base, lhp->lh_chunksize * nchunks); + + for (i = 0; i < umem_max_ncpus; i++) { + umem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + (void) mutex_init(&clhp->clh_lock, USYNC_THREAD, NULL); + clhp->clh_chunk = i; + } + + for (i = umem_max_ncpus; i < nchunks; i++) + lhp->lh_free[i] = i; + + lhp->lh_head = umem_max_ncpus; + lhp->lh_tail = 0; + + return (lhp); + +fail: + if (lhp != NULL) { + if (lhp->lh_base != NULL) + vmem_free(umem_log_arena, lhp->lh_base, + lhp->lh_chunksize * nchunks); + + vmem_xfree(umem_log_arena, lhp, lhsize); + } + return (NULL); +} + +static void * +umem_log_enter(umem_log_header_t *lhp, void *data, size_t size) +{ + void *logspace; + umem_cpu_log_header_t *clhp = + &(lhp->lh_cpu[CPU(umem_cpu_mask)->cpu_number]); + + if (lhp == NULL || umem_logging == 0) + return (NULL); + + (void) mutex_lock(&clhp->clh_lock); + clhp->clh_hits++; + if (size > clhp->clh_avail) { + (void) mutex_lock(&lhp->lh_lock); + lhp->lh_hits++; + lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk; + lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks; + clhp->clh_chunk = lhp->lh_free[lhp->lh_head]; + lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks; + clhp->clh_current = lhp->lh_base + + clhp->clh_chunk * lhp->lh_chunksize; + clhp->clh_avail = lhp->lh_chunksize; + if (size > lhp->lh_chunksize) + size = lhp->lh_chunksize; + (void) mutex_unlock(&lhp->lh_lock); + } + logspace = clhp->clh_current; + clhp->clh_current += size; + clhp->clh_avail -= size; + bcopy(data, logspace, size); + (void) mutex_unlock(&clhp->clh_lock); + return (logspace); +} + +#define UMEM_AUDIT(lp, cp, bcp) \ +{ \ + umem_bufctl_audit_t *_bcp = (umem_bufctl_audit_t *)(bcp); \ + _bcp->bc_timestamp = gethrtime(); \ + _bcp->bc_thread = thr_self(); \ + _bcp->bc_depth = getpcstack(_bcp->bc_stack, umem_stack_depth, \ + (cp != NULL) && (cp->cache_flags & UMF_CHECKSIGNAL)); \ + _bcp->bc_lastlog = umem_log_enter((lp), _bcp, \ + UMEM_BUFCTL_AUDIT_SIZE); \ +} + +static void +umem_log_event(umem_log_header_t *lp, umem_cache_t *cp, + umem_slab_t *sp, void *addr) +{ + umem_bufctl_audit_t *bcp; + UMEM_LOCAL_BUFCTL_AUDIT(&bcp); + + bzero(bcp, UMEM_BUFCTL_AUDIT_SIZE); + bcp->bc_addr = addr; + bcp->bc_slab = sp; + bcp->bc_cache = cp; + UMEM_AUDIT(lp, cp, bcp); +} + +/* + * Create a new slab for cache cp. + */ +static umem_slab_t * +umem_slab_create(umem_cache_t *cp, int umflag) +{ + size_t slabsize = cp->cache_slabsize; + size_t chunksize = cp->cache_chunksize; + int cache_flags = cp->cache_flags; + size_t color, chunks; + char *buf, *slab; + umem_slab_t *sp; + umem_bufctl_t *bcp; + vmem_t *vmp = cp->cache_arena; + + color = cp->cache_color + cp->cache_align; + if (color > cp->cache_maxcolor) + color = cp->cache_mincolor; + cp->cache_color = color; + + slab = vmem_alloc(vmp, slabsize, UMEM_VMFLAGS(umflag)); + + if (slab == NULL) + goto vmem_alloc_failure; + + ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0); + + if (!(cp->cache_cflags & UMC_NOTOUCH) && + (cp->cache_flags & UMF_DEADBEEF)) + copy_pattern(UMEM_UNINITIALIZED_PATTERN, slab, slabsize); + + if (cache_flags & UMF_HASH) { + if ((sp = _umem_cache_alloc(umem_slab_cache, umflag)) == NULL) + goto slab_alloc_failure; + chunks = (slabsize - color) / chunksize; + } else { + sp = UMEM_SLAB(cp, slab); + chunks = (slabsize - sizeof (umem_slab_t) - color) / chunksize; + } + + sp->slab_cache = cp; + sp->slab_head = NULL; + sp->slab_refcnt = 0; + sp->slab_base = buf = slab + color; + sp->slab_chunks = chunks; + + ASSERT(chunks > 0); + while (chunks-- != 0) { + if (cache_flags & UMF_HASH) { + bcp = _umem_cache_alloc(cp->cache_bufctl_cache, umflag); + if (bcp == NULL) + goto bufctl_alloc_failure; + if (cache_flags & UMF_AUDIT) { + umem_bufctl_audit_t *bcap = + (umem_bufctl_audit_t *)bcp; + bzero(bcap, UMEM_BUFCTL_AUDIT_SIZE); + bcap->bc_cache = cp; + } + bcp->bc_addr = buf; + bcp->bc_slab = sp; + } else { + bcp = UMEM_BUFCTL(cp, buf); + } + if (cache_flags & UMF_BUFTAG) { + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + btp->bt_redzone = UMEM_REDZONE_PATTERN; + btp->bt_bufctl = bcp; + btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE; + if (cache_flags & UMF_DEADBEEF) { + copy_pattern(UMEM_FREE_PATTERN, buf, + cp->cache_verify); + } + } + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + buf += chunksize; + } + + umem_log_event(umem_slab_log, cp, sp, slab); + + return (sp); + +bufctl_alloc_failure: + + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + _umem_cache_free(cp->cache_bufctl_cache, bcp); + } + _umem_cache_free(umem_slab_cache, sp); + +slab_alloc_failure: + + vmem_free(vmp, slab, slabsize); + +vmem_alloc_failure: + + umem_log_event(umem_failure_log, cp, NULL, NULL); + atomic_add_64(&cp->cache_alloc_fail, 1); + + return (NULL); +} + +/* + * Destroy a slab. + */ +static void +umem_slab_destroy(umem_cache_t *cp, umem_slab_t *sp) +{ + vmem_t *vmp = cp->cache_arena; + void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + + if (cp->cache_flags & UMF_HASH) { + umem_bufctl_t *bcp; + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + _umem_cache_free(cp->cache_bufctl_cache, bcp); + } + _umem_cache_free(umem_slab_cache, sp); + } + vmem_free(vmp, slab, cp->cache_slabsize); +} + +/* + * Allocate a raw (unconstructed) buffer from cp's slab layer. + */ +static void * +umem_slab_alloc(umem_cache_t *cp, int umflag) +{ + umem_bufctl_t *bcp, **hash_bucket; + umem_slab_t *sp; + void *buf; + + (void) mutex_lock(&cp->cache_lock); + cp->cache_slab_alloc++; + sp = cp->cache_freelist; + ASSERT(sp->slab_cache == cp); + if (sp->slab_head == NULL) { + /* + * The freelist is empty. Create a new slab. + */ + (void) mutex_unlock(&cp->cache_lock); + if (cp == &umem_null_cache) + return (NULL); + if ((sp = umem_slab_create(cp, umflag)) == NULL) + return (NULL); + (void) mutex_lock(&cp->cache_lock); + cp->cache_slab_create++; + if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) + cp->cache_bufmax = cp->cache_buftotal; + sp->slab_next = cp->cache_freelist; + sp->slab_prev = cp->cache_freelist->slab_prev; + sp->slab_next->slab_prev = sp; + sp->slab_prev->slab_next = sp; + cp->cache_freelist = sp; + } + + sp->slab_refcnt++; + ASSERT(sp->slab_refcnt <= sp->slab_chunks); + + /* + * If we're taking the last buffer in the slab, + * remove the slab from the cache's freelist. + */ + bcp = sp->slab_head; + if ((sp->slab_head = bcp->bc_next) == NULL) { + cp->cache_freelist = sp->slab_next; + ASSERT(sp->slab_refcnt == sp->slab_chunks); + } + + if (cp->cache_flags & UMF_HASH) { + /* + * Add buffer to allocated-address hash table. + */ + buf = bcp->bc_addr; + hash_bucket = UMEM_HASH(cp, buf); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) { + UMEM_AUDIT(umem_transaction_log, cp, bcp); + } + } else { + buf = UMEM_BUF(cp, bcp); + } + + ASSERT(UMEM_SLAB_MEMBER(sp, buf)); + + (void) mutex_unlock(&cp->cache_lock); + + return (buf); +} + +/* + * Free a raw (unconstructed) buffer to cp's slab layer. + */ +static void +umem_slab_free(umem_cache_t *cp, void *buf) +{ + umem_slab_t *sp; + umem_bufctl_t *bcp, **prev_bcpp; + + ASSERT(buf != NULL); + + (void) mutex_lock(&cp->cache_lock); + cp->cache_slab_free++; + + if (cp->cache_flags & UMF_HASH) { + /* + * Look up buffer in allocated-address hash table. + */ + prev_bcpp = UMEM_HASH(cp, buf); + while ((bcp = *prev_bcpp) != NULL) { + if (bcp->bc_addr == buf) { + *prev_bcpp = bcp->bc_next; + sp = bcp->bc_slab; + break; + } + cp->cache_lookup_depth++; + prev_bcpp = &bcp->bc_next; + } + } else { + bcp = UMEM_BUFCTL(cp, buf); + sp = UMEM_SLAB(cp, buf); + } + + if (bcp == NULL || sp->slab_cache != cp || !UMEM_SLAB_MEMBER(sp, buf)) { + (void) mutex_unlock(&cp->cache_lock); + umem_error(UMERR_BADADDR, cp, buf); + return; + } + + if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) { + if (cp->cache_flags & UMF_CONTENTS) + ((umem_bufctl_audit_t *)bcp)->bc_contents = + umem_log_enter(umem_content_log, buf, + cp->cache_contents); + UMEM_AUDIT(umem_transaction_log, cp, bcp); + } + + /* + * If this slab isn't currently on the freelist, put it there. + */ + if (sp->slab_head == NULL) { + ASSERT(sp->slab_refcnt == sp->slab_chunks); + ASSERT(cp->cache_freelist != sp); + sp->slab_next->slab_prev = sp->slab_prev; + sp->slab_prev->slab_next = sp->slab_next; + sp->slab_next = cp->cache_freelist; + sp->slab_prev = cp->cache_freelist->slab_prev; + sp->slab_next->slab_prev = sp; + sp->slab_prev->slab_next = sp; + cp->cache_freelist = sp; + } + + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + + ASSERT(sp->slab_refcnt >= 1); + if (--sp->slab_refcnt == 0) { + /* + * There are no outstanding allocations from this slab, + * so we can reclaim the memory. + */ + sp->slab_next->slab_prev = sp->slab_prev; + sp->slab_prev->slab_next = sp->slab_next; + if (sp == cp->cache_freelist) + cp->cache_freelist = sp->slab_next; + cp->cache_slab_destroy++; + cp->cache_buftotal -= sp->slab_chunks; + (void) mutex_unlock(&cp->cache_lock); + umem_slab_destroy(cp, sp); + return; + } + (void) mutex_unlock(&cp->cache_lock); +} + +static int +umem_cache_alloc_debug(umem_cache_t *cp, void *buf, int umflag) +{ + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl; + uint32_t mtbf; + int flags_nfatal; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) { + umem_error(UMERR_BADBUFTAG, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_ALLOC; + + if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) { + umem_error(UMERR_BADBUFCTL, cp, buf); + return (-1); + } + + btp->bt_redzone = UMEM_REDZONE_PATTERN; + + if (cp->cache_flags & UMF_DEADBEEF) { + if (verify_and_copy_pattern(UMEM_FREE_PATTERN, + UMEM_UNINITIALIZED_PATTERN, buf, cp->cache_verify)) { + umem_error(UMERR_MODIFIED, cp, buf); + return (-1); + } + } + + if ((mtbf = umem_mtbf | cp->cache_mtbf) != 0 && + gethrtime() % mtbf == 0 && + (umflag & (UMEM_FATAL_FLAGS)) == 0) { + umem_log_event(umem_failure_log, cp, NULL, NULL); + } else { + mtbf = 0; + } + + /* + * We do not pass fatal flags on to the constructor. This prevents + * leaking buffers in the event of a subordinate constructor failing. + */ + flags_nfatal = UMEM_DEFAULT; + if (mtbf || (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0)) { + atomic_add_64(&cp->cache_alloc_fail, 1); + btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE; + copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify); + umem_slab_free(cp, buf); + return (-1); + } + + if (cp->cache_flags & UMF_AUDIT) { + UMEM_AUDIT(umem_transaction_log, cp, bcp); + } + + return (0); +} + +static int +umem_cache_free_debug(umem_cache_t *cp, void *buf) +{ + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl; + umem_slab_t *sp; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_ALLOC)) { + if (btp->bt_bxstat == ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) { + umem_error(UMERR_DUPFREE, cp, buf); + return (-1); + } + sp = umem_findslab(cp, buf); + if (sp == NULL || sp->slab_cache != cp) + umem_error(UMERR_BADADDR, cp, buf); + else + umem_error(UMERR_REDZONE, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE; + + if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) { + umem_error(UMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (btp->bt_redzone != UMEM_REDZONE_PATTERN) { + umem_error(UMERR_REDZONE, cp, buf); + return (-1); + } + + if (cp->cache_flags & UMF_AUDIT) { + if (cp->cache_flags & UMF_CONTENTS) + bcp->bc_contents = umem_log_enter(umem_content_log, + buf, cp->cache_contents); + UMEM_AUDIT(umem_transaction_log, cp, bcp); + } + + if (cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + if (cp->cache_flags & UMF_DEADBEEF) + copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify); + + return (0); +} + +/* + * Free each object in magazine mp to cp's slab layer, and free mp itself. + */ +static void +umem_magazine_destroy(umem_cache_t *cp, umem_magazine_t *mp, int nrounds) +{ + int round; + + ASSERT(cp->cache_next == NULL || IN_UPDATE()); + + for (round = 0; round < nrounds; round++) { + void *buf = mp->mag_round[round]; + + if ((cp->cache_flags & UMF_DEADBEEF) && + verify_pattern(UMEM_FREE_PATTERN, buf, + cp->cache_verify) != NULL) { + umem_error(UMERR_MODIFIED, cp, buf); + continue; + } + + if (!(cp->cache_flags & UMF_BUFTAG) && + cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + umem_slab_free(cp, buf); + } + ASSERT(UMEM_MAGAZINE_VALID(cp, mp)); + _umem_cache_free(cp->cache_magtype->mt_cache, mp); +} + +/* + * Allocate a magazine from the depot. + */ +static umem_magazine_t * +umem_depot_alloc(umem_cache_t *cp, umem_maglist_t *mlp) +{ + umem_magazine_t *mp; + + /* + * If we can't get the depot lock without contention, + * update our contention count. We use the depot + * contention rate to determine whether we need to + * increase the magazine size for better scalability. + */ + if (mutex_trylock(&cp->cache_depot_lock) != 0) { + (void) mutex_lock(&cp->cache_depot_lock); + cp->cache_depot_contention++; + } + + if ((mp = mlp->ml_list) != NULL) { + ASSERT(UMEM_MAGAZINE_VALID(cp, mp)); + mlp->ml_list = mp->mag_next; + if (--mlp->ml_total < mlp->ml_min) + mlp->ml_min = mlp->ml_total; + mlp->ml_alloc++; + } + + (void) mutex_unlock(&cp->cache_depot_lock); + + return (mp); +} + +/* + * Free a magazine to the depot. + */ +static void +umem_depot_free(umem_cache_t *cp, umem_maglist_t *mlp, umem_magazine_t *mp) +{ + (void) mutex_lock(&cp->cache_depot_lock); + ASSERT(UMEM_MAGAZINE_VALID(cp, mp)); + mp->mag_next = mlp->ml_list; + mlp->ml_list = mp; + mlp->ml_total++; + (void) mutex_unlock(&cp->cache_depot_lock); +} + +/* + * Update the working set statistics for cp's depot. + */ +static void +umem_depot_ws_update(umem_cache_t *cp) +{ + (void) mutex_lock(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_min; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + (void) mutex_unlock(&cp->cache_depot_lock); +} + +/* + * Reap all magazines that have fallen out of the depot's working set. + */ +static void +umem_depot_ws_reap(umem_cache_t *cp) +{ + long reap; + umem_magazine_t *mp; + + ASSERT(cp->cache_next == NULL || IN_REAP()); + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_full)) != NULL) + umem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize); + + reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min); + while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_empty)) != NULL) + umem_magazine_destroy(cp, mp, 0); +} + +static void +umem_cpu_reload(umem_cpu_cache_t *ccp, umem_magazine_t *mp, int rounds) +{ + ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) || + (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize)); + ASSERT(ccp->cc_magsize > 0); + + ccp->cc_ploaded = ccp->cc_loaded; + ccp->cc_prounds = ccp->cc_rounds; + ccp->cc_loaded = mp; + ccp->cc_rounds = rounds; +} + +/* + * Allocate a constructed object from cache cp. + */ +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_cache_alloc = _umem_cache_alloc +#endif +void * +_umem_cache_alloc(umem_cache_t *cp, int umflag) +{ + umem_cpu_cache_t *ccp; + umem_magazine_t *fmp; + void *buf; + int flags_nfatal; + +retry: + ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask)); + (void) mutex_lock(&ccp->cc_lock); + for (;;) { + /* + * If there's an object available in the current CPU's + * loaded magazine, just take it and return. + */ + if (ccp->cc_rounds > 0) { + buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds]; + ccp->cc_alloc++; + (void) mutex_unlock(&ccp->cc_lock); + if ((ccp->cc_flags & UMF_BUFTAG) && + umem_cache_alloc_debug(cp, buf, umflag) == -1) { + if (umem_alloc_retry(cp, umflag)) { + goto retry; + } + + return (NULL); + } + return (buf); + } + + /* + * The loaded magazine is empty. If the previously loaded + * magazine was full, exchange them and try again. + */ + if (ccp->cc_prounds > 0) { + umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) + break; + + /* + * Try to get a full magazine from the depot. + */ + fmp = umem_depot_alloc(cp, &cp->cache_full); + if (fmp != NULL) { + if (ccp->cc_ploaded != NULL) + umem_depot_free(cp, &cp->cache_empty, + ccp->cc_ploaded); + umem_cpu_reload(ccp, fmp, ccp->cc_magsize); + continue; + } + + /* + * There are no full magazines in the depot, + * so fall through to the slab layer. + */ + break; + } + (void) mutex_unlock(&ccp->cc_lock); + + /* + * We couldn't allocate a constructed object from the magazine layer, + * so get a raw buffer from the slab layer and apply its constructor. + */ + buf = umem_slab_alloc(cp, umflag); + + if (buf == NULL) { + if (cp == &umem_null_cache) + return (NULL); + if (umem_alloc_retry(cp, umflag)) { + goto retry; + } + + return (NULL); + } + + if (cp->cache_flags & UMF_BUFTAG) { + /* + * Let umem_cache_alloc_debug() apply the constructor for us. + */ + if (umem_cache_alloc_debug(cp, buf, umflag) == -1) { + if (umem_alloc_retry(cp, umflag)) { + goto retry; + } + return (NULL); + } + return (buf); + } + + /* + * We do not pass fatal flags on to the constructor. This prevents + * leaking buffers in the event of a subordinate constructor failing. + */ + flags_nfatal = UMEM_DEFAULT; + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0) { + atomic_add_64(&cp->cache_alloc_fail, 1); + umem_slab_free(cp, buf); + + if (umem_alloc_retry(cp, umflag)) { + goto retry; + } + return (NULL); + } + + return (buf); +} + +/* + * Free a constructed object to cache cp. + */ +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_cache_free = _umem_cache_free +#endif +void +_umem_cache_free(umem_cache_t *cp, void *buf) +{ + umem_cpu_cache_t *ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask)); + umem_magazine_t *emp; + umem_magtype_t *mtp; + + if (ccp->cc_flags & UMF_BUFTAG) + if (umem_cache_free_debug(cp, buf) == -1) + return; + + (void) mutex_lock(&ccp->cc_lock); + for (;;) { + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and return. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_free++; + (void) mutex_unlock(&ccp->cc_lock); + return; + } + + /* + * The loaded magazine is full. If the previously loaded + * magazine was empty, exchange them and try again. + */ + if (ccp->cc_prounds == 0) { + umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) + break; + + /* + * Try to get an empty magazine from the depot. + */ + emp = umem_depot_alloc(cp, &cp->cache_empty); + if (emp != NULL) { + if (ccp->cc_ploaded != NULL) + umem_depot_free(cp, &cp->cache_full, + ccp->cc_ploaded); + umem_cpu_reload(ccp, emp, 0); + continue; + } + + /* + * There are no empty magazines in the depot, + * so try to allocate a new one. We must drop all locks + * across umem_cache_alloc() because lower layers may + * attempt to allocate from this cache. + */ + mtp = cp->cache_magtype; + (void) mutex_unlock(&ccp->cc_lock); + emp = _umem_cache_alloc(mtp->mt_cache, UMEM_DEFAULT); + (void) mutex_lock(&ccp->cc_lock); + + if (emp != NULL) { + /* + * We successfully allocated an empty magazine. + * However, we had to drop ccp->cc_lock to do it, + * so the cache's magazine size may have changed. + * If so, free the magazine and try again. + */ + if (ccp->cc_magsize != mtp->mt_magsize) { + (void) mutex_unlock(&ccp->cc_lock); + _umem_cache_free(mtp->mt_cache, emp); + (void) mutex_lock(&ccp->cc_lock); + continue; + } + + /* + * We got a magazine of the right size. Add it to + * the depot and try the whole dance again. + */ + umem_depot_free(cp, &cp->cache_empty, emp); + continue; + } + + /* + * We couldn't allocate an empty magazine, + * so fall through to the slab layer. + */ + break; + } + (void) mutex_unlock(&ccp->cc_lock); + + /* + * We couldn't free our constructed object to the magazine layer, + * so apply its destructor and free it to the slab layer. + * Note that if UMF_BUFTAG is in effect, umem_cache_free_debug() + * will have already applied the destructor. + */ + if (!(cp->cache_flags & UMF_BUFTAG) && cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + umem_slab_free(cp, buf); +} + +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_zalloc = _umem_zalloc +#endif +void * +_umem_zalloc(size_t size, int umflag) +{ + size_t index = (size - 1) >> UMEM_ALIGN_SHIFT; + void *buf; + +retry: + if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) { + umem_cache_t *cp = umem_alloc_table[index]; + buf = _umem_cache_alloc(cp, umflag); + if (buf != NULL) { + if (cp->cache_flags & UMF_BUFTAG) { + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size); + } + bzero(buf, size); + } else if (umem_alloc_retry(cp, umflag)) + goto retry; + } else { + buf = _umem_alloc(size, umflag); /* handles failure */ + if (buf != NULL) + bzero(buf, size); + } + return (buf); +} + +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_alloc = _umem_alloc +#endif +void * +_umem_alloc(size_t size, int umflag) +{ + size_t index = (size - 1) >> UMEM_ALIGN_SHIFT; + void *buf; +umem_alloc_retry: + if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) { + umem_cache_t *cp = umem_alloc_table[index]; + buf = _umem_cache_alloc(cp, umflag); + if ((cp->cache_flags & UMF_BUFTAG) && buf != NULL) { + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size); + } + if (buf == NULL && umem_alloc_retry(cp, umflag)) + goto umem_alloc_retry; + return (buf); + } + if (size == 0) + return (NULL); + if (umem_oversize_arena == NULL) { + if (umem_init()) + ASSERT(umem_oversize_arena != NULL); + else + return (NULL); + } + buf = vmem_alloc(umem_oversize_arena, size, UMEM_VMFLAGS(umflag)); + if (buf == NULL) { + umem_log_event(umem_failure_log, NULL, NULL, (void *)size); + if (umem_alloc_retry(NULL, umflag)) + goto umem_alloc_retry; + } + return (buf); +} + +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_alloc_align = _umem_alloc_align +#endif +void * +_umem_alloc_align(size_t size, size_t align, int umflag) +{ + void *buf; + + if (size == 0) + return (NULL); + if ((align & (align - 1)) != 0) + return (NULL); + if (align < UMEM_ALIGN) + align = UMEM_ALIGN; + +umem_alloc_align_retry: + if (umem_memalign_arena == NULL) { + if (umem_init()) + ASSERT(umem_oversize_arena != NULL); + else + return (NULL); + } + buf = vmem_xalloc(umem_memalign_arena, size, align, 0, 0, NULL, NULL, + UMEM_VMFLAGS(umflag)); + if (buf == NULL) { + umem_log_event(umem_failure_log, NULL, NULL, (void *)size); + if (umem_alloc_retry(NULL, umflag)) + goto umem_alloc_align_retry; + } + return (buf); +} + +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_free = _umem_free +#endif +void +_umem_free(void *buf, size_t size) +{ + size_t index = (size - 1) >> UMEM_ALIGN_SHIFT; + + if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) { + umem_cache_t *cp = umem_alloc_table[index]; + if (cp->cache_flags & UMF_BUFTAG) { + umem_buftag_t *btp = UMEM_BUFTAG(cp, buf); + uint32_t *ip = (uint32_t *)btp; + if (ip[1] != UMEM_SIZE_ENCODE(size)) { + if (*(uint64_t *)buf == UMEM_FREE_PATTERN) { + umem_error(UMERR_DUPFREE, cp, buf); + return; + } + if (UMEM_SIZE_VALID(ip[1])) { + ip[0] = UMEM_SIZE_ENCODE(size); + umem_error(UMERR_BADSIZE, cp, buf); + } else { + umem_error(UMERR_REDZONE, cp, buf); + } + return; + } + if (((uint8_t *)buf)[size] != UMEM_REDZONE_BYTE) { + umem_error(UMERR_REDZONE, cp, buf); + return; + } + btp->bt_redzone = UMEM_REDZONE_PATTERN; + } + _umem_cache_free(cp, buf); + } else { + if (buf == NULL && size == 0) + return; + vmem_free(umem_oversize_arena, buf, size); + } +} + +#ifndef NO_WEAK_SYMBOLS +#pragma weak umem_free_align = _umem_free_align +#endif +void +_umem_free_align(void *buf, size_t size) +{ + if (buf == NULL && size == 0) + return; + vmem_xfree(umem_memalign_arena, buf, size); +} + +static void * +umem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + size_t realsize = size + vmp->vm_quantum; + + /* + * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding + * vm_quantum will cause integer wraparound. Check for this, and + * blow off the firewall page in this case. Note that such a + * giant allocation (the entire address space) can never be + * satisfied, so it will either fail immediately (VM_NOSLEEP) + * or sleep forever (VM_SLEEP). Thus, there is no need for a + * corresponding check in umem_firewall_va_free(). + */ + if (realsize < size) + realsize = size; + + return (vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT)); +} + +static void +umem_firewall_va_free(vmem_t *vmp, void *addr, size_t size) +{ + vmem_free(vmp, addr, size + vmp->vm_quantum); +} + +/* + * Reclaim all unused memory from a cache. + */ +static void +umem_cache_reap(umem_cache_t *cp) +{ + /* + * Ask the cache's owner to free some memory if possible. + * The idea is to handle things like the inode cache, which + * typically sits on a bunch of memory that it doesn't truly + * *need*. Reclaim policy is entirely up to the owner; this + * callback is just an advisory plea for help. + */ + if (cp->cache_reclaim != NULL) + cp->cache_reclaim(cp->cache_private); + + umem_depot_ws_reap(cp); +} + +/* + * Purge all magazines from a cache and set its magazine limit to zero. + * All calls are serialized by being done by the update thread, except for + * the final call from umem_cache_destroy(). + */ +static void +umem_cache_magazine_purge(umem_cache_t *cp) +{ + umem_cpu_cache_t *ccp; + umem_magazine_t *mp, *pmp; + int rounds, prounds, cpu_seqid; + + ASSERT(cp->cache_next == NULL || IN_UPDATE()); + + for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + (void) mutex_lock(&ccp->cc_lock); + mp = ccp->cc_loaded; + pmp = ccp->cc_ploaded; + rounds = ccp->cc_rounds; + prounds = ccp->cc_prounds; + ccp->cc_loaded = NULL; + ccp->cc_ploaded = NULL; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + ccp->cc_magsize = 0; + (void) mutex_unlock(&ccp->cc_lock); + + if (mp) + umem_magazine_destroy(cp, mp, rounds); + if (pmp) + umem_magazine_destroy(cp, pmp, prounds); + } + + /* + * Updating the working set statistics twice in a row has the + * effect of setting the working set size to zero, so everything + * is eligible for reaping. + */ + umem_depot_ws_update(cp); + umem_depot_ws_update(cp); + + umem_depot_ws_reap(cp); +} + +/* + * Enable per-cpu magazines on a cache. + */ +static void +umem_cache_magazine_enable(umem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & UMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) { + umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + (void) mutex_lock(&ccp->cc_lock); + ccp->cc_magsize = cp->cache_magtype->mt_magsize; + (void) mutex_unlock(&ccp->cc_lock); + } + +} + +/* + * Recompute a cache's magazine size. The trade-off is that larger magazines + * provide a higher transfer rate with the depot, while smaller magazines + * reduce memory consumption. Magazine resizing is an expensive operation; + * it should not be done frequently. + * + * Changes to the magazine size are serialized by only having one thread + * doing updates. (the update thread) + * + * Note: at present this only grows the magazine size. It might be useful + * to allow shrinkage too. + */ +static void +umem_cache_magazine_resize(umem_cache_t *cp) +{ + umem_magtype_t *mtp = cp->cache_magtype; + + ASSERT(IN_UPDATE()); + + if (cp->cache_chunksize < mtp->mt_maxbuf) { + umem_cache_magazine_purge(cp); + (void) mutex_lock(&cp->cache_depot_lock); + cp->cache_magtype = ++mtp; + cp->cache_depot_contention_prev = + cp->cache_depot_contention + INT_MAX; + (void) mutex_unlock(&cp->cache_depot_lock); + umem_cache_magazine_enable(cp); + } +} + +/* + * Rescale a cache's hash table, so that the table size is roughly the + * cache size. We want the average lookup time to be extremely small. + */ +static void +umem_hash_rescale(umem_cache_t *cp) +{ + umem_bufctl_t **old_table, **new_table, *bcp; + size_t old_size, new_size, h; + + ASSERT(IN_UPDATE()); + + new_size = MAX(UMEM_HASH_INITIAL, + 1 << (highbit(3 * cp->cache_buftotal + 4) - 2)); + old_size = cp->cache_hash_mask + 1; + + if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(umem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + (void) mutex_lock(&cp->cache_lock); + + old_size = cp->cache_hash_mask + 1; + old_table = cp->cache_hash_table; + + cp->cache_hash_mask = new_size - 1; + cp->cache_hash_table = new_table; + cp->cache_rescale++; + + for (h = 0; h < old_size; h++) { + bcp = old_table[h]; + while (bcp != NULL) { + void *addr = bcp->bc_addr; + umem_bufctl_t *next_bcp = bcp->bc_next; + umem_bufctl_t **hash_bucket = UMEM_HASH(cp, addr); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + bcp = next_bcp; + } + } + + (void) mutex_unlock(&cp->cache_lock); + + vmem_free(umem_hash_arena, old_table, old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on a cache: hash rescaling, + * depot working-set update, and magazine resizing. + */ +void +umem_cache_update(umem_cache_t *cp) +{ + int update_flags = 0; + + ASSERT(MUTEX_HELD(&umem_cache_lock)); + + /* + * If the cache has become much larger or smaller than its hash table, + * fire off a request to rescale the hash table. + */ + (void) mutex_lock(&cp->cache_lock); + + if ((cp->cache_flags & UMF_HASH) && + (cp->cache_buftotal > (cp->cache_hash_mask << 1) || + (cp->cache_buftotal < (cp->cache_hash_mask >> 1) && + cp->cache_hash_mask > UMEM_HASH_INITIAL))) + update_flags |= UMU_HASH_RESCALE; + + (void) mutex_unlock(&cp->cache_lock); + + /* + * Update the depot working set statistics. + */ + umem_depot_ws_update(cp); + + /* + * If there's a lot of contention in the depot, + * increase the magazine size. + */ + (void) mutex_lock(&cp->cache_depot_lock); + + if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf && + (int)(cp->cache_depot_contention - + cp->cache_depot_contention_prev) > umem_depot_contention) + update_flags |= UMU_MAGAZINE_RESIZE; + + cp->cache_depot_contention_prev = cp->cache_depot_contention; + + (void) mutex_unlock(&cp->cache_depot_lock); + + if (update_flags) + umem_add_update(cp, update_flags); +} + +/* + * Runs all pending updates. + * + * The update lock must be held on entrance, and will be held on exit. + */ +void +umem_process_updates(void) +{ + ASSERT(MUTEX_HELD(&umem_update_lock)); + + while (umem_null_cache.cache_unext != &umem_null_cache) { + int notify = 0; + umem_cache_t *cp = umem_null_cache.cache_unext; + + cp->cache_uprev->cache_unext = cp->cache_unext; + cp->cache_unext->cache_uprev = cp->cache_uprev; + cp->cache_uprev = cp->cache_unext = NULL; + + ASSERT(!(cp->cache_uflags & UMU_ACTIVE)); + + while (cp->cache_uflags) { + int uflags = (cp->cache_uflags |= UMU_ACTIVE); + (void) mutex_unlock(&umem_update_lock); + + /* + * The order here is important. Each step can speed up + * later steps. + */ + + if (uflags & UMU_HASH_RESCALE) + umem_hash_rescale(cp); + + if (uflags & UMU_MAGAZINE_RESIZE) + umem_cache_magazine_resize(cp); + + if (uflags & UMU_REAP) + umem_cache_reap(cp); + + (void) mutex_lock(&umem_update_lock); + + /* + * check if anyone has requested notification + */ + if (cp->cache_uflags & UMU_NOTIFY) { + uflags |= UMU_NOTIFY; + notify = 1; + } + cp->cache_uflags &= ~uflags; + } + if (notify) + (void) cond_broadcast(&umem_update_cv); + } +} + +#ifndef UMEM_STANDALONE +static void +umem_st_update(void) +{ + ASSERT(MUTEX_HELD(&umem_update_lock)); + ASSERT(umem_update_thr == 0 && umem_st_update_thr == 0); + + umem_st_update_thr = thr_self(); + + (void) mutex_unlock(&umem_update_lock); + + vmem_update(NULL); + umem_cache_applyall(umem_cache_update); + + (void) mutex_lock(&umem_update_lock); + + umem_process_updates(); /* does all of the requested work */ + + umem_reap_next = gethrtime() + + (hrtime_t)umem_reap_interval * NANOSEC; + + umem_reaping = UMEM_REAP_DONE; + + umem_st_update_thr = 0; +} +#endif + +/* + * Reclaim all unused memory from all caches. Called from vmem when memory + * gets tight. Must be called with no locks held. + * + * This just requests a reap on all caches, and notifies the update thread. + */ +void +umem_reap(void) +{ +#ifndef UMEM_STANDALONE + extern int __nthreads(void); +#endif + + if (umem_ready != UMEM_READY || umem_reaping != UMEM_REAP_DONE || + gethrtime() < umem_reap_next) + return; + + (void) mutex_lock(&umem_update_lock); + + if (umem_reaping != UMEM_REAP_DONE || gethrtime() < umem_reap_next) { + (void) mutex_unlock(&umem_update_lock); + return; + } + + umem_reaping = UMEM_REAP_ADDING; /* lock out other reaps */ + + (void) mutex_unlock(&umem_update_lock); + + umem_updateall(UMU_REAP); + + (void) mutex_lock(&umem_update_lock); + + umem_reaping = UMEM_REAP_ACTIVE; + + /* Standalone is single-threaded */ +#ifndef UMEM_STANDALONE + if (umem_update_thr == 0) { + /* + * The update thread does not exist. If the process is + * multi-threaded, create it. If not, or the creation fails, + * do the update processing inline. + */ + ASSERT(umem_st_update_thr == 0); + + if (__nthreads() <= 1 || umem_create_update_thread() == 0) + umem_st_update(); + } + + (void) cond_broadcast(&umem_update_cv); /* wake up the update thread */ +#endif + + (void) mutex_unlock(&umem_update_lock); +} + +umem_cache_t * +umem_cache_create( + char *name, /* descriptive name for this cache */ + size_t bufsize, /* size of the objects it manages */ + size_t align, /* required object alignment */ + umem_constructor_t *constructor, /* object constructor */ + umem_destructor_t *destructor, /* object destructor */ + umem_reclaim_t *reclaim, /* memory reclaim callback */ + void *private, /* pass-thru arg for constr/destr/reclaim */ + vmem_t *vmp, /* vmem source for slab allocation */ + int cflags) /* cache creation flags */ +{ + int cpu_seqid; + size_t chunksize; + umem_cache_t *cp, *cnext, *cprev; + umem_magtype_t *mtp; + size_t csize; + size_t phase; + + /* + * The init thread is allowed to create internal and quantum caches. + * + * Other threads must wait until until initialization is complete. + */ + if (umem_init_thr == thr_self()) + ASSERT((cflags & (UMC_INTERNAL | UMC_QCACHE)) != 0); + else { + ASSERT(!(cflags & UMC_INTERNAL)); + if (umem_ready != UMEM_READY && umem_init() == 0) { + errno = EAGAIN; + return (NULL); + } + } + + csize = UMEM_CACHE_SIZE(umem_max_ncpus); + phase = P2NPHASE(csize, UMEM_CPU_CACHE_SIZE); + + if (vmp == NULL) + vmp = umem_default_arena; + + ASSERT(P2PHASE(phase, UMEM_ALIGN) == 0); + + /* + * Check that the arguments are reasonable + */ + if ((align & (align - 1)) != 0 || align > vmp->vm_quantum || + ((cflags & UMC_NOHASH) && (cflags & UMC_NOTOUCH)) || + name == NULL || bufsize == 0) { + errno = EINVAL; + return (NULL); + } + + /* + * If align == 0, we set it to the minimum required alignment. + * + * If align < UMEM_ALIGN, we round it up to UMEM_ALIGN, unless + * UMC_NOTOUCH was passed. + */ + if (align == 0) { + if (P2ROUNDUP(bufsize, UMEM_ALIGN) >= UMEM_SECOND_ALIGN) + align = UMEM_SECOND_ALIGN; + else + align = UMEM_ALIGN; + } else if (align < UMEM_ALIGN && (cflags & UMC_NOTOUCH) == 0) + align = UMEM_ALIGN; + + + /* + * Get a umem_cache structure. We arrange that cp->cache_cpu[] + * is aligned on a UMEM_CPU_CACHE_SIZE boundary to prevent + * false sharing of per-CPU data. + */ + cp = vmem_xalloc(umem_cache_arena, csize, UMEM_CPU_CACHE_SIZE, phase, + 0, NULL, NULL, VM_NOSLEEP); + + if (cp == NULL) { + errno = EAGAIN; + return (NULL); + } + + bzero(cp, csize); + + (void) mutex_lock(&umem_flags_lock); + if (umem_flags & UMF_RANDOMIZE) + umem_flags = (((umem_flags | ~UMF_RANDOM) + 1) & UMF_RANDOM) | + UMF_RANDOMIZE; + cp->cache_flags = umem_flags | (cflags & UMF_DEBUG); + (void) mutex_unlock(&umem_flags_lock); + + /* + * Make sure all the various flags are reasonable. + */ + if (cp->cache_flags & UMF_LITE) { + if (bufsize >= umem_lite_minsize && + align <= umem_lite_maxalign && + P2PHASE(bufsize, umem_lite_maxalign) != 0) { + cp->cache_flags |= UMF_BUFTAG; + cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL); + } else { + cp->cache_flags &= ~UMF_DEBUG; + } + } + + if ((cflags & UMC_QCACHE) && (cp->cache_flags & UMF_AUDIT)) + cp->cache_flags |= UMF_NOMAGAZINE; + + if (cflags & UMC_NODEBUG) + cp->cache_flags &= ~UMF_DEBUG; + + if (cflags & UMC_NOTOUCH) + cp->cache_flags &= ~UMF_TOUCH; + + if (cflags & UMC_NOHASH) + cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL); + + if (cflags & UMC_NOMAGAZINE) + cp->cache_flags |= UMF_NOMAGAZINE; + + if ((cp->cache_flags & UMF_AUDIT) && !(cflags & UMC_NOTOUCH)) + cp->cache_flags |= UMF_REDZONE; + + if ((cp->cache_flags & UMF_BUFTAG) && bufsize >= umem_minfirewall && + !(cp->cache_flags & UMF_LITE) && !(cflags & UMC_NOHASH)) + cp->cache_flags |= UMF_FIREWALL; + + if (vmp != umem_default_arena || umem_firewall_arena == NULL) + cp->cache_flags &= ~UMF_FIREWALL; + + if (cp->cache_flags & UMF_FIREWALL) { + cp->cache_flags &= ~UMF_BUFTAG; + cp->cache_flags |= UMF_NOMAGAZINE; + ASSERT(vmp == umem_default_arena); + vmp = umem_firewall_arena; + } + + /* + * Set cache properties. + */ + (void) strncpy(cp->cache_name, name, sizeof (cp->cache_name) - 1); + cp->cache_bufsize = bufsize; + cp->cache_align = align; + cp->cache_constructor = constructor; + cp->cache_destructor = destructor; + cp->cache_reclaim = reclaim; + cp->cache_private = private; + cp->cache_arena = vmp; + cp->cache_cflags = cflags; + cp->cache_cpu_mask = umem_cpu_mask; + + /* + * Determine the chunk size. + */ + chunksize = bufsize; + + if (align >= UMEM_ALIGN) { + chunksize = P2ROUNDUP(chunksize, UMEM_ALIGN); + cp->cache_bufctl = chunksize - UMEM_ALIGN; + } + + if (cp->cache_flags & UMF_BUFTAG) { + cp->cache_bufctl = chunksize; + cp->cache_buftag = chunksize; + chunksize += sizeof (umem_buftag_t); + } + + if (cp->cache_flags & UMF_DEADBEEF) { + cp->cache_verify = MIN(cp->cache_buftag, umem_maxverify); + if (cp->cache_flags & UMF_LITE) + cp->cache_verify = MIN(cp->cache_verify, UMEM_ALIGN); + } + + cp->cache_contents = MIN(cp->cache_bufctl, umem_content_maxsave); + + cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align); + + if (chunksize < bufsize) { + errno = ENOMEM; + goto fail; + } + + /* + * Now that we know the chunk size, determine the optimal slab size. + */ + if (vmp == umem_firewall_arena) { + cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum); + cp->cache_mincolor = cp->cache_slabsize - chunksize; + cp->cache_maxcolor = cp->cache_mincolor; + cp->cache_flags |= UMF_HASH; + ASSERT(!(cp->cache_flags & UMF_BUFTAG)); + } else if ((cflags & UMC_NOHASH) || (!(cflags & UMC_NOTOUCH) && + !(cp->cache_flags & UMF_AUDIT) && + chunksize < vmp->vm_quantum / UMEM_VOID_FRACTION)) { + cp->cache_slabsize = vmp->vm_quantum; + cp->cache_mincolor = 0; + cp->cache_maxcolor = + (cp->cache_slabsize - sizeof (umem_slab_t)) % chunksize; + + if (chunksize + sizeof (umem_slab_t) > cp->cache_slabsize) { + errno = EINVAL; + goto fail; + } + ASSERT(!(cp->cache_flags & UMF_AUDIT)); + } else { + size_t chunks, bestfit, waste, slabsize; + size_t minwaste = LONG_MAX; + + for (chunks = 1; chunks <= UMEM_VOID_FRACTION; chunks++) { + slabsize = P2ROUNDUP(chunksize * chunks, + vmp->vm_quantum); + /* + * check for overflow + */ + if ((slabsize / chunks) < chunksize) { + errno = ENOMEM; + goto fail; + } + chunks = slabsize / chunksize; + waste = (slabsize % chunksize) / chunks; + if (waste < minwaste) { + minwaste = waste; + bestfit = slabsize; + } + } + if (cflags & UMC_QCACHE) + bestfit = MAX(1 << highbit(3 * vmp->vm_qcache_max), 64); + cp->cache_slabsize = bestfit; + cp->cache_mincolor = 0; + cp->cache_maxcolor = bestfit % chunksize; + cp->cache_flags |= UMF_HASH; + } + + if (cp->cache_flags & UMF_HASH) { + ASSERT(!(cflags & UMC_NOHASH)); + cp->cache_bufctl_cache = (cp->cache_flags & UMF_AUDIT) ? + umem_bufctl_audit_cache : umem_bufctl_cache; + } + + if (cp->cache_maxcolor >= vmp->vm_quantum) + cp->cache_maxcolor = vmp->vm_quantum - 1; + + cp->cache_color = cp->cache_mincolor; + + /* + * Initialize the rest of the slab layer. + */ + (void) mutex_init(&cp->cache_lock, USYNC_THREAD, NULL); + + cp->cache_freelist = &cp->cache_nullslab; + cp->cache_nullslab.slab_cache = cp; + cp->cache_nullslab.slab_refcnt = -1; + cp->cache_nullslab.slab_next = &cp->cache_nullslab; + cp->cache_nullslab.slab_prev = &cp->cache_nullslab; + + if (cp->cache_flags & UMF_HASH) { + cp->cache_hash_table = vmem_alloc(umem_hash_arena, + UMEM_HASH_INITIAL * sizeof (void *), VM_NOSLEEP); + if (cp->cache_hash_table == NULL) { + errno = EAGAIN; + goto fail_lock; + } + bzero(cp->cache_hash_table, + UMEM_HASH_INITIAL * sizeof (void *)); + cp->cache_hash_mask = UMEM_HASH_INITIAL - 1; + cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1; + } + + /* + * Initialize the depot. + */ + (void) mutex_init(&cp->cache_depot_lock, USYNC_THREAD, NULL); + + for (mtp = umem_magtype; chunksize <= mtp->mt_minbuf; mtp++) + continue; + + cp->cache_magtype = mtp; + + /* + * Initialize the CPU layer. + */ + for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) { + umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + (void) mutex_init(&ccp->cc_lock, USYNC_THREAD, NULL); + ccp->cc_flags = cp->cache_flags; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + } + + /* + * Add the cache to the global list. This makes it visible + * to umem_update(), so the cache must be ready for business. + */ + (void) mutex_lock(&umem_cache_lock); + cp->cache_next = cnext = &umem_null_cache; + cp->cache_prev = cprev = umem_null_cache.cache_prev; + cnext->cache_prev = cp; + cprev->cache_next = cp; + (void) mutex_unlock(&umem_cache_lock); + + if (umem_ready == UMEM_READY) + umem_cache_magazine_enable(cp); + + return (cp); + +fail_lock: + (void) mutex_destroy(&cp->cache_lock); +fail: + vmem_xfree(umem_cache_arena, cp, csize); + return (NULL); +} + +void +umem_cache_destroy(umem_cache_t *cp) +{ + int cpu_seqid; + + /* + * Remove the cache from the global cache list so that no new updates + * will be scheduled on its behalf, wait for any pending tasks to + * complete, purge the cache, and then destroy it. + */ + (void) mutex_lock(&umem_cache_lock); + cp->cache_prev->cache_next = cp->cache_next; + cp->cache_next->cache_prev = cp->cache_prev; + cp->cache_prev = cp->cache_next = NULL; + (void) mutex_unlock(&umem_cache_lock); + + umem_remove_updates(cp); + + umem_cache_magazine_purge(cp); + + (void) mutex_lock(&cp->cache_lock); + if (cp->cache_buftotal != 0) + log_message("umem_cache_destroy: '%s' (%p) not empty\n", + cp->cache_name, (void *)cp); + cp->cache_reclaim = NULL; + /* + * The cache is now dead. There should be no further activity. + * We enforce this by setting land mines in the constructor and + * destructor routines that induce a segmentation fault if invoked. + */ + cp->cache_constructor = (umem_constructor_t *)1; + cp->cache_destructor = (umem_destructor_t *)2; + (void) mutex_unlock(&cp->cache_lock); + + if (cp->cache_hash_table != NULL) + vmem_free(umem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) + (void) mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); + + (void) mutex_destroy(&cp->cache_depot_lock); + (void) mutex_destroy(&cp->cache_lock); + + vmem_free(umem_cache_arena, cp, UMEM_CACHE_SIZE(umem_max_ncpus)); +} + +static int +umem_cache_init(void) +{ + int i; + size_t size, max_size; + umem_cache_t *cp; + umem_magtype_t *mtp; + char name[UMEM_CACHE_NAMELEN + 1]; + umem_cache_t *umem_alloc_caches[NUM_ALLOC_SIZES]; + + for (i = 0; i < sizeof (umem_magtype) / sizeof (*mtp); i++) { + mtp = &umem_magtype[i]; + (void) snprintf(name, sizeof (name), "umem_magazine_%d", + mtp->mt_magsize); + mtp->mt_cache = umem_cache_create(name, + (mtp->mt_magsize + 1) * sizeof (void *), + mtp->mt_align, NULL, NULL, NULL, NULL, + umem_internal_arena, UMC_NOHASH | UMC_INTERNAL); + if (mtp->mt_cache == NULL) + return (0); + } + + umem_slab_cache = umem_cache_create("umem_slab_cache", + sizeof (umem_slab_t), 0, NULL, NULL, NULL, NULL, + umem_internal_arena, UMC_NOHASH | UMC_INTERNAL); + + if (umem_slab_cache == NULL) + return (0); + + umem_bufctl_cache = umem_cache_create("umem_bufctl_cache", + sizeof (umem_bufctl_t), 0, NULL, NULL, NULL, NULL, + umem_internal_arena, UMC_NOHASH | UMC_INTERNAL); + + if (umem_bufctl_cache == NULL) + return (0); + + /* + * The size of the umem_bufctl_audit structure depends upon + * umem_stack_depth. See umem_impl.h for details on the size + * restrictions. + */ + + size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth); + max_size = UMEM_BUFCTL_AUDIT_MAX_SIZE; + + if (size > max_size) { /* too large -- truncate */ + int max_frames = UMEM_MAX_STACK_DEPTH; + + ASSERT(UMEM_BUFCTL_AUDIT_SIZE_DEPTH(max_frames) <= max_size); + + umem_stack_depth = max_frames; + size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth); + } + + umem_bufctl_audit_cache = umem_cache_create("umem_bufctl_audit_cache", + size, 0, NULL, NULL, NULL, NULL, umem_internal_arena, + UMC_NOHASH | UMC_INTERNAL); + + if (umem_bufctl_audit_cache == NULL) + return (0); + + if (vmem_backend & VMEM_BACKEND_MMAP) + umem_va_arena = vmem_create("umem_va", + NULL, 0, pagesize, + vmem_alloc, vmem_free, heap_arena, + 8 * pagesize, VM_NOSLEEP); + else + umem_va_arena = heap_arena; + + if (umem_va_arena == NULL) + return (0); + + umem_default_arena = vmem_create("umem_default", + NULL, 0, pagesize, + heap_alloc, heap_free, umem_va_arena, + 0, VM_NOSLEEP); + + if (umem_default_arena == NULL) + return (0); + + /* + * make sure the umem_alloc table initializer is correct + */ + i = sizeof (umem_alloc_table) / sizeof (*umem_alloc_table); + ASSERT(umem_alloc_table[i - 1] == &umem_null_cache); + + /* + * Create the default caches to back umem_alloc() + */ + for (i = 0; i < NUM_ALLOC_SIZES; i++) { + size_t cache_size = umem_alloc_sizes[i]; + size_t align = 0; + /* + * If they allocate a multiple of the coherency granularity, + * they get a coherency-granularity-aligned address. + */ + if (IS_P2ALIGNED(cache_size, 64)) + align = 64; + if (IS_P2ALIGNED(cache_size, pagesize)) + align = pagesize; + (void) snprintf(name, sizeof (name), "umem_alloc_%lu", + (long)cache_size); + + cp = umem_cache_create(name, cache_size, align, + NULL, NULL, NULL, NULL, NULL, UMC_INTERNAL); + if (cp == NULL) + return (0); + + umem_alloc_caches[i] = cp; + } + + /* + * Initialization cannot fail at this point. Make the caches + * visible to umem_alloc() and friends. + */ + size = UMEM_ALIGN; + for (i = 0; i < NUM_ALLOC_SIZES; i++) { + size_t cache_size = umem_alloc_sizes[i]; + + cp = umem_alloc_caches[i]; + + while (size <= cache_size) { + umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT] = cp; + size += UMEM_ALIGN; + } + } + return (1); +} + +/* + * umem_startup() is called early on, and must be called explicitly if we're + * the standalone version. + */ +static void +umem_startup() __attribute__((constructor)); + +void +umem_startup() +{ + caddr_t start = NULL; + size_t len = 0; + size_t pagesize = 0; + +#ifdef UMEM_STANDALONE + int idx; + /* Standalone doesn't fork */ +#else + umem_forkhandler_init(); /* register the fork handler */ +#endif + +#ifdef __lint + /* make lint happy */ + minstack = maxstack; +#endif + +#ifdef UMEM_STANDALONE + umem_ready = UMEM_READY_STARTUP; + umem_init_env_ready = 0; + + umem_min_stack = minstack; + umem_max_stack = maxstack; + + nofail_callback = NULL; + umem_slab_cache = NULL; + umem_bufctl_cache = NULL; + umem_bufctl_audit_cache = NULL; + heap_arena = NULL; + heap_alloc = NULL; + heap_free = NULL; + umem_internal_arena = NULL; + umem_cache_arena = NULL; + umem_hash_arena = NULL; + umem_log_arena = NULL; + umem_oversize_arena = NULL; + umem_va_arena = NULL; + umem_default_arena = NULL; + umem_firewall_va_arena = NULL; + umem_firewall_arena = NULL; + umem_memalign_arena = NULL; + umem_transaction_log = NULL; + umem_content_log = NULL; + umem_failure_log = NULL; + umem_slab_log = NULL; + umem_cpu_mask = 0; + + umem_cpus = &umem_startup_cpu; + umem_startup_cpu.cpu_cache_offset = UMEM_CACHE_SIZE(0); + umem_startup_cpu.cpu_number = 0; + + bcopy(&umem_null_cache_template, &umem_null_cache, + sizeof (umem_cache_t)); + + for (idx = 0; idx < (UMEM_MAXBUF >> UMEM_ALIGN_SHIFT); idx++) + umem_alloc_table[idx] = &umem_null_cache; +#endif + + /* + * Perform initialization specific to the way we've been compiled + * (library or standalone) + */ + umem_type_init(start, len, pagesize); + + vmem_startup(); +} + +int +umem_init(void) +{ + size_t maxverify, minfirewall; + size_t size; + int idx; + umem_cpu_t *new_cpus; + + vmem_t *memalign_arena, *oversize_arena; + + if (thr_self() != umem_init_thr) { + /* + * The usual case -- non-recursive invocation of umem_init(). + */ + (void) mutex_lock(&umem_init_lock); + if (umem_ready != UMEM_READY_STARTUP) { + /* + * someone else beat us to initializing umem. Wait + * for them to complete, then return. + */ + while (umem_ready == UMEM_READY_INITING) + (void) _cond_wait(&umem_init_cv, + &umem_init_lock); + ASSERT(umem_ready == UMEM_READY || + umem_ready == UMEM_READY_INIT_FAILED); + (void) mutex_unlock(&umem_init_lock); + return (umem_ready == UMEM_READY); + } + + ASSERT(umem_ready == UMEM_READY_STARTUP); + ASSERT(umem_init_env_ready == 0); + + umem_ready = UMEM_READY_INITING; + umem_init_thr = thr_self(); + + (void) mutex_unlock(&umem_init_lock); + umem_setup_envvars(0); /* can recurse -- see below */ + if (umem_init_env_ready) { + /* + * initialization was completed already + */ + ASSERT(umem_ready == UMEM_READY || + umem_ready == UMEM_READY_INIT_FAILED); + ASSERT(umem_init_thr == 0); + return (umem_ready == UMEM_READY); + } + } else if (!umem_init_env_ready) { + /* + * The umem_setup_envvars() call (above) makes calls into + * the dynamic linker and directly into user-supplied code. + * Since we cannot know what that code will do, we could be + * recursively invoked (by, say, a malloc() call in the code + * itself, or in a (C++) _init section it causes to be fired). + * + * This code is where we end up if such recursion occurs. We + * first clean up any partial results in the envvar code, then + * proceed to finish initialization processing in the recursive + * call. The original call will notice this, and return + * immediately. + */ + umem_setup_envvars(1); /* clean up any partial state */ + } else { + umem_panic( + "recursive allocation while initializing umem\n"); + } + umem_init_env_ready = 1; + + /* + * From this point until we finish, recursion into umem_init() will + * cause a umem_panic(). + */ + maxverify = minfirewall = ULONG_MAX; + + /* LINTED constant condition */ + if (sizeof (umem_cpu_cache_t) != UMEM_CPU_CACHE_SIZE) { + umem_panic("sizeof (umem_cpu_cache_t) = %d, should be %d\n", + sizeof (umem_cpu_cache_t), UMEM_CPU_CACHE_SIZE); + } + + umem_max_ncpus = umem_get_max_ncpus(); + + /* + * load tunables from environment + */ + umem_process_envvars(); + + if (issetugid()) + umem_mtbf = 0; + + /* + * set up vmem + */ + if (!(umem_flags & UMF_AUDIT)) + vmem_no_debug(); + + heap_arena = vmem_heap_arena(&heap_alloc, &heap_free); + + pagesize = heap_arena->vm_quantum; + + umem_internal_arena = vmem_create("umem_internal", NULL, 0, pagesize, + heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP); + + umem_default_arena = umem_internal_arena; + + if (umem_internal_arena == NULL) + goto fail; + + umem_cache_arena = vmem_create("umem_cache", NULL, 0, UMEM_ALIGN, + vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP); + + umem_hash_arena = vmem_create("umem_hash", NULL, 0, UMEM_ALIGN, + vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP); + + umem_log_arena = vmem_create("umem_log", NULL, 0, UMEM_ALIGN, + heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP); + + umem_firewall_va_arena = vmem_create("umem_firewall_va", + NULL, 0, pagesize, + umem_firewall_va_alloc, umem_firewall_va_free, heap_arena, + 0, VM_NOSLEEP); + + if (umem_cache_arena == NULL || umem_hash_arena == NULL || + umem_log_arena == NULL || umem_firewall_va_arena == NULL) + goto fail; + + umem_firewall_arena = vmem_create("umem_firewall", NULL, 0, pagesize, + heap_alloc, heap_free, umem_firewall_va_arena, 0, + VM_NOSLEEP); + + if (umem_firewall_arena == NULL) + goto fail; + + oversize_arena = vmem_create("umem_oversize", NULL, 0, pagesize, + heap_alloc, heap_free, minfirewall < ULONG_MAX ? + umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP); + + memalign_arena = vmem_create("umem_memalign", NULL, 0, UMEM_ALIGN, + heap_alloc, heap_free, minfirewall < ULONG_MAX ? + umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP); + + if (oversize_arena == NULL || memalign_arena == NULL) + goto fail; + + if (umem_max_ncpus > CPUHINT_MAX()) + umem_max_ncpus = CPUHINT_MAX(); + + while ((umem_max_ncpus & (umem_max_ncpus - 1)) != 0) + umem_max_ncpus++; + + if (umem_max_ncpus == 0) + umem_max_ncpus = 1; + + size = umem_max_ncpus * sizeof (umem_cpu_t); + new_cpus = vmem_alloc(umem_internal_arena, size, VM_NOSLEEP); + if (new_cpus == NULL) + goto fail; + + bzero(new_cpus, size); + for (idx = 0; idx < umem_max_ncpus; idx++) { + new_cpus[idx].cpu_number = idx; + new_cpus[idx].cpu_cache_offset = UMEM_CACHE_SIZE(idx); + } + umem_cpus = new_cpus; + umem_cpu_mask = (umem_max_ncpus - 1); + + if (umem_maxverify == 0) + umem_maxverify = maxverify; + + if (umem_minfirewall == 0) + umem_minfirewall = minfirewall; + + /* + * Set up updating and reaping + */ + umem_reap_next = gethrtime() + NANOSEC; + +#ifndef UMEM_STANDALONE + (void) gettimeofday(&umem_update_next, NULL); +#endif + + /* + * Set up logging -- failure here is okay, since it will just disable + * the logs + */ + if (umem_logging) { + umem_transaction_log = umem_log_init(umem_transaction_log_size); + umem_content_log = umem_log_init(umem_content_log_size); + umem_failure_log = umem_log_init(umem_failure_log_size); + umem_slab_log = umem_log_init(umem_slab_log_size); + } + + /* + * Set up caches -- if successful, initialization cannot fail, since + * allocations from other threads can now succeed. + */ + if (umem_cache_init() == 0) { + log_message("unable to create initial caches\n"); + goto fail; + } + umem_oversize_arena = oversize_arena; + umem_memalign_arena = memalign_arena; + + umem_cache_applyall(umem_cache_magazine_enable); + + /* + * initialization done, ready to go + */ + (void) mutex_lock(&umem_init_lock); + umem_ready = UMEM_READY; + umem_init_thr = 0; + (void) cond_broadcast(&umem_init_cv); + (void) mutex_unlock(&umem_init_lock); + return (1); + +fail: + log_message("umem initialization failed\n"); + + (void) mutex_lock(&umem_init_lock); + umem_ready = UMEM_READY_INIT_FAILED; + umem_init_thr = 0; + (void) cond_broadcast(&umem_init_cv); + (void) mutex_unlock(&umem_init_lock); + return (0); +} + +size_t +umem_cache_get_bufsize(umem_cache_t *cache) +{ + return cache->cache_bufsize; +} + diff --git a/zfs/lib/libumem/umem_agent_support.c b/zfs/lib/libumem/umem_agent_support.c new file mode 100644 index 000000000..55db5e6eb --- /dev/null +++ b/zfs/lib/libumem/umem_agent_support.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)umem_agent_support.c 1.2 05/06/08 SMI" */ + +#include "config.h" +#include "umem_base.h" + +#define AGENT_STACK_SIZE 4096 + +#if 0 +char __umem_agent_stack_beg[AGENT_STACK_SIZE]; +char *__umem_agent_stack_end = __umem_agent_stack_beg + AGENT_STACK_SIZE; + +void +__umem_agent_free_bp(umem_cache_t *cp, void *buf) +{ + extern void _breakpoint(void); /* inline asm */ + + _umem_cache_free(cp, buf); + _breakpoint(); +} +#endif + diff --git a/zfs/lib/libumem/umem_base.h b/zfs/lib/libumem/umem_base.h new file mode 100644 index 000000000..ad3cc1ef0 --- /dev/null +++ b/zfs/lib/libumem/umem_base.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _UMEM_BASE_H +#define _UMEM_BASE_H + +/* #pragma ident "@(#)umem_base.h 1.4 05/06/08 SMI" */ + +#include <umem_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include "misc.h" + +extern size_t pagesize; +#undef PAGESIZE +#define PAGESIZE pagesize + +/* + * umem.c: non-tunables + */ +extern vmem_t *umem_memalign_arena; + +extern int umem_ready; +extern thread_t umem_init_thr; /* the thread doing the init */ + +extern int umem_init(void); /* do umem's initialization */ +/* #pragma rarely_called(umem_init) */ + +extern umem_log_header_t *umem_transaction_log; +extern umem_log_header_t *umem_content_log; +extern umem_log_header_t *umem_failure_log; +extern umem_log_header_t *umem_slab_log; + +extern mutex_t umem_init_lock; + +extern mutex_t umem_cache_lock; +extern umem_cache_t umem_null_cache; + +extern mutex_t umem_flags_lock; + +extern mutex_t umem_update_lock; +extern cond_t umem_update_cv; +extern volatile thread_t umem_st_update_thr; +extern thread_t umem_update_thr; +extern struct timeval umem_update_next; + +extern volatile hrtime_t umem_reap_next; +extern volatile uint32_t umem_reaping; +#define UMEM_REAP_DONE 0x00000000 /* inactive */ +#define UMEM_REAP_ADDING 0x00000001 /* umem_reap() is active */ +#define UMEM_REAP_ACTIVE 0x00000002 /* update thread is reaping */ + +/* + * umem.c: tunables + */ +extern uint32_t umem_max_ncpus; + +extern uint32_t umem_stack_depth; +extern uint32_t umem_reap_interval; +extern uint32_t umem_update_interval; +extern uint32_t umem_depot_contention; +extern uint32_t umem_abort; +extern uint32_t umem_output; +extern uint32_t umem_logging; +extern uint32_t umem_mtbf; +extern size_t umem_transaction_log_size; +extern size_t umem_content_log_size; +extern size_t umem_failure_log_size; +extern size_t umem_slab_log_size; +extern size_t umem_content_maxsave; +extern size_t umem_lite_minsize; +extern size_t umem_lite_maxalign; +extern size_t umem_maxverify; +extern size_t umem_minfirewall; + +extern uint32_t umem_flags; + +/* + * umem.c: Internal aliases (to avoid PLTs) + */ +extern void *_umem_alloc(size_t size, int umflags); +extern void *_umem_zalloc(size_t size, int umflags); +extern void _umem_free(void *buf, size_t size); + +extern void *_umem_cache_alloc(umem_cache_t *cache, int flags); +extern void _umem_cache_free(umem_cache_t *cache, void *buffer); + +/* + * umem.c: private interfaces + */ +extern void umem_type_init(caddr_t, size_t, size_t); +extern int umem_get_max_ncpus(void); +extern void umem_process_updates(void); +extern void umem_cache_applyall(void (*)(umem_cache_t *)); +extern void umem_cache_update(umem_cache_t *); + +/* + * umem_fork.c: private interfaces + */ +extern void umem_forkhandler_init(void); + +/* + * umem_update_thread.c + */ +extern int umem_create_update_thread(void); + +/* + * envvar.c: + */ +void umem_setup_envvars(int); +void umem_process_envvars(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _UMEM_BASE_H */ diff --git a/zfs/lib/libumem/umem_fail.c b/zfs/lib/libumem/umem_fail.c new file mode 100644 index 000000000..2bafd2682 --- /dev/null +++ b/zfs/lib/libumem/umem_fail.c @@ -0,0 +1,176 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)umem_fail.c 1.4 05/06/08 SMI" */ + +/* + * Failure routines for libumem (not standalone) + */ + +#include "config.h" +#include <sys/types.h> +#include <signal.h> +#include <stdarg.h> +#include <string.h> +#include <stdio.h> + +#include "misc.h" + +static volatile int umem_exiting = 0; +#define UMEM_EXIT_ABORT 1 + +static mutex_t umem_exit_lock = DEFAULTMUTEX; /* protects umem_exiting */ + +static int +firstexit(int type) +{ + if (umem_exiting) + return (0); + + (void) mutex_lock(&umem_exit_lock); + if (umem_exiting) { + (void) mutex_unlock(&umem_exit_lock); + return (0); + } + umem_exiting = type; + (void) mutex_unlock(&umem_exit_lock); + + return (1); +} + +/* + * We can't use abort(3C), since it closes all of the standard library + * FILEs, which can call free(). + * + * In addition, we can't just raise(SIGABRT), since the current handler + * might do allocation. We give them once chance, though. + */ +static void __NORETURN +umem_do_abort(void) +{ +#ifdef _WIN32 + abort(); +#else + if (firstexit(UMEM_EXIT_ABORT)) { + (void) raise(SIGABRT); + } + + for (;;) { +#if defined(__FreeBSD__) + sigset_t set; + struct sigaction sa; + + sa.sa_handler = SIG_DFL; + (void) sigaction(SIGABRT, &sa, NULL); + (void) sigemptyset (&set); + (void) sigaddset (&set, SIGABRT); + (void) sigprocmask (SIG_UNBLOCK, &set, NULL); + (void) raise (SIGABRT); +#else + (void) signal(SIGABRT, SIG_DFL); + (void) sigrelse(SIGABRT); + (void) raise(SIGABRT); +#endif + } +#endif +} + +#define SKIP_FRAMES 1 /* skip the panic frame */ +#define ERR_STACK_FRAMES 128 + +static void +print_stacktrace(void) +{ + uintptr_t cur_stack[ERR_STACK_FRAMES]; + + /* + * if we are in a signal context, checking for it will recurse + */ + uint_t nframes = getpcstack(cur_stack, ERR_STACK_FRAMES, 0); + uint_t idx; + + if (nframes > SKIP_FRAMES) { + umem_printf("stack trace:\n"); + + for (idx = SKIP_FRAMES; idx < nframes; idx++) { + (void) print_sym((void *)cur_stack[idx]); + umem_printf("\n"); + } + } +} + +void +umem_panic(const char *format, ...) +{ + va_list va; + + va_start(va, format); + umem_vprintf(format, va); + va_end(va); + + if (format[strlen(format)-1] != '\n') + umem_error_enter("\n"); + +#ifdef ECELERITY + va_start(va, format); + ec_debug_vprintf(DCRITICAL, DMEM, format, va); + va_end(va); +#endif + + print_stacktrace(); + + umem_do_abort(); +} + +void +umem_err_recoverable(const char *format, ...) +{ + va_list va; + + va_start(va, format); + umem_vprintf(format, va); + va_end(va); + + if (format[strlen(format)-1] != '\n') + umem_error_enter("\n"); + + print_stacktrace(); + + if (umem_abort > 0) + umem_do_abort(); +} + +int +__umem_assert_failed(const char *assertion, const char *file, int line) +{ + umem_panic("Assertion failed: %s, file %s, line %d\n", + assertion, file, line); + umem_do_abort(); + /*NOTREACHED*/ +} diff --git a/zfs/lib/libumem/umem_fork.c b/zfs/lib/libumem/umem_fork.c new file mode 100644 index 000000000..2f701026d --- /dev/null +++ b/zfs/lib/libumem/umem_fork.c @@ -0,0 +1,214 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)umem_fork.c 1.3 05/06/08 SMI" */ + +#include "config.h" +/* #include "mtlib.h" */ +#include "umem_base.h" +#include "vmem_base.h" + +#ifndef _WIN32 +#include <unistd.h> + +/* + * The following functions are for pre- and post-fork1(2) handling. + */ + +static void +umem_lockup_cache(umem_cache_t *cp) +{ + int idx; + int ncpus = cp->cache_cpu_mask + 1; + + for (idx = 0; idx < ncpus; idx++) + (void) mutex_lock(&cp->cache_cpu[idx].cc_lock); + + (void) mutex_lock(&cp->cache_depot_lock); + (void) mutex_lock(&cp->cache_lock); +} + +static void +umem_release_cache(umem_cache_t *cp) +{ + int idx; + int ncpus = cp->cache_cpu_mask + 1; + + (void) mutex_unlock(&cp->cache_lock); + (void) mutex_unlock(&cp->cache_depot_lock); + + for (idx = 0; idx < ncpus; idx++) + (void) mutex_unlock(&cp->cache_cpu[idx].cc_lock); +} + +static void +umem_lockup_log_header(umem_log_header_t *lhp) +{ + int idx; + if (lhp == NULL) + return; + for (idx = 0; idx < umem_max_ncpus; idx++) + (void) mutex_lock(&lhp->lh_cpu[idx].clh_lock); + + (void) mutex_lock(&lhp->lh_lock); +} + +static void +umem_release_log_header(umem_log_header_t *lhp) +{ + int idx; + if (lhp == NULL) + return; + + (void) mutex_unlock(&lhp->lh_lock); + + for (idx = 0; idx < umem_max_ncpus; idx++) + (void) mutex_unlock(&lhp->lh_cpu[idx].clh_lock); +} + +static void +umem_lockup(void) +{ + umem_cache_t *cp; + + (void) mutex_lock(&umem_init_lock); + /* + * If another thread is busy initializing the library, we must + * wait for it to complete (by calling umem_init()) before allowing + * the fork() to proceed. + */ + if (umem_ready == UMEM_READY_INITING && umem_init_thr != thr_self()) { + (void) mutex_unlock(&umem_init_lock); + (void) umem_init(); + (void) mutex_lock(&umem_init_lock); + } + (void) mutex_lock(&umem_cache_lock); + (void) mutex_lock(&umem_update_lock); + (void) mutex_lock(&umem_flags_lock); + + umem_lockup_cache(&umem_null_cache); + for (cp = umem_null_cache.cache_prev; cp != &umem_null_cache; + cp = cp->cache_prev) + umem_lockup_cache(cp); + + umem_lockup_log_header(umem_transaction_log); + umem_lockup_log_header(umem_content_log); + umem_lockup_log_header(umem_failure_log); + umem_lockup_log_header(umem_slab_log); + + (void) cond_broadcast(&umem_update_cv); + + vmem_sbrk_lockup(); + vmem_lockup(); +} + +static void +umem_release(void) +{ + umem_cache_t *cp; + + vmem_release(); + vmem_sbrk_release(); + + umem_release_log_header(umem_slab_log); + umem_release_log_header(umem_failure_log); + umem_release_log_header(umem_content_log); + umem_release_log_header(umem_transaction_log); + + for (cp = umem_null_cache.cache_next; cp != &umem_null_cache; + cp = cp->cache_next) + umem_release_cache(cp); + umem_release_cache(&umem_null_cache); + + (void) mutex_unlock(&umem_flags_lock); + (void) mutex_unlock(&umem_update_lock); + (void) mutex_unlock(&umem_cache_lock); + (void) mutex_unlock(&umem_init_lock); +} + +static void +umem_release_child(void) +{ + umem_cache_t *cp; + + /* + * Clean up the update state + */ + umem_update_thr = 0; + + if (umem_st_update_thr != thr_self()) { + umem_st_update_thr = 0; + umem_reaping = UMEM_REAP_DONE; + + for (cp = umem_null_cache.cache_next; cp != &umem_null_cache; + cp = cp->cache_next) { + if (cp->cache_uflags & UMU_NOTIFY) + cp->cache_uflags &= ~UMU_NOTIFY; + + /* + * If the cache is active, we just re-add it to + * the update list. This will re-do any active + * updates on the cache, but that won't break + * anything. + * + * The worst that can happen is a cache has + * its magazines rescaled twice, instead of once. + */ + if (cp->cache_uflags & UMU_ACTIVE) { + umem_cache_t *cnext, *cprev; + + ASSERT(cp->cache_unext == NULL && + cp->cache_uprev == NULL); + + cp->cache_uflags &= ~UMU_ACTIVE; + cp->cache_unext = cnext = &umem_null_cache; + cp->cache_uprev = cprev = + umem_null_cache.cache_uprev; + cnext->cache_uprev = cp; + cprev->cache_unext = cp; + } + } + } + + umem_release(); +} +#endif + +void +umem_forkhandler_init(void) +{ +#ifndef _WIN32 + /* + * There is no way to unregister these atfork functions, + * but we don't need to. The dynamic linker and libc take + * care of unregistering them if/when the library is unloaded. + */ + (void) pthread_atfork(umem_lockup, umem_release, umem_release_child); +#endif +} diff --git a/zfs/lib/libumem/umem_impl.h b/zfs/lib/libumem/umem_impl.h new file mode 100644 index 000000000..a2e886f25 --- /dev/null +++ b/zfs/lib/libumem/umem_impl.h @@ -0,0 +1,424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +#ifndef _UMEM_IMPL_H +#define _UMEM_IMPL_H + +/* #pragma ident "@(#)umem_impl.h 1.6 05/06/08 SMI" */ + +#include <umem.h> + +#ifdef HAVE_SYS_SYSMACROS_H +#include <sys/sysmacros.h> +#endif + +#if HAVE_SYS_TIME_H +#include <sys/time.h> +#endif + +#include <sys/vmem.h> +#ifdef HAVE_THREAD_H +# include <thread.h> +#else +# include "sol_compat.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * umem memory allocator: implementation-private data structures + */ + +/* + * Internal flags for umem_cache_create + */ +#define UMC_QCACHE 0x00100000 +#define UMC_INTERNAL 0x80000000 + +/* + * Cache flags + */ +#define UMF_AUDIT 0x00000001 /* transaction auditing */ +#define UMF_DEADBEEF 0x00000002 /* deadbeef checking */ +#define UMF_REDZONE 0x00000004 /* redzone checking */ +#define UMF_CONTENTS 0x00000008 /* freed-buffer content logging */ +#define UMF_CHECKSIGNAL 0x00000010 /* abort when in signal context */ +#define UMF_NOMAGAZINE 0x00000020 /* disable per-cpu magazines */ +#define UMF_FIREWALL 0x00000040 /* put all bufs before unmapped pages */ +#define UMF_LITE 0x00000100 /* lightweight debugging */ + +#define UMF_HASH 0x00000200 /* cache has hash table */ +#define UMF_RANDOMIZE 0x00000400 /* randomize other umem_flags */ + +#define UMF_BUFTAG (UMF_DEADBEEF | UMF_REDZONE) +#define UMF_TOUCH (UMF_BUFTAG | UMF_LITE | UMF_CONTENTS) +#define UMF_RANDOM (UMF_TOUCH | UMF_AUDIT | UMF_NOMAGAZINE) +#define UMF_DEBUG (UMF_RANDOM | UMF_FIREWALL) + +#define UMEM_STACK_DEPTH umem_stack_depth + +#define UMEM_FREE_PATTERN 0xdeadbeefdeadbeefULL +#define UMEM_UNINITIALIZED_PATTERN 0xbaddcafebaddcafeULL +#define UMEM_REDZONE_PATTERN 0xfeedfacefeedfaceULL +#define UMEM_REDZONE_BYTE 0xbb + +#define UMEM_FATAL_FLAGS (UMEM_NOFAIL) +#define UMEM_SLEEP_FLAGS (0) + +/* + * Redzone size encodings for umem_alloc() / umem_free(). We encode the + * allocation size, rather than storing it directly, so that umem_free() + * can distinguish frees of the wrong size from redzone violations. + */ +#define UMEM_SIZE_ENCODE(x) (251 * (x) + 1) +#define UMEM_SIZE_DECODE(x) ((x) / 251) +#define UMEM_SIZE_VALID(x) ((x) % 251 == 1) + +/* + * The bufctl (buffer control) structure keeps some minimal information + * about each buffer: its address, its slab, and its current linkage, + * which is either on the slab's freelist (if the buffer is free), or + * on the cache's buf-to-bufctl hash table (if the buffer is allocated). + * In the case of non-hashed, or "raw", caches (the common case), only + * the freelist linkage is necessary: the buffer address is at a fixed + * offset from the bufctl address, and the slab is at the end of the page. + * + * NOTE: bc_next must be the first field; raw buffers have linkage only. + */ +typedef struct umem_bufctl { + struct umem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct umem_slab *bc_slab; /* controlling slab */ +} umem_bufctl_t; + +/* + * The UMF_AUDIT version of the bufctl structure. The beginning of this + * structure must be identical to the normal bufctl structure so that + * pointers are interchangeable. + */ + +#define UMEM_BUFCTL_AUDIT_SIZE_DEPTH(frames) \ + ((size_t)(&((umem_bufctl_audit_t *)0)->bc_stack[frames])) + +/* + * umem_bufctl_audits must be allocated from a UMC_NOHASH cache, so we + * require that 2 of them, plus 2 buftags, plus a umem_slab_t, all fit on + * a single page. + * + * For ILP32, this is about 1000 frames. + * For LP64, this is about 490 frames. + */ + +#define UMEM_BUFCTL_AUDIT_ALIGN 32 + +#define UMEM_BUFCTL_AUDIT_MAX_SIZE \ + (P2ALIGN((PAGESIZE - sizeof (umem_slab_t))/2 - \ + sizeof (umem_buftag_t), UMEM_BUFCTL_AUDIT_ALIGN)) + +#define UMEM_MAX_STACK_DEPTH \ + ((UMEM_BUFCTL_AUDIT_MAX_SIZE - \ + UMEM_BUFCTL_AUDIT_SIZE_DEPTH(0)) / sizeof (uintptr_t)) + +typedef struct umem_bufctl_audit { + struct umem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct umem_slab *bc_slab; /* controlling slab */ + umem_cache_t *bc_cache; /* controlling cache */ + hrtime_t bc_timestamp; /* transaction time */ + thread_t bc_thread; /* thread doing transaction */ + struct umem_bufctl *bc_lastlog; /* last log entry */ + void *bc_contents; /* contents at last free */ + int bc_depth; /* stack depth */ + uintptr_t bc_stack[1]; /* pc stack */ +} umem_bufctl_audit_t; + +#define UMEM_LOCAL_BUFCTL_AUDIT(bcpp) \ + *(bcpp) = (umem_bufctl_audit_t *) \ + alloca(UMEM_BUFCTL_AUDIT_SIZE) + +#define UMEM_BUFCTL_AUDIT_SIZE \ + UMEM_BUFCTL_AUDIT_SIZE_DEPTH(UMEM_STACK_DEPTH) + +/* + * A umem_buftag structure is appended to each buffer whenever any of the + * UMF_BUFTAG flags (UMF_DEADBEEF, UMF_REDZONE, UMF_VERIFY) are set. + */ +typedef struct umem_buftag { + uint64_t bt_redzone; /* 64-bit redzone pattern */ + umem_bufctl_t *bt_bufctl; /* bufctl */ + intptr_t bt_bxstat; /* bufctl ^ (alloc/free) */ +} umem_buftag_t; + +#define UMEM_BUFTAG(cp, buf) \ + ((umem_buftag_t *)((char *)(buf) + (cp)->cache_buftag)) + +#define UMEM_BUFCTL(cp, buf) \ + ((umem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl)) + +#define UMEM_BUF(cp, bcp) \ + ((void *)((char *)(bcp) - (cp)->cache_bufctl)) + +#define UMEM_SLAB(cp, buf) \ + ((umem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1) + +#define UMEM_CPU_CACHE(cp, cpu) \ + (umem_cpu_cache_t *)((char *)cp + cpu->cpu_cache_offset) + +#define UMEM_MAGAZINE_VALID(cp, mp) \ + (((umem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ + (cp)->cache_magtype->mt_cache) + +#define UMEM_SLAB_MEMBER(sp, buf) \ + ((size_t)(buf) - (size_t)(sp)->slab_base < \ + (sp)->slab_cache->cache_slabsize) + +#define UMEM_BUFTAG_ALLOC 0xa110c8edUL +#define UMEM_BUFTAG_FREE 0xf4eef4eeUL + +typedef struct umem_slab { + struct umem_cache *slab_cache; /* controlling cache */ + void *slab_base; /* base of allocated memory */ + struct umem_slab *slab_next; /* next slab on freelist */ + struct umem_slab *slab_prev; /* prev slab on freelist */ + struct umem_bufctl *slab_head; /* first free buffer */ + long slab_refcnt; /* outstanding allocations */ + long slab_chunks; /* chunks (bufs) in this slab */ +} umem_slab_t; + +#define UMEM_HASH_INITIAL 64 + +#define UMEM_HASH(cp, buf) \ + ((cp)->cache_hash_table + \ + (((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask)) + +typedef struct umem_magazine { + void *mag_next; + void *mag_round[1]; /* one or more rounds */ +} umem_magazine_t; + +/* + * The magazine types for fast per-cpu allocation + */ +typedef struct umem_magtype { + int mt_magsize; /* magazine size (number of rounds) */ + int mt_align; /* magazine alignment */ + size_t mt_minbuf; /* all smaller buffers qualify */ + size_t mt_maxbuf; /* no larger buffers qualify */ + umem_cache_t *mt_cache; /* magazine cache */ +} umem_magtype_t; + +#if (defined(__PTHREAD_MUTEX_SIZE__) && __PTHREAD_MUTEX_SIZE__ >= 24) || defined(UMEM_PTHREAD_MUTEX_TOO_BIG) +#define UMEM_CPU_CACHE_SIZE 128 /* must be power of 2 */ +#else +#define UMEM_CPU_CACHE_SIZE 64 /* must be power of 2 */ +#endif +#define UMEM_CPU_PAD (UMEM_CPU_CACHE_SIZE - sizeof (mutex_t) - \ + 2 * sizeof (uint_t) - 2 * sizeof (void *) - 4 * sizeof (int)) +#define UMEM_CACHE_SIZE(ncpus) \ + ((size_t)(&((umem_cache_t *)0)->cache_cpu[ncpus])) + +typedef struct umem_cpu_cache { + mutex_t cc_lock; /* protects this cpu's local cache */ + uint_t cc_alloc; /* allocations from this cpu */ + uint_t cc_free; /* frees to this cpu */ + umem_magazine_t *cc_loaded; /* the currently loaded magazine */ + umem_magazine_t *cc_ploaded; /* the previously loaded magazine */ + int cc_rounds; /* number of objects in loaded mag */ + int cc_prounds; /* number of objects in previous mag */ + int cc_magsize; /* number of rounds in a full mag */ + int cc_flags; /* CPU-local copy of cache_flags */ +#if (!defined(_LP64) || defined(UMEM_PTHREAD_MUTEX_TOO_BIG)) && !defined(_WIN32) + /* on win32, UMEM_CPU_PAD evaluates to zero, and the MS compiler + * won't allow static initialization of arrays containing structures + * that contain zero size arrays */ + char cc_pad[UMEM_CPU_PAD]; /* for nice alignment (32-bit) */ +#endif +} umem_cpu_cache_t; + +/* + * The magazine lists used in the depot. + */ +typedef struct umem_maglist { + umem_magazine_t *ml_list; /* magazine list */ + long ml_total; /* number of magazines */ + long ml_min; /* min since last update */ + long ml_reaplimit; /* max reapable magazines */ + uint64_t ml_alloc; /* allocations from this list */ +} umem_maglist_t; + +#define UMEM_CACHE_NAMELEN 31 + +struct umem_cache { + /* + * Statistics + */ + uint64_t cache_slab_create; /* slab creates */ + uint64_t cache_slab_destroy; /* slab destroys */ + uint64_t cache_slab_alloc; /* slab layer allocations */ + uint64_t cache_slab_free; /* slab layer frees */ + uint64_t cache_alloc_fail; /* total failed allocations */ + uint64_t cache_buftotal; /* total buffers */ + uint64_t cache_bufmax; /* max buffers ever */ + uint64_t cache_rescale; /* # of hash table rescales */ + uint64_t cache_lookup_depth; /* hash lookup depth */ + uint64_t cache_depot_contention; /* mutex contention count */ + uint64_t cache_depot_contention_prev; /* previous snapshot */ + + /* + * Cache properties + */ + char cache_name[UMEM_CACHE_NAMELEN + 1]; + size_t cache_bufsize; /* object size */ + size_t cache_align; /* object alignment */ + umem_constructor_t *cache_constructor; + umem_destructor_t *cache_destructor; + umem_reclaim_t *cache_reclaim; + void *cache_private; /* opaque arg to callbacks */ + vmem_t *cache_arena; /* vmem source for slabs */ + int cache_cflags; /* cache creation flags */ + int cache_flags; /* various cache state info */ + int cache_uflags; /* UMU_* flags */ + uint32_t cache_mtbf; /* induced alloc failure rate */ + umem_cache_t *cache_next; /* forward cache linkage */ + umem_cache_t *cache_prev; /* backward cache linkage */ + umem_cache_t *cache_unext; /* next in update list */ + umem_cache_t *cache_uprev; /* prev in update list */ + uint32_t cache_cpu_mask; /* mask for cpu offset */ + + /* + * Slab layer + */ + mutex_t cache_lock; /* protects slab layer */ + size_t cache_chunksize; /* buf + alignment [+ debug] */ + size_t cache_slabsize; /* size of a slab */ + size_t cache_bufctl; /* buf-to-bufctl distance */ + size_t cache_buftag; /* buf-to-buftag distance */ + size_t cache_verify; /* bytes to verify */ + size_t cache_contents; /* bytes of saved content */ + size_t cache_color; /* next slab color */ + size_t cache_mincolor; /* maximum slab color */ + size_t cache_maxcolor; /* maximum slab color */ + size_t cache_hash_shift; /* get to interesting bits */ + size_t cache_hash_mask; /* hash table mask */ + umem_slab_t *cache_freelist; /* slab free list */ + umem_slab_t cache_nullslab; /* end of freelist marker */ + umem_cache_t *cache_bufctl_cache; /* source of bufctls */ + umem_bufctl_t **cache_hash_table; /* hash table base */ + /* + * Depot layer + */ + mutex_t cache_depot_lock; /* protects depot */ + umem_magtype_t *cache_magtype; /* magazine type */ + umem_maglist_t cache_full; /* full magazines */ + umem_maglist_t cache_empty; /* empty magazines */ + + /* + * Per-CPU layer + */ + umem_cpu_cache_t cache_cpu[1]; /* cache_cpu_mask + 1 entries */ +}; + +typedef struct umem_cpu_log_header { + mutex_t clh_lock; + char *clh_current; + size_t clh_avail; + int clh_chunk; + int clh_hits; + char clh_pad[UMEM_CPU_CACHE_SIZE - + sizeof (mutex_t) - sizeof (char *) - + sizeof (size_t) - 2 * sizeof (int)]; +} umem_cpu_log_header_t; + +typedef struct umem_log_header { + mutex_t lh_lock; + char *lh_base; + int *lh_free; + size_t lh_chunksize; + int lh_nchunks; + int lh_head; + int lh_tail; + int lh_hits; + umem_cpu_log_header_t lh_cpu[1]; /* actually umem_max_ncpus */ +} umem_log_header_t; + +typedef struct umem_cpu { + uint32_t cpu_cache_offset; + uint32_t cpu_number; +} umem_cpu_t; + +#define UMEM_MAXBUF 16384 + +#define UMEM_ALIGN 8 /* min guaranteed alignment */ +#define UMEM_ALIGN_SHIFT 3 /* log2(UMEM_ALIGN) */ +#define UMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ + +/* + * For 64 bits, buffers >= 16 bytes must be 16-byte aligned + */ +#ifdef _LP64 +#define UMEM_SECOND_ALIGN 16 +#else +#define UMEM_SECOND_ALIGN UMEM_ALIGN +#endif + +#define MALLOC_MAGIC 0x3a10c000 /* 8-byte tag */ +#define MEMALIGN_MAGIC 0x3e3a1000 + +#ifdef _LP64 +#define MALLOC_SECOND_MAGIC 0x16ba7000 /* 8-byte tag, 16-aligned */ +#define MALLOC_OVERSIZE_MAGIC 0x06e47000 /* 16-byte tag, _LP64 */ +#endif + +#define UMEM_MALLOC_ENCODE(type, sz) (uint32_t)((type) - (sz)) +#define UMEM_MALLOC_DECODE(stat, sz) (uint32_t)((stat) + (sz)) +#define UMEM_FREE_PATTERN_32 (uint32_t)(UMEM_FREE_PATTERN) + +#define UMU_MAGAZINE_RESIZE 0x00000001 +#define UMU_HASH_RESCALE 0x00000002 +#define UMU_REAP 0x00000004 +#define UMU_NOTIFY 0x08000000 +#define UMU_ACTIVE 0x80000000 + +#define UMEM_READY_INIT_FAILED -1 +#define UMEM_READY_STARTUP 1 +#define UMEM_READY_INITING 2 +#define UMEM_READY 3 + +#ifdef UMEM_STANDALONE +extern void umem_startup(caddr_t, size_t, size_t, caddr_t, caddr_t); +extern int umem_add(caddr_t, size_t); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _UMEM_IMPL_H */ diff --git a/zfs/lib/libumem/umem_update_thread.c b/zfs/lib/libumem/umem_update_thread.c new file mode 100644 index 000000000..033d606be --- /dev/null +++ b/zfs/lib/libumem/umem_update_thread.c @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)umem_update_thread.c 1.2 05/06/08 SMI" */ + +#include "config.h" +#include "umem_base.h" +#include "vmem_base.h" + +#include <signal.h> + +/* + * we use the _ version, since we don't want to be cancelled. + */ +extern int _cond_timedwait(cond_t *cv, mutex_t *mutex, const timespec_t *delay); + +/*ARGSUSED*/ +static THR_RETURN +THR_API umem_update_thread(void *arg) +{ + struct timeval now; + int in_update = 0; + + (void) mutex_lock(&umem_update_lock); + + ASSERT(umem_update_thr == thr_self()); + ASSERT(umem_st_update_thr == 0); + + for (;;) { + umem_process_updates(); + + if (in_update) { + in_update = 0; + /* + * we wait until now to set the next update time + * so that the updates are self-throttling + */ + (void) gettimeofday(&umem_update_next, NULL); + umem_update_next.tv_sec += umem_reap_interval; + } + + switch (umem_reaping) { + case UMEM_REAP_DONE: + case UMEM_REAP_ADDING: + break; + + case UMEM_REAP_ACTIVE: + umem_reap_next = gethrtime() + + (hrtime_t)umem_reap_interval * NANOSEC; + umem_reaping = UMEM_REAP_DONE; + break; + + default: + ASSERT(umem_reaping == UMEM_REAP_DONE || + umem_reaping == UMEM_REAP_ADDING || + umem_reaping == UMEM_REAP_ACTIVE); + break; + } + + (void) gettimeofday(&now, NULL); + if (now.tv_sec > umem_update_next.tv_sec || + (now.tv_sec == umem_update_next.tv_sec && + now.tv_usec >= umem_update_next.tv_usec)) { + /* + * Time to run an update + */ + (void) mutex_unlock(&umem_update_lock); + + vmem_update(NULL); + /* + * umem_cache_update can use umem_add_update to + * request further work. The update is not complete + * until all such work is finished. + */ + umem_cache_applyall(umem_cache_update); + + (void) mutex_lock(&umem_update_lock); + in_update = 1; + continue; /* start processing immediately */ + } + + /* + * if there is no work to do, we wait until it is time for + * next update, or someone wakes us. + */ + if (umem_null_cache.cache_unext == &umem_null_cache) { + timespec_t abs_time; + abs_time.tv_sec = umem_update_next.tv_sec; + abs_time.tv_nsec = umem_update_next.tv_usec * 1000; + + (void) _cond_timedwait(&umem_update_cv, + &umem_update_lock, &abs_time); + } + } + /* LINTED no return statement */ +} + +int +umem_create_update_thread(void) +{ +#ifndef _WIN32 + sigset_t sigmask, oldmask; +#endif + + ASSERT(MUTEX_HELD(&umem_update_lock)); + ASSERT(umem_update_thr == 0); + +#ifndef _WIN32 + /* + * The update thread handles no signals + */ + (void) sigfillset(&sigmask); + (void) thr_sigsetmask(SIG_BLOCK, &sigmask, &oldmask); +#endif + if (thr_create(NULL, 0, umem_update_thread, NULL, + THR_BOUND | THR_DAEMON | THR_DETACHED, &umem_update_thr) == 0) { +#ifndef _WIN32 + (void) thr_sigsetmask(SIG_SETMASK, &oldmask, NULL); +#endif + return (1); + } + umem_update_thr = 0; +#ifndef _WIN32 + (void) thr_sigsetmask(SIG_SETMASK, &oldmask, NULL); +#endif + return (0); +} diff --git a/zfs/lib/libumem/vmem.c b/zfs/lib/libumem/vmem.c new file mode 100644 index 000000000..1b8981a91 --- /dev/null +++ b/zfs/lib/libumem/vmem.c @@ -0,0 +1,1807 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* #pragma ident "@(#)vmem.c 1.10 05/06/08 SMI" */ + +/* + * For a more complete description of the main ideas, see: + * + * Jeff Bonwick and Jonathan Adams, + * + * Magazines and vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources. + * + * Proceedings of the 2001 Usenix Conference. + * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. + * + * For the "Big Theory Statement", see usr/src/common/os/vmem.c + * + * 1. Overview of changes + * ------------------------------ + * There have been a few changes to vmem in order to support umem. The + * main areas are: + * + * * VM_SLEEP unsupported + * + * * Reaping changes + * + * * initialization changes + * + * * _vmem_extend_alloc + * + * + * 2. VM_SLEEP Removed + * ------------------- + * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for + * possibly infinite amounts of time, they are not supported in this + * version of vmem. Sleep-like behavior can be achieved through + * UMEM_NOFAIL umem allocations. + * + * + * 3. Reaping changes + * ------------------ + * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() + * can do allocations and frees synchronously. This is a problem if it + * occurs during a vmem_populate() allocation. + * + * Instead, we delay reaps while populates are active. + * + * + * 4. Initialization changes + * ------------------------- + * In the kernel, vmem_init() allows you to create a single, top-level arena, + * which has vmem_internal_arena as a child. For umem, we want to be able + * to extend arenas dynamically. It is much easier to support this if we + * allow a two-level "heap" arena: + * + * +----------+ + * | "fake" | + * +----------+ + * | + * +----------+ + * | "heap" | + * +----------+ + * | \ \ + * | +-+-- ... <other children> + * | + * +---------------+ + * | vmem_internal | + * +---------------+ + * | | | | + * <children> + * + * The new vmem_init() allows you to specify a "parent" of the heap, along + * with allocation functions. + * + * + * 5. _vmem_extend_alloc + * --------------------- + * The other part of extending is _vmem_extend_alloc. This function allows + * you to extend (expand current spans, if possible) an arena and allocate + * a chunk of the newly extened span atomically. This is needed to support + * extending the heap while vmem_populate()ing it. + * + * In order to increase the usefulness of extending, non-imported spans are + * sorted in address order. + */ + +#include "config.h" +/* #include "mtlib.h" */ +#include <sys/vmem_impl_user.h> +#if HAVE_ALLOCA_H +#include <alloca.h> +#endif +#ifdef HAVE_SYS_SYSMACROS_H +#include <sys/sysmacros.h> +#endif +#include <stdio.h> +#if HAVE_STRINGS_H +#include <strings.h> +#endif +#if HAVE_ATOMIC_H +#include <atomic.h> +#endif + +#include "vmem_base.h" +#include "umem_base.h" + +#define VMEM_INITIAL 6 /* early vmem arenas */ +#define VMEM_SEG_INITIAL 100 /* early segments */ + +/* + * Adding a new span to an arena requires two segment structures: one to + * represent the span, and one to represent the free segment it contains. + */ +#define VMEM_SEGS_PER_SPAN_CREATE 2 + +/* + * Allocating a piece of an existing segment requires 0-2 segment structures + * depending on how much of the segment we're allocating. + * + * To allocate the entire segment, no new segment structures are needed; we + * simply move the existing segment structure from the freelist to the + * allocation hash table. + * + * To allocate a piece from the left or right end of the segment, we must + * split the segment into two pieces (allocated part and remainder), so we + * need one new segment structure to represent the remainder. + * + * To allocate from the middle of a segment, we need two new segment strucures + * to represent the remainders on either side of the allocated part. + */ +#define VMEM_SEGS_PER_EXACT_ALLOC 0 +#define VMEM_SEGS_PER_LEFT_ALLOC 1 +#define VMEM_SEGS_PER_RIGHT_ALLOC 1 +#define VMEM_SEGS_PER_MIDDLE_ALLOC 2 + +/* + * vmem_populate() preallocates segment structures for vmem to do its work. + * It must preallocate enough for the worst case, which is when we must import + * a new span and then allocate from the middle of it. + */ +#define VMEM_SEGS_PER_ALLOC_MAX \ + (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) + +/* + * The segment structures themselves are allocated from vmem_seg_arena, so + * we have a recursion problem when vmem_seg_arena needs to populate itself. + * We address this by working out the maximum number of segment structures + * this act will require, and multiplying by the maximum number of threads + * that we'll allow to do it simultaneously. + * + * The worst-case segment consumption to populate vmem_seg_arena is as + * follows (depicted as a stack trace to indicate why events are occurring): + * + * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) + * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) + * heap_alloc(heap_arena) + * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) + * parent_alloc(parent_arena) + * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) + * + * Note: The reservation for heap_arena must be 4, since vmem_xalloc() + * is overly pessimistic on allocations where parent_arena has a stricter + * alignment than heap_arena. + * + * The worst-case consumption for any arena is 4 segment structures. + * For now, we only support VM_NOSLEEP allocations, so as long as we + * serialize all vmem_populates, a 4-seg reserve is sufficient. + */ +#define VMEM_POPULATE_SEGS_PER_ARENA 4 +#define VMEM_POPULATE_LOCKS 1 + +#define VMEM_POPULATE_RESERVE \ + (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) + +/* + * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures + * so that it can satisfy the worst-case allocation *and* participate in + * worst-case allocation from vmem_seg_arena. + */ +#define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) + +/* Don't assume new statics are zeroed - see vmem_startup() */ +static vmem_t vmem0[VMEM_INITIAL]; +static vmem_t *vmem_populator[VMEM_INITIAL]; +static uint32_t vmem_id; +static uint32_t vmem_populators; +static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; +static vmem_seg_t *vmem_segfree; +static mutex_t vmem_list_lock = DEFAULTMUTEX; +static mutex_t vmem_segfree_lock = DEFAULTMUTEX; +static vmem_populate_lock_t vmem_nosleep_lock = { + DEFAULTMUTEX, + 0 +}; +#define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) +static vmem_t *vmem_list; +static vmem_t *vmem_internal_arena; +static vmem_t *vmem_seg_arena; +static vmem_t *vmem_hash_arena; +static vmem_t *vmem_vmem_arena; + +vmem_t *vmem_heap; +vmem_alloc_t *vmem_heap_alloc; +vmem_free_t *vmem_heap_free; + +uint32_t vmem_mtbf; /* mean time between failures [default: off] */ +size_t vmem_seg_size = sizeof (vmem_seg_t); + +/* + * we use the _ version, since we don't want to be cancelled. + * Actually, this is automatically taken care of by including "mtlib.h". + */ +extern int _cond_wait(cond_t *cv, mutex_t *mutex); + +/* + * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). + */ +#define VMEM_INSERT(vprev, vsp, type) \ +{ \ + vmem_seg_t *vnext = (vprev)->vs_##type##next; \ + (vsp)->vs_##type##next = (vnext); \ + (vsp)->vs_##type##prev = (vprev); \ + (vprev)->vs_##type##next = (vsp); \ + (vnext)->vs_##type##prev = (vsp); \ +} + +#define VMEM_DELETE(vsp, type) \ +{ \ + vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ + vmem_seg_t *vnext = (vsp)->vs_##type##next; \ + (vprev)->vs_##type##next = (vnext); \ + (vnext)->vs_##type##prev = (vprev); \ +} + +/* + * Get a vmem_seg_t from the global segfree list. + */ +static vmem_seg_t * +vmem_getseg_global(void) +{ + vmem_seg_t *vsp; + + (void) mutex_lock(&vmem_segfree_lock); + if ((vsp = vmem_segfree) != NULL) + vmem_segfree = vsp->vs_knext; + (void) mutex_unlock(&vmem_segfree_lock); + + return (vsp); +} + +/* + * Put a vmem_seg_t on the global segfree list. + */ +static void +vmem_putseg_global(vmem_seg_t *vsp) +{ + (void) mutex_lock(&vmem_segfree_lock); + vsp->vs_knext = vmem_segfree; + vmem_segfree = vsp; + (void) mutex_unlock(&vmem_segfree_lock); +} + +/* + * Get a vmem_seg_t from vmp's segfree list. + */ +static vmem_seg_t * +vmem_getseg(vmem_t *vmp) +{ + vmem_seg_t *vsp; + + ASSERT(vmp->vm_nsegfree > 0); + + vsp = vmp->vm_segfree; + vmp->vm_segfree = vsp->vs_knext; + vmp->vm_nsegfree--; + + return (vsp); +} + +/* + * Put a vmem_seg_t on vmp's segfree list. + */ +static void +vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) +{ + vsp->vs_knext = vmp->vm_segfree; + vmp->vm_segfree = vsp; + vmp->vm_nsegfree++; +} + +/* + * Add vsp to the appropriate freelist. + */ +static void +vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(vprev, vsp, k); + + (void) cond_broadcast(&vmp->vm_cv); +} + +/* + * Take vsp from the freelist. + */ +static void +vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + ASSERT(vsp->vs_type == VMEM_FREE); + + if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { + /* + * The segments on both sides of 'vsp' are freelist heads, + * so taking vsp leaves the freelist at vsp->vs_kprev empty. + */ + ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); + vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); + } + VMEM_DELETE(vsp, k); +} + +/* + * Add vsp to the allocated-segment hash table and update kstats. + */ +static void +vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t **bucket; + + vsp->vs_type = VMEM_ALLOC; + bucket = VMEM_HASH(vmp, vsp->vs_start); + vsp->vs_knext = *bucket; + *bucket = vsp; + + if (vmem_seg_size == sizeof (vmem_seg_t)) { + vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, + VMEM_STACK_DEPTH, 0); + vsp->vs_thread = thr_self(); + vsp->vs_timestamp = gethrtime(); + } else { + vsp->vs_depth = 0; + } + + vmp->vm_kstat.vk_alloc++; + vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); +} + +/* + * Remove vsp from the allocated-segment hash table and update kstats. + */ +static vmem_seg_t * +vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) +{ + vmem_seg_t *vsp, **prev_vspp; + + prev_vspp = VMEM_HASH(vmp, addr); + while ((vsp = *prev_vspp) != NULL) { + if (vsp->vs_start == addr) { + *prev_vspp = vsp->vs_knext; + break; + } + vmp->vm_kstat.vk_lookup++; + prev_vspp = &vsp->vs_knext; + } + + if (vsp == NULL) { + umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", + vmp, addr, size); + } + if (VS_SIZE(vsp) != size) { + umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " + "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); + } + + vmp->vm_kstat.vk_free++; + vmp->vm_kstat.vk_mem_inuse -= size; + + return (vsp); +} + +/* + * Create a segment spanning the range [start, end) and add it to the arena. + */ +static vmem_seg_t * +vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) +{ + vmem_seg_t *newseg = vmem_getseg(vmp); + + newseg->vs_start = start; + newseg->vs_end = end; + newseg->vs_type = 0; + newseg->vs_import = 0; + + VMEM_INSERT(vprev, newseg, a); + + return (newseg); +} + +/* + * Remove segment vsp from the arena. + */ +static void +vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vsp->vs_type != VMEM_ROTOR); + VMEM_DELETE(vsp, a); + + vmem_putseg(vmp, vsp); +} + +/* + * Add the span [vaddr, vaddr + size) to vmp and update kstats. + */ +static vmem_seg_t * +vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) +{ + vmem_seg_t *knext; + vmem_seg_t *newseg, *span; + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + + knext = &vmp->vm_seg0; + if (!import && vmp->vm_source_alloc == NULL) { + vmem_seg_t *kend, *kprev; + /* + * non-imported spans are sorted in address order. This + * makes vmem_extend_unlocked() much more effective. + * + * We search in reverse order, since new spans are + * generally at higher addresses. + */ + kend = &vmp->vm_seg0; + for (kprev = kend->vs_kprev; kprev != kend; + kprev = kprev->vs_kprev) { + if (!kprev->vs_import && (kprev->vs_end - 1) < start) + break; + } + knext = kprev->vs_knext; + } + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((start | end) & (vmp->vm_quantum - 1)) { + umem_panic("vmem_span_create(%p, %p, %lu): misaligned", + vmp, vaddr, size); + } + + span = vmem_seg_create(vmp, knext->vs_aprev, start, end); + span->vs_type = VMEM_SPAN; + VMEM_INSERT(knext->vs_kprev, span, k); + + newseg = vmem_seg_create(vmp, span, start, end); + vmem_freelist_insert(vmp, newseg); + + newseg->vs_import = import; + if (import) + vmp->vm_kstat.vk_mem_import += size; + vmp->vm_kstat.vk_mem_total += size; + + return (newseg); +} + +/* + * Remove span vsp from vmp and update kstats. + */ +static void +vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t *span = vsp->vs_aprev; + size_t size = VS_SIZE(vsp); + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + ASSERT(span->vs_type == VMEM_SPAN); + + if (vsp->vs_import) + vmp->vm_kstat.vk_mem_import -= size; + vmp->vm_kstat.vk_mem_total -= size; + + VMEM_DELETE(span, k); + + vmem_seg_destroy(vmp, vsp); + vmem_seg_destroy(vmp, span); +} + +/* + * Allocate the subrange [addr, addr + size) from segment vsp. + * If there are leftovers on either side, place them on the freelist. + * Returns a pointer to the segment representing [addr, addr + size). + */ +static vmem_seg_t * +vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) +{ + uintptr_t vs_start = vsp->vs_start; + uintptr_t vs_end = vsp->vs_end; + size_t vs_size = vs_end - vs_start; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + uintptr_t addr_end = addr + realsize; + + ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); + ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); + ASSERT(vsp->vs_type == VMEM_FREE); + ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); + ASSERT(addr - 1 <= addr_end - 1); + + /* + * If we're allocating from the start of the segment, and the + * remainder will be on the same freelist, we can save quite + * a bit of work. + */ + if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + vsp->vs_start = addr_end; + vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); + vmem_hash_insert(vmp, vsp); + return (vsp); + } + + vmem_freelist_delete(vmp, vsp); + + if (vs_end != addr_end) + vmem_freelist_insert(vmp, + vmem_seg_create(vmp, vsp, addr_end, vs_end)); + + if (vs_start != addr) + vmem_freelist_insert(vmp, + vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); + + vsp->vs_start = addr; + vsp->vs_end = addr + size; + + vmem_hash_insert(vmp, vsp); + return (vsp); +} + +/* + * We cannot reap if we are in the middle of a vmem_populate(). + */ +void +vmem_reap(void) +{ + if (!IN_POPULATE()) + umem_reap(); +} + +/* + * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. + */ +static int +vmem_populate(vmem_t *vmp, int vmflag) +{ + char *p; + vmem_seg_t *vsp; + ssize_t nseg; + size_t size; + vmem_populate_lock_t *lp; + int i; + + while (vmp->vm_nsegfree < VMEM_MINFREE && + (vsp = vmem_getseg_global()) != NULL) + vmem_putseg(vmp, vsp); + + if (vmp->vm_nsegfree >= VMEM_MINFREE) + return (1); + + /* + * If we're already populating, tap the reserve. + */ + if (vmem_nosleep_lock.vmpl_thr == thr_self()) { + ASSERT(vmp->vm_cflags & VMC_POPULATOR); + return (1); + } + + (void) mutex_unlock(&vmp->vm_lock); + + ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ + lp = &vmem_nosleep_lock; + + /* + * Cannot be just a mutex_lock(), since that has no effect if + * libthread is not linked. + */ + (void) mutex_lock(&lp->vmpl_mutex); + ASSERT(lp->vmpl_thr == 0); + lp->vmpl_thr = thr_self(); + + nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; + size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); + nseg = size / vmem_seg_size; + + /* + * The following vmem_alloc() may need to populate vmem_seg_arena + * and all the things it imports from. When doing so, it will tap + * each arena's reserve to prevent recursion (see the block comment + * above the definition of VMEM_POPULATE_RESERVE). + * + * During this allocation, vmem_reap() is a no-op. If the allocation + * fails, we call vmem_reap() after dropping the population lock. + */ + p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); + if (p == NULL) { + lp->vmpl_thr = 0; + (void) mutex_unlock(&lp->vmpl_mutex); + vmem_reap(); + + (void) mutex_lock(&vmp->vm_lock); + vmp->vm_kstat.vk_populate_fail++; + return (0); + } + /* + * Restock the arenas that may have been depleted during population. + */ + for (i = 0; i < vmem_populators; i++) { + (void) mutex_lock(&vmem_populator[i]->vm_lock); + while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) + vmem_putseg(vmem_populator[i], + (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + (void) mutex_unlock(&vmem_populator[i]->vm_lock); + } + + lp->vmpl_thr = 0; + (void) mutex_unlock(&lp->vmpl_mutex); + (void) mutex_lock(&vmp->vm_lock); + + /* + * Now take our own segments. + */ + ASSERT(nseg >= VMEM_MINFREE); + while (vmp->vm_nsegfree < VMEM_MINFREE) + vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + /* + * Give the remainder to charity. + */ + while (nseg > 0) + vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + return (1); +} + +/* + * Advance a walker from its previous position to 'afterme'. + * Note: may drop and reacquire vmp->vm_lock. + */ +static void +vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) +{ + vmem_seg_t *vprev = walker->vs_aprev; + vmem_seg_t *vnext = walker->vs_anext; + vmem_seg_t *vsp = NULL; + + VMEM_DELETE(walker, a); + + if (afterme != NULL) + VMEM_INSERT(afterme, walker, a); + + /* + * The walker segment's presence may have prevented its neighbors + * from coalescing. If so, coalesce them now. + */ + if (vprev->vs_type == VMEM_FREE) { + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vnext->vs_start); + vmem_freelist_delete(vmp, vnext); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vnext->vs_end; + vmem_freelist_insert(vmp, vprev); + vmem_seg_destroy(vmp, vnext); + } + vsp = vprev; + } else if (vnext->vs_type == VMEM_FREE) { + vsp = vnext; + } + + /* + * vsp could represent a complete imported span, + * in which case we must return it to the source. + */ + if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + void *vaddr = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_freelist_delete(vmp, vsp); + vmem_span_destroy(vmp, vsp); + (void) mutex_unlock(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + (void) mutex_lock(&vmp->vm_lock); + } +} + +/* + * VM_NEXTFIT allocations deliberately cycle through all virtual addresses + * in an arena, so that we avoid reusing addresses for as long as possible. + * This helps to catch used-after-freed bugs. It's also the perfect policy + * for allocating things like process IDs, where we want to cycle through + * all values in order. + */ +static void * +vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp, *rotor; + uintptr_t addr; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + size_t vs_size; + + (void) mutex_lock(&vmp->vm_lock); + + if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { + (void) mutex_unlock(&vmp->vm_lock); + return (NULL); + } + + /* + * The common case is that the segment right after the rotor is free, + * and large enough that extracting 'size' bytes won't change which + * freelist it's on. In this case we can avoid a *lot* of work. + * Instead of the normal vmem_seg_alloc(), we just advance the start + * address of the victim segment. Instead of moving the rotor, we + * create the new segment structure *behind the rotor*, which has + * the same effect. And finally, we know we don't have to coalesce + * the rotor's neighbors because the new segment lies between them. + */ + rotor = &vmp->vm_rotor; + vsp = rotor->vs_anext; + if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && + P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + addr = vsp->vs_start; + vsp->vs_start = addr + realsize; + vmem_hash_insert(vmp, + vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); + (void) mutex_unlock(&vmp->vm_lock); + return ((void *)addr); + } + + /* + * Starting at the rotor, look for a segment large enough to + * satisfy the allocation. + */ + for (;;) { + vmp->vm_kstat.vk_search++; + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + vsp = vsp->vs_anext; + if (vsp == rotor) { + /* + * We've come full circle. One possibility is that the + * there's actually enough space, but the rotor itself + * is preventing the allocation from succeeding because + * it's sitting between two free segments. Therefore, + * we advance the rotor and see if that liberates a + * suitable segment. + */ + vmem_advance(vmp, rotor, rotor->vs_anext); + vsp = rotor->vs_aprev; + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + /* + * If there's a lower arena we can import from, or it's + * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. + * Otherwise, wait until another thread frees something. + */ + if (vmp->vm_source_alloc != NULL || + (vmflag & VM_NOSLEEP)) { + (void) mutex_unlock(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); + } + vmp->vm_kstat.vk_wait++; + (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); + vsp = rotor->vs_anext; + } + } + + /* + * We found a segment. Extract enough space to satisfy the allocation. + */ + addr = vsp->vs_start; + vsp = vmem_seg_alloc(vmp, vsp, addr, size); + ASSERT(vsp->vs_type == VMEM_ALLOC && + vsp->vs_start == addr && vsp->vs_end == addr + size); + + /* + * Advance the rotor to right after the newly-allocated segment. + * That's where the next VM_NEXTFIT allocation will begin searching. + */ + vmem_advance(vmp, rotor, vsp); + (void) mutex_unlock(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Allocate size bytes at offset phase from an align boundary such that the + * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) + * that does not straddle a nocross-aligned boundary. + */ +void * +vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, + size_t nocross, void *minaddr, void *maxaddr, int vmflag) +{ + vmem_seg_t *vsp; + vmem_seg_t *vbest = NULL; + uintptr_t addr, taddr, start, end; + void *vaddr; + int hb, flist, resv; + uint32_t mtbf; + + if (phase > 0 && phase >= align) + umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "invalid phase", + (void *)vmp, size, align, phase, nocross, + minaddr, maxaddr, vmflag); + + if (align == 0) + align = vmp->vm_quantum; + + if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { + umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters not vm_quantum aligned", + (void *)vmp, size, align, phase, nocross, + minaddr, maxaddr, vmflag); + } + + if (nocross != 0 && + (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { + umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "overconstrained allocation", + (void *)vmp, size, align, phase, nocross, + minaddr, maxaddr, vmflag); + } + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + (void) mutex_lock(&vmp->vm_lock); + for (;;) { + if (vmp->vm_nsegfree < VMEM_MINFREE && + !vmem_populate(vmp, vmflag)) + break; + + /* + * highbit() returns the highest bit + 1, which is exactly + * what we want: we want to search the first freelist whose + * members are *definitely* large enough to satisfy our + * allocation. However, there are certain cases in which we + * want to look at the next-smallest freelist (which *might* + * be able to satisfy the allocation): + * + * (1) The size is exactly a power of 2, in which case + * the smaller freelist is always big enough; + * + * (2) All other freelists are empty; + * + * (3) We're in the highest possible freelist, which is + * always empty (e.g. the 4GB freelist on 32-bit systems); + * + * (4) We're doing a best-fit or first-fit allocation. + */ + if ((size & (size - 1)) == 0) { + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + } else { + hb = highbit(size); + if ((vmp->vm_freemap >> hb) == 0 || + hb == VMEM_FREELISTS || + (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) + hb--; + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + for (vbest = NULL, vsp = (flist == 0) ? NULL : + vmp->vm_freelist[flist - 1].vs_knext; + vsp != NULL; vsp = vsp->vs_knext) { + vmp->vm_kstat.vk_search++; + if (vsp->vs_start == 0) { + /* + * We're moving up to a larger freelist, + * so if we've already found a candidate, + * the fit can't possibly get any better. + */ + if (vbest != NULL) + break; + /* + * Find the next non-empty freelist. + */ + flist = lowbit(P2ALIGN(vmp->vm_freemap, + VS_SIZE(vsp))); + if (flist-- == 0) + break; + vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; + ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); + continue; + } + if (vsp->vs_end - 1 < (uintptr_t)minaddr) + continue; + if (vsp->vs_start > (uintptr_t)maxaddr - 1) + continue; + start = MAX(vsp->vs_start, (uintptr_t)minaddr); + end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; + taddr = P2PHASEUP(start, align, phase); + if (P2CROSS(taddr, taddr + size - 1, nocross)) + taddr += + P2ROUNDUP(P2NPHASE(taddr, nocross), align); + if ((taddr - start) + size > end - start || + (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) + continue; + vbest = vsp; + addr = taddr; + if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) + break; + } + if (vbest != NULL) + break; + if (size == 0) + umem_panic("vmem_xalloc(): size == 0"); + if (vmp->vm_source_alloc != NULL && nocross == 0 && + minaddr == NULL && maxaddr == NULL) { + size_t asize = P2ROUNDUP(size + phase, + MAX(align, vmp->vm_source->vm_quantum)); + if (asize < size) { /* overflow */ + (void) mutex_unlock(&vmp->vm_lock); + if (vmflag & VM_NOSLEEP) + return (NULL); + + umem_panic("vmem_xalloc(): " + "overflow on VM_SLEEP allocation"); + } + /* + * Determine how many segment structures we'll consume. + * The calculation must be presise because if we're + * here on behalf of vmem_populate(), we are taking + * segments from a very limited reserve. + */ + resv = (size == asize) ? + VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_EXACT_ALLOC : + VMEM_SEGS_PER_ALLOC_MAX; + ASSERT(vmp->vm_nsegfree >= resv); + vmp->vm_nsegfree -= resv; /* reserve our segs */ + (void) mutex_unlock(&vmp->vm_lock); + vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, + vmflag & VM_UMFLAGS); + (void) mutex_lock(&vmp->vm_lock); + vmp->vm_nsegfree += resv; /* claim reservation */ + if (vaddr != NULL) { + vbest = vmem_span_create(vmp, vaddr, asize, 1); + addr = P2PHASEUP(vbest->vs_start, align, phase); + break; + } + } + (void) mutex_unlock(&vmp->vm_lock); + vmem_reap(); + (void) mutex_lock(&vmp->vm_lock); + if (vmflag & VM_NOSLEEP) + break; + vmp->vm_kstat.vk_wait++; + (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); + } + if (vbest != NULL) { + ASSERT(vbest->vs_type == VMEM_FREE); + ASSERT(vbest->vs_knext != vbest); + (void) vmem_seg_alloc(vmp, vbest, addr, size); + (void) mutex_unlock(&vmp->vm_lock); + ASSERT(P2PHASE(addr, align) == phase); + ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); + ASSERT(addr >= (uintptr_t)minaddr); + ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); + return ((void *)addr); + } + vmp->vm_kstat.vk_fail++; + (void) mutex_unlock(&vmp->vm_lock); + if (vmflag & VM_PANIC) + umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "cannot satisfy mandatory allocation", + (void *)vmp, size, align, phase, nocross, + minaddr, maxaddr, vmflag); + return (NULL); +} + +/* + * Free the segment [vaddr, vaddr + size), where vaddr was a constrained + * allocation. vmem_xalloc() and vmem_xfree() must always be paired because + * both routines bypass the quantum caches. + */ +void +vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) +{ + vmem_seg_t *vsp, *vnext, *vprev; + + (void) mutex_lock(&vmp->vm_lock); + + vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); + vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); + + /* + * Attempt to coalesce with the next segment. + */ + vnext = vsp->vs_anext; + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vsp->vs_end == vnext->vs_start); + vmem_freelist_delete(vmp, vnext); + vsp->vs_end = vnext->vs_end; + vmem_seg_destroy(vmp, vnext); + } + + /* + * Attempt to coalesce with the previous segment. + */ + vprev = vsp->vs_aprev; + if (vprev->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vsp->vs_start); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vsp->vs_end; + vmem_seg_destroy(vmp, vsp); + vsp = vprev; + } + + /* + * If the entire span is free, return it to the source. + */ + if (vsp->vs_import && vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + vaddr = (void *)vsp->vs_start; + size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_span_destroy(vmp, vsp); + (void) mutex_unlock(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + } else { + vmem_freelist_insert(vmp, vsp); + (void) mutex_unlock(&vmp->vm_lock); + } +} + +/* + * Allocate size bytes from arena vmp. Returns the allocated address + * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, + * and may also specify best-fit, first-fit, or next-fit allocation policy + * instead of the default instant-fit policy. VM_SLEEP allocations are + * guaranteed to succeed. + */ +void * +vmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp; + uintptr_t addr; + int hb; + int flist = 0; + uint32_t mtbf; + + if (size - 1 < vmp->vm_qcache_max) { + ASSERT(vmflag & VM_NOSLEEP); + return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> + vmp->vm_qshift], UMEM_DEFAULT)); + } + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + if (vmflag & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) + return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, + NULL, NULL, vmflag)); + + /* + * Unconstrained instant-fit allocation from the segment list. + */ + (void) mutex_lock(&vmp->vm_lock); + + if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + if (flist-- == 0) { + (void) mutex_unlock(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag)); + } + + ASSERT(size <= (1UL << flist)); + vsp = vmp->vm_freelist[flist].vs_knext; + addr = vsp->vs_start; + (void) vmem_seg_alloc(vmp, vsp, addr, size); + (void) mutex_unlock(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Free the segment [vaddr, vaddr + size). + */ +void +vmem_free(vmem_t *vmp, void *vaddr, size_t size) +{ + if (size - 1 < vmp->vm_qcache_max) + _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], + vaddr); + else + vmem_xfree(vmp, vaddr, size); +} + +/* + * Determine whether arena vmp contains the segment [vaddr, vaddr + size). + */ +int +vmem_contains(vmem_t *vmp, void *vaddr, size_t size) +{ + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + + (void) mutex_lock(&vmp->vm_lock); + vmp->vm_kstat.vk_contains++; + for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { + vmp->vm_kstat.vk_contains_search++; + ASSERT(vsp->vs_type == VMEM_SPAN); + if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) + break; + } + (void) mutex_unlock(&vmp->vm_lock); + return (vsp != seg0); +} + +/* + * Add the span [vaddr, vaddr + size) to arena vmp. + */ +void * +vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) +{ + if (vaddr == NULL || size == 0) { + umem_panic("vmem_add(%p, %p, %lu): bad arguments", + vmp, vaddr, size); + } + + ASSERT(!vmem_contains(vmp, vaddr, size)); + + (void) mutex_lock(&vmp->vm_lock); + if (vmem_populate(vmp, vmflag)) + (void) vmem_span_create(vmp, vaddr, size, 0); + else + vaddr = NULL; + (void) cond_broadcast(&vmp->vm_cv); + (void) mutex_unlock(&vmp->vm_lock); + return (vaddr); +} + +/* + * Adds the address range [addr, endaddr) to arena vmp, by either: + * 1. joining two existing spans, [x, addr), and [endaddr, y) (which + * are in that order) into a single [x, y) span, + * 2. expanding an existing [x, addr) span to [x, endaddr), + * 3. expanding an existing [endaddr, x) span to [addr, x), or + * 4. creating a new [addr, endaddr) span. + * + * Called with vmp->vm_lock held, and a successful vmem_populate() completed. + * Cannot fail. Returns the new segment. + * + * NOTE: this algorithm is linear-time in the number of spans, but is + * constant-time when you are extending the last (highest-addressed) + * span. + */ +static vmem_seg_t * +vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) +{ + vmem_seg_t *span; + vmem_seg_t *vsp; + + vmem_seg_t *end = &vmp->vm_seg0; + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + /* + * the second "if" clause below relies on the direction of this search + */ + for (span = end->vs_kprev; span != end; span = span->vs_kprev) { + if (span->vs_end == addr || span->vs_start == endaddr) + break; + } + + if (span == end) + return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); + if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { + vmem_seg_t *prevspan = span->vs_kprev; + vmem_seg_t *nextseg = span->vs_anext; + vmem_seg_t *prevseg = span->vs_aprev; + + /* + * prevspan becomes the span marker for the full range + */ + prevspan->vs_end = span->vs_end; + + /* + * Notionally, span becomes a free segment representing + * [addr, endaddr). + * + * However, if either of its neighbors are free, we coalesce + * by destroying span and changing the free segment. + */ + if (prevseg->vs_type == VMEM_FREE && + nextseg->vs_type == VMEM_FREE) { + /* + * coalesce both ways + */ + ASSERT(prevseg->vs_end == addr && + nextseg->vs_start == endaddr); + + vmem_freelist_delete(vmp, prevseg); + prevseg->vs_end = nextseg->vs_end; + + vmem_freelist_delete(vmp, nextseg); + VMEM_DELETE(span, k); + vmem_seg_destroy(vmp, nextseg); + vmem_seg_destroy(vmp, span); + + vsp = prevseg; + } else if (prevseg->vs_type == VMEM_FREE) { + /* + * coalesce left + */ + ASSERT(prevseg->vs_end == addr); + + VMEM_DELETE(span, k); + vmem_seg_destroy(vmp, span); + + vmem_freelist_delete(vmp, prevseg); + prevseg->vs_end = endaddr; + + vsp = prevseg; + } else if (nextseg->vs_type == VMEM_FREE) { + /* + * coalesce right + */ + ASSERT(nextseg->vs_start == endaddr); + + VMEM_DELETE(span, k); + vmem_seg_destroy(vmp, span); + + vmem_freelist_delete(vmp, nextseg); + nextseg->vs_start = addr; + + vsp = nextseg; + } else { + /* + * cannnot coalesce + */ + VMEM_DELETE(span, k); + span->vs_start = addr; + span->vs_end = endaddr; + + vsp = span; + } + } else if (span->vs_end == addr) { + vmem_seg_t *oldseg = span->vs_knext->vs_aprev; + span->vs_end = endaddr; + + ASSERT(oldseg->vs_type != VMEM_SPAN); + if (oldseg->vs_type == VMEM_FREE) { + ASSERT(oldseg->vs_end == addr); + vmem_freelist_delete(vmp, oldseg); + oldseg->vs_end = endaddr; + vsp = oldseg; + } else + vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); + } else { + vmem_seg_t *oldseg = span->vs_anext; + ASSERT(span->vs_start == endaddr); + span->vs_start = addr; + + ASSERT(oldseg->vs_type != VMEM_SPAN); + if (oldseg->vs_type == VMEM_FREE) { + ASSERT(oldseg->vs_start == endaddr); + vmem_freelist_delete(vmp, oldseg); + oldseg->vs_start = addr; + vsp = oldseg; + } else + vsp = vmem_seg_create(vmp, span, addr, endaddr); + } + vmem_freelist_insert(vmp, vsp); + vmp->vm_kstat.vk_mem_total += (endaddr - addr); + return (vsp); +} + +/* + * Does some error checking, calls vmem_extend_unlocked to add + * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the + * newly merged segment. + */ +void * +_vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, + int vmflag) +{ + uintptr_t addr = (uintptr_t)vaddr; + uintptr_t endaddr = addr + size; + vmem_seg_t *vsp; + + ASSERT(vaddr != NULL && size != 0 && endaddr > addr); + ASSERT(alloc <= size && alloc != 0); + ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); + + ASSERT(!vmem_contains(vmp, vaddr, size)); + + (void) mutex_lock(&vmp->vm_lock); + if (!vmem_populate(vmp, vmflag)) { + (void) mutex_unlock(&vmp->vm_lock); + return (NULL); + } + /* + * if there is a source, we can't mess with the spans + */ + if (vmp->vm_source_alloc != NULL) + vsp = vmem_span_create(vmp, vaddr, size, 0); + else + vsp = vmem_extend_unlocked(vmp, addr, endaddr); + + ASSERT(VS_SIZE(vsp) >= alloc); + + addr = vsp->vs_start; + (void) vmem_seg_alloc(vmp, vsp, addr, alloc); + vaddr = (void *)addr; + + (void) cond_broadcast(&vmp->vm_cv); + (void) mutex_unlock(&vmp->vm_lock); + + return (vaddr); +} + +/* + * Walk the vmp arena, applying func to each segment matching typemask. + * If VMEM_REENTRANT is specified, the arena lock is dropped across each + * call to func(); otherwise, it is held for the duration of vmem_walk() + * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks + * are *not* necessarily consistent, so they may only be used when a hint + * is adequate. + */ +void +vmem_walk(vmem_t *vmp, int typemask, + void (*func)(void *, void *, size_t), void *arg) +{ + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t walker; + + if (typemask & VMEM_WALKER) + return; + + bzero(&walker, sizeof (walker)); + walker.vs_type = VMEM_WALKER; + + (void) mutex_lock(&vmp->vm_lock); + VMEM_INSERT(seg0, &walker, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { + if (vsp->vs_type & typemask) { + void *start = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + if (typemask & VMEM_REENTRANT) { + vmem_advance(vmp, &walker, vsp); + (void) mutex_unlock(&vmp->vm_lock); + func(arg, start, size); + (void) mutex_lock(&vmp->vm_lock); + vsp = &walker; + } else { + func(arg, start, size); + } + } + } + vmem_advance(vmp, &walker, NULL); + (void) mutex_unlock(&vmp->vm_lock); +} + +/* + * Return the total amount of memory whose type matches typemask. Thus: + * + * typemask VMEM_ALLOC yields total memory allocated (in use). + * typemask VMEM_FREE yields total memory free (available). + * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + uint64_t size = 0; + + if (typemask & VMEM_ALLOC) + size += vmp->vm_kstat.vk_mem_inuse; + if (typemask & VMEM_FREE) + size += vmp->vm_kstat.vk_mem_total - + vmp->vm_kstat.vk_mem_inuse; + return ((size_t)size); +} + +/* + * Create an arena called name whose initial span is [base, base + size). + * The arena's natural unit of currency is quantum, so vmem_alloc() + * guarantees quantum-aligned results. The arena may import new spans + * by invoking afunc() on source, and may return those spans by invoking + * ffunc() on source. To make small allocations fast and scalable, + * the arena offers high-performance caching for each integer multiple + * of quantum up to qcache_max. + */ +vmem_t * +vmem_create(const char *name, void *base, size_t size, size_t quantum, + vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, + size_t qcache_max, int vmflag) +{ + int i; + size_t nqcache; + vmem_t *vmp, *cur, **vmpp; + vmem_seg_t *vsp; + vmem_freelist_t *vfp; + uint32_t id = atomic_add_32_nv(&vmem_id, 1); + + if (vmem_vmem_arena != NULL) { + vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), + vmflag & VM_UMFLAGS); + } else { + ASSERT(id <= VMEM_INITIAL); + vmp = &vmem0[id - 1]; + } + + if (vmp == NULL) + return (NULL); + bzero(vmp, sizeof (vmem_t)); + + (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); + (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); + (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); + vmp->vm_cflags = vmflag; + vmflag &= VM_UMFLAGS; + + vmp->vm_quantum = quantum; + vmp->vm_qshift = highbit(quantum) - 1; + nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); + + for (i = 0; i <= VMEM_FREELISTS; i++) { + vfp = &vmp->vm_freelist[i]; + vfp->vs_end = 1UL << i; + vfp->vs_knext = (vmem_seg_t *)(vfp + 1); + vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); + } + + vmp->vm_freelist[0].vs_kprev = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; + vmp->vm_hash_table = vmp->vm_hash0; + vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + vsp = &vmp->vm_seg0; + vsp->vs_anext = vsp; + vsp->vs_aprev = vsp; + vsp->vs_knext = vsp; + vsp->vs_kprev = vsp; + vsp->vs_type = VMEM_SPAN; + + vsp = &vmp->vm_rotor; + vsp->vs_type = VMEM_ROTOR; + VMEM_INSERT(&vmp->vm_seg0, vsp, a); + + vmp->vm_id = id; + if (source != NULL) + vmp->vm_kstat.vk_source_id = source->vm_id; + vmp->vm_source = source; + vmp->vm_source_alloc = afunc; + vmp->vm_source_free = ffunc; + + if (nqcache != 0) { + vmp->vm_qcache_max = nqcache << vmp->vm_qshift; + for (i = 0; i < nqcache; i++) { + char buf[VMEM_NAMELEN + 21]; + (void) snprintf(buf, sizeof (buf), "%s_%lu", + vmp->vm_name, (long)((i + 1) * quantum)); + vmp->vm_qcache[i] = umem_cache_create(buf, + (i + 1) * quantum, quantum, NULL, NULL, NULL, + NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); + if (vmp->vm_qcache[i] == NULL) { + vmp->vm_qcache_max = i * quantum; + break; + } + } + } + + (void) mutex_lock(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != NULL) + vmpp = &cur->vm_next; + *vmpp = vmp; + (void) mutex_unlock(&vmem_list_lock); + + if (vmp->vm_cflags & VMC_POPULATOR) { + uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); + ASSERT(pop_id <= VMEM_INITIAL); + vmem_populator[pop_id - 1] = vmp; + (void) mutex_lock(&vmp->vm_lock); + (void) vmem_populate(vmp, vmflag | VM_PANIC); + (void) mutex_unlock(&vmp->vm_lock); + } + + if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { + vmem_destroy(vmp); + return (NULL); + } + + return (vmp); +} + +/* + * Destroy arena vmp. + */ +void +vmem_destroy(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp; + size_t leaked; + int i; + + (void) mutex_lock(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + (void) mutex_unlock(&vmem_list_lock); + + for (i = 0; i < VMEM_NQCACHE_MAX; i++) + if (vmp->vm_qcache[i]) + umem_cache_destroy(vmp->vm_qcache[i]); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + umem_printf("vmem_destroy('%s'): leaked %lu bytes", + vmp->vm_name, leaked); + + if (vmp->vm_hash_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) + vmem_putseg_global(vsp); + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + (void) mutex_destroy(&vmp->vm_lock); + (void) cond_destroy(&vmp->vm_cv); + vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + +/* + * Resize vmp's hash table to keep the average lookup depth near 1.0. + */ +static void +vmem_hash_rescale(vmem_t *vmp) +{ + vmem_seg_t **old_table, **new_table, *vsp; + size_t old_size, new_size, h, nseg; + + nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); + + new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); + old_size = vmp->vm_hash_mask + 1; + + if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + (void) mutex_lock(&vmp->vm_lock); + + old_size = vmp->vm_hash_mask + 1; + old_table = vmp->vm_hash_table; + + vmp->vm_hash_mask = new_size - 1; + vmp->vm_hash_table = new_table; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + for (h = 0; h < old_size; h++) { + vsp = old_table[h]; + while (vsp != NULL) { + uintptr_t addr = vsp->vs_start; + vmem_seg_t *next_vsp = vsp->vs_knext; + vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); + vsp->vs_knext = *hash_bucket; + *hash_bucket = vsp; + vsp = next_vsp; + } + } + + (void) mutex_unlock(&vmp->vm_lock); + + if (old_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, old_table, + old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on all vmem arenas. + */ +/*ARGSUSED*/ +void +vmem_update(void *dummy) +{ + vmem_t *vmp; + + (void) mutex_lock(&vmem_list_lock); + for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { + /* + * If threads are waiting for resources, wake them up + * periodically so they can issue another vmem_reap() + * to reclaim resources cached by the slab allocator. + */ + (void) cond_broadcast(&vmp->vm_cv); + + /* + * Rescale the hash table to keep the hash chains short. + */ + vmem_hash_rescale(vmp); + } + (void) mutex_unlock(&vmem_list_lock); +} + +/* + * If vmem_init is called again, we need to be able to reset the world. + * That includes resetting the statics back to their original values. + */ +void +vmem_startup(void) +{ +#ifdef UMEM_STANDALONE + vmem_id = 0; + vmem_populators = 0; + vmem_segfree = NULL; + vmem_list = NULL; + vmem_internal_arena = NULL; + vmem_seg_arena = NULL; + vmem_hash_arena = NULL; + vmem_vmem_arena = NULL; + vmem_heap = NULL; + vmem_heap_alloc = NULL; + vmem_heap_free = NULL; + + bzero(vmem0, sizeof (vmem0)); + bzero(vmem_populator, sizeof (vmem_populator)); + bzero(vmem_seg0, sizeof (vmem_seg0)); +#endif +} + +/* + * Prepare vmem for use. + */ +vmem_t * +vmem_init(const char *parent_name, size_t parent_quantum, + vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, + const char *heap_name, void *heap_start, size_t heap_size, + size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) +{ + uint32_t id; + int nseg = VMEM_SEG_INITIAL; + vmem_t *parent, *heap; + + ASSERT(vmem_internal_arena == NULL); + + while (--nseg >= 0) + vmem_putseg_global(&vmem_seg0[nseg]); + + if (parent_name != NULL) { + parent = vmem_create(parent_name, + heap_start, heap_size, parent_quantum, + NULL, NULL, NULL, 0, + VM_SLEEP | VMC_POPULATOR); + heap_start = NULL; + heap_size = 0; + } else { + ASSERT(parent_alloc == NULL && parent_free == NULL); + parent = NULL; + } + + heap = vmem_create(heap_name, + heap_start, heap_size, heap_quantum, + parent_alloc, parent_free, parent, 0, + VM_SLEEP | VMC_POPULATOR); + + vmem_heap = heap; + vmem_heap_alloc = heap_alloc; + vmem_heap_free = heap_free; + + vmem_internal_arena = vmem_create("vmem_internal", + NULL, 0, heap_quantum, + heap_alloc, heap_free, heap, 0, + VM_SLEEP | VMC_POPULATOR); + + vmem_seg_arena = vmem_create("vmem_seg", + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, vmem_internal_arena, 0, + VM_SLEEP | VMC_POPULATOR); + + vmem_hash_arena = vmem_create("vmem_hash", + NULL, 0, 8, + vmem_alloc, vmem_free, vmem_internal_arena, 0, + VM_SLEEP); + + vmem_vmem_arena = vmem_create("vmem_vmem", + vmem0, sizeof (vmem0), 1, + vmem_alloc, vmem_free, vmem_internal_arena, 0, + VM_SLEEP); + + for (id = 0; id < vmem_id; id++) + (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), + 1, 0, 0, &vmem0[id], &vmem0[id + 1], + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + + return (heap); +} + +void +vmem_no_debug(void) +{ + /* + * This size must be a multiple of the minimum required alignment, + * since vmem_populate allocates them compactly. + */ + vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), + sizeof (hrtime_t)); +} + +/* + * Lockup and release, for fork1(2) handling. + */ +void +vmem_lockup(void) +{ + vmem_t *cur; + + (void) mutex_lock(&vmem_list_lock); + (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); + + /* + * Lock up and broadcast all arenas. + */ + for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { + (void) mutex_lock(&cur->vm_lock); + (void) cond_broadcast(&cur->vm_cv); + } + + (void) mutex_lock(&vmem_segfree_lock); +} + +void +vmem_release(void) +{ + vmem_t *cur; + + (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); + + for (cur = vmem_list; cur != NULL; cur = cur->vm_next) + (void) mutex_unlock(&cur->vm_lock); + + (void) mutex_unlock(&vmem_segfree_lock); + (void) mutex_unlock(&vmem_list_lock); +} diff --git a/zfs/lib/libumem/vmem_base.c b/zfs/lib/libumem/vmem_base.c new file mode 100644 index 000000000..d43ecded4 --- /dev/null +++ b/zfs/lib/libumem/vmem_base.c @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* #pragma ident "@(#)vmem_base.c 1.6 05/06/08 SMI" */ + +/* #include "mtlib.h" */ +#include "config.h" +#include "vmem_base.h" +#include "umem_base.h" + +uint_t vmem_backend = 0; + +vmem_t * +vmem_heap_arena(vmem_alloc_t **allocp, vmem_free_t **freep) +{ + static mutex_t arena_mutex = DEFAULTMUTEX; + + /* + * Allow the init thread through, block others until the init completes + */ + if (umem_ready != UMEM_READY && umem_init_thr != thr_self() && + umem_init() == 0) + return (NULL); + + (void) mutex_lock(&arena_mutex); + if (vmem_heap == NULL) + vmem_heap_init(); + (void) mutex_unlock(&arena_mutex); + + if (allocp != NULL) + *allocp = vmem_heap_alloc; + if (freep != NULL) + *freep = vmem_heap_free; + return (vmem_heap); +} diff --git a/zfs/lib/libumem/vmem_base.h b/zfs/lib/libumem/vmem_base.h new file mode 100644 index 000000000..06c1efc52 --- /dev/null +++ b/zfs/lib/libumem/vmem_base.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VMEM_BASE_H +#define _VMEM_BASE_H + +/* #pragma ident "@(#)vmem_base.h 1.3 05/06/08 SMI" */ + +#include <sys/vmem.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include "misc.h" + +extern void vmem_startup(void); +extern vmem_t *vmem_init(const char *parent_name, size_t parent_quantum, + vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, + const char *heap_name, + void *heap_start, size_t heap_size, size_t heap_quantum, + vmem_alloc_t *heap_alloc, vmem_free_t *heap_free); + +extern void *_vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, + size_t alloc, int vmflag); + +extern vmem_t *vmem_heap_arena(vmem_alloc_t **, vmem_free_t **); +extern void vmem_heap_init(void); + +extern vmem_t *vmem_sbrk_arena(vmem_alloc_t **, vmem_free_t **); +extern vmem_t *vmem_mmap_arena(vmem_alloc_t **, vmem_free_t **); +extern vmem_t *vmem_stand_arena(vmem_alloc_t **, vmem_free_t **); + +extern void vmem_update(void *); +extern void vmem_reap(void); /* vmem_populate()-safe reap */ + +extern size_t pagesize; +extern size_t vmem_sbrk_pagesize; + +extern uint_t vmem_backend; +#define VMEM_BACKEND_SBRK 0x0000001 +#define VMEM_BACKEND_MMAP 0x0000002 +#define VMEM_BACKEND_STAND 0x0000003 + +extern vmem_t *vmem_heap; +extern vmem_alloc_t *vmem_heap_alloc; +extern vmem_free_t *vmem_heap_free; + +extern void vmem_lockup(void); +extern void vmem_release(void); + +extern void vmem_sbrk_lockup(void); +extern void vmem_sbrk_release(void); + +extern void vmem_no_debug(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _VMEM_BASE_H */ diff --git a/zfs/lib/libumem/vmem_mmap.c b/zfs/lib/libumem/vmem_mmap.c new file mode 100644 index 000000000..f59e48dc1 --- /dev/null +++ b/zfs/lib/libumem/vmem_mmap.c @@ -0,0 +1,186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)vmem_mmap.c 1.2 05/06/08 SMI" */ + +#include "config.h" +#include <errno.h> + +#if HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif + +#ifdef HAVE_SYS_SYSMACROS_H +#include <sys/sysmacros.h> +#endif + +#include <unistd.h> + +#include "vmem_base.h" + +#define ALLOC_PROT PROT_READ | PROT_WRITE | PROT_EXEC +#define FREE_PROT PROT_NONE + +#define ALLOC_FLAGS MAP_PRIVATE | MAP_ANON +#define FREE_FLAGS MAP_PRIVATE | MAP_ANON | MAP_NORESERVE + +#ifdef MAP_ALIGN +#define CHUNKSIZE (64*1024) /* 64 kilobytes */ +#else +static size_t CHUNKSIZE; +#endif + +static vmem_t *mmap_heap; + +static void * +vmem_mmap_alloc(vmem_t *src, size_t size, int vmflags) +{ + void *ret; + int old_errno = errno; + + ret = vmem_alloc(src, size, vmflags); +#ifndef _WIN32 + if (ret != NULL + && + mmap(ret, size, ALLOC_PROT, ALLOC_FLAGS | MAP_FIXED, -1, 0) == + MAP_FAILED + ) { + vmem_free(src, ret, size); + vmem_reap(); + + ASSERT((vmflags & VM_NOSLEEP) == VM_NOSLEEP); + errno = old_errno; + return (NULL); + } +#endif + + errno = old_errno; + return (ret); +} + +static void +vmem_mmap_free(vmem_t *src, void *addr, size_t size) +{ + int old_errno = errno; +#ifdef _WIN32 + VirtualFree(addr, size, MEM_RELEASE); +#else + (void) mmap(addr, size, FREE_PROT, FREE_FLAGS | MAP_FIXED, -1, 0); +#endif + vmem_free(src, addr, size); + errno = old_errno; +} + +static void * +vmem_mmap_top_alloc(vmem_t *src, size_t size, int vmflags) +{ + void *ret; + void *buf; + int old_errno = errno; + + ret = vmem_alloc(src, size, VM_NOSLEEP); + + if (ret) { + errno = old_errno; + return (ret); + } + /* + * Need to grow the heap + */ +#ifdef _WIN32 + buf = VirtualAlloc(NULL, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); + if (buf == NULL) buf = MAP_FAILED; +#else + buf = mmap( +#ifdef MAP_ALIGN + (void *)CHUNKSIZE, +#else + 0, +#endif + size, FREE_PROT, FREE_FLAGS +#ifdef MAP_ALIGN + | MAP_ALIGN +#endif + , -1, 0); +#endif + + if (buf != MAP_FAILED) { + ret = _vmem_extend_alloc(src, buf, size, size, vmflags); + if (ret != NULL) + return (ret); + else { + (void) munmap(buf, size); + errno = old_errno; + return (NULL); + } + } else { + /* + * Growing the heap failed. The allocation above will + * already have called umem_reap(). + */ + ASSERT((vmflags & VM_NOSLEEP) == VM_NOSLEEP); + + errno = old_errno; + return (NULL); + } +} + +vmem_t * +vmem_mmap_arena(vmem_alloc_t **a_out, vmem_free_t **f_out) +{ +#ifdef _WIN32 + SYSTEM_INFO info; + size_t pagesize; +#else + size_t pagesize = _sysconf(_SC_PAGESIZE); +#endif + +#ifdef _WIN32 + GetSystemInfo(&info); + pagesize = info.dwPageSize; + CHUNKSIZE = info.dwAllocationGranularity; +#elif !defined(MAP_ALIGN) + CHUNKSIZE = pagesize; +#endif + + if (mmap_heap == NULL) { + mmap_heap = vmem_init("mmap_top", + CHUNKSIZE, + vmem_mmap_top_alloc, vmem_free, + "mmap_heap", NULL, 0, pagesize, + vmem_mmap_alloc, vmem_mmap_free); + } + + if (a_out != NULL) + *a_out = vmem_mmap_alloc; + if (f_out != NULL) + *f_out = vmem_mmap_free; + + return (mmap_heap); +} diff --git a/zfs/lib/libumem/vmem_sbrk.c b/zfs/lib/libumem/vmem_sbrk.c new file mode 100644 index 000000000..a7c91bbbd --- /dev/null +++ b/zfs/lib/libumem/vmem_sbrk.c @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Portions Copyright 2006 OmniTI, Inc. + */ + +/* #pragma ident "@(#)vmem_sbrk.c 1.4 05/06/08 SMI" */ + +/* + * The structure of the sbrk backend: + * + * +-----------+ + * | sbrk_top | + * +-----------+ + * | (vmem_sbrk_alloc(), vmem_free()) + * | + * +-----------+ + * | sbrk_heap | + * +-----------+ + * | | ... | (vmem_alloc(), vmem_free()) + * <other arenas> + * + * The sbrk_top arena holds all controlled memory. vmem_sbrk_alloc() handles + * allocations from it, including growing the heap when we run low. + * + * Growing the heap is complicated by the fact that we have to extend the + * sbrk_top arena (using _vmem_extend_alloc()), and that can fail. Since + * other threads may be actively allocating, we can't return the memory. + * + * Instead, we put it on a doubly-linked list, sbrk_fails, which we search + * before calling sbrk(). + */ + +#include "config.h" +/* #include "mtlib.h" */ +#include <errno.h> +#include <limits.h> +#ifdef HAVE_SYS_SYSMACROS_H +#include <sys/sysmacros.h> +#endif +#include <sys/mman.h> +#include <unistd.h> + +#include "vmem_base.h" + +#include "misc.h" + +size_t vmem_sbrk_pagesize = 0; /* the preferred page size of the heap */ + +#define MIN_ALLOC (64*1024) + +static size_t real_pagesize; +static vmem_t *sbrk_heap; + +typedef struct sbrk_fail { + struct sbrk_fail *sf_next; + struct sbrk_fail *sf_prev; + void *sf_base; /* == the sbrk_fail's address */ + size_t sf_size; /* the size of this buffer */ +} sbrk_fail_t; + +static sbrk_fail_t sbrk_fails = { + &sbrk_fails, + &sbrk_fails, + NULL, + 0 +}; + +static mutex_t sbrk_faillock = DEFAULTMUTEX; + +/* + * _sbrk_grow_aligned() aligns the old break to a low_align boundry, + * adds min_size, aligns to a high_align boundry, and calls _brk_unlocked() + * to set the new break. The low_aligned-aligned value is returned, and + * the actual space allocated is returned through actual_size. + * + * Unlike sbrk(2), _sbrk_grow_aligned takes an unsigned size, and does + * not allow shrinking the heap. + */ +void * +_sbrk_grow_aligned(size_t min_size, size_t low_align, size_t high_align, + size_t *actual_size) +{ + uintptr_t old_brk; + uintptr_t ret_brk; + uintptr_t high_brk; + uintptr_t new_brk; + int brk_result; + +#define ALIGNSZ 16 +#define BRKALIGN(x) (caddr_t)P2ROUNDUP((uintptr_t)(x), ALIGNSZ) + + if ((low_align & (low_align - 1)) != 0 || + (high_align & (high_align - 1)) != 0) { + errno = EINVAL; + return ((void *)-1); + } + low_align = MAX(low_align, ALIGNSZ); + high_align = MAX(high_align, ALIGNSZ); + + old_brk = (uintptr_t)BRKALIGN(sbrk(0)); + ret_brk = P2ROUNDUP(old_brk, low_align); + high_brk = ret_brk + min_size; + new_brk = P2ROUNDUP(high_brk, high_align); + + /* + * Check for overflow + */ + if (ret_brk < old_brk || high_brk < ret_brk || new_brk < high_brk) { + errno = ENOMEM; + return ((void *)-1); + } + + brk_result = brk((void *)new_brk); + + if (brk_result != 0) + return ((void *)-1); + + if (actual_size != NULL) + *actual_size = (new_brk - ret_brk); + return ((void *)ret_brk); +} + +/* + * Try to extend src with [pos, pos + size). + * + * If it fails, add the block to the sbrk_fails list. + */ +static void * +vmem_sbrk_extend_alloc(vmem_t *src, void *pos, size_t size, size_t alloc, + int vmflags) +{ + sbrk_fail_t *fnext, *fprev, *fp; + void *ret; + + ret = _vmem_extend_alloc(src, pos, size, alloc, vmflags); + if (ret != NULL) + return (ret); + + fp = (sbrk_fail_t *)pos; + + ASSERT(sizeof (sbrk_fail_t) <= size); + + fp->sf_base = pos; + fp->sf_size = size; + + (void) mutex_lock(&sbrk_faillock); + fp->sf_next = fnext = &sbrk_fails; + fp->sf_prev = fprev = sbrk_fails.sf_prev; + fnext->sf_prev = fp; + fprev->sf_next = fp; + (void) mutex_unlock(&sbrk_faillock); + + return (NULL); +} + +/* + * Try to add at least size bytes to src, using the sbrk_fails list + */ +static void * +vmem_sbrk_tryfail(vmem_t *src, size_t size, int vmflags) +{ + sbrk_fail_t *fp; + + (void) mutex_lock(&sbrk_faillock); + for (fp = sbrk_fails.sf_next; fp != &sbrk_fails; fp = fp->sf_next) { + if (fp->sf_size >= size) { + fp->sf_next->sf_prev = fp->sf_prev; + fp->sf_prev->sf_next = fp->sf_next; + fp->sf_next = fp->sf_prev = NULL; + break; + } + } + (void) mutex_unlock(&sbrk_faillock); + + if (fp != &sbrk_fails) { + ASSERT(fp->sf_base == (void *)fp); + return (vmem_sbrk_extend_alloc(src, fp, fp->sf_size, size, + vmflags)); + } + /* + * nothing of the right size on the freelist + */ + return (NULL); +} + +static void * +vmem_sbrk_alloc(vmem_t *src, size_t size, int vmflags) +{ + extern void *_sbrk_grow_aligned(size_t min_size, size_t low_align, + size_t high_align, size_t *actual_size); + + void *ret; + void *buf; + size_t buf_size; + + int old_errno = errno; + + ret = vmem_alloc(src, size, VM_NOSLEEP); + if (ret != NULL) { + errno = old_errno; + return (ret); + } + + /* + * The allocation failed. We need to grow the heap. + * + * First, try to use any buffers which failed earlier. + */ + if (sbrk_fails.sf_next != &sbrk_fails && + (ret = vmem_sbrk_tryfail(src, size, vmflags)) != NULL) + return (ret); + + buf_size = MAX(size, MIN_ALLOC); + + /* + * buf_size gets overwritten with the actual allocated size + */ + buf = _sbrk_grow_aligned(buf_size, real_pagesize, vmem_sbrk_pagesize, + &buf_size); + + if (buf != MAP_FAILED) { + ret = vmem_sbrk_extend_alloc(src, buf, buf_size, size, vmflags); + if (ret != NULL) { + errno = old_errno; + return (ret); + } + } + + /* + * Growing the heap failed. The vmem_alloc() above called umem_reap(). + */ + ASSERT((vmflags & VM_NOSLEEP) == VM_NOSLEEP); + + errno = old_errno; + return (NULL); +} + +/* + * fork1() support + */ +void +vmem_sbrk_lockup(void) +{ + (void) mutex_lock(&sbrk_faillock); +} + +void +vmem_sbrk_release(void) +{ + (void) mutex_unlock(&sbrk_faillock); +} + +vmem_t * +vmem_sbrk_arena(vmem_alloc_t **a_out, vmem_free_t **f_out) +{ + if (sbrk_heap == NULL) { + size_t heap_size; + + real_pagesize = sysconf(_SC_PAGESIZE); + + heap_size = vmem_sbrk_pagesize; + + if (issetugid()) { + heap_size = 0; + } else if (heap_size != 0 && !ISP2(heap_size)) { + heap_size = 0; + log_message("ignoring bad pagesize: 0x%p\n", heap_size); + } + if (heap_size <= real_pagesize) { + heap_size = real_pagesize; + } else { +#ifdef MHA_MAPSIZE_BSSBRK + struct memcntl_mha mha; + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = heap_size; + + if (memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0) + == -1) { + log_message("unable to set MAPSIZE_BSSBRK to " + "0x%p\n", heap_size); + heap_size = real_pagesize; + } +#else + heap_size = real_pagesize; +#endif + } + vmem_sbrk_pagesize = heap_size; + + sbrk_heap = vmem_init("sbrk_top", real_pagesize, + vmem_sbrk_alloc, vmem_free, + "sbrk_heap", NULL, 0, real_pagesize, + vmem_alloc, vmem_free); + } + + if (a_out != NULL) + *a_out = vmem_alloc; + if (f_out != NULL) + *f_out = vmem_free; + + return (sbrk_heap); +} diff --git a/zfs/lib/libumem/vmem_stand.h b/zfs/lib/libumem/vmem_stand.h new file mode 100644 index 000000000..aeb8d11a3 --- /dev/null +++ b/zfs/lib/libumem/vmem_stand.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VMEM_STAND_H +#define _VMEM_STAND_H + +/* #pragma ident "@(#)vmem_stand.h 1.3 05/06/08 SMI" */ + +/* + * additional functions defined by the standalone backend + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vmem_stand_init(void); +extern int vmem_stand_add(caddr_t, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _VMEM_STAND_H */ |