diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/Kbuild.in | 1 | ||||
-rw-r--r-- | module/Makefile.bsd | 12 | ||||
-rw-r--r-- | module/Makefile.in | 2 | ||||
-rw-r--r-- | module/zcommon/zfeature_common.c | 13 | ||||
-rw-r--r-- | module/zcommon/zfs_prop.c | 89 | ||||
-rw-r--r-- | module/zfs/arc.c | 76 | ||||
-rw-r--r-- | module/zfs/blkptr.c | 2 | ||||
-rw-r--r-- | module/zfs/dbuf.c | 8 | ||||
-rw-r--r-- | module/zfs/dmu.c | 6 | ||||
-rw-r--r-- | module/zfs/dmu_objset.c | 9 | ||||
-rw-r--r-- | module/zfs/dmu_recv.c | 20 | ||||
-rw-r--r-- | module/zfs/dmu_send.c | 22 | ||||
-rw-r--r-- | module/zfs/dsl_dataset.c | 85 | ||||
-rw-r--r-- | module/zfs/spa_misc.c | 1 | ||||
-rw-r--r-- | module/zfs/zfs_ioctl.c | 36 | ||||
-rw-r--r-- | module/zfs/zio.c | 18 | ||||
-rw-r--r-- | module/zfs/zio_compress.c | 95 | ||||
-rw-r--r-- | module/zstd/Makefile.in | 33 | ||||
-rw-r--r-- | module/zstd/README.md | 60 | ||||
-rw-r--r-- | module/zstd/include/aarch64_compat.h | 37 | ||||
-rw-r--r-- | module/zstd/include/limits.h | 63 | ||||
-rw-r--r-- | module/zstd/include/stddef.h | 62 | ||||
-rw-r--r-- | module/zstd/include/stdint.h | 62 | ||||
-rw-r--r-- | module/zstd/include/stdio.h | 54 | ||||
-rw-r--r-- | module/zstd/include/stdlib.h | 58 | ||||
-rw-r--r-- | module/zstd/include/string.h | 62 | ||||
-rw-r--r-- | module/zstd/zfs_zstd.c | 737 |
27 files changed, 1652 insertions, 71 deletions
diff --git a/module/Kbuild.in b/module/Kbuild.in index 031b5a9a8..1507965c5 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -9,6 +9,7 @@ ZFS_MODULES += nvpair/ ZFS_MODULES += unicode/ ZFS_MODULES += zcommon/ ZFS_MODULES += zfs/ +ZFS_MODULES += zstd/ # The rest is only relevant when run by kbuild ifneq ($(KERNELRELEASE),) diff --git a/module/Makefile.bsd b/module/Makefile.bsd index c6ace9fb5..d0b4a5bd6 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -16,7 +16,10 @@ KMOD= openzfs ${SRCDIR}/os/freebsd/zfs \ ${SRCDIR}/unicode \ ${SRCDIR}/zcommon \ - ${SRCDIR}/zfs + ${SRCDIR}/zfs \ + ${SRCDIR}/zstd \ + ${SRCDIR}/zstd/lib + CFLAGS+= -I${.OBJDIR:H}/include @@ -25,6 +28,7 @@ CFLAGS+= -I${INCDIR}/spl CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs +CFLAGS+= -I${SRCDIR}/zstd/include CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ @@ -292,6 +296,10 @@ SRCS+= abd.c \ zthr.c \ zvol.c +#zstd +SRCS+= zfs_zstd.c \ + zstd.c + beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ @@ -347,3 +355,5 @@ CFLAGS.zfs_ioctl.c= -Wno-cast-qual CFLAGS.zil.c= -Wno-cast-qual CFLAGS.zio.c= -Wno-cast-qual CFLAGS.zrlock.c= -Wno-cast-qual +CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zstd.c= -fno-tree-vectorize diff --git a/module/Makefile.in b/module/Makefile.in index 59b485d55..ead4ff136 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -2,7 +2,7 @@ include Kbuild INSTALL_MOD_DIR ?= extra -SUBDIR_TARGETS = icp lua +SUBDIR_TARGETS = icp lua zstd all: modules distclean maintainer-clean: clean diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index ed7967dc1..97ddacbab 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -25,6 +25,8 @@ * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #ifndef _KERNEL @@ -576,6 +578,17 @@ zpool_feature_init(void) "org.openzfs:device_rebuild", "device_rebuild", "Support for sequential device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t zstd_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, + "org.freebsd:zstd_compress", "zstd_compress", + "zstd compression algorithm support.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); + } } #if defined(_KERNEL) diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 3a005b687..272e0e93c 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -23,6 +23,8 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2016, Joyent, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* Portions Copyright 2010 Robert Milkowski */ @@ -125,6 +127,87 @@ zfs_prop_init(void) { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, { "lz4", ZIO_COMPRESS_LZ4 }, + { "zstd", ZIO_COMPRESS_ZSTD }, + { "zstd-fast", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) }, + + /* + * ZSTD 1-19 are synthetic. We store the compression level in a + * separate hidden property to avoid wasting a large amount of + * space in the ZIO_COMPRESS enum. + * + * The compression level is also stored within the header of the + * compressed block since we may need it for later recompression + * to avoid checksum errors (L2ARC). + * + * Note that the level here is defined as bit shifted mask on + * top of the method. + */ + { "zstd-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) }, + { "zstd-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) }, + { "zstd-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) }, + { "zstd-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) }, + { "zstd-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) }, + { "zstd-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) }, + { "zstd-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) }, + { "zstd-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) }, + { "zstd-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) }, + { "zstd-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) }, + { "zstd-11", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) }, + { "zstd-12", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) }, + { "zstd-13", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) }, + { "zstd-14", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) }, + { "zstd-15", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) }, + { "zstd-16", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) }, + { "zstd-17", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) }, + { "zstd-18", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) }, + { "zstd-19", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) }, + + /* + * The ZSTD-Fast levels are also synthetic. + */ + { "zstd-fast-1", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) }, + { "zstd-fast-2", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) }, + { "zstd-fast-3", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) }, + { "zstd-fast-4", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) }, + { "zstd-fast-5", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) }, + { "zstd-fast-6", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) }, + { "zstd-fast-7", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) }, + { "zstd-fast-8", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) }, + { "zstd-fast-9", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) }, + { "zstd-fast-10", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) }, + { "zstd-fast-20", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) }, + { "zstd-fast-30", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) }, + { "zstd-fast-40", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) }, + { "zstd-fast-50", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) }, + { "zstd-fast-60", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) }, + { "zstd-fast-70", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) }, + { "zstd-fast-80", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) }, + { "zstd-fast-90", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) }, + { "zstd-fast-100", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) }, + { "zstd-fast-500", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) }, + { "zstd-fast-1000", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) }, { NULL } }; @@ -330,8 +413,10 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", "COMPRESS", - compress_table); + "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | " + "zstd | zstd-[1-19] | " + "zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]", + "COMPRESS", compress_table); zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0512497d5..ff2621194 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -26,6 +26,8 @@ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K <[email protected]>. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude @@ -1362,6 +1364,12 @@ arc_hdr_get_compress(arc_buf_hdr_t *hdr) HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } +uint8_t +arc_get_complevel(arc_buf_t *buf) +{ + return (buf->b_hdr->b_complevel); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -1707,7 +1715,8 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, - enum zio_compress compress, boolean_t protected, boolean_t prefetch) + enum zio_compress compress, uint8_t complevel, boolean_t protected, + boolean_t prefetch) { arc_buf_hdr_t *hdr; @@ -1720,6 +1729,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) @@ -1779,9 +1789,8 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) tmpbuf = zio_buf_alloc(lsize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); - csize = zio_compress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, tmpbuf, lsize); + hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); ASSERT3U(csize, <=, psize); abd_zero_off(abd, csize, psize - csize); } @@ -1867,7 +1876,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); + HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; @@ -2114,7 +2123,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), + &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should @@ -2865,10 +2875,10 @@ arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, - psize, lsize, compression_type); + psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); @@ -2879,10 +2889,11 @@ arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, - byteorder, salt, iv, mac, ot, psize, lsize, compression_type); + byteorder, salt, iv, mac, ot, psize, lsize, compression_type, + complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); @@ -3249,7 +3260,7 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - boolean_t protected, enum zio_compress compression_type, + boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type, boolean_t alloc_rdata) { arc_buf_hdr_t *hdr; @@ -3272,6 +3283,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); + hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); @@ -3574,7 +3586,7 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); + B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3590,7 +3602,7 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); @@ -3598,7 +3610,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); + B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3624,7 +3636,7 @@ arc_buf_t * arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) + enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -3637,7 +3649,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, - compression_type, type, B_TRUE); + compression_type, complevel, type, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -5579,6 +5591,9 @@ arc_read_done(zio_t *zio) } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + if (!HDR_L2_READING(hdr)) { + hdr->b_complevel = zio->io_prop.zp_complevel; + } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); @@ -5982,7 +5997,7 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type, + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, encrypted_read); if (!embedded_bp) { @@ -6549,7 +6564,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, - compress, type, HDR_HAS_RABD(hdr)); + compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -6713,6 +6728,7 @@ arc_write_ready(zio_t *zio) } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; @@ -6902,6 +6918,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; @@ -6920,6 +6937,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); @@ -8252,7 +8270,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), - HDR_GET_LSIZE(hdr)); + HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); @@ -8351,6 +8369,7 @@ l2arc_read_done(zio_t *zio) (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); @@ -8763,7 +8782,18 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, cabd = abd_alloc_for_io(asize, ismd); tmp = abd_borrow_buf(cabd, asize); - psize = zio_compress_data(compress, to_write, tmp, size); + psize = zio_compress_data(compress, to_write, tmp, size, + hdr->b_complevel); + + if (psize >= size) { + abd_return_buf(cabd, tmp, asize); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + to_write = cabd; + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto encrypt; + } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) bzero((char *)tmp + psize, asize - psize); @@ -8772,6 +8802,7 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, to_write = cabd; } +encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); @@ -9922,7 +9953,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev, abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), - abd, this_lb, asize, sizeof (*this_lb))) != 0) { + abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } @@ -10021,7 +10052,7 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, - L2BLK_GET_COMPRESS((le)->le_prop), + L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, @@ -10197,7 +10228,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, - abd_buf->abd, tmpbuf, sizeof (*lb)); + abd_buf->abd, tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); @@ -10354,6 +10385,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c index 73600e4ab..aa09ded8d 100644 --- a/module/zfs/blkptr.c +++ b/module/zfs/blkptr.c @@ -143,7 +143,7 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) uint8_t dstbuf[BPE_PAYLOAD_SIZE]; decode_embedded_bp_compressed(bp, dstbuf); VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), - dstbuf, buf, psize, buflen)); + dstbuf, buf, psize, buflen, NULL)); } else { ASSERT3U(lsize, ==, psize); decode_embedded_bp_compressed(bp, buf); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 83b2c3721..2de1f4e4c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -24,6 +24,8 @@ * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/zfs_context.h> @@ -1095,11 +1097,13 @@ dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data) spa_t *spa = os->os_spa; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type; + uint8_t complevel; int psize, lsize; psize = arc_buf_size(data); lsize = arc_buf_lsize(data); compress_type = arc_get_compression(data); + complevel = arc_get_complevel(data); if (arc_is_encrypted(data)) { boolean_t byteorder; @@ -1111,11 +1115,11 @@ dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data) arc_get_raw_params(data, &byteorder, salt, iv, mac); data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os), byteorder, salt, iv, mac, dn->dn_type, psize, lsize, - compress_type); + compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); data = arc_alloc_compressed_buf(spa, db, - psize, lsize, compress_type); + psize, lsize, compress_type, complevel); } else { data = arc_alloc_buf(spa, db, type, psize); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5cc7bfe11..06d6df618 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -26,6 +26,8 @@ * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/dmu.h> @@ -2067,6 +2069,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; + uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; @@ -2123,6 +2126,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); + complevel = zio_complevel_select(os->os_spa, compress, + complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : @@ -2181,6 +2186,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } zp->zp_compress = compress; + zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index bf488384d..b1590d7db 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -30,6 +30,8 @@ * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2018, loli10K <[email protected]>. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* Portions Copyright 2010 Robert Milkowski */ @@ -192,8 +194,10 @@ compression_changed_cb(void *arg, uint64_t newval) */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - os->os_compress = zio_compress_select(os->os_spa, newval, - ZIO_COMPRESS_ON); + os->os_compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON); + os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress, + ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT); } static void @@ -580,6 +584,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; + os->os_complevel = ZIO_COMPLEVEL_DEFAULT; os->os_encrypted = B_FALSE; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 2f3507914..2eee19a28 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -25,6 +25,8 @@ * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K <[email protected]>. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/dmu.h> @@ -529,14 +531,18 @@ recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) return (SET_ERROR(ENOTSUP)); /* - * LZ4 compressed, embedded, mooched, large blocks, and large_dnodes - * in the stream can only be used if those pool features are enabled - * because we don't attempt to decompress / un-embed / un-mooch / - * split up the blocks / dnodes during the receive process. + * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, + * and large_dnodes in the stream can only be used if those pool + * features are enabled because we don't attempt to decompress / + * un-embed / un-mooch / split up the blocks / dnodes during the + * receive process. */ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && + !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) + return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); @@ -2457,7 +2463,7 @@ receive_read_record(dmu_recv_cookie_t *drc) drrw->drr_object, byteorder, drrw->drr_salt, drrw->drr_iv, drrw->drr_mac, drrw->drr_type, drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); + drrw->drr_compressiontype, 0); } else if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, @@ -2466,7 +2472,7 @@ receive_read_record(dmu_recv_cookie_t *drc) abuf = arc_loan_compressed_buf( dmu_objset_spa(drc->drc_os), drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); + drrw->drr_compressiontype, 0); } else { abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), is_meta, drrw->drr_logical_size); @@ -2541,7 +2547,7 @@ receive_read_record(dmu_recv_cookie_t *drc) drrs->drr_object, byteorder, drrs->drr_salt, drrs->drr_iv, drrs->drr_mac, drrs->drr_type, drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype); + drrs->drr_compressiontype, 0); } else { abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), DMU_OT_IS_METADATA(drrs->drr_type), diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 403e85592..33e99c2e0 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -26,6 +26,8 @@ * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/dmu.h> @@ -863,6 +865,14 @@ send_do_embed(const blkptr_t *bp, uint64_t featureflags) return (B_FALSE); /* + * If we have not set the ZSTD feature flag, we can't send ZSTD + * compressed embedded blocks, as the receiver may not support them. + */ + if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD && + !(featureflags & DMU_BACKUP_FEATURE_ZSTD))) + return (B_FALSE); + + /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { @@ -1954,6 +1964,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, /* raw send implies compressok */ if (dspp->compressok || dspp->rawok) *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + if (dspp->rawok && os->os_encrypted) *featureflags |= DMU_BACKUP_FEATURE_RAW; @@ -1964,6 +1975,17 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, *featureflags |= DMU_BACKUP_FEATURE_LZ4; } + /* + * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to + * allow sending ZSTD compressed datasets to a receiver that does not + * support ZSTD + */ + if ((*featureflags & + (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_ZSTD; + } + if (dspp->resumeobj != 0 || dspp->resumeoff != 0) { *featureflags |= DMU_BACKUP_FEATURE_RESUMING; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index c5143ac5a..1fcd83db7 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -28,6 +28,12 @@ * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2020 The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. */ #include <sys/dmu_objset.h> @@ -127,6 +133,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; + spa_feature_t f; dprintf_bp(bp, "ds=%p", ds); @@ -156,7 +163,15 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) (void *)B_TRUE; } - spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } + + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); @@ -4507,6 +4522,74 @@ dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +typedef struct dsl_dataset_set_compression_arg { + const char *ddsca_name; + zprop_source_t ddsca_source; + uint64_t ddsca_value; +} dsl_dataset_set_compression_arg_t; + +/* ARGSUSED */ +static int +dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + + if (f == SPA_FEATURE_NONE) + return (SET_ERROR(EINVAL)); + + if (!spa_feature_is_enabled(dp->dp_spa, f)) + return (SET_ERROR(ENOTSUP)); + + return (0); +} + +static void +dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); + + VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); + if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) { + ds->ds_feature_activation[f] = (void *)B_TRUE; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_compression(const char *dsname, zprop_source_t source, + uint64_t compression) +{ + dsl_dataset_set_compression_arg_t ddsca; + + /* + * The sync task is only required for zstd in order to activate + * the feature flag when the property is first set. + */ + if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD) + return (0); + + ddsca.ddsca_name = dsname; + ddsca.ddsca_source = source; + ddsca.ddsca_value = compression; + + return (dsl_sync_task(dsname, dsl_dataset_set_compression_check, + dsl_dataset_set_compression_sync, &ddsca, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + /* * Return (in *usedp) the amount of space referenced by "new" that was not * referenced at the time the bookmark corresponds to. "New" may be a diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 4c884409a..41f0ddbde 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -62,6 +62,7 @@ #include <sys/btree.h> #include <sys/zfeature.h> #include <sys/qat.h> +#include <sys/zstd/zstd.h> /* * SPA locking diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 463704c14..7f623bb04 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,6 +38,8 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ /* @@ -2464,6 +2466,15 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; + case ZFS_PROP_COMPRESSION: + err = dsl_dataset_set_compression(dsname, source, intval); + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; @@ -4355,7 +4366,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); - uint64_t intval; + uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { @@ -4437,19 +4448,20 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { - if (intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9 && + compval = ZIO_COMPRESS_ALGO(intval); + if (compval >= ZIO_COMPRESS_GZIP_1 && + compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } - if (intval == ZIO_COMPRESS_ZLE && + if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); - if (intval == ZIO_COMPRESS_LZ4) { + if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) @@ -4462,6 +4474,20 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } spa_close(spa, FTAG); } + + if (compval == ZIO_COMPRESS_ZSTD) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_ZSTD_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } } break; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 93d6b115c..2628cc029 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -23,6 +23,8 @@ * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/sysmacros.h> @@ -409,7 +411,8 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size) if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_abd, tmp, zio->io_size, size); + zio->io_abd, tmp, zio->io_size, size, + &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) @@ -459,7 +462,8 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), - zio->io_abd, tmp, zio->io_size, lsize); + zio->io_abd, tmp, zio->io_size, lsize, + &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; @@ -1678,8 +1682,9 @@ zio_write_compress(zio_t *zio) if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); - if (psize == 0 || psize == lsize) { + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, + zp->zp_complevel); + if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && @@ -1741,8 +1746,8 @@ zio_write_compress(zio_t *zio) * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, - zio->io_abd, NULL, lsize); - if (psize == 0) + zio->io_abd, NULL, lsize, zp->zp_complevel); + if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(psize, !=, 0); @@ -2849,6 +2854,7 @@ zio_write_gang_block(zio_t *pio) zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 01c51347f..d91e82d9e 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -29,6 +29,8 @@ /* * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude */ #include <sys/zfs_context.h> @@ -36,6 +38,7 @@ #include <sys/zfeature.h> #include <sys/zio.h> #include <sys/zio_compress.h> +#include <sys/zstd/zstd.h> /* * If nonzero, every 1/X decompression attempts will fail, simulating @@ -47,24 +50,42 @@ unsigned long zio_decompress_fail_fraction = 0; * Compression vectors. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {"inherit", 0, NULL, NULL}, - {"on", 0, NULL, NULL}, - {"uncompressed", 0, NULL, NULL}, - {"lzjb", 0, lzjb_compress, lzjb_decompress}, - {"empty", 0, NULL, NULL}, - {"gzip-1", 1, gzip_compress, gzip_decompress}, - {"gzip-2", 2, gzip_compress, gzip_decompress}, - {"gzip-3", 3, gzip_compress, gzip_decompress}, - {"gzip-4", 4, gzip_compress, gzip_decompress}, - {"gzip-5", 5, gzip_compress, gzip_decompress}, - {"gzip-6", 6, gzip_compress, gzip_decompress}, - {"gzip-7", 7, gzip_compress, gzip_decompress}, - {"gzip-8", 8, gzip_compress, gzip_decompress}, - {"gzip-9", 9, gzip_compress, gzip_decompress}, - {"zle", 64, zle_compress, zle_decompress}, - {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs} + {"inherit", 0, NULL, NULL, NULL}, + {"on", 0, NULL, NULL, NULL}, + {"uncompressed", 0, NULL, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress, NULL}, + {"empty", 0, NULL, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress, NULL}, + {"gzip-2", 2, gzip_compress, gzip_decompress, NULL}, + {"gzip-3", 3, gzip_compress, gzip_decompress, NULL}, + {"gzip-4", 4, gzip_compress, gzip_decompress, NULL}, + {"gzip-5", 5, gzip_compress, gzip_decompress, NULL}, + {"gzip-6", 6, gzip_compress, gzip_decompress, NULL}, + {"gzip-7", 7, gzip_compress, gzip_decompress, NULL}, + {"gzip-8", 8, gzip_compress, gzip_decompress, NULL}, + {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, + {"zle", 64, zle_compress, zle_decompress, NULL}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zstd_compress, zstd_decompress, + zstd_decompress_level}, }; +uint8_t +zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, + uint8_t parent) +{ + uint8_t result; + + if (!ZIO_COMPRESS_HASLEVEL(compress)) + return (0); + + result = child; + if (result == ZIO_COMPLEVEL_INHERIT) + result = parent; + + return (result); +} + enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent) @@ -102,9 +123,11 @@ zio_compress_zeroed_cb(void *data, size_t len, void *private) } size_t -zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, + uint8_t level) { size_t c_len, d_len; + uint8_t complevel; zio_compress_info_t *ci = &zio_compress_table[c]; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); @@ -123,9 +146,24 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) + return (s_len); + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + /* No compression algorithms can read from ABDs directly */ void *tmp = abd_borrow_buf_copy(src, s_len); - c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel); abd_return_buf(src, tmp, s_len); if (c_len > d_len) @@ -137,21 +175,24 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, - size_t s_len, size_t d_len) + size_t s_len, size_t d_len, uint8_t *level) { zio_compress_info_t *ci = &zio_compress_table[c]; if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); + if (ci->ci_decompress_level != NULL && level != NULL) + return (ci->ci_decompress_level(src, dst, s_len, d_len, level)); + return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len) + size_t s_len, size_t d_len, uint8_t *level) { void *tmp = abd_borrow_buf_copy(src, s_len); - int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level); abd_return_buf(src, tmp, s_len); /* @@ -165,3 +206,15 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, return (ret); } + +int +zio_compress_to_feature(enum zio_compress comp) +{ + switch (comp) { + case ZIO_COMPRESS_ZSTD: + return (SPA_FEATURE_ZSTD_COMPRESS); + default: + /* fallthru */; + } + return (SPA_FEATURE_NONE); +} diff --git a/module/zstd/Makefile.in b/module/zstd/Makefile.in new file mode 100644 index 000000000..a7f91a435 --- /dev/null +++ b/module/zstd/Makefile.in @@ -0,0 +1,33 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +zstd_include = $(src)/include +else +zstd_include = $(srctree)/$(src)/include +endif + +MODULE := zzstd + +obj-$(CONFIG_ZFS) := $(MODULE).o + +asflags-y := -I$(zstd_include) +ccflags-y := -I$(zstd_include) + +# Zstd uses -O3 by default, so we should follow +ccflags-y += -O3 + +# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +$(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize + +# Quiet warnings about frame size due to unused code in unmodified zstd lib +$(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480 + +# Disable aarch64 neon SIMD instructions for kernel mode +$(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h + +$(MODULE)-objs += zfs_zstd.o +$(MODULE)-objs += lib/zstd.o + +all: + mkdir -p lib diff --git a/module/zstd/README.md b/module/zstd/README.md new file mode 100644 index 000000000..b08a41906 --- /dev/null +++ b/module/zstd/README.md @@ -0,0 +1,60 @@ +# ZSTD-On-ZFS Library Manual + +## Introduction + +This subtree contains the ZSTD library used in ZFS. It is heavily cut-down by +dropping any unneeded files, and combined into a single file, but otherwise is +intentionally unmodified. Please do not alter the file containing the zstd +library, besides upgrading to a newer ZSTD release. + +Tree structure: + +* `zfs_zstd.c` is the actual `zzstd` kernel module. +* `lib/` contains the the unmodified, [_"amalgamated"_](https://github.com/facebook/zstd/blob/dev/contrib/single_file_libs/README.md) + version of the `Zstandard` library, generated from our template file +* `zstd-in.c` is our template file for generating the library +* `include/`: This directory contains supplemental includes for platform + compatibility, which are not expected to be used by ZFS elsewhere in the + future. Thus we keep them private to ZSTD. + +## Updating ZSTD + +To update ZSTD the following steps need to be taken: + +1. Grab the latest release of [ZSTD](https://github.com/facebook/zstd/releases). +2. Update `module/zstd/zstd-in.c` if required. (see + `zstd/contrib/single_file_libs/zstd-in.c` in the zstd repository) +3. Generate the "single-file-library" and put it to `module/zstd/lib/`. +4. Copy the following files to `module/zstd/lib/`: + - `zstd/lib/zstd.h` + - `zstd/lib/common/zstd_errors.h` + +This can be done using a few shell commands from inside the zfs repo: + +~~~sh +cd PATH/TO/ZFS + +url="https://github.com/facebook/zstd" +release="$(curl -s "${url}"/releases/latest | grep -oP '(?<=v)[\d\.]+')" +zstd="/tmp/zstd-${release}/" + +wget -O /tmp/zstd.tar.gz \ + "${url}/releases/download/v${release}/zstd-${release}.tar.gz" +tar -C /tmp -xzf /tmp/zstd.tar.gz + +cp ${zstd}/lib/zstd.h module/zstd/lib/ +cp ${zstd}/lib/zstd_errors.h module/zstd/lib/ +${zstd}/contrib/single_file_libs/combine.sh \ + -r ${zstd}/lib -o module/zstd/lib/zstd.c module/zstd/zstd-in.c +~~~ + + +## Altering ZSTD and breaking changes + +If ZSTD made changes that break compatibility or you need to make breaking +changes to the way we handle ZSTD, it is required to maintain backwards +compatibility. + +We already save the ZSTD version number within the block header to be used +to add future compatibility checks and/or fixes. However, currently it is +not actually used in such a way. diff --git a/module/zstd/include/aarch64_compat.h b/module/zstd/include/aarch64_compat.h new file mode 100644 index 000000000..088517d3d --- /dev/null +++ b/module/zstd/include/aarch64_compat.h @@ -0,0 +1,37 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2018-2020, Sebastian Gottschall + */ + +#ifdef _KERNEL +#undef __aarch64__ +#endif diff --git a/module/zstd/include/limits.h b/module/zstd/include/limits.h new file mode 100644 index 000000000..3bf5b6776 --- /dev/null +++ b/module/zstd/include/limits.h @@ -0,0 +1,63 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_LIMITS_H +#define _ZSTD_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include <sys/limits.h> +#elif defined(__linux__) +#include <linux/limits.h> +#include <linux/kernel.h> +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next <limits.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_LIMITS_H */ diff --git a/module/zstd/include/stddef.h b/module/zstd/include/stddef.h new file mode 100644 index 000000000..3f46fb8b0 --- /dev/null +++ b/module/zstd/include/stddef.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDDEF_H +#define _ZSTD_STDDEF_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include <sys/types.h> +#elif defined(__linux__) +#include <linux/types.h> +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next <stddef.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDDEF_H */ diff --git a/module/zstd/include/stdint.h b/module/zstd/include/stdint.h new file mode 100644 index 000000000..2d98a556c --- /dev/null +++ b/module/zstd/include/stdint.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDINT_H +#define _ZSTD_STDINT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include <sys/stdint.h> +#elif defined(__linux__) +#include <linux/types.h> +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next <stdint.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDINT_H */ diff --git a/module/zstd/include/stdio.h b/module/zstd/include/stdio.h new file mode 100644 index 000000000..5a7c6ec69 --- /dev/null +++ b/module/zstd/include/stdio.h @@ -0,0 +1,54 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDIO_H +#define _ZSTD_STDIO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#include_next <stdio.h> + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDIO_H */ diff --git a/module/zstd/include/stdlib.h b/module/zstd/include/stdlib.h new file mode 100644 index 000000000..c341a0c84 --- /dev/null +++ b/module/zstd/include/stdlib.h @@ -0,0 +1,58 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDLIB_H +#define _ZSTD_STDLIB_H + +#ifdef __cplusplus +extern "C" { +#endif + +#undef GCC_VERSION + +/* + * Define calloc, malloc, free to make building work. They are never really used + * in zstdlib.c since allocation is done in zstd.c. + */ +#define calloc(n, sz) NULL +#define malloc(sz) NULL +#define free(ptr) + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDLIB_H */ diff --git a/module/zstd/include/string.h b/module/zstd/include/string.h new file mode 100644 index 000000000..78998d3c4 --- /dev/null +++ b/module/zstd/include/string.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STRING_H +#define _ZSTD_STRING_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include <sys/systm.h> /* memcpy, memset */ +#elif defined(__linux__) +#include <linux/string.h> /* memcpy, memset */ +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next <string.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STRING_H */ diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c new file mode 100644 index 000000000..b6dd7efcc --- /dev/null +++ b/module/zstd/zfs_zstd.c @@ -0,0 +1,737 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2018, Klara Inc. + * Copyright (c) 2016-2018, Allan Jude + * Copyright (c) 2018-2020, Sebastian Gottschall + * Copyright (c) 2019-2020, Michael Niewöhner + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/zfs_context.h> +#include <sys/zio_compress.h> +#include <sys/spa.h> +#include <sys/zstd/zstd.h> + +#define ZSTD_STATIC_LINKING_ONLY +#include "lib/zstd.h" +#include "lib/zstd_errors.h" + +kstat_t *zstd_ksp = NULL; + +typedef struct zstd_stats { + kstat_named_t zstd_stat_alloc_fail; + kstat_named_t zstd_stat_alloc_fallback; + kstat_named_t zstd_stat_com_alloc_fail; + kstat_named_t zstd_stat_dec_alloc_fail; + kstat_named_t zstd_stat_com_inval; + kstat_named_t zstd_stat_dec_inval; + kstat_named_t zstd_stat_dec_header_inval; + kstat_named_t zstd_stat_com_fail; + kstat_named_t zstd_stat_dec_fail; +} zstd_stats_t; + +static zstd_stats_t zstd_stats = { + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "alloc_fallback", KSTAT_DATA_UINT64 }, + { "compress_alloc_fail", KSTAT_DATA_UINT64 }, + { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, + { "compress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_header_invalid", KSTAT_DATA_UINT64 }, + { "compress_failed", KSTAT_DATA_UINT64 }, + { "decompress_failed", KSTAT_DATA_UINT64 }, +}; + +/* Enums describing the allocator type specified by kmem_type in zstd_kmem */ +enum zstd_kmem_type { + ZSTD_KMEM_UNKNOWN = 0, + /* Allocation type using kmem_vmalloc */ + ZSTD_KMEM_DEFAULT, + /* Pool based allocation using mempool_alloc */ + ZSTD_KMEM_POOL, + /* Reserved fallback memory for decompression only */ + ZSTD_KMEM_DCTX, + ZSTD_KMEM_COUNT, +}; + +/* Structure for pooled memory objects */ +struct zstd_pool { + void *mem; + size_t size; + kmutex_t barrier; + hrtime_t timeout; +}; + +/* Global structure for handling memory allocations */ +struct zstd_kmem { + enum zstd_kmem_type kmem_type; + size_t kmem_size; + struct zstd_pool *pool; +}; + +/* Fallback memory structure used for decompression only if memory runs out */ +struct zstd_fallback_mem { + size_t mem_size; + void *mem; + kmutex_t barrier; +}; + +struct zstd_levelmap { + int16_t zstd_level; + enum zio_zstd_levels level; +}; + +/* + * ZSTD memory handlers + * + * For decompression we use a different handler which also provides fallback + * memory allocation in case memory runs out. + * + * The ZSTD handlers were split up for the most simplified implementation. + */ +static void *zstd_alloc(void *opaque, size_t size); +static void *zstd_dctx_alloc(void *opaque, size_t size); +static void zstd_free(void *opaque, void *ptr); + +/* Compression memory handler */ +static const ZSTD_customMem zstd_malloc = { + zstd_alloc, + zstd_free, + NULL, +}; + +/* Decompression memory handler */ +static const ZSTD_customMem zstd_dctx_malloc = { + zstd_dctx_alloc, + zstd_free, + NULL, +}; + +/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ +static struct zstd_levelmap zstd_levels[] = { + {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, + {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, + {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, + {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, + {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, + {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, + {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, + {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, + {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, + {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, + {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, + {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, + {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, + {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, + {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, + {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, + {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, + {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, + {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, + {-1, ZIO_ZSTD_LEVEL_FAST_1}, + {-2, ZIO_ZSTD_LEVEL_FAST_2}, + {-3, ZIO_ZSTD_LEVEL_FAST_3}, + {-4, ZIO_ZSTD_LEVEL_FAST_4}, + {-5, ZIO_ZSTD_LEVEL_FAST_5}, + {-6, ZIO_ZSTD_LEVEL_FAST_6}, + {-7, ZIO_ZSTD_LEVEL_FAST_7}, + {-8, ZIO_ZSTD_LEVEL_FAST_8}, + {-9, ZIO_ZSTD_LEVEL_FAST_9}, + {-10, ZIO_ZSTD_LEVEL_FAST_10}, + {-20, ZIO_ZSTD_LEVEL_FAST_20}, + {-30, ZIO_ZSTD_LEVEL_FAST_30}, + {-40, ZIO_ZSTD_LEVEL_FAST_40}, + {-50, ZIO_ZSTD_LEVEL_FAST_50}, + {-60, ZIO_ZSTD_LEVEL_FAST_60}, + {-70, ZIO_ZSTD_LEVEL_FAST_70}, + {-80, ZIO_ZSTD_LEVEL_FAST_80}, + {-90, ZIO_ZSTD_LEVEL_FAST_90}, + {-100, ZIO_ZSTD_LEVEL_FAST_100}, + {-500, ZIO_ZSTD_LEVEL_FAST_500}, + {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, +}; + +/* + * This variable represents the maximum count of the pool based on the number + * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. + */ +static int pool_count = 16; + +#define ZSTD_POOL_MAX pool_count +#define ZSTD_POOL_TIMEOUT 60 * 2 + +static struct zstd_fallback_mem zstd_dctx_fallback; +static struct zstd_pool *zstd_mempool_cctx; +static struct zstd_pool *zstd_mempool_dctx; + +/* + * Try to get a cached allocated buffer from memory pool or allocate a new one + * if necessary. If a object is older than 2 minutes and does not fit the + * requested size, it will be released and a new cached entry will be allocated. + * If other pooled objects are detected without being used for 2 minutes, they + * will be released, too. + * + * The concept is that high frequency memory allocations of bigger objects are + * expensive. So if a lot of work is going on, allocations will be kept for a + * while and can be reused in that time frame. + * + * The scheduled release will be updated every time a object is reused. + */ +static void * +zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) +{ + struct zstd_pool *pool; + struct zstd_kmem *mem = NULL; + + if (!zstd_mempool) { + return (NULL); + } + + /* Seek for preallocated memory slot and free obsolete slots */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + /* + * This lock is simply a marker for a pool object beeing in use. + * If it's already hold, it will be skipped. + * + * We need to create it before checking it to avoid race + * conditions caused by running in a threaded context. + * + * The lock is later released by zstd_mempool_free. + */ + if (mutex_tryenter(&pool->barrier)) { + /* + * Check if objects fits the size, if so we take it and + * update the timestamp. + */ + if (!mem && pool->mem && size <= pool->size) { + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + mem = pool->mem; + continue; + } + + /* Free memory if unused object older than 2 minutes */ + if (pool->mem && gethrestime_sec() > pool->timeout) { + vmem_free(pool->mem, pool->size); + pool->mem = NULL; + pool->size = 0; + pool->timeout = 0; + } + + mutex_exit(&pool->barrier); + } + } + + if (mem) { + return (mem); + } + + /* + * If no preallocated slot was found, try to fill in a new one. + * + * We run a similar algorithm twice here to avoid pool fragmentation. + * The first one may generate holes in the list if objects get released. + * We always make sure that these holes get filled instead of adding new + * allocations constantly at the end. + */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + if (mutex_tryenter(&pool->barrier)) { + /* Object is free, try to allocate new one */ + if (!pool->mem) { + mem = vmem_alloc(size, KM_SLEEP); + pool->mem = mem; + + if (pool->mem) { + /* Keep track for later release */ + mem->pool = pool; + pool->size = size; + mem->kmem_type = ZSTD_KMEM_POOL; + mem->kmem_size = size; + } + } + + if (size <= pool->size) { + /* Update timestamp */ + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + + return (pool->mem); + } + + mutex_exit(&pool->barrier); + } + } + + /* + * If the pool is full or the allocation failed, try lazy allocation + * instead. + */ + if (!mem) { + mem = vmem_alloc(size, KM_NOSLEEP); + if (mem) { + mem->pool = NULL; + mem->kmem_type = ZSTD_KMEM_DEFAULT; + mem->kmem_size = size; + } + } + + return (mem); +} + +/* Mark object as released by releasing the barrier mutex */ +static void +zstd_mempool_free(struct zstd_kmem *z) +{ + mutex_exit(&z->pool->barrier); +} + +/* Convert ZFS internal enum to ZSTD level */ +static int +zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) +{ + if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { + *zstd_level = zstd_levels[level - 1].zstd_level; + return (0); + } + if (level >= ZIO_ZSTD_LEVEL_FAST_1 && + level <= ZIO_ZSTD_LEVEL_FAST_1000) { + *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 + + ZIO_ZSTD_LEVEL_19].zstd_level; + return (0); + } + + /* Invalid/unknown zfs compression enum - this should never happen. */ + return (1); +} + +/* Compress block using zstd */ +size_t +zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level) +{ + size_t c_len; + int16_t zstd_level; + zfs_zstdhdr_t *hdr; + ZSTD_CCtx *cctx; + + hdr = (zfs_zstdhdr_t *)d_start; + + /* Skip compression if the specified level is invalid */ + if (zstd_enum_to_level(level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_com_inval); + return (s_len); + } + + ASSERT3U(d_len, >=, sizeof (*hdr)); + ASSERT3U(d_len, <=, s_len); + ASSERT3U(zstd_level, !=, 0); + + cctx = ZSTD_createCCtx_advanced(zstd_malloc); + + /* + * Out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (!cctx) { + ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); + return (s_len); + } + + /* Set the compression level */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); + + /* Use the "magicless" zstd header which saves us 4 header bytes */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); + + /* + * Disable redundant checksum calculation and content size storage since + * this is already done by ZFS itself. + */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); + ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); + + c_len = ZSTD_compress2(cctx, + hdr->data, + d_len - sizeof (*hdr), + s_start, s_len); + + ZSTD_freeCCtx(cctx); + + /* Error in the compression routine, disable compression. */ + if (ZSTD_isError(c_len)) { + /* + * If we are aborting the compression because the saves are + * too small, that is not a failure. Everything else is a + * failure, so increment the compression failure counter. + */ + if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) { + ZSTDSTAT_BUMP(zstd_stat_com_fail); + } + return (s_len); + } + + /* + * Encode the compressed buffer size at the start. We'll need this in + * decompression to counter the effects of padding which might be added + * to the compressed buffer and which, if unhandled, would confuse the + * hell out of our decompression function. + */ + hdr->c_len = BE_32(c_len); + + /* + * Check version for overflow. + * The limit of 24 bits must not be exceeded. This allows a maximum + * version 1677.72.15 which we don't expect to be ever reached. + */ + ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); + + /* + * Encode the compression level as well. We may need to know the + * original compression level if compressed_arc is disabled, to match + * the compression settings to write this block to the L2ARC. + * + * Encode the actual level, so if the enum changes in the future, we + * will be compatible. + * + * The upper 24 bits store the ZSTD version to be able to provide + * future compatibility, since new versions might enhance the + * compression algorithm in a way, where the compressed data will + * change. + * + * As soon as such incompatibility occurs, handling code needs to be + * added, differentiating between the versions. + */ + hdr->version = ZSTD_VERSION_NUMBER; + hdr->level = level; + hdr->raw_version_level = BE_32(hdr->raw_version_level); + + return (c_len + sizeof (*hdr)); +} + +/* Decompress block using zstd and return its stored level */ +int +zstd_decompress_level(void *s_start, void *d_start, size_t s_len, size_t d_len, + uint8_t *level) +{ + ZSTD_DCtx *dctx; + size_t result; + int16_t zstd_level; + uint32_t c_len; + const zfs_zstdhdr_t *hdr; + zfs_zstdhdr_t hdr_copy; + + hdr = (const zfs_zstdhdr_t *)s_start; + c_len = BE_32(hdr->c_len); + + /* + * Make a copy instead of directly converting the header, since we must + * not modify the original data that may be used again later. + */ + hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); + + /* + * NOTE: We ignore the ZSTD version for now. As soon as any + * incompatibility occurrs, it has to be handled accordingly. + * The version can be accessed via `hdr_copy.version`. + */ + + /* + * Convert and check the level + * An invalid level is a strong indicator for data corruption! In such + * case return an error so the upper layers can try to fix it. + */ + if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_dec_inval); + return (1); + } + + ASSERT3U(d_len, >=, s_len); + ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT); + + /* Invalid compressed buffer size encoded at start */ + if (c_len + sizeof (*hdr) > s_len) { + ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); + return (1); + } + + dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); + if (!dctx) { + ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); + return (1); + } + + /* Set header type to "magicless" */ + ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); + + /* Decompress the data and release the context */ + result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); + ZSTD_freeDCtx(dctx); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative. + */ + if (ZSTD_isError(result)) { + ZSTDSTAT_BUMP(zstd_stat_dec_fail); + return (1); + } + + if (level) { + *level = hdr_copy.level; + } + + return (0); +} + +/* Decompress datablock using zstd */ +int +zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level __maybe_unused) +{ + + return (zstd_decompress_level(s_start, d_start, s_len, d_len, NULL)); +} + +/* Allocator for zstd compression context using mempool_allocator */ +static void * +zstd_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); + + if (!z) { + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + return (NULL); + } + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* + * Allocator for zstd decompression context using mempool_allocator with + * fallback to reserved memory if allocation fails + */ +static void * +zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); + if (!z) { + /* Try harder, decompression shall not fail */ + z = vmem_alloc(nbytes, KM_SLEEP); + if (z) { + z->pool = NULL; + } + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + } else { + return ((void*)z + (sizeof (struct zstd_kmem))); + } + + /* Fallback if everything fails */ + if (!z) { + /* + * Barrier since we only can handle it in a single thread. All + * other following threads need to wait here until decompression + * is completed. zstd_free will release this barrier later. + */ + mutex_enter(&zstd_dctx_fallback.barrier); + + z = zstd_dctx_fallback.mem; + type = ZSTD_KMEM_DCTX; + ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); + } + + /* Allocation should always be successful */ + if (!z) { + return (NULL); + } + + z->kmem_type = type; + z->kmem_size = nbytes; + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* Free allocated memory by its specific type */ +static void +zstd_free(void *opaque __maybe_unused, void *ptr) +{ + struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); + enum zstd_kmem_type type; + + ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); + ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); + + type = z->kmem_type; + switch (type) { + case ZSTD_KMEM_DEFAULT: + vmem_free(z, z->kmem_size); + break; + case ZSTD_KMEM_POOL: + zstd_mempool_free(z); + break; + case ZSTD_KMEM_DCTX: + mutex_exit(&zstd_dctx_fallback.barrier); + break; + default: + break; + } +} + +/* Allocate fallback memory to ensure safe decompression */ +static void __init +create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) +{ + mem->mem_size = size; + mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); + mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); +} + +/* Initialize memory pool barrier mutexes */ +static void __init +zstd_mempool_init(void) +{ + zstd_mempool_cctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + zstd_mempool_dctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + mutex_init(&zstd_mempool_cctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + mutex_init(&zstd_mempool_dctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + } +} + +/* Initialize zstd-related memory handling */ +static int __init +zstd_meminit(void) +{ + zstd_mempool_init(); + + /* + * Estimate the size of the fallback decompression context. + * The expected size on x64 with current ZSTD should be about 160 KB. + */ + create_fallback_mem(&zstd_dctx_fallback, + P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), + PAGESIZE)); + + return (0); +} + +/* Release object from pool and free memory */ +static void __exit +release_pool(struct zstd_pool *pool) +{ + mutex_destroy(&pool->barrier); + vmem_free(pool->mem, pool->size); + pool->mem = NULL; + pool->size = 0; +} + +/* Release memory pool objects */ +static void __exit +zstd_mempool_deinit(void) +{ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + release_pool(&zstd_mempool_cctx[i]); + release_pool(&zstd_mempool_dctx[i]); + } + + kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + zstd_mempool_dctx = NULL; + zstd_mempool_cctx = NULL; +} + +extern int __init +zstd_init(void) +{ + /* Set pool size by using maximum sane thread count * 4 */ + pool_count = (boot_ncpus * 4); + zstd_meminit(); + + /* Initialize kstat */ + zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", + KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (zstd_ksp != NULL) { + zstd_ksp->ks_data = &zstd_stats; + kstat_install(zstd_ksp); + } + + return (0); +} + +extern void __exit +zstd_fini(void) +{ + /* Deinitialize kstat */ + if (zstd_ksp != NULL) { + kstat_delete(zstd_ksp); + zstd_ksp = NULL; + } + + /* Release fallback memory */ + vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); + mutex_destroy(&zstd_dctx_fallback.barrier); + + /* Deinit memory pool */ + zstd_mempool_deinit(); +} + +#if defined(_KERNEL) +module_init(zstd_init); +module_exit(zstd_fini); + +ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS"); +ZFS_MODULE_LICENSE("BSD"); +ZFS_MODULE_VERSION(ZSTD_VERSION_STRING); + +EXPORT_SYMBOL(zstd_compress); +EXPORT_SYMBOL(zstd_decompress_level); +EXPORT_SYMBOL(zstd_decompress); +#endif |