From: Alex Zhuravlev Date: Tue, 29 May 2012 10:56:48 +0000 (+0400) Subject: LU-1305 osd: dmu helpers X-Git-Tag: 2.2.59~14 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=a6179606f842d8fb0961296011ed70d383cc0f4e LU-1305 osd: dmu helpers few simple functions to wrap native dmu funtions. to be replaced with direct calls at some point. Signed-off-by: Alex Zhuravlev Change-Id: Ibf9331b5668b1e7db22670caf395391bca002061 Reviewed-on: http://review.whamcloud.com/2971 Reviewed-by: Andreas Dilger Tested-by: Hudson Reviewed-by: Johann Lombardi --- diff --git a/lustre/osd-zfs/udmu.c b/lustre/osd-zfs/udmu.c new file mode 100644 index 0000000..1bb5d2a --- /dev/null +++ b/lustre/osd-zfs/udmu.c @@ -0,0 +1,533 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012 Whamcloud, Inc. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/osd-zfs/udmu.c + * Module that interacts with the ZFS DMU and provides an abstraction + * to the rest of Lustre. + * + * Author: Alex Zhuravlev + * Author: Atul Vidwansa + * Author: Manoj Joseph + * Author: Mike Pershin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* OBD_OBJECT_EOF */ +#include /* struct obd_statfs */ + +#include "udmu.h" + +int udmu_blk_insert_cost(void) +{ + int max_blockshift, nr_blkptrshift; + + /* max_blockshift is the log2 of the number of blocks needed to reach + * the maximum filesize (that's to say 2^64) */ + max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT; + + /* nr_blkptrshift is the log2 of the number of block pointers that can + * be stored in an indirect block */ + CLASSERT(DN_MAX_INDBLKSHIFT > SPA_BLKPTRSHIFT); + nr_blkptrshift = DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT; + + /* max_blockshift / nr_blkptrshift is thus the maximum depth of the + * tree. We add +1 for rounding purpose. + * The tree depth times the indirect block size gives us the maximum + * cost of inserting a block in the tree */ + return (max_blockshift / nr_blkptrshift + 1) * (1 << DN_MAX_INDBLKSHIFT); +} + +int udmu_objset_open(char *osname, udmu_objset_t *uos) +{ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t version = ZPL_VERSION; + uint64_t sa_obj; + int error; + + memset(uos, 0, sizeof(udmu_objset_t)); + + error = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, uos, &uos->os); + if (error) { + uos->os = NULL; + goto out; + } + + /* Check ZFS version */ + error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, + &version); + if (error) { + CERROR("%s: Error looking up ZPL VERSION\n", osname); + /* + * We can't return ENOENT because that would mean the objset + * didn't exist. + */ + error = EIO; + goto out; + } + + error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error) + goto out; + + error = sa_setup(uos->os, sa_obj, zfs_attr_table, ZPL_END, + &uos->z_attr_table); + if (error) + goto out; + + error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &uos->root); + if (error) { + CERROR("%s: Error looking up ZFS root object.\n", osname); + error = EIO; + goto out; + } + ASSERT(uos->root != 0); + + /* Check that user/group usage tracking is supported */ + if (!dmu_objset_userused_enabled(uos->os) || + DMU_USERUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED || + DMU_GROUPUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED) { + CERROR("%s: Space accounting not supported by this target, " + "aborting\n", osname); + error = ENOTSUPP; + goto out; + } + + /* + * as DMU doesn't maintain f_files absolutely actual (it's updated + * at flush, not when object is create/destroyed) we've implemented + * own counter which is initialized from on-disk at mount, then is + * being maintained by DMU OSD + */ + dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, + &availobjs); + uos->objects = usedobjs; + cfs_spin_lock_init(&uos->lock); + +out: + if (error && uos->os != NULL) + dmu_objset_disown(uos->os, uos); + + return error; +} + +void udmu_objset_close(udmu_objset_t *uos) +{ + ASSERT(uos->os != NULL); + + /* + * Force a txg sync. This should not be needed, neither for + * correctness nor safety. Presumably, we are only doing + * this to force commit callbacks to be called sooner. + */ + txg_wait_synced(dmu_objset_pool(uos->os), 0ULL); + + /* close the object set */ + dmu_objset_disown(uos->os, uos); + + uos->os = NULL; +} + +/* Estimate the number of objects from a number of blocks */ +static uint64_t udmu_objs_count_estimate(uint64_t refdbytes, + uint64_t usedobjs, + uint64_t nrblocks) +{ + uint64_t est_objs, est_refdblocks, est_usedobjs; + + /* Compute an nrblocks estimate based on the actual number of + * dnodes that could fit in the space. Since we don't know the + * overhead associated with each dnode (xattrs, SAs, VDEV overhead, + * etc) just using DNODE_SHIFT isn't going to give a good estimate. + * Instead, compute an estimate based on the average space usage per + * dnode, with an upper and lower cap. + * + * In case there aren't many dnodes or blocks used yet, add a small + * correction factor using OSD_DNODE_EST_SHIFT. This correction + * factor gradually disappears as the number of real dnodes grows. + * This also avoids the need to check for divide-by-zero later. + */ + CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0); + CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0); + + est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) + + (OSD_DNODE_EST_COUNT << OSD_DNODE_EST_BLKSHIFT); + est_usedobjs = usedobjs + OSD_DNODE_EST_COUNT; + + /* Average space/dnode more than maximum dnode size, use max dnode + * size to estimate free dnodes from adjusted free blocks count. + * OSTs typically use more than one block dnode so this case applies. */ + if (est_usedobjs <= est_refdblocks * 2) { + est_objs = nrblocks; + + /* Average space/dnode smaller than min dnode size (probably due to + * metadnode compression), use min dnode size to estimate the number of + * objects. + * An MDT typically uses below 512 bytes/dnode so this case applies. */ + } else if (est_usedobjs >= (est_refdblocks << OSD_DNODE_MIN_BLKSHIFT)) { + est_objs = nrblocks << OSD_DNODE_MIN_BLKSHIFT; + + /* Between the extremes, we try to use the average size of + * existing dnodes to compute the number of dnodes that fit + * into nrblocks: + * + * est_objs = nrblocks * (est_usedobjs / est_refblocks); + * + * but this may overflow 64 bits or become 0 if not handled well + * + * We know nrblocks is below (64 - 17 = 47) bits from + * SPA_MAXBLKSHIFT, and est_usedobjs is under 48 bits due to + * DN_MAX_OBJECT_SHIFT, which means that multiplying them may + * get as large as 2 ^ 95. + * + * We also know (est_usedobjs / est_refdblocks) is between 2 and + * 256, due to above checks, so we can safely compute this first. + * We care more about accuracy on the MDT (many dnodes/block) + * which is good because this is where truncation errors are + * smallest. This adds 8 bits to nrblocks so we can use 7 bits + * to compute a fixed-point fraction and nrblocks can still fit + * in 64 bits. */ + } else { + unsigned dnodes_per_block = (est_usedobjs << 7)/est_refdblocks; + + est_objs = (nrblocks * dnodes_per_block) >> 7; + } + return est_objs; +} + +int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs) +{ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t est_availobjs; + uint64_t reserved; + + dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, + &availobjs); + + /* + * ZFS allows multiple block sizes. For statfs, Linux makes no + * proper distinction between bsize and frsize. For calculations + * of free and used blocks incorrectly uses bsize instead of frsize, + * but bsize is also used as the optimal blocksize. We return the + * largest possible block size as IO size for the optimum performance + * and scale the free and used blocks count appropriately. + */ + osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT; + + osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT; + osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT; + osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */ + + /* Take replication (i.e. number of copies) into account */ + osfs->os_bavail /= uos->os->os_copies; + + /* + * Reserve some space so we don't run into ENOSPC due to grants not + * accounting for metadata overhead in ZFS, and to avoid fragmentation. + * Rather than report this via os_bavail (which makes users unhappy if + * they can't fill the filesystem 100%), reduce os_blocks as well. + * + * Reserve 0.78% of total space, at least 4MB for small filesystems, + * for internal files to be created/unlinked when space is tight. + */ + CLASSERT(OSD_STATFS_RESERVED_BLKS > 0); + if (likely(osfs->os_blocks >= + OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT)) + reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; + else + reserved = OSD_STATFS_RESERVED_BLKS; + + osfs->os_blocks -= reserved; + osfs->os_bfree -= MIN(reserved, osfs->os_bfree); + osfs->os_bavail -= MIN(reserved, osfs->os_bavail); + + /* + * The availobjs value returned from dmu_objset_space() is largely + * useless, since it reports the number of objects that might + * theoretically still fit into the dataset, independent of minor + * issues like how much space is actually available in the pool. + * Compute a better estimate in udmu_objs_count_estimate(). + */ + est_availobjs = udmu_objs_count_estimate(refdbytes, usedobjs, + osfs->os_bfree); + + osfs->os_ffree = min(availobjs, est_availobjs); + osfs->os_files = osfs->os_ffree + uos->objects; + + /* ZFS XXX: fill in backing dataset FSID/UUID + memcpy(osfs->os_fsid, .... );*/ + + /* We're a zfs filesystem. */ + osfs->os_type = UBERBLOCK_MAGIC; + + /* ZFS XXX: fill in appropriate OS_STATE_{DEGRADED,READONLY} flags + osfs->os_state = vf_to_stf(vfsp->vfs_flag); + if (sb->s_flags & MS_RDONLY) + osfs->os_state = OS_STATE_READONLY; + */ + + osfs->os_namelen = MAXNAMELEN; + osfs->os_maxbytes = OBD_OBJECT_EOF; + + return 0; +} + +/** + * Helper function to estimate the number of inodes in use for a give uid/gid + * from the block usage + */ +uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes) +{ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t uidobjs; + + /* get fresh statfs info */ + dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, + &availobjs); + + /* estimate the number of objects based on the disk usage */ + uidobjs = udmu_objs_count_estimate(refdbytes, usedobjs, + uidbytes >> SPA_MAXBLOCKSHIFT); + if (uidbytes > 0) + /* if we have at least 1 byte, we have at least one dnode ... */ + uidobjs = max_t(uint64_t, uidobjs, 1); + return uidobjs; +} + +/* Get the objset name. + buf must have at least MAXNAMELEN bytes */ +void udmu_objset_name_get(udmu_objset_t *uos, char *buf) +{ + dmu_objset_name(uos->os, buf); +} + +static int udmu_userprop_setup(udmu_objset_t *uos, const char *prop_name, + char **os_name, char **real_prop) +{ + if (os_name != NULL) { + *os_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + udmu_objset_name_get(uos, *os_name); + } + + *real_prop = kmem_alloc(MAXNAMELEN, KM_SLEEP); + + if (snprintf(*real_prop, MAXNAMELEN, "lustre:%s", prop_name) >= + MAXNAMELEN) { + if (os_name != NULL) + kmem_free(*os_name, MAXNAMELEN); + kmem_free(*real_prop, MAXNAMELEN); + + CERROR("property name too long: %s\n", prop_name); + return ENAMETOOLONG; + } + + return 0; +} + +static void udmu_userprop_cleanup(char **os_name, char **real_prop) +{ + if (os_name != NULL) + kmem_free(*os_name, MAXNAMELEN); + kmem_free(*real_prop, MAXNAMELEN); +} + +/* Set ZFS user property 'prop_name' of objset 'uos' to string 'val' */ +int udmu_userprop_set_str(udmu_objset_t *uos, const char *prop_name, + const char *val) +{ + char *os_name; + char *real_prop; + int rc; + + rc = udmu_userprop_setup(uos, prop_name, &os_name, &real_prop); + if (rc != 0) + return rc; + + rc = dsl_prop_set(os_name, real_prop, ZPROP_SRC_LOCAL, 1, + strlen(val) + 1, val); + udmu_userprop_cleanup(&os_name, &real_prop); + + return rc; +} + +/* Get ZFS user property 'prop_name' of objset 'uos' into buffer 'buf' of size + 'buf_size' */ +int udmu_userprop_get_str(udmu_objset_t *uos, const char *prop_name, char *buf, + size_t buf_size) +{ + char *real_prop; + char *nvp_val; + size_t nvp_len; + nvlist_t *nvl = NULL; + nvlist_t *nvl_val; + nvpair_t *elem = NULL; + int rc; + + rc = udmu_userprop_setup(uos, prop_name, NULL, &real_prop); + if (rc != 0) + return rc; + + /* We can't just pass buf_size to dsl_prop_get() because it expects the + exact value size (zap_lookup() requirement), so we must get all props + and extract the one we want. */ + rc = dsl_prop_get_all(uos->os, &nvl); + if (rc != 0) { + nvl = NULL; + goto out; + } + + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *name = nvpair_name(elem); + if (strcmp(name, real_prop) != 0) + continue; + + /* Got the property we were looking for, but the val is not the + string yet, it's an nvlist */ + + rc = nvpair_value_nvlist(elem, &nvl_val); + if (rc != 0) + goto out; + + rc = nvlist_lookup_string(nvl_val, ZPROP_VALUE, &nvp_val); + if (rc != 0) + goto out; + + nvp_len = strlen(nvp_val); + if (buf_size < nvp_len + 1) { + rc = EOVERFLOW; + goto out; + } + strcpy(buf, nvp_val); + goto out; + } + /* Not found */ + rc = ENOENT; +out: + if (nvl != NULL) + nvlist_free(nvl); + udmu_userprop_cleanup(NULL, &real_prop); + + return rc; +} + +/* We don't actually have direct access to the zap_hashbits() function + * so just pretend like we do for now. If this ever breaks we can look at + * it at that time. */ +#define zap_hashbits(zc) 48 +/* + * ZFS hash format: + * | cd (16 bits) | hash (48 bits) | + * we need it in other form: + * |0| hash (48 bit) | cd (15 bit) | + * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge + * the readdir hashes from multiple directory stripes uniformly on the client. + * Another point is sign bit, the hash range should be in [0, 2^63-1] because + * loff_t (for llseek) needs to be a positive value. This means the "cd" field + * should only be the low 15 bits. + */ +uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc) +{ + uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1); + + return (zfs_hash >> zap_hashbits(zc)) | + (zfs_hash << (63 - zap_hashbits(zc))); +} + +void udmu_zap_cursor_init_serialized(zap_cursor_t *zc, udmu_objset_t *uos, + uint64_t zapobj, uint64_t dirhash) +{ + uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) | + (dirhash >> (63 - zap_hashbits(zc))); + zap_cursor_init_serialized(zc, uos->os, zapobj, zfs_hash); +} + +/* + * Zap cursor APIs + */ +int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos, + uint64_t zapobj, uint64_t dirhash) +{ + zap_cursor_t *t; + + t = kmem_alloc(sizeof(*t), KM_NOSLEEP); + if (t) { + udmu_zap_cursor_init_serialized(t, uos, zapobj, dirhash); + *zc = t; + return 0; + } + return (ENOMEM); +} + +void udmu_zap_cursor_fini(zap_cursor_t *zc) +{ + zap_cursor_fini(zc); + kmem_free(zc, sizeof(*zc)); +} + +/* + * Get the object id from dmu_buf_t + */ +int udmu_object_is_zap(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db; + dnode_t *dn; + int rc; + + DB_DNODE_ENTER(dbi); + + dn = DB_DNODE(dbi); + rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS || + dn->dn_type == DMU_OT_USERGROUP_USED); + + DB_DNODE_EXIT(dbi); + + return rc; +} + diff --git a/lustre/osd-zfs/udmu.h b/lustre/osd-zfs/udmu.h new file mode 100644 index 0000000..45487b6 --- /dev/null +++ b/lustre/osd-zfs/udmu.h @@ -0,0 +1,119 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012 Whamcloud, Inc. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/osd-zfs/udmu.h + * + * Author: Alex Tomas + * Author: Atul Vidwansa + * Author: Manoj Joseph + */ + +#ifndef _DMU_H +#define _DMU_H + +#include +#include +#include +#include + +#include + +typedef struct udmu_objset { + struct objset *os; + uint64_t root; /* id of root znode */ + cfs_spinlock_t lock; /* protects objects below */ + uint64_t objects; /* in-core counter of objects */ + /* SA attr mapping->id, + * name is the same as in ZFS to use defines SA_ZPL_...*/ + sa_attr_type_t *z_attr_table; +} udmu_objset_t; + +#ifndef _SYS_TXG_H +#define TXG_WAIT 1ULL +#define TXG_NOWAIT 2ULL +#endif + +#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) + +/* Statfs space reservation for grant, fragmentation, and unlink space. */ +#define OSD_STATFS_RESERVED_BLKS (1ULL << (22 - SPA_MAXBLOCKSHIFT)) /* 4MB */ +#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ + +/* Statfs {minimum, safe estimate, and maximum} dnodes per block */ +#define OSD_DNODE_MIN_BLKSHIFT (SPA_MAXBLOCKSHIFT - DNODE_SHIFT) /* 17-9 =8 */ +#define OSD_DNODE_EST_BLKSHIFT (SPA_MAXBLOCKSHIFT - 12) /* 17-12=5 */ +#define OSD_DNODE_EST_COUNT 1024 + +#define OSD_GRANT_FOR_LOCAL_OIDS (2ULL << 20) /* 2MB for last_rcvd, ... */ + +void udmu_init(void); +void udmu_fini(void); + +/* udmu object-set API */ +int udmu_objset_open(char *osname, udmu_objset_t *uos); +void udmu_objset_close(udmu_objset_t *uos); +int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs); +uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes); +int udmu_objset_root(udmu_objset_t *uos, dmu_buf_t **dbp, void *tag); +uint64_t udmu_get_txg(udmu_objset_t *uos, dmu_tx_t *tx); +int udmu_blk_insert_cost(void); + +/* buf must have at least MAXNAMELEN bytes */ +void udmu_objset_name_get(udmu_objset_t *uos, char *buf); + +/* get/set ZFS user properties */ +int udmu_userprop_set_str(udmu_objset_t *uos, const char *prop_name, + const char *val); +int udmu_userprop_get_str(udmu_objset_t *uos, const char *prop_name, char *buf, + size_t buf_size); + +/* zap cursor apis */ +int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos, + uint64_t zapobj, uint64_t hash); + +void udmu_zap_cursor_fini(zap_cursor_t *zc); + +void udmu_zap_cursor_advance(zap_cursor_t *zc); + +uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc); + +int udmu_zap_cursor_move_to_key(zap_cursor_t *zc, const char *name); + +/* Commit callbacks */ +int udmu_object_is_zap(dmu_buf_t *); + +#endif /* _DMU_H */