LU-1305 osd: dmu helpers

author Alex Zhuravlev <bzzz@whamcloud.com>

Tue, 29 May 2012 10:56:48 +0000 (14:56 +0400)

committer Andreas Dilger <adilger@whamcloud.com>

Fri, 29 Jun 2012 03:49:07 +0000 (23:49 -0400)
author Alex Zhuravlev <bzzz@whamcloud.com>
Tue, 29 May 2012 10:56:48 +0000 (14:56 +0400)
committer Andreas Dilger <adilger@whamcloud.com>
Fri, 29 Jun 2012 03:49:07 +0000 (23:49 -0400)
diff --git a/lustre/osd-zfs/udmu.c b/lustre/osd-zfs/udmu.c

new file mode 100644 (file)

index 0000000..1bb5d2a
--- /dev/null
+++ b/lustre/osd-zfs/udmu.c
@@ -0,0 +1,533 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012 Whamcloud, Inc.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/osd-zfs/udmu.c
+ * Module that interacts with the ZFS DMU and provides an abstraction
+ * to the rest of Lustre.
+ *
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Atul Vidwansa <atul.vidwansa@sun.com>
+ * Author: Manoj Joseph <manoj.joseph@sun.com>
+ * Author: Mike Pershin <tappro@whamcloud.com>
+ */
+
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/spa.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa_impl.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_prop.h>
+#include <sys/sa_impl.h>
+#include <sys/txg.h>
+
+#include <lustre/lustre_idl.h>  /* OBD_OBJECT_EOF */
+#include <lustre/lustre_user.h> /* struct obd_statfs */
+
+#include "udmu.h"
+
+int udmu_blk_insert_cost(void)
+{
+       int max_blockshift, nr_blkptrshift;
+
+       /* max_blockshift is the log2 of the number of blocks needed to reach
+        * the maximum filesize (that's to say 2^64) */
+       max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT;
+
+       /* nr_blkptrshift is the log2 of the number of block pointers that can
+        * be stored in an indirect block */
+       CLASSERT(DN_MAX_INDBLKSHIFT > SPA_BLKPTRSHIFT);
+       nr_blkptrshift = DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT;
+
+       /* max_blockshift / nr_blkptrshift is thus the maximum depth of the
+        * tree. We add +1 for rounding purpose.
+        * The tree depth times the indirect block size gives us the maximum
+        * cost of inserting a block in the tree */
+       return (max_blockshift / nr_blkptrshift + 1) * (1 << DN_MAX_INDBLKSHIFT);
+}
+
+int udmu_objset_open(char *osname, udmu_objset_t *uos)
+{
+       uint64_t refdbytes, availbytes, usedobjs, availobjs;
+       uint64_t version = ZPL_VERSION;
+       uint64_t sa_obj;
+       int      error;
+
+       memset(uos, 0, sizeof(udmu_objset_t));
+
+       error = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, uos, &uos->os);
+       if (error) {
+               uos->os = NULL;
+               goto out;
+       }
+
+       /* Check ZFS version */
+       error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
+                          &version);
+       if (error) {
+               CERROR("%s: Error looking up ZPL VERSION\n", osname);
+               /*
+                * We can't return ENOENT because that would mean the objset
+                * didn't exist.
+                */
+               error = EIO;
+               goto out;
+       }
+
+       error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+                          &sa_obj);
+       if (error)
+               goto out;
+
+       error = sa_setup(uos->os, sa_obj, zfs_attr_table, ZPL_END,
+                        &uos->z_attr_table);
+       if (error)
+               goto out;
+
+       error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+                          &uos->root);
+       if (error) {
+               CERROR("%s: Error looking up ZFS root object.\n", osname);
+               error = EIO;
+               goto out;
+       }
+       ASSERT(uos->root != 0);
+
+       /* Check that user/group usage tracking is supported */
+       if (!dmu_objset_userused_enabled(uos->os) ||
+               DMU_USERUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED ||
+               DMU_GROUPUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED) {
+               CERROR("%s: Space accounting not supported by this target, "
+                       "aborting\n", osname);
+               error = ENOTSUPP;
+               goto out;
+       }
+
+       /*
+        * as DMU doesn't maintain f_files absolutely actual (it's updated
+        * at flush, not when object is create/destroyed) we've implemented
+        * own counter which is initialized from on-disk at mount, then is
+        * being maintained by DMU OSD
+        */
+       dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
+                        &availobjs);
+       uos->objects = usedobjs;
+       cfs_spin_lock_init(&uos->lock);
+
+out:
+       if (error && uos->os != NULL)
+               dmu_objset_disown(uos->os, uos);
+
+       return error;
+}
+
+void udmu_objset_close(udmu_objset_t *uos)
+{
+       ASSERT(uos->os != NULL);
+
+       /*
+        * Force a txg sync.  This should not be needed, neither for
+        * correctness nor safety.  Presumably, we are only doing
+        * this to force commit callbacks to be called sooner.
+        */
+       txg_wait_synced(dmu_objset_pool(uos->os), 0ULL);
+
+       /* close the object set */
+       dmu_objset_disown(uos->os, uos);
+
+       uos->os = NULL;
+}
+
+/* Estimate the number of objects from a number of blocks */
+static uint64_t udmu_objs_count_estimate(uint64_t refdbytes,
+                                       uint64_t usedobjs,
+                                       uint64_t nrblocks)
+{
+       uint64_t est_objs, est_refdblocks, est_usedobjs;
+
+       /* Compute an nrblocks estimate based on the actual number of
+        * dnodes that could fit in the space.  Since we don't know the
+        * overhead associated with each dnode (xattrs, SAs, VDEV overhead,
+        * etc) just using DNODE_SHIFT isn't going to give a good estimate.
+        * Instead, compute an estimate based on the average space usage per
+        * dnode, with an upper and lower cap.
+        *
+        * In case there aren't many dnodes or blocks used yet, add a small
+        * correction factor using OSD_DNODE_EST_SHIFT.  This correction
+        * factor gradually disappears as the number of real dnodes grows.
+        * This also avoids the need to check for divide-by-zero later.
+        */
+       CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0);
+       CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0);
+
+       est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) +
+               (OSD_DNODE_EST_COUNT << OSD_DNODE_EST_BLKSHIFT);
+       est_usedobjs   = usedobjs + OSD_DNODE_EST_COUNT;
+
+       /* Average space/dnode more than maximum dnode size, use max dnode
+        * size to estimate free dnodes from adjusted free blocks count.
+        * OSTs typically use more than one block dnode so this case applies. */
+       if (est_usedobjs <= est_refdblocks * 2) {
+               est_objs = nrblocks;
+
+       /* Average space/dnode smaller than min dnode size (probably due to
+        * metadnode compression), use min dnode size to estimate the number of
+        * objects.
+        * An MDT typically uses below 512 bytes/dnode so this case applies. */
+       } else if (est_usedobjs >= (est_refdblocks << OSD_DNODE_MIN_BLKSHIFT)) {
+               est_objs = nrblocks << OSD_DNODE_MIN_BLKSHIFT;
+
+               /* Between the extremes, we try to use the average size of
+                * existing dnodes to compute the number of dnodes that fit
+                * into nrblocks:
+                *
+                * est_objs = nrblocks * (est_usedobjs / est_refblocks);
+                *
+                * but this may overflow 64 bits or become 0 if not handled well
+                *
+                * We know nrblocks is below (64 - 17 = 47) bits from
+                * SPA_MAXBLKSHIFT, and est_usedobjs is under 48 bits due to
+                * DN_MAX_OBJECT_SHIFT, which means that multiplying them may
+                * get as large as 2 ^ 95.
+                *
+                * We also know (est_usedobjs / est_refdblocks) is between 2 and
+                * 256, due to above checks, so we can safely compute this first.
+                * We care more about accuracy on the MDT (many dnodes/block)
+                * which is good because this is where truncation errors are
+                * smallest.  This adds 8 bits to nrblocks so we can use 7 bits
+                * to compute a fixed-point fraction and nrblocks can still fit
+                * in 64 bits. */
+       } else {
+               unsigned dnodes_per_block = (est_usedobjs << 7)/est_refdblocks;
+
+               est_objs = (nrblocks * dnodes_per_block) >> 7;
+       }
+       return est_objs;
+}
+
+int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs)
+{
+       uint64_t refdbytes, availbytes, usedobjs, availobjs;
+       uint64_t est_availobjs;
+       uint64_t reserved;
+
+       dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
+                       &availobjs);
+
+       /*
+        * ZFS allows multiple block sizes.  For statfs, Linux makes no
+        * proper distinction between bsize and frsize.  For calculations
+        * of free and used blocks incorrectly uses bsize instead of frsize,
+        * but bsize is also used as the optimal blocksize.  We return the
+        * largest possible block size as IO size for the optimum performance
+        * and scale the free and used blocks count appropriately.
+        */
+       osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT;
+
+       osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT;
+       osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT;
+       osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */
+
+       /* Take replication (i.e. number of copies) into account */
+       osfs->os_bavail /= uos->os->os_copies;
+
+       /*
+        * Reserve some space so we don't run into ENOSPC due to grants not
+        * accounting for metadata overhead in ZFS, and to avoid fragmentation.
+        * Rather than report this via os_bavail (which makes users unhappy if
+        * they can't fill the filesystem 100%), reduce os_blocks as well.
+        *
+        * Reserve 0.78% of total space, at least 4MB for small filesystems,
+        * for internal files to be created/unlinked when space is tight.
+        */
+       CLASSERT(OSD_STATFS_RESERVED_BLKS > 0);
+       if (likely(osfs->os_blocks >=
+                       OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT))
+               reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT;
+       else
+               reserved = OSD_STATFS_RESERVED_BLKS;
+
+       osfs->os_blocks -= reserved;
+       osfs->os_bfree  -= MIN(reserved, osfs->os_bfree);
+       osfs->os_bavail -= MIN(reserved, osfs->os_bavail);
+
+       /*
+        * The availobjs value returned from dmu_objset_space() is largely
+        * useless, since it reports the number of objects that might
+        * theoretically still fit into the dataset, independent of minor
+        * issues like how much space is actually available in the pool.
+        * Compute a better estimate in udmu_objs_count_estimate().
+        */
+       est_availobjs = udmu_objs_count_estimate(refdbytes, usedobjs,
+                                               osfs->os_bfree);
+
+       osfs->os_ffree = min(availobjs, est_availobjs);
+       osfs->os_files = osfs->os_ffree + uos->objects;
+
+       /* ZFS XXX: fill in backing dataset FSID/UUID
+          memcpy(osfs->os_fsid, .... );*/
+
+       /* We're a zfs filesystem. */
+       osfs->os_type = UBERBLOCK_MAGIC;
+
+       /* ZFS XXX: fill in appropriate OS_STATE_{DEGRADED,READONLY} flags
+          osfs->os_state = vf_to_stf(vfsp->vfs_flag);
+          if (sb->s_flags & MS_RDONLY)
+          osfs->os_state = OS_STATE_READONLY;
+        */
+
+       osfs->os_namelen = MAXNAMELEN;
+       osfs->os_maxbytes = OBD_OBJECT_EOF;
+
+       return 0;
+}
+
+/**
+ * Helper function to estimate the number of inodes in use for a give uid/gid
+ * from the block usage
+ */
+uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes)
+{
+       uint64_t refdbytes, availbytes, usedobjs, availobjs;
+       uint64_t uidobjs;
+
+       /* get fresh statfs info */
+       dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs,
+                       &availobjs);
+
+       /* estimate the number of objects based on the disk usage */
+       uidobjs = udmu_objs_count_estimate(refdbytes, usedobjs,
+                                       uidbytes >> SPA_MAXBLOCKSHIFT);
+       if (uidbytes > 0)
+               /* if we have at least 1 byte, we have at least one dnode ... */
+               uidobjs = max_t(uint64_t, uidobjs, 1);
+       return uidobjs;
+}
+
+/* Get the objset name.
+   buf must have at least MAXNAMELEN bytes */
+void udmu_objset_name_get(udmu_objset_t *uos, char *buf)
+{
+       dmu_objset_name(uos->os, buf);
+}
+
+static int udmu_userprop_setup(udmu_objset_t *uos, const char *prop_name,
+               char **os_name, char **real_prop)
+{
+       if (os_name != NULL) {
+               *os_name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+               udmu_objset_name_get(uos, *os_name);
+       }
+
+       *real_prop = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+       if (snprintf(*real_prop, MAXNAMELEN, "lustre:%s", prop_name) >=
+                       MAXNAMELEN) {
+               if (os_name != NULL)
+                       kmem_free(*os_name, MAXNAMELEN);
+               kmem_free(*real_prop, MAXNAMELEN);
+
+               CERROR("property name too long: %s\n", prop_name);
+               return ENAMETOOLONG;
+       }
+
+       return 0;
+}
+
+static void udmu_userprop_cleanup(char **os_name, char **real_prop)
+{
+       if (os_name != NULL)
+               kmem_free(*os_name, MAXNAMELEN);
+       kmem_free(*real_prop, MAXNAMELEN);
+}
+
+/* Set ZFS user property 'prop_name' of objset 'uos' to string 'val' */
+int udmu_userprop_set_str(udmu_objset_t *uos, const char *prop_name,
+               const char *val)
+{
+       char *os_name;
+       char *real_prop;
+       int rc;
+
+       rc = udmu_userprop_setup(uos, prop_name, &os_name, &real_prop);
+       if (rc != 0)
+               return rc;
+
+       rc = dsl_prop_set(os_name, real_prop, ZPROP_SRC_LOCAL, 1,
+                       strlen(val) + 1, val);
+       udmu_userprop_cleanup(&os_name, &real_prop);
+
+       return rc;
+}
+
+/* Get ZFS user property 'prop_name' of objset 'uos' into buffer 'buf' of size
+   'buf_size' */
+int udmu_userprop_get_str(udmu_objset_t *uos, const char *prop_name, char *buf,
+                               size_t buf_size)
+{
+       char *real_prop;
+       char *nvp_val;
+       size_t nvp_len;
+       nvlist_t *nvl = NULL;
+       nvlist_t *nvl_val;
+       nvpair_t *elem = NULL;
+       int rc;
+
+       rc = udmu_userprop_setup(uos, prop_name, NULL, &real_prop);
+       if (rc != 0)
+               return rc;
+
+       /* We can't just pass buf_size to dsl_prop_get() because it expects the
+          exact value size (zap_lookup() requirement), so we must get all props
+          and extract the one we want. */
+       rc = dsl_prop_get_all(uos->os, &nvl);
+       if (rc != 0) {
+               nvl = NULL;
+               goto out;
+       }
+
+       while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+               const char *name = nvpair_name(elem);
+               if (strcmp(name, real_prop) != 0)
+                       continue;
+
+               /* Got the property we were looking for, but the val is not the
+                  string yet, it's an nvlist */
+
+               rc = nvpair_value_nvlist(elem, &nvl_val);
+               if (rc != 0)
+                       goto out;
+
+               rc = nvlist_lookup_string(nvl_val, ZPROP_VALUE, &nvp_val);
+               if (rc != 0)
+                       goto out;
+
+               nvp_len = strlen(nvp_val);
+               if (buf_size < nvp_len + 1) {
+                       rc = EOVERFLOW;
+                       goto out;
+               }
+               strcpy(buf, nvp_val);
+               goto out;
+       }
+       /* Not found */
+       rc = ENOENT;
+out:
+       if (nvl != NULL)
+               nvlist_free(nvl);
+       udmu_userprop_cleanup(NULL, &real_prop);
+
+       return rc;
+}
+
+/* We don't actually have direct access to the zap_hashbits() function
+ * so just pretend like we do for now.  If this ever breaks we can look at
+ * it at that time. */
+#define zap_hashbits(zc) 48
+/*
+ * ZFS hash format:
+ * | cd (16 bits) | hash (48 bits) |
+ * we need it in other form:
+ * |0| hash (48 bit) | cd (15 bit) |
+ * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge
+ * the readdir hashes from multiple directory stripes uniformly on the client.
+ * Another point is sign bit, the hash range should be in [0, 2^63-1] because
+ * loff_t (for llseek) needs to be a positive value.  This means the "cd" field
+ * should only be the low 15 bits.
+ */
+uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc)
+{
+       uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1);
+
+       return (zfs_hash >> zap_hashbits(zc)) |
+               (zfs_hash << (63 - zap_hashbits(zc)));
+}
+
+void udmu_zap_cursor_init_serialized(zap_cursor_t *zc, udmu_objset_t *uos,
+               uint64_t zapobj, uint64_t dirhash)
+{
+       uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) |
+               (dirhash >> (63 - zap_hashbits(zc)));
+       zap_cursor_init_serialized(zc, uos->os, zapobj, zfs_hash);
+}
+
+/*
+ * Zap cursor APIs
+ */
+int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos,
+               uint64_t zapobj, uint64_t dirhash)
+{
+       zap_cursor_t *t;
+
+       t = kmem_alloc(sizeof(*t), KM_NOSLEEP);
+       if (t) {
+               udmu_zap_cursor_init_serialized(t, uos, zapobj, dirhash);
+               *zc = t;
+               return 0;
+       }
+       return (ENOMEM);
+}
+
+void udmu_zap_cursor_fini(zap_cursor_t *zc)
+{
+       zap_cursor_fini(zc);
+       kmem_free(zc, sizeof(*zc));
+}
+
+/*
+ * Get the object id from dmu_buf_t
+ */
+int udmu_object_is_zap(dmu_buf_t *db)
+{
+       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db;
+       dnode_t *dn;
+       int rc;
+
+       DB_DNODE_ENTER(dbi);
+
+       dn = DB_DNODE(dbi);
+       rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS ||
+                       dn->dn_type == DMU_OT_USERGROUP_USED);
+
+       DB_DNODE_EXIT(dbi);
+
+       return rc;
+}
+
diff --git a/lustre/osd-zfs/udmu.h b/lustre/osd-zfs/udmu.h

new file mode 100644 (file)

index 0000000..45487b6
--- /dev/null
+++ b/lustre/osd-zfs/udmu.h
@@ -0,0 +1,119 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012 Whamcloud, Inc.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/osd-zfs/udmu.h
+ *
+ * Author: Alex Tomas <alex@clusterfs.com>
+ * Author: Atul Vidwansa <atul.vidwansa@sun.com>
+ * Author: Manoj Joseph <manoj.joseph@sun.com>
+ */
+
+#ifndef _DMU_H
+#define _DMU_H
+
+#include <sys/zap.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/sa.h>
+
+#include <lustre/lustre_user.h>
+
+typedef struct udmu_objset {
+       struct objset   *os;
+       uint64_t        root;  /* id of root znode */
+       cfs_spinlock_t  lock;  /* protects objects below */
+       uint64_t        objects; /* in-core counter of objects */
+       /* SA attr mapping->id,
+        * name is the same as in ZFS to use defines SA_ZPL_...*/
+       sa_attr_type_t *z_attr_table;
+} udmu_objset_t;
+
+#ifndef _SYS_TXG_H
+#define TXG_WAIT        1ULL
+#define TXG_NOWAIT      2ULL
+#endif
+
+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
+
+/* Statfs space reservation for grant, fragmentation, and unlink space. */
+#define OSD_STATFS_RESERVED_BLKS  (1ULL << (22 - SPA_MAXBLOCKSHIFT)) /* 4MB */
+#define OSD_STATFS_RESERVED_SHIFT (7)         /* reserve 0.78% of all space */
+
+/* Statfs {minimum, safe estimate, and maximum} dnodes per block */
+#define OSD_DNODE_MIN_BLKSHIFT (SPA_MAXBLOCKSHIFT - DNODE_SHIFT) /* 17-9 =8 */
+#define OSD_DNODE_EST_BLKSHIFT (SPA_MAXBLOCKSHIFT - 12)          /* 17-12=5 */
+#define OSD_DNODE_EST_COUNT    1024
+
+#define OSD_GRANT_FOR_LOCAL_OIDS (2ULL << 20) /* 2MB for last_rcvd, ... */
+
+void udmu_init(void);
+void udmu_fini(void);
+
+/* udmu object-set API */
+int udmu_objset_open(char *osname, udmu_objset_t *uos);
+void udmu_objset_close(udmu_objset_t *uos);
+int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs);
+uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes);
+int udmu_objset_root(udmu_objset_t *uos, dmu_buf_t **dbp, void *tag);
+uint64_t udmu_get_txg(udmu_objset_t *uos, dmu_tx_t *tx);
+int udmu_blk_insert_cost(void);
+
+/* buf must have at least MAXNAMELEN bytes */
+void udmu_objset_name_get(udmu_objset_t *uos, char *buf);
+
+/* get/set ZFS user properties */
+int udmu_userprop_set_str(udmu_objset_t *uos, const char *prop_name,
+                         const char *val);
+int udmu_userprop_get_str(udmu_objset_t *uos, const char *prop_name, char *buf,
+                         size_t buf_size);
+
+/* zap cursor apis */
+int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos,
+               uint64_t zapobj, uint64_t hash);
+
+void udmu_zap_cursor_fini(zap_cursor_t *zc);
+
+void udmu_zap_cursor_advance(zap_cursor_t *zc);
+
+uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc);
+
+int udmu_zap_cursor_move_to_key(zap_cursor_t *zc, const char *name);
+
+/* Commit callbacks */
+int udmu_object_is_zap(dmu_buf_t *);
+
+#endif /* _DMU_H */
author	Alex Zhuravlev <bzzz@whamcloud.com>
	Tue, 29 May 2012 10:56:48 +0000 (14:56 +0400)
committer	Andreas Dilger <adilger@whamcloud.com>
	Fri, 29 Jun 2012 03:49:07 +0000 (23:49 -0400)
lustre/osd-zfs/udmu.c	[new file with mode: 0644]	patch \| blob
lustre/osd-zfs/udmu.h	[new file with mode: 0644]	patch \| blob