From 745c19c70319915a55b71b81b4e89d68e3a4e272 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 19 Mar 2014 12:20:16 +0400 Subject: [PATCH] LU-4821 osd: cleanups in osd-zfs many small changes to get rid of udmu wrappers. Signed-off-by: Alex Zhuravlev Change-Id: Ic8746345da1e6695149bacf066be10bf284aecdf Reviewed-on: http://review.whamcloud.com/9721 Tested-by: Jenkins Reviewed-by: Nathaniel Clark Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- lustre/osd-zfs/Makefile.in | 2 +- lustre/osd-zfs/autoMakefile.am | 2 +- lustre/osd-zfs/osd_handler.c | 293 +++++++++++++++++++++++++---- lustre/osd-zfs/osd_index.c | 234 ++++++++++++----------- lustre/osd-zfs/osd_internal.h | 54 ++++-- lustre/osd-zfs/osd_io.c | 23 +-- lustre/osd-zfs/osd_object.c | 214 +++++++++------------ lustre/osd-zfs/osd_oi.c | 30 ++- lustre/osd-zfs/osd_quota.c | 120 +++++++++--- lustre/osd-zfs/osd_xattr.c | 146 +++++++-------- lustre/osd-zfs/udmu.c | 417 ----------------------------------------- lustre/osd-zfs/udmu.h | 109 ----------- lustre/tests/test-framework.sh | 38 ++-- lustre/utils/mount_utils_zfs.c | 5 +- 14 files changed, 717 insertions(+), 970 deletions(-) delete mode 100644 lustre/osd-zfs/udmu.c delete mode 100644 lustre/osd-zfs/udmu.h diff --git a/lustre/osd-zfs/Makefile.in b/lustre/osd-zfs/Makefile.in index 7def377..6ffa654 100644 --- a/lustre/osd-zfs/Makefile.in +++ b/lustre/osd-zfs/Makefile.in @@ -1,5 +1,5 @@ MODULES := osd_zfs -osd_zfs-objs := osd_handler.o osd_lproc.o udmu.o osd_quota.o +osd_zfs-objs := osd_handler.o osd_lproc.o osd_quota.o osd_zfs-objs += osd_object.o osd_io.o osd_oi.o osd_xattr.o osd_index.o EXTRA_PRE_CFLAGS += -include @SPL_OBJ@/spl_config.h diff --git a/lustre/osd-zfs/autoMakefile.am b/lustre/osd-zfs/autoMakefile.am index 52af68c..d9bb4c2 100644 --- a/lustre/osd-zfs/autoMakefile.am +++ b/lustre/osd-zfs/autoMakefile.am @@ -43,4 +43,4 @@ endif endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -EXTRA_DIST := $(osd_zfs-objs:%.o=%.c) osd_internal.h udmu.h +EXTRA_DIST := $(osd_zfs-objs:%.o=%.c) osd_internal.h diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index a966487..dfbd45f 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -74,8 +74,6 @@ struct lu_context_key osd_key; -static char *root_tag = "osd_mount, rootdb"; - /* Slab for OSD object allocation */ struct kmem_cache *osd_object_kmem; @@ -271,7 +269,7 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, dmu_tx_commit(oh->ot_tx); if (th->th_sync) - txg_wait_synced(dmu_objset_pool(osd->od_objset.os), txg); + txg_wait_synced(dmu_objset_pool(osd->od_os), txg); RETURN(rc); } @@ -285,7 +283,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, dmu_tx_t *tx; ENTRY; - tx = dmu_tx_create(osd->od_objset.os); + tx = dmu_tx_create(osd->od_os); if (tx == NULL) RETURN(ERR_PTR(-ENOMEM)); @@ -310,6 +308,149 @@ static struct thandle *osd_trans_create(const struct lu_env *env, RETURN(th); } +/* Estimate the number of objects from a number of blocks */ +uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, + uint64_t nrblocks) +{ + uint64_t est_objs, est_refdblocks, est_usedobjs; + + /* Compute an nrblocks estimate based on the actual number of + * dnodes that could fit in the space. Since we don't know the + * overhead associated with each dnode (xattrs, SAs, VDEV overhead, + * etc) just using DNODE_SHIFT isn't going to give a good estimate. + * Instead, compute an estimate based on the average space usage per + * dnode, with an upper and lower cap. + * + * In case there aren't many dnodes or blocks used yet, add a small + * correction factor using OSD_DNODE_EST_SHIFT. This correction + * factor gradually disappears as the number of real dnodes grows. + * This also avoids the need to check for divide-by-zero later. + */ + CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0); + CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0); + + est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) + + (OSD_DNODE_EST_COUNT >> OSD_DNODE_EST_BLKSHIFT); + est_usedobjs = usedobjs + OSD_DNODE_EST_COUNT; + + /* Average space/dnode more than maximum dnode size, use max dnode + * size to estimate free dnodes from adjusted free blocks count. + * OSTs typically use more than one block dnode so this case applies. */ + if (est_usedobjs <= est_refdblocks * 2) { + est_objs = nrblocks; + + /* Average space/dnode smaller than min dnode size (probably due to + * metadnode compression), use min dnode size to estimate the number of + * objects. + * An MDT typically uses below 512 bytes/dnode so this case applies. */ + } else if (est_usedobjs >= (est_refdblocks << OSD_DNODE_MIN_BLKSHIFT)) { + est_objs = nrblocks << OSD_DNODE_MIN_BLKSHIFT; + + /* Between the extremes, we try to use the average size of + * existing dnodes to compute the number of dnodes that fit + * into nrblocks: + * + * est_objs = nrblocks * (est_usedobjs / est_refblocks); + * + * but this may overflow 64 bits or become 0 if not handled well + * + * We know nrblocks is below (64 - 17 = 47) bits from + * SPA_MAXBLKSHIFT, and est_usedobjs is under 48 bits due to + * DN_MAX_OBJECT_SHIFT, which means that multiplying them may + * get as large as 2 ^ 95. + * + * We also know (est_usedobjs / est_refdblocks) is between 2 and + * 256, due to above checks, we can safely compute this first. + * We care more about accuracy on the MDT (many dnodes/block) + * which is good because this is where truncation errors are + * smallest. This adds 8 bits to nrblocks so we can use 7 bits + * to compute a fixed-point fraction and nrblocks can still fit + * in 64 bits. */ + } else { + unsigned dnodes_per_block = (est_usedobjs << 7)/est_refdblocks; + + est_objs = (nrblocks * dnodes_per_block) >> 7; + } + return est_objs; +} + +static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) +{ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t est_availobjs; + uint64_t reserved; + + dmu_objset_space(os, &refdbytes, &availbytes, &usedobjs, + &availobjs); + + /* + * ZFS allows multiple block sizes. For statfs, Linux makes no + * proper distinction between bsize and frsize. For calculations + * of free and used blocks incorrectly uses bsize instead of frsize, + * but bsize is also used as the optimal blocksize. We return the + * largest possible block size as IO size for the optimum performance + * and scale the free and used blocks count appropriately. + */ + osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT; + + osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT; + osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT; + osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */ + + /* Take replication (i.e. number of copies) into account */ + osfs->os_bavail /= os->os_copies; + + /* + * Reserve some space so we don't run into ENOSPC due to grants not + * accounting for metadata overhead in ZFS, and to avoid fragmentation. + * Rather than report this via os_bavail (which makes users unhappy if + * they can't fill the filesystem 100%), reduce os_blocks as well. + * + * Reserve 0.78% of total space, at least 4MB for small filesystems, + * for internal files to be created/unlinked when space is tight. + */ + CLASSERT(OSD_STATFS_RESERVED_BLKS > 0); + if (likely(osfs->os_blocks >= + OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT)) + reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; + else + reserved = OSD_STATFS_RESERVED_BLKS; + + osfs->os_blocks -= reserved; + osfs->os_bfree -= MIN(reserved, osfs->os_bfree); + osfs->os_bavail -= MIN(reserved, osfs->os_bavail); + + /* + * The availobjs value returned from dmu_objset_space() is largely + * useless, since it reports the number of objects that might + * theoretically still fit into the dataset, independent of minor + * issues like how much space is actually available in the pool. + * Compute a better estimate in udmu_objs_count_estimate(). + */ + est_availobjs = osd_objs_count_estimate(refdbytes, usedobjs, + osfs->os_bfree); + + osfs->os_ffree = min(availobjs, est_availobjs); + osfs->os_files = osfs->os_ffree + usedobjs; + + /* ZFS XXX: fill in backing dataset FSID/UUID + memcpy(osfs->os_fsid, .... );*/ + + /* We're a zfs filesystem. */ + osfs->os_type = UBERBLOCK_MAGIC; + + /* ZFS XXX: fill in appropriate OS_STATE_{DEGRADED,READONLY} flags + osfs->os_state = vf_to_stf(vfsp->vfs_flag); + if (sb->s_flags & MS_RDONLY) + osfs->os_state = OS_STATE_READONLY; + */ + + osfs->os_namelen = MAXNAMELEN; + osfs->os_maxbytes = OBD_OBJECT_EOF; + + return 0; +} + /* * Concurrency: shouldn't matter. */ @@ -320,15 +461,36 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d, int rc; ENTRY; - rc = udmu_objset_statfs(&osd->od_objset, osfs); - if (unlikely(rc)) + rc = osd_objset_statfs(osd->od_os, osfs); + if (unlikely(rc != 0)) RETURN(rc); + osfs->os_bavail -= min_t(obd_size, OSD_GRANT_FOR_LOCAL_OIDS / osfs->os_bsize, osfs->os_bavail); RETURN(0); } +static int osd_blk_insert_cost(void) +{ + int max_blockshift, nr_blkptrshift; + + /* max_blockshift is the log2 of the number of blocks needed to reach + * the maximum filesize (that's to say 2^64) */ + max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT; + + /* nr_blkptrshift is the log2 of the number of block pointers that can + * be stored in an indirect block */ + CLASSERT(DN_MAX_INDBLKSHIFT > SPA_BLKPTRSHIFT); + nr_blkptrshift = DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT; + + /* max_blockshift / nr_blkptrshift is thus the maximum depth of the + * tree. We add +1 for rounding purpose. + * The tree depth times the indirect block size gives us the maximum + * cost of inserting a block in the tree */ + return (max_blockshift / nr_blkptrshift + 1) * (1<ddp_inodespace = OSD_DNODE_EST_COUNT; /* per-fragment overhead to be used by the client code */ - param->ddp_grant_frag = udmu_blk_insert_cost(); + param->ddp_grant_frag = osd_blk_insert_cost(); } /* @@ -377,14 +539,14 @@ static int osd_sync(const struct lu_env *env, struct dt_device *d) { struct osd_device *osd = osd_dt_dev(d); CDEBUG(D_HA, "syncing OSD %s\n", LUSTRE_OSD_ZFS_NAME); - txg_wait_synced(dmu_objset_pool(osd->od_objset.os), 0ULL); + txg_wait_synced(dmu_objset_pool(osd->od_os), 0ULL); return 0; } static int osd_commit_async(const struct lu_env *env, struct dt_device *dev) { struct osd_device *osd = osd_dt_dev(dev); - tx_state_t *tx = &dmu_objset_pool(osd->od_objset.os)->dp_tx; + tx_state_t *tx = &dmu_objset_pool(osd->od_os)->dp_tx; uint64_t txg; mutex_enter(&tx->tx_sync_lock); @@ -409,7 +571,7 @@ static int osd_ro(const struct lu_env *env, struct dt_device *d) CERROR("%s: *** setting device %s read-only ***\n", osd->od_svname, LUSTRE_OSD_ZFS_NAME); osd->od_rdonly = 1; - spa_freeze(dmu_objset_spa(osd->od_objset.os)); + spa_freeze(dmu_objset_spa(osd->od_os)); RETURN(0); } @@ -516,6 +678,65 @@ static void osd_xattr_changed_cb(void *arg, uint64_t newval) osd->od_xattr_in_sa = (newval == ZFS_XATTR_SA); } +static int osd_objset_open(struct osd_device *o) +{ + uint64_t version = ZPL_VERSION; + uint64_t sa_obj; + int rc; + ENTRY; + + rc = -dmu_objset_own(o->od_mntdev, DMU_OST_ZFS, B_FALSE, o, &o->od_os); + if (rc) { + o->od_os = NULL; + goto out; + } + + /* Check ZFS version */ + rc = -zap_lookup(o->od_os, MASTER_NODE_OBJ, + ZPL_VERSION_STR, 8, 1, &version); + if (rc) { + CERROR("%s: Error looking up ZPL VERSION\n", o->od_mntdev); + /* + * We can't return ENOENT because that would mean the objset + * didn't exist. + */ + GOTO(out, rc = -EIO); + } + + rc = -zap_lookup(o->od_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (rc) + GOTO(out, rc); + + rc = -sa_setup(o->od_os, sa_obj, zfs_attr_table, + ZPL_END, &o->z_attr_table); + if (rc) + GOTO(out, rc); + + rc = -zap_lookup(o->od_os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, + 8, 1, &o->od_rootid); + if (rc) { + CERROR("%s: lookup for root failed: rc = %d\n", + o->od_svname, rc); + GOTO(out, rc); + } + + /* Check that user/group usage tracking is supported */ + if (!dmu_objset_userused_enabled(o->od_os) || + DMU_USERUSED_DNODE(o->od_os)->dn_type != DMU_OT_USERGROUP_USED || + DMU_GROUPUSED_DNODE(o->od_os)->dn_type != DMU_OT_USERGROUP_USED) { + CERROR("%s: Space accounting not supported by this target, " + "aborting\n", o->od_svname); + GOTO(out, -ENOTSUPP); + } + +out: + if (rc != 0 && o->od_os != NULL) + dmu_objset_disown(o->od_os, o); + + RETURN(rc); +} + static int osd_mount(const struct lu_env *env, struct osd_device *o, struct lustre_cfg *cfg) { @@ -528,7 +749,7 @@ static int osd_mount(const struct lu_env *env, int rc; ENTRY; - if (o->od_objset.os != NULL) + if (o->od_os != NULL) RETURN(0); if (mntdev == NULL || svname == NULL) @@ -545,33 +766,34 @@ static int osd_mount(const struct lu_env *env, if (server_name_is_ost(o->od_svname)) o->od_is_ost = 1; - rc = -udmu_objset_open(o->od_mntdev, &o->od_objset); + rc = osd_objset_open(o); if (rc) { - CERROR("can't open objset %s: %d\n", o->od_mntdev, rc); + CERROR("%s: can't open objset %s: rc = %d\n", o->od_svname, + o->od_mntdev, rc); RETURN(rc); } - ds = dmu_objset_ds(o->od_objset.os); - dp = dmu_objset_pool(o->od_objset.os); + ds = dmu_objset_ds(o->od_os); + dp = dmu_objset_pool(o->od_os); LASSERT(ds); LASSERT(dp); dsl_pool_config_enter(dp, FTAG); rc = dsl_prop_register(ds, "xattr", osd_xattr_changed_cb, o); dsl_pool_config_exit(dp, FTAG); if (rc) - CERROR("%s: cat not register xattr callback, ignore: %d\n", - o->od_svname, rc); + CWARN("%s: can't register xattr callback, ignore: rc=%d\n", + o->od_svname, rc); - rc = __osd_obj2dbuf(env, o->od_objset.os, o->od_objset.root, - &rootdb, root_tag); + rc = __osd_obj2dbuf(env, o->od_os, o->od_rootid, &rootdb); if (rc) { - CERROR("udmu_obj2dbuf() failed with error %d\n", rc); - udmu_objset_close(&o->od_objset); + CERROR("%s: obj2dbuf() failed: rc = %d\n", o->od_svname, rc); + dmu_objset_disown(o->od_os, o); + o->od_os = NULL; RETURN(rc); } o->od_root = rootdb->db_object; - sa_buf_rele(rootdb, root_tag); + sa_buf_rele(rootdb, osd_obj_tag); /* 1. initialize oi before any file create or file open */ rc = osd_oi_init(env, o); @@ -633,8 +855,15 @@ static void osd_umount(const struct lu_env *env, struct osd_device *o) CERROR("%s: lost %d pinned dbuf(s)\n", o->od_svname, atomic_read(&o->od_zerocopy_pin)); - if (o->od_objset.os != NULL) - udmu_objset_close(&o->od_objset); + if (o->od_os != NULL) { + /* force a txg sync to get all commit callbacks */ + txg_wait_synced(dmu_objset_pool(o->od_os), 0ULL); + + /* close the object set */ + dmu_objset_disown(o->od_os, o); + + o->od_os = NULL; + } EXIT; } @@ -727,8 +956,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, osd_shutdown(env, o); osd_oi_fini(env, o); - if (o->od_objset.os) { - ds = dmu_objset_ds(o->od_objset.os); + if (o->od_os) { + ds = dmu_objset_ds(o->od_os); rc = dsl_prop_unregister(ds, "xattr", osd_xattr_changed_cb, o); if (rc) CERROR("%s: dsl_prop_unregister xattr error %d\n", @@ -738,7 +967,7 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, o->arc_prune_cb = NULL; } osd_sync(env, lu2dt_dev(d)); - txg_wait_callbacks(spa_get_dsl(dmu_objset_spa(o->od_objset.os))); + txg_wait_callbacks(spa_get_dsl(dmu_objset_spa(o->od_os))); } rc = osd_procfs_fini(o); @@ -747,7 +976,7 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, RETURN(ERR_PTR(rc)); } - if (o->od_objset.os) + if (o->od_os) osd_umount(env, o); RETURN(NULL); @@ -829,9 +1058,9 @@ static int osd_obd_connect(const struct lu_env *env, struct obd_export **exp, *exp = class_conn2export(&conn); - spin_lock(&osd->od_objset.lock); + spin_lock(&obd->obd_dev_lock); osd->od_connects++; - spin_unlock(&osd->od_objset.lock); + spin_unlock(&obd->obd_dev_lock); RETURN(0); } @@ -848,11 +1077,11 @@ static int osd_obd_disconnect(struct obd_export *exp) ENTRY; /* Only disconnect the underlying layers on the final disconnect. */ - spin_lock(&osd->od_objset.lock); + spin_lock(&obd->obd_dev_lock); osd->od_connects--; if (osd->od_connects == 0) release = 1; - spin_unlock(&osd->od_objset.lock); + spin_unlock(&obd->obd_dev_lock); rc = class_disconnect(exp); /* bz 9811 */ diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c index b635943..2f420a9 100644 --- a/lustre/osd-zfs/osd_index.c +++ b/lustre/osd-zfs/osd_index.c @@ -67,6 +67,89 @@ #include #include +static inline int osd_object_is_zap(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db; + dnode_t *dn; + int rc; + + DB_DNODE_ENTER(dbi); + dn = DB_DNODE(dbi); + rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS || + dn->dn_type == DMU_OT_USERGROUP_USED); + DB_DNODE_EXIT(dbi); + + return rc; +} + +/* We don't actually have direct access to the zap_hashbits() function + * so just pretend like we do for now. If this ever breaks we can look at + * it at that time. */ +#define zap_hashbits(zc) 48 +/* + * ZFS hash format: + * | cd (16 bits) | hash (48 bits) | + * we need it in other form: + * |0| hash (48 bit) | cd (15 bit) | + * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge + * the readdir hashes from multiple directory stripes uniformly on the client. + * Another point is sign bit, the hash range should be in [0, 2^63-1] because + * loff_t (for llseek) needs to be a positive value. This means the "cd" field + * should only be the low 15 bits. + */ +uint64_t osd_zap_cursor_serialize(zap_cursor_t *zc) +{ + uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1); + + return (zfs_hash >> zap_hashbits(zc)) | + (zfs_hash << (63 - zap_hashbits(zc))); +} + +void osd_zap_cursor_init_serialized(zap_cursor_t *zc, struct objset *os, + uint64_t id, uint64_t dirhash) +{ + uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) | + (dirhash >> (63 - zap_hashbits(zc))); + + zap_cursor_init_serialized(zc, os, id, zfs_hash); +} + +int osd_zap_cursor_init(zap_cursor_t **zc, struct objset *os, + uint64_t id, uint64_t dirhash) +{ + zap_cursor_t *t; + + OBD_ALLOC_PTR(t); + if (unlikely(t == NULL)) + return -ENOMEM; + + osd_zap_cursor_init_serialized(t, os, id, dirhash); + *zc = t; + + return 0; +} + +void osd_zap_cursor_fini(zap_cursor_t *zc) +{ + zap_cursor_fini(zc); + OBD_FREE_PTR(zc); +} + +static inline void osd_obj_cursor_init_serialized(zap_cursor_t *zc, + struct osd_object *o, + uint64_t dirhash) +{ + struct osd_device *d = osd_obj2dev(o); + zap_cursor_init_serialized(zc, d->od_os, o->oo_db->db_object, dirhash); +} + +static inline int osd_obj_cursor_init(zap_cursor_t **zc, struct osd_object *o, + uint64_t dirhash) +{ + struct osd_device *d = osd_obj2dev(o); + return osd_zap_cursor_init(zc, d->od_os, o->oo_db->db_object, dirhash); +} + static struct dt_it *osd_index_it_init(const struct lu_env *env, struct dt_object *dt, __u32 unused, @@ -75,22 +158,22 @@ static struct dt_it *osd_index_it_init(const struct lu_env *env, struct osd_thread_info *info = osd_oti_get(env); struct osd_zap_it *it; struct osd_object *obj = osd_dt_obj(dt); - struct osd_device *osd = osd_obj2dev(obj); struct lu_object *lo = &dt->do_lu; + int rc; ENTRY; /* XXX: check capa ? */ LASSERT(lu_object_exists(lo)); LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); LASSERT(info); it = &info->oti_it_zap; - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, 0)) - RETURN(ERR_PTR(-ENOMEM)); + rc = osd_obj_cursor_init(&it->ozi_zc, obj, 0); + if (rc != 0) + RETURN(ERR_PTR(rc)); it->ozi_obj = obj; it->ozi_capa = capa; @@ -111,7 +194,7 @@ static void osd_index_it_fini(const struct lu_env *env, struct dt_it *di) obj = it->ozi_obj; - udmu_zap_cursor_fini(it->ozi_zc); + osd_zap_cursor_fini(it->ozi_zc); lu_object_put(env, &obj->oo_dt.do_lu); EXIT; @@ -124,57 +207,6 @@ static void osd_index_it_put(const struct lu_env *env, struct dt_it *di) * next/finish. */ } -int udmu_zap_cursor_retrieve_key(const struct lu_env *env, - zap_cursor_t *zc, char *key, int max) -{ - zap_attribute_t *za = &osd_oti_get(env)->oti_za; - int err; - - if ((err = zap_cursor_retrieve(zc, za))) - return err; - - if (key) - strcpy(key, za->za_name); - - return 0; -} - -/* - * zap_cursor_retrieve read from current record. - * to read bytes we need to call zap_lookup explicitly. - */ -int udmu_zap_cursor_retrieve_value(const struct lu_env *env, - zap_cursor_t *zc, char *buf, - int buf_size, int *bytes_read) -{ - zap_attribute_t *za = &osd_oti_get(env)->oti_za; - int err, actual_size; - - if ((err = zap_cursor_retrieve(zc, za))) - return err; - - if (za->za_integer_length <= 0) - return (ERANGE); - - actual_size = za->za_integer_length * za->za_num_integers; - - if (actual_size > buf_size) { - actual_size = buf_size; - buf_size = actual_size / za->za_integer_length; - } else { - buf_size = za->za_num_integers; - } - - err = -zap_lookup(zc->zc_objset, zc->zc_zapobj, - za->za_name, za->za_integer_length, - buf_size, buf); - - if (!err) - *bytes_read = actual_size; - - return err; -} - static inline void osd_it_append_attrs(struct lu_dirent *ent, __u32 attr, int len, __u16 type) { @@ -202,8 +234,8 @@ static int osd_find_parent_by_dnode(const struct lu_env *env, struct dt_object *o, struct lu_fid *fid) { + struct osd_device *osd = osd_obj2dev(osd_dt_obj(o)); struct lustre_mdt_attrs *lma; - udmu_objset_t *uos = &osd_obj2dev(osd_dt_obj(o))->od_objset; struct lu_buf buf; sa_handle_t *sa_hdl; nvlist_t *nvbuf = NULL; @@ -214,19 +246,19 @@ static int osd_find_parent_by_dnode(const struct lu_env *env, /* first of all, get parent dnode from own attributes */ LASSERT(osd_dt_obj(o)->oo_db); - rc = -sa_handle_get(uos->os, osd_dt_obj(o)->oo_db->db_object, + rc = -sa_handle_get(osd->od_os, osd_dt_obj(o)->oo_db->db_object, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) RETURN(rc); dnode = ZFS_NO_OBJECT; - rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(uos), &dnode, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(osd), &dnode, 8); sa_handle_destroy(sa_hdl); if (rc) RETURN(rc); /* now get EA buffer */ - rc = __osd_xattr_load(uos, dnode, &nvbuf); + rc = __osd_xattr_load(osd, dnode, &nvbuf); if (rc) GOTO(regular, rc); @@ -246,12 +278,12 @@ regular: /* no LMA attribute in SA, let's try regular EA */ /* first of all, get parent dnode storing regular EA */ - rc = -sa_handle_get(uos->os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); + rc = -sa_handle_get(osd->od_os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) GOTO(out, rc); dnode = ZFS_NO_OBJECT; - rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(uos), &dnode, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(osd), &dnode, 8); sa_handle_destroy(sa_hdl); if (rc) GOTO(out, rc); @@ -261,7 +293,7 @@ regular: buf.lb_len = sizeof(osd_oti_get(env)->oti_buf); /* now try to find LMA */ - rc = __osd_xattr_get_large(env, uos, dnode, &buf, + rc = __osd_xattr_get_large(env, osd, dnode, &buf, XATTR_NAME_LMA, &size); if (rc == 0 && size >= sizeof(*lma)) { lma = buf.lb_buf; @@ -361,7 +393,7 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, int rc; ENTRY; - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); if (name[0] == '.') { if (name[1] == 0) { @@ -374,7 +406,7 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, } } - rc = -zap_lookup(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_lookup(osd->od_os, obj->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde); memcpy(rec, &oti->oti_zde.lzd_fid, sizeof(struct lu_fid)); @@ -396,7 +428,7 @@ static int osd_declare_dir_insert(const struct lu_env *env, oh = container_of0(th, struct osd_thandle, ot_super); LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); dmu_tx_hold_bonus(oh->ot_tx, obj->oo_db->db_object); dmu_tx_hold_zap(oh->ot_tx, obj->oo_db->db_object, TRUE, (char *)key); @@ -547,7 +579,7 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, ENTRY; LASSERT(parent->oo_db); - LASSERT(udmu_object_is_zap(parent->oo_db)); + LASSERT(osd_object_is_zap(parent->oo_db)); LASSERT(dt_object_exists(dt)); LASSERT(osd_invariant(parent)); @@ -585,9 +617,8 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, } else if (name[1] == '.' && name[2] == 0) { /* update parent dnode in the child. * later it will be used to generate ".." */ - udmu_objset_t *uos = &osd->od_objset; rc = osd_object_sa_update(parent, - SA_ZPL_PARENT(uos), + SA_ZPL_PARENT(osd), &child->oo_db->db_object, 8, oh); GOTO(out, rc); @@ -602,7 +633,7 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, oti->oti_zde.lzd_fid = *fid; /* Insert (key,oid) into ZAP */ - rc = -zap_add(osd->od_objset.os, parent->oo_db->db_object, + rc = -zap_add(osd->od_os, parent->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde, oh->ot_tx); @@ -629,7 +660,7 @@ static int osd_declare_dir_delete(const struct lu_env *env, oh = container_of0(th, struct osd_thandle, ot_super); LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); dmu_tx_hold_zap(oh->ot_tx, obj->oo_db->db_object, TRUE, (char *)key); @@ -649,7 +680,7 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, ENTRY; LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); @@ -667,7 +698,7 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, } /* Remove key from the ZAP */ - rc = -zap_remove(osd->od_objset.os, zap_db->db_object, + rc = -zap_remove(osd->od_os, zap_db->db_object, (char *) key, oh->ot_tx); if (unlikely(rc && rc != -ENOENT)) @@ -705,7 +736,6 @@ static int osd_dir_it_get(const struct lu_env *env, { struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); char *name = (char *)key; int rc; ENTRY; @@ -713,11 +743,9 @@ static int osd_dir_it_get(const struct lu_env *env, LASSERT(it); LASSERT(it->ozi_zc); - udmu_zap_cursor_fini(it->ozi_zc); - - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, 0)) - RETURN(-ENOMEM); + /* reset the cursor */ + zap_cursor_fini(it->ozi_zc); + osd_obj_cursor_init_serialized(it->ozi_zc, obj, 0); /* XXX: implementation of the API is broken at the moment */ LASSERT(((const char *)key)[0] == 0); @@ -916,7 +944,7 @@ static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di, if (unlikely(rc != 0)) GOTO(out, rc); - lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); + lde->lde_hash = cpu_to_le64(osd_zap_cursor_serialize(it->ozi_zc)); namelen = strlen(za->za_name); if (namelen > NAME_MAX) GOTO(out, rc = -EOVERFLOW); @@ -995,7 +1023,7 @@ static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) if (it->ozi_pos <= 2) pos = it->ozi_pos; else - pos = udmu_zap_cursor_serialize(it->ozi_zc); + pos = osd_zap_cursor_serialize(it->ozi_zc); RETURN(pos); } @@ -1011,15 +1039,13 @@ static int osd_dir_it_load(const struct lu_env *env, { struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc; ENTRY; - udmu_zap_cursor_fini(it->ozi_zc); - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, hash)) - RETURN(-ENOMEM); + /* reset the cursor */ + zap_cursor_fini(it->ozi_zc); + osd_obj_cursor_init_serialized(it->ozi_zc, obj, hash); if (hash <= 2) { it->ozi_pos = hash; @@ -1096,7 +1122,7 @@ static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, rc = osd_prepare_key_uint64(obj, k, key); - rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_lookup_uint64(osd->od_os, obj->oo_db->db_object, k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec); RETURN(rc == 0 ? 1 : rc); @@ -1149,7 +1175,7 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, rc = osd_prepare_key_uint64(obj, k, key); /* Insert (key,oid) into ZAP */ - rc = -zap_add_uint64(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_add_uint64(osd->od_os, obj->oo_db->db_object, k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec, oh->ot_tx); RETURN(rc); @@ -1193,7 +1219,7 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, rc = osd_prepare_key_uint64(obj, k, key); /* Remove binary key from the ZAP */ - rc = -zap_remove_uint64(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_remove_uint64(osd->od_os, obj->oo_db->db_object, k, rc, oh->ot_tx); RETURN(rc); } @@ -1217,8 +1243,7 @@ static int osd_index_it_get(const struct lu_env *env, struct dt_it *di, *((__u64 *)key)); zap_cursor_fini(it->ozi_zc); - memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); - zap_cursor_init(it->ozi_zc, osd->od_objset.os, obj->oo_db->db_object); + zap_cursor_init(it->ozi_zc, osd->od_os, obj->oo_db->db_object); it->ozi_reset = 1; RETURN(+1); @@ -1294,7 +1319,7 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, rc = osd_prepare_key_uint64(obj, k, (const struct dt_key *)za->za_name); - rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_lookup_uint64(osd->od_os, obj->oo_db->db_object, k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec); RETURN(rc); @@ -1319,12 +1344,9 @@ static int osd_index_it_load(const struct lu_env *env, const struct dt_it *di, int rc; ENTRY; - /* close the current cursor */ + /* reset the cursor */ zap_cursor_fini(it->ozi_zc); - - /* create a new one starting at hash */ - memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); - zap_cursor_init_serialized(it->ozi_zc, osd->od_objset.os, + zap_cursor_init_serialized(it->ozi_zc, osd->od_os, obj->oo_db->db_object, hash); it->ozi_reset = 0; @@ -1382,7 +1404,7 @@ static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env, /* XXX: dmu_object_next() does NOT find dnodes allocated * in the current non-committed txg, so we force txg * commit to find all existing dnodes ... */ - txg_wait_synced(dmu_objset_pool(dev->od_objset.os), 0ULL); + txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL); RETURN((struct dt_it *)it); } @@ -1410,7 +1432,6 @@ static void osd_zfs_otable_prefetch(const struct lu_env *env, struct osd_metadnode_it *it) { struct osd_device *dev = it->mit_dev; - udmu_objset_t *uos = &dev->od_objset; int rc; /* can go negative on the very first access to the iterator @@ -1425,7 +1446,7 @@ static void osd_zfs_otable_prefetch(const struct lu_env *env, it->mit_prefetched_dnode = it->mit_pos; while (it->mit_prefetched < OTABLE_PREFETCH) { - rc = -dmu_object_next(uos->os, &it->mit_prefetched_dnode, + rc = -dmu_object_next(dev->od_os, &it->mit_prefetched_dnode, B_FALSE, 0); if (unlikely(rc != 0)) break; @@ -1433,7 +1454,7 @@ static void osd_zfs_otable_prefetch(const struct lu_env *env, /* dmu_prefetch() was exported in 0.6.2, if you use with * an older release, just comment it out - this is an * optimization */ - dmu_prefetch(uos->os, it->mit_prefetched_dnode, 0, 0); + dmu_prefetch(dev->od_os, it->mit_prefetched_dnode, 0, 0); it->mit_prefetched++; } @@ -1444,7 +1465,6 @@ static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di) struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; struct lustre_mdt_attrs *lma; struct osd_device *dev = it->mit_dev; - udmu_objset_t *uos = &dev->od_objset; nvlist_t *nvbuf = NULL; uchar_t *v; __u64 dnode; @@ -1454,14 +1474,14 @@ static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di) dnode = it->mit_pos; do { - rc = -dmu_object_next(uos->os, &it->mit_pos, B_FALSE, 0); + rc = -dmu_object_next(dev->od_os, &it->mit_pos, B_FALSE, 0); if (unlikely(rc != 0)) GOTO(out, rc = 1); it->mit_prefetched--; /* LMA is required for this to be a Lustre object. * If there is no xattr skip it. */ - rc = __osd_xattr_load(uos, it->mit_pos, &nvbuf); + rc = __osd_xattr_load(dev, it->mit_pos, &nvbuf); if (unlikely(rc != 0)) continue; @@ -1589,14 +1609,14 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, LASSERT(obj->oo_db != NULL); if (likely(feat == &dt_directory_features)) { - if (udmu_object_is_zap(obj->oo_db)) + if (osd_object_is_zap(obj->oo_db)) dt->do_index_ops = &osd_dir_ops; else RETURN(-ENOTDIR); } else if (unlikely(feat == &dt_acct_features)) { LASSERT(fid_is_acct(lu_object_fid(&dt->do_lu))); dt->do_index_ops = &osd_acct_index_ops; - } else if (udmu_object_is_zap(obj->oo_db) && + } else if (osd_object_is_zap(obj->oo_db) && dt->do_index_ops == NULL) { /* For index file, we don't support variable key & record sizes * and the key has to be unique */ diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index dd3822f..3807fe1 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -59,11 +59,9 @@ #endif #include - #include - #include -#include "udmu.h" +#include #define LUSTRE_ROOT_FID_SEQ 0 #define DMU_OSD_SVNAME "svname" @@ -71,6 +69,17 @@ #define OSD_GFP_IO (GFP_NOFS | __GFP_HIGHMEM) +/* Statfs space reservation for grant, fragmentation, and unlink space. */ +#define OSD_STATFS_RESERVED_BLKS (1ULL << (22 - SPA_MAXBLOCKSHIFT)) /* 4MB */ +#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ + +/* Statfs {minimum, safe estimate, and maximum} dnodes per block */ +#define OSD_DNODE_MIN_BLKSHIFT (SPA_MAXBLOCKSHIFT - DNODE_SHIFT) /* 17-9 =8 */ +#define OSD_DNODE_EST_BLKSHIFT (SPA_MAXBLOCKSHIFT - 12) /* 17-12=5 */ +#define OSD_DNODE_EST_COUNT 1024 + +#define OSD_GRANT_FOR_LOCAL_OIDS (2ULL << 20) /* 2MB for last_rcvd, ... */ + /** * Iterator's in-memory data structure for quota file. */ @@ -233,7 +242,11 @@ struct osd_device { /* super-class */ struct dt_device od_dt_dev; /* information about underlying file system */ - udmu_objset_t od_objset; + struct objset *od_os; + uint64_t od_rootid; /* id of root znode */ + /* SA attr mapping->id, + * name is the same as in ZFS to use defines SA_ZPL_...*/ + sa_attr_type_t *z_attr_table; /* * Fid Capability @@ -323,6 +336,8 @@ int osd_declare_quota(const struct lu_env *env, struct osd_device *osd, qid_t uid, qid_t gid, long long space, struct osd_thandle *oh, bool is_blk, int *flags, bool force); +uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, + uint64_t nrblocks); /* * Helpers. @@ -367,7 +382,7 @@ static inline struct lu_device *osd2lu_dev(struct osd_device *osd) static inline struct objset * osd_dtobj2objset(struct dt_object *o) { - return osd_dev(o->do_lu.lo_dev)->od_objset.os; + return osd_dev(o->do_lu.lo_dev)->od_os; } static inline int osd_invariant(const struct osd_object *obj) @@ -411,28 +426,22 @@ extern struct lprocfs_seq_vars lprocfs_osd_obd_vars[]; int osd_procfs_init(struct osd_device *osd, const char *name); int osd_procfs_fini(struct osd_device *osd); -int udmu_zap_cursor_retrieve_key(const struct lu_env *env, - zap_cursor_t *zc, char *key, int max); -int udmu_zap_cursor_retrieve_value(const struct lu_env *env, - zap_cursor_t *zc, char *buf, - int buf_size, int *bytes_read); - /* osd_object.c */ +extern char *osd_obj_tag; void osd_object_sa_dirty_rele(struct osd_thandle *oh); int __osd_obj2dbuf(const struct lu_env *env, objset_t *os, - uint64_t oid, dmu_buf_t **dbp, void *tag); + uint64_t oid, dmu_buf_t **dbp); struct lu_object *osd_object_alloc(const struct lu_env *env, const struct lu_object_header *hdr, struct lu_device *d); int osd_object_sa_update(struct osd_object *obj, sa_attr_type_t type, void *buf, uint32_t buflen, struct osd_thandle *oh); -int __osd_zap_create(const struct lu_env *env, udmu_objset_t *uos, +int __osd_zap_create(const struct lu_env *env, struct osd_device *osd, dmu_buf_t **zap_dbp, dmu_tx_t *tx, struct lu_attr *la, - uint64_t parent, void *tag, zap_flags_t flags); -int __osd_object_create(const struct lu_env *env, udmu_objset_t *uos, + uint64_t parent, zap_flags_t flags); +int __osd_object_create(const struct lu_env *env, struct osd_device *osd, dmu_buf_t **dbp, dmu_tx_t *tx, struct lu_attr *la, - uint64_t parent, void *tag); -int __osd_object_free(udmu_objset_t *uos, uint64_t oid, dmu_tx_t *tx); + uint64_t parent); /* osd_oi.c */ int osd_oi_init(const struct lu_env *env, struct osd_device *o); @@ -451,10 +460,17 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat); int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, obd_seq seq, struct lu_seq_range *range); +void osd_zap_cursor_init_serialized(zap_cursor_t *zc, struct objset *os, + uint64_t id, uint64_t dirhash); +int osd_zap_cursor_init(zap_cursor_t **zc, struct objset *os, + uint64_t id, uint64_t dirhash); +void osd_zap_cursor_fini(zap_cursor_t *zc); +uint64_t osd_zap_cursor_serialize(zap_cursor_t *zc); /* osd_xattr.c */ -int __osd_xattr_load(udmu_objset_t *uos, uint64_t dnode, nvlist_t **sa_xattr); -int __osd_xattr_get_large(const struct lu_env *env, udmu_objset_t *uos, +int __osd_xattr_load(struct osd_device *osd, uint64_t dnode, + nvlist_t **sa_xattr); +int __osd_xattr_get_large(const struct lu_env *env, struct osd_device *osd, uint64_t xattr, struct lu_buf *buf, const char *name, int *sizep); int osd_xattr_get(const struct lu_env *env, struct dt_object *dt, diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index e0b0748..284e886 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -94,7 +94,7 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, size = old_size - *pos; } - rc = -dmu_read(osd->od_objset.os, obj->oo_db->db_object, *pos, size, + rc = -dmu_read(osd->od_os, obj->oo_db->db_object, *pos, size, buf->lb_buf, DMU_READ_PREFETCH); if (rc == 0) { rc = size; @@ -161,7 +161,6 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; struct osd_thandle *oh; uint64_t offset = *pos; int rc; @@ -173,7 +172,7 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); - dmu_write(osd->od_objset.os, obj->oo_db->db_object, offset, + dmu_write(osd->od_os, obj->oo_db->db_object, offset, (uint64_t)buf->lb_len, buf->lb_buf, oh->ot_tx); write_lock(&obj->oo_attr_lock); if (obj->oo_attr.la_size < offset + buf->lb_len) { @@ -182,7 +181,7 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, /* osd_object_sa_update() will be copying directly from oo_attr * into dbuf. any update within a single txg will copy the * most actual */ - rc = osd_object_sa_update(obj, SA_ZPL_SIZE(uos), + rc = osd_object_sa_update(obj, SA_ZPL_SIZE(osd), &obj->oo_attr.la_size, 8, oh); if (unlikely(rc)) GOTO(out, rc); @@ -609,7 +608,7 @@ static int osd_declare_write_commit(const struct lu_env *env, /* backend zfs filesystem might be configured to store multiple data * copies */ - space *= osd->od_objset.os->os_copies; + space *= osd->od_os->os_copies; space = toqb(space); CDEBUG(D_QUOTA, "writting %d pages, reserving "LPD64"K of quota " "space\n", npages, space); @@ -645,7 +644,6 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; struct osd_thandle *oh; uint64_t new_size = 0; int i, rc = 0; @@ -673,7 +671,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, } if (lnb[i].lnb_page->mapping == (void *)obj) { - dmu_write(osd->od_objset.os, obj->oo_db->db_object, + dmu_write(osd->od_os, obj->oo_db->db_object, lnb[i].lnb_file_offset, lnb[i].lnb_len, kmap(lnb[i].lnb_page), oh->ot_tx); kunmap(lnb[i].lnb_page); @@ -710,8 +708,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, /* osd_object_sa_update() will be copying directly from * oo_attr into dbuf. any update within a single txg will copy * the most actual */ - rc = osd_object_sa_update(obj, SA_ZPL_SIZE(uos), - &obj->oo_attr.la_size, 8, oh); + rc = osd_object_sa_update(obj, SA_ZPL_SIZE(osd), + &obj->oo_attr.la_size, 8, oh); } else { write_unlock(&obj->oo_attr_lock); } @@ -792,7 +790,6 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; struct osd_thandle *oh; __u64 len; int rc = 0; @@ -812,15 +809,15 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, len = end - start; write_unlock(&obj->oo_attr_lock); - rc = __osd_object_punch(osd->od_objset.os, obj->oo_db, oh->ot_tx, + rc = __osd_object_punch(osd->od_os, obj->oo_db, oh->ot_tx, obj->oo_attr.la_size, start, len); /* set new size */ if (len == DMU_OBJECT_END) { write_lock(&obj->oo_attr_lock); obj->oo_attr.la_size = start; write_unlock(&obj->oo_attr_lock); - rc = osd_object_sa_update(obj, SA_ZPL_SIZE(uos), - &obj->oo_attr.la_size, 8, oh); + rc = osd_object_sa_update(obj, SA_ZPL_SIZE(osd), + &obj->oo_attr.la_size, 8, oh); } RETURN(rc); } diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c index e4a2649..6bf8185 100644 --- a/lustre/osd-zfs/osd_object.c +++ b/lustre/osd-zfs/osd_object.c @@ -68,7 +68,7 @@ #include #include -static char *osd_obj_tag = "osd_object"; +char *osd_obj_tag = "osd_object"; static struct dt_object_operations osd_obj_ops; static struct lu_object_operations osd_lu_obj_ops; @@ -87,20 +87,20 @@ osd_object_sa_fini(struct osd_object *obj) } static int -osd_object_sa_init(struct osd_object *obj, udmu_objset_t *uos) +osd_object_sa_init(struct osd_object *obj, struct osd_device *o) { int rc; LASSERT(obj->oo_sa_hdl == NULL); LASSERT(obj->oo_db != NULL); - rc = -sa_handle_get(uos->os, obj->oo_db->db_object, obj, + rc = -sa_handle_get(o->od_os, obj->oo_db->db_object, obj, SA_HDL_PRIVATE, &obj->oo_sa_hdl); if (rc) return rc; /* Cache the xattr object id, valid for the life of the object */ - rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_XATTR(uos), &obj->oo_xattr, 8); + rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_XATTR(o), &obj->oo_xattr, 8); if (rc == -ENOENT) { obj->oo_xattr = ZFS_NO_OBJECT; rc = 0; @@ -185,7 +185,7 @@ osd_object_sa_bulk_update(struct osd_object *obj, sa_bulk_attr_t *attrs, /* * Retrieve the attributes of a DMU object */ -int __osd_object_attr_get(const struct lu_env *env, udmu_objset_t *uos, +int __osd_object_attr_get(const struct lu_env *env, struct osd_device *o, struct osd_object *obj, struct lu_attr *la) { struct osa_attr *osa = &osd_oti_get(env)->oti_osa; @@ -197,7 +197,7 @@ int __osd_object_attr_get(const struct lu_env *env, udmu_objset_t *uos, LASSERT(obj->oo_db != NULL); - rc = -sa_handle_get(uos->os, obj->oo_db->db_object, NULL, + rc = -sa_handle_get(o->od_os, obj->oo_db->db_object, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) RETURN(rc); @@ -209,15 +209,15 @@ int __osd_object_attr_get(const struct lu_env *env, udmu_objset_t *uos, la->la_valid |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE | LA_TYPE | LA_SIZE | LA_UID | LA_GID | LA_FLAGS | LA_NLINK; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(uos), NULL, osa->atime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(uos), NULL, osa->mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(uos), NULL, osa->ctime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(uos), NULL, &osa->mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(uos), NULL, &osa->size, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(uos), NULL, &osa->nlink, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(uos), NULL, &osa->uid, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(uos), NULL, &osa->gid, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(uos), NULL, &osa->flags, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(o), NULL, osa->atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(o), NULL, osa->mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(o), NULL, osa->ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(o), NULL, &osa->mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(o), NULL, &osa->size, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(o), NULL, &osa->nlink, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(o), NULL, &osa->uid, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(o), NULL, &osa->gid, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(o), NULL, &osa->flags, 8); rc = -sa_bulk_lookup(sa_hdl, bulk, cnt); if (rc) @@ -234,7 +234,7 @@ int __osd_object_attr_get(const struct lu_env *env, udmu_objset_t *uos, la->la_size = osa->size; if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode)) { - rc = -sa_lookup(sa_hdl, SA_ZPL_RDEV(uos), &osa->rdev, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_RDEV(o), &osa->rdev, 8); if (rc) GOTO(out_bulk, rc); la->la_rdev = osa->rdev; @@ -249,21 +249,19 @@ out_sa: } int __osd_obj2dbuf(const struct lu_env *env, objset_t *os, - uint64_t oid, dmu_buf_t **dbp, void *tag) + uint64_t oid, dmu_buf_t **dbp) { dmu_object_info_t *doi = &osd_oti_get(env)->oti_doi; int rc; - LASSERT(tag); - - rc = -sa_buf_hold(os, oid, tag, dbp); + rc = -sa_buf_hold(os, oid, osd_obj_tag, dbp); if (rc) return rc; dmu_object_info_from_db(*dbp, doi); if (unlikely (oid != DMU_USERUSED_OBJECT && oid != DMU_GROUPUSED_OBJECT && doi->doi_bonus_type != DMU_OT_SA)) { - sa_buf_rele(*dbp, tag); + sa_buf_rele(*dbp, osd_obj_tag); *dbp = NULL; return -EINVAL; } @@ -310,7 +308,6 @@ struct lu_object *osd_object_alloc(const struct lu_env *env, int osd_object_init0(const struct lu_env *env, struct osd_object *obj) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu); int rc = 0; ENTRY; @@ -320,13 +317,12 @@ int osd_object_init0(const struct lu_env *env, struct osd_object *obj) /* object exist */ - rc = osd_object_sa_init(obj, uos); + rc = osd_object_sa_init(obj, osd); if (rc) RETURN(rc); /* cache attrs in object */ - rc = __osd_object_attr_get(env, &osd->od_objset, - obj, &obj->oo_attr); + rc = __osd_object_attr_get(env, osd, obj, &obj->oo_attr); if (rc) RETURN(rc); @@ -402,8 +398,7 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l, rc = osd_fid_lookup(env, osd, lu_object_fid(l), &oid); if (rc == 0) { LASSERT(obj->oo_db == NULL); - rc = __osd_obj2dbuf(env, osd->od_objset.os, oid, - &obj->oo_db, osd_obj_tag); + rc = __osd_obj2dbuf(env, osd->od_os, oid, &obj->oo_db); if (rc != 0) { CERROR("%s: lookup "DFID"/"LPX64" failed: rc = %d\n", osd->od_svname, PFID(lu_object_fid(l)), oid, rc); @@ -444,7 +439,6 @@ static void __osd_declare_object_destroy(const struct lu_env *env, struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; dmu_buf_t *db = obj->oo_db; zap_attribute_t *za = &osd_oti_get(env)->oti_za; uint64_t oid = db->db_object, xid; @@ -460,7 +454,7 @@ static void __osd_declare_object_destroy(const struct lu_env *env, dmu_tx_hold_free(tx, oid, 0, DMU_OBJECT_END); - rc = -udmu_zap_cursor_init(&zc, uos, oid, 0); + rc = osd_zap_cursor_init(&zc, osd->od_os, oid, 0); if (rc) goto out; @@ -468,7 +462,7 @@ static void __osd_declare_object_destroy(const struct lu_env *env, BUG_ON(za->za_integer_length != sizeof(uint64_t)); BUG_ON(za->za_num_integers != 1); - rc = -zap_lookup(uos->os, obj->oo_xattr, za->za_name, + rc = -zap_lookup(osd->od_os, obj->oo_xattr, za->za_name, sizeof(uint64_t), 1, &xid); if (rc) { CERROR("%s: xattr lookup failed: rc = %d\n", @@ -482,7 +476,7 @@ static void __osd_declare_object_destroy(const struct lu_env *env, if (rc == -ENOENT) rc = 0; out_err: - udmu_zap_cursor_fini(zc); + osd_zap_cursor_fini(zc); } out: if (rc && tx->tx_err == 0) @@ -534,16 +528,6 @@ static int osd_declare_object_destroy(const struct lu_env *env, RETURN(rc); } -int __osd_object_free(udmu_objset_t *uos, uint64_t oid, dmu_tx_t *tx) -{ - LASSERT(uos->objects != 0); - spin_lock(&uos->lock); - uos->objects--; - spin_unlock(&uos->lock); - - return -dmu_object_free(uos->os, oid, tx); -} - /* * Delete a DMU object * @@ -558,7 +542,6 @@ static int __osd_object_destroy(const struct lu_env *env, dmu_tx_t *tx, void *tag) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; uint64_t xid; zap_attribute_t *za = &osd_oti_get(env)->oti_za; zap_cursor_t *zc; @@ -570,35 +553,35 @@ static int __osd_object_destroy(const struct lu_env *env, /* zap holding xattrs */ if (obj->oo_xattr != ZFS_NO_OBJECT) { - rc = -udmu_zap_cursor_init(&zc, uos, obj->oo_xattr, 0); + rc = osd_zap_cursor_init(&zc, osd->od_os, obj->oo_xattr, 0); if (rc) return rc; while ((rc = -zap_cursor_retrieve(zc, za)) == 0) { BUG_ON(za->za_integer_length != sizeof(uint64_t)); BUG_ON(za->za_num_integers != 1); - rc = -zap_lookup(uos->os, obj->oo_xattr, za->za_name, + rc = -zap_lookup(osd->od_os, obj->oo_xattr, za->za_name, sizeof(uint64_t), 1, &xid); if (rc) { CERROR("%s: lookup xattr %s failed: rc = %d\n", osd->od_svname, za->za_name, rc); continue; } - rc = __osd_object_free(uos, xid, tx); + rc = -dmu_object_free(osd->od_os, xid, tx); if (rc) CERROR("%s: fetch xattr %s failed: rc = %d\n", osd->od_svname, za->za_name, rc); zap_cursor_advance(zc); } - udmu_zap_cursor_fini(zc); + osd_zap_cursor_fini(zc); - rc = __osd_object_free(uos, obj->oo_xattr, tx); + rc = -dmu_object_free(osd->od_os, obj->oo_xattr, tx); if (rc) CERROR("%s: freeing xattr failed: rc = %d\n", osd->od_svname, rc); } - return __osd_object_free(uos, obj->oo_db->db_object, tx); + return -dmu_object_free(osd->od_os, obj->oo_db->db_object, tx); } static int osd_object_destroy(const struct lu_env *env, @@ -625,7 +608,7 @@ static int osd_object_destroy(const struct lu_env *env, zapid = osd_get_name_n_idx(env, osd, fid, buf); /* remove obj ref from index dir (it depends) */ - rc = -zap_remove(osd->od_objset.os, zapid, buf, oh->ot_tx); + rc = -zap_remove(osd->od_os, zapid, buf, oh->ot_tx); if (rc) { CERROR("%s: zap_remove() failed: rc = %d\n", osd->od_svname, rc); @@ -635,13 +618,13 @@ static int osd_object_destroy(const struct lu_env *env, /* Remove object from inode accounting. It is not fatal for the destroy * operation if something goes wrong while updating accounting, but we * still log an error message to notify the administrator */ - rc = -zap_increment_int(osd->od_objset.os, osd->od_iusr_oid, + rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, obj->oo_attr.la_uid, -1, oh->ot_tx); if (rc) CERROR("%s: failed to remove "DFID" from accounting ZAP for usr" " %d: rc = %d\n", osd->od_svname, PFID(fid), obj->oo_attr.la_uid, rc); - rc = -zap_increment_int(osd->od_objset.os, osd->od_igrp_oid, + rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, obj->oo_attr.la_gid, -1, oh->ot_tx); if (rc) CERROR("%s: failed to remove "DFID" from accounting ZAP for grp" @@ -929,7 +912,6 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; struct osd_thandle *oh; struct osa_attr *osa = &osd_oti_get(env)->oti_osa; sa_bulk_attr_t *bulk; @@ -963,12 +945,12 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, if ((valid & LA_UID) && (la->la_uid != obj->oo_attr.la_uid)) { /* Update user accounting. Failure isn't fatal, but we still * log an error message */ - rc = -zap_increment_int(osd->od_objset.os, osd->od_iusr_oid, + rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, la->la_uid, 1, oh->ot_tx); if (rc) CERROR("%s: failed to update accounting ZAP for user " "%d (%d)\n", osd->od_svname, la->la_uid, rc); - rc = -zap_increment_int(osd->od_objset.os, osd->od_iusr_oid, + rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, obj->oo_attr.la_uid, -1, oh->ot_tx); if (rc) CERROR("%s: failed to update accounting ZAP for user " @@ -978,12 +960,12 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, if ((valid & LA_GID) && (la->la_gid != obj->oo_attr.la_gid)) { /* Update group accounting. Failure isn't fatal, but we still * log an error message */ - rc = -zap_increment_int(osd->od_objset.os, osd->od_igrp_oid, + rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, la->la_gid, 1, oh->ot_tx); if (rc) CERROR("%s: failed to update accounting ZAP for user " "%d (%d)\n", osd->od_svname, la->la_gid, rc); - rc = -zap_increment_int(osd->od_objset.os, osd->od_igrp_oid, + rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, obj->oo_attr.la_gid, -1, oh->ot_tx); if (rc) CERROR("%s: failed to update accounting ZAP for user " @@ -995,17 +977,17 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, cnt = 0; if (valid & LA_ATIME) { osa->atime[0] = obj->oo_attr.la_atime = la->la_atime; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16); } if (valid & LA_MTIME) { osa->mtime[0] = obj->oo_attr.la_mtime = la->la_mtime; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16); } if (valid & LA_CTIME) { osa->ctime[0] = obj->oo_attr.la_ctime = la->la_ctime; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16); } if (valid & LA_MODE) { @@ -1013,22 +995,22 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, obj->oo_attr.la_mode = (obj->oo_attr.la_mode & S_IFMT) | (la->la_mode & ~S_IFMT); osa->mode = obj->oo_attr.la_mode; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8); } if (valid & LA_SIZE) { osa->size = obj->oo_attr.la_size = la->la_size; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8); } if (valid & LA_NLINK) { osa->nlink = obj->oo_attr.la_nlink = la->la_nlink; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8); } if (valid & LA_RDEV) { osa->rdev = obj->oo_attr.la_rdev = la->la_rdev; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8); } if (valid & LA_FLAGS) { @@ -1036,17 +1018,17 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, /* many flags are not supported by zfs, so ensure a good cached * copy */ obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8); } if (valid & LA_UID) { osa->uid = obj->oo_attr.la_uid = la->la_uid; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8); } if (valid & LA_GID) { osa->gid = obj->oo_attr.la_gid = la->la_gid; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(uos), NULL, + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8); } obj->oo_attr.la_valid |= valid; @@ -1147,8 +1129,9 @@ static int osd_declare_object_create(const struct lu_env *env, RETURN(rc); } -int __osd_attr_init(const struct lu_env *env, udmu_objset_t *uos, uint64_t oid, - dmu_tx_t *tx, struct lu_attr *la, uint64_t parent) +int __osd_attr_init(const struct lu_env *env, struct osd_device *osd, + uint64_t oid, dmu_tx_t *tx, struct lu_attr *la, + uint64_t parent) { sa_bulk_attr_t *bulk; sa_handle_t *sa_hdl; @@ -1176,7 +1159,7 @@ int __osd_attr_init(const struct lu_env *env, udmu_objset_t *uos, uint64_t oid, osa->size = la->la_size; /* Now add in all of the "SA" attributes */ - rc = -sa_handle_get(uos->os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl); + rc = -sa_handle_get(osd->od_os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) return rc; @@ -1196,19 +1179,19 @@ int __osd_attr_init(const struct lu_env *env, udmu_objset_t *uos, uint64_t oid, * work around the problem. See ORI-610. */ cnt = 0; - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(uos), NULL, &osa->mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(uos), NULL, &osa->size, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(uos), NULL, &gen, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(uos), NULL, &osa->uid, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(uos), NULL, &osa->gid, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(uos), NULL, &parent, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(uos), NULL, &osa->flags, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(uos), NULL, osa->atime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(uos), NULL, osa->mtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(uos), NULL, osa->ctime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(uos), NULL, crtime, 16); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(uos), NULL, &osa->nlink, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(uos), NULL, &osa->rdev, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(osd), NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(osd), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8); rc = -sa_replace_all_by_template(sa_hdl, bulk, cnt, tx); @@ -1223,39 +1206,35 @@ out: * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned * to a transaction group. */ -int __osd_object_create(const struct lu_env *env, udmu_objset_t *uos, +int __osd_object_create(const struct lu_env *env, struct osd_device *osd, dmu_buf_t **dbp, dmu_tx_t *tx, struct lu_attr *la, - uint64_t parent, void *tag) + uint64_t parent) { uint64_t oid; int rc; - LASSERT(tag); /* Assert that the transaction has been assigned to a transaction group. */ LASSERT(tx->tx_txg != 0); /* Create a new DMU object. */ - oid = dmu_object_alloc(uos->os, DMU_OT_PLAIN_FILE_CONTENTS, 0, + oid = dmu_object_alloc(osd->od_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_SA, DN_MAX_BONUSLEN, tx); - rc = -sa_buf_hold(uos->os, oid, tag, dbp); + rc = -sa_buf_hold(osd->od_os, oid, osd_obj_tag, dbp); LASSERTF(rc == 0, "sa_buf_hold "LPU64" failed: %d\n", oid, rc); LASSERT(la->la_valid & LA_MODE); la->la_size = 0; la->la_nlink = 1; - rc = __osd_attr_init(env, uos, oid, tx, la, parent); + rc = __osd_attr_init(env, osd, oid, tx, la, parent); if (rc != 0) { - sa_buf_rele(*dbp, tag); + sa_buf_rele(*dbp, osd_obj_tag); *dbp = NULL; - dmu_object_free(uos->os, oid, tx); + dmu_object_free(osd->od_os, oid, tx); return rc; } - spin_lock(&uos->lock); - uos->objects++; - spin_unlock(&uos->lock); return 0; } @@ -1269,31 +1248,24 @@ int __osd_object_create(const struct lu_env *env, udmu_objset_t *uos, * will also require a FAT ZAP. If there is a new type of micro ZAP created * then we might need to re-evaluate the use of this flag and instead do * a conversion from the different internal ZAP hash formats being used. */ -int __osd_zap_create(const struct lu_env *env, udmu_objset_t *uos, +int __osd_zap_create(const struct lu_env *env, struct osd_device *osd, dmu_buf_t **zap_dbp, dmu_tx_t *tx, - struct lu_attr *la, uint64_t parent, - void *tag, zap_flags_t flags) + struct lu_attr *la, uint64_t parent, zap_flags_t flags) { uint64_t oid; int rc; - LASSERT(tag); - - spin_lock(&uos->lock); - uos->objects++; - spin_unlock(&uos->lock); - /* Assert that the transaction has been assigned to a transaction group. */ LASSERT(tx->tx_txg != 0); - oid = zap_create_flags(uos->os, 0, flags | ZAP_FLAG_HASH64, + oid = zap_create_flags(osd->od_os, 0, flags | ZAP_FLAG_HASH64, DMU_OT_DIRECTORY_CONTENTS, 14, /* == ZFS fzap_default_block_shift */ DN_MAX_INDBLKSHIFT, /* indirect block shift */ DMU_OT_SA, DN_MAX_BONUSLEN, tx); - rc = -sa_buf_hold(uos->os, oid, tag, zap_dbp); + rc = -sa_buf_hold(osd->od_os, oid, osd_obj_tag, zap_dbp); if (rc) return rc; @@ -1301,7 +1273,7 @@ int __osd_zap_create(const struct lu_env *env, udmu_objset_t *uos, la->la_size = 2; la->la_nlink = 1; - return __osd_attr_init(env, uos, oid, tx, la, parent); + return __osd_attr_init(env, osd, oid, tx, la, parent); } static dmu_buf_t *osd_mkidx(const struct lu_env *env, struct osd_device *osd, @@ -1316,8 +1288,8 @@ static dmu_buf_t *osd_mkidx(const struct lu_env *env, struct osd_device *osd, * We set ZAP_FLAG_UINT64_KEY to let ZFS know than we are going to use * binary keys */ LASSERT(S_ISREG(la->la_mode)); - rc = __osd_zap_create(env, &osd->od_objset, &db, oh->ot_tx, la, - parent, osd_obj_tag, ZAP_FLAG_UINT64_KEY); + rc = __osd_zap_create(env, osd, &db, oh->ot_tx, la, parent, + ZAP_FLAG_UINT64_KEY); if (rc) return ERR_PTR(rc); return db; @@ -1331,8 +1303,7 @@ static dmu_buf_t *osd_mkdir(const struct lu_env *env, struct osd_device *osd, int rc; LASSERT(S_ISDIR(la->la_mode)); - rc = __osd_zap_create(env, &osd->od_objset, &db, oh->ot_tx, la, - parent, osd_obj_tag, 0); + rc = __osd_zap_create(env, osd, &db, oh->ot_tx, la, parent, 0); if (rc) return ERR_PTR(rc); return db; @@ -1346,8 +1317,7 @@ static dmu_buf_t* osd_mkreg(const struct lu_env *env, struct osd_device *osd, int rc; LASSERT(S_ISREG(la->la_mode)); - rc = __osd_object_create(env, &osd->od_objset, &db, oh->ot_tx, la, - parent, osd_obj_tag); + rc = __osd_object_create(env, osd, &db, oh->ot_tx, la, parent); if (rc) return ERR_PTR(rc); @@ -1356,7 +1326,7 @@ static dmu_buf_t* osd_mkreg(const struct lu_env *env, struct osd_device *osd, * a method in OSD API to control this from OFD/MDD */ if (!lu_device_is_md(osd2lu_dev(osd))) { - rc = -dmu_object_set_blocksize(osd->od_objset.os, + rc = -dmu_object_set_blocksize(osd->od_os, db->db_object, 128 << 10, 0, oh->ot_tx); if (unlikely(rc)) { @@ -1377,8 +1347,7 @@ static dmu_buf_t *osd_mksym(const struct lu_env *env, struct osd_device *osd, int rc; LASSERT(S_ISLNK(la->la_mode)); - rc = __osd_object_create(env, &osd->od_objset, &db, oh->ot_tx, la, - parent, osd_obj_tag); + rc = __osd_object_create(env, osd, &db, oh->ot_tx, la, parent); if (rc) return ERR_PTR(rc); return db; @@ -1395,8 +1364,7 @@ static dmu_buf_t *osd_mknod(const struct lu_env *env, struct osd_device *osd, if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode)) la->la_valid |= LA_RDEV; - rc = __osd_object_create(env, &osd->od_objset, &db, oh->ot_tx, la, - parent, osd_obj_tag); + rc = __osd_object_create(env, osd, &db, oh->ot_tx, la, parent); if (rc) return ERR_PTR(rc); return db; @@ -1512,19 +1480,19 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt, zapid = osd_get_name_n_idx(env, osd, fid, buf); - rc = -zap_add(osd->od_objset.os, zapid, buf, 8, 1, zde, oh->ot_tx); + rc = -zap_add(osd->od_os, zapid, buf, 8, 1, zde, oh->ot_tx); if (rc) GOTO(out, rc); /* Add new object to inode accounting. * Errors are not considered as fatal */ - rc = -zap_increment_int(osd->od_objset.os, osd->od_iusr_oid, + rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, (attr->la_valid & LA_UID) ? attr->la_uid : 0, 1, oh->ot_tx); if (rc) CERROR("%s: failed to add "DFID" to accounting ZAP for usr %d " "(%d)\n", osd->od_svname, PFID(fid), attr->la_uid, rc); - rc = -zap_increment_int(osd->od_objset.os, osd->od_igrp_oid, + rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, (attr->la_valid & LA_GID) ? attr->la_gid : 0, 1, oh->ot_tx); if (rc) @@ -1567,7 +1535,6 @@ static int osd_object_ref_add(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; uint64_t nlink; int rc; @@ -1583,7 +1550,7 @@ static int osd_object_ref_add(const struct lu_env *env, nlink = ++obj->oo_attr.la_nlink; write_unlock(&obj->oo_attr_lock); - rc = osd_object_sa_update(obj, SA_ZPL_LINKS(uos), &nlink, 8, oh); + rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh); return rc; } @@ -1604,7 +1571,6 @@ static int osd_object_ref_del(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; uint64_t nlink; int rc; @@ -1621,7 +1587,7 @@ static int osd_object_ref_del(const struct lu_env *env, nlink = --obj->oo_attr.la_nlink; write_unlock(&obj->oo_attr_lock); - rc = osd_object_sa_update(obj, SA_ZPL_LINKS(uos), &nlink, 8, oh); + rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh); return rc; } @@ -1774,7 +1740,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt, * support ZIL. If the object tracked the txg that it was last * modified in, it could pass that txg here instead of "0". Maybe * the changes are already committed, so no wait is needed at all? */ - txg_wait_synced(dmu_objset_pool(osd->od_objset.os), 0ULL); + txg_wait_synced(dmu_objset_pool(osd->od_os), 0ULL); RETURN(0); } diff --git a/lustre/osd-zfs/osd_oi.c b/lustre/osd-zfs/osd_oi.c index 1e3e990..371c442 100644 --- a/lustre/osd-zfs/osd_oi.c +++ b/lustre/osd-zfs/osd_oi.c @@ -69,8 +69,6 @@ #include #include -static char *oi_tag = "osd_mount, oi"; - #define OSD_OI_FID_NR (1UL << 7) #define OSD_OI_FID_NR_MAX (1UL << OSD_OI_FID_OID_BITS_MAX) unsigned int osd_oi_count = OSD_OI_FID_NR; @@ -123,7 +121,7 @@ osd_oi_lookup(const struct lu_env *env, struct osd_device *o, struct zpl_direntry *zde = &osd_oti_get(env)->oti_zde.lzd_reg; int rc; - rc = -zap_lookup(o->od_objset.os, parent, name, 8, 1, (void *)zde); + rc = -zap_lookup(o->od_os, parent, name, 8, 1, (void *)zde); if (rc) return rc; @@ -151,12 +149,12 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, int rc; /* verify it doesn't already exist */ - rc = -zap_lookup(o->od_objset.os, parent, name, 8, 1, (void *)zde); + rc = -zap_lookup(o->od_os, parent, name, 8, 1, (void *)zde); if (rc == 0) return -EEXIST; /* create fid-to-dnode index */ - tx = dmu_tx_create(o->od_objset.os); + tx = dmu_tx_create(o->od_os); if (tx == NULL) return -ENOMEM; @@ -175,18 +173,18 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, la->la_valid = LA_MODE | LA_UID | LA_GID; la->la_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; la->la_uid = la->la_gid = 0; - __osd_zap_create(env, &o->od_objset, &db, tx, la, parent, oi_tag, 0); + __osd_zap_create(env, o, &db, tx, la, parent, 0); zde->zde_dnode = db->db_object; zde->zde_pad = 0; zde->zde_type = IFTODT(S_IFDIR); - rc = -zap_add(o->od_objset.os, parent, name, 8, 1, (void *)zde, tx); + rc = -zap_add(o->od_os, parent, name, 8, 1, (void *)zde, tx); dmu_tx_commit(tx); *child = db->db_object; - sa_buf_rele(db, oi_tag); + sa_buf_rele(db, osd_obj_tag); return rc; } @@ -484,7 +482,7 @@ int osd_fid_lookup(const struct lu_env *env, struct osd_device *dev, } else { zapid = osd_get_name_n_idx(env, dev, fid, buf); - rc = -zap_lookup(dev->od_objset.os, zapid, buf, + rc = -zap_lookup(dev->od_os, zapid, buf, 8, 1, &info->oti_zde); if (rc) RETURN(rc); @@ -492,7 +490,7 @@ int osd_fid_lookup(const struct lu_env *env, struct osd_device *dev, } if (rc == 0) - dmu_prefetch(dev->od_objset.os, *oid, 0, 0); + dmu_prefetch(dev->od_os, *oid, 0, 0); RETURN(rc); } @@ -710,7 +708,7 @@ int osd_convert_root_to_new_seq(const struct lu_env *env, RETURN(0); /* lookup /ROOT */ - rc = -zap_lookup(o->od_objset.os, o->od_root, root2convert, 8, + rc = -zap_lookup(o->od_os, o->od_root, root2convert, 8, sizeof(*lze) / 8, (void *)lze); /* doesn't exist or let actual user to handle the error */ if (rc) @@ -723,7 +721,7 @@ int osd_convert_root_to_new_seq(const struct lu_env *env, if (fid_seq(&lze->lzd_fid) == FID_SEQ_ROOT) return 0; - tx = dmu_tx_create(o->od_objset.os); + tx = dmu_tx_create(o->od_os); if (tx == NULL) return -ENOMEM; @@ -750,25 +748,25 @@ int osd_convert_root_to_new_seq(const struct lu_env *env, if (rc) GOTO(err, rc); - rc = -zap_remove(o->od_objset.os, o->od_root, root2convert, tx); + rc = -zap_remove(o->od_os, o->od_root, root2convert, tx); if (rc) GOTO(err, rc); /* remove from OI */ zapid = osd_get_name_n_idx(env, o, &lze->lzd_fid, buf); - rc = -zap_remove(o->od_objset.os, zapid, buf, tx); + rc = -zap_remove(o->od_os, zapid, buf, tx); if (rc) GOTO(err, rc); lze->lzd_fid = newfid; - rc = -zap_add(o->od_objset.os, o->od_root, root2convert, + rc = -zap_add(o->od_os, o->od_root, root2convert, 8, sizeof(*lze) / 8, (void *)lze, tx); if (rc) GOTO(err, rc); /* add to OI with the new fid */ zapid = osd_get_name_n_idx(env, o, &newfid, buf); - rc = -zap_add(o->od_objset.os, zapid, buf, 8, 1, &lze->lzd_reg, tx); + rc = -zap_add(o->od_os, zapid, buf, 8, 1, &lze->lzd_reg, tx); if (rc) GOTO(err, rc); diff --git a/lustre/osd-zfs/osd_quota.c b/lustre/osd-zfs/osd_quota.c index b51be3b..ee2293f 100644 --- a/lustre/osd-zfs/osd_quota.c +++ b/lustre/osd-zfs/osd_quota.c @@ -43,6 +43,29 @@ uint64_t osd_quota_fid2dmu(const struct lu_fid *fid) } /** + * Helper function to estimate the number of inodes in use for a give uid/gid + * from the block usage + */ +static uint64_t osd_objset_user_iused(struct osd_device *osd, uint64_t uidbytes) +{ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t uidobjs; + + /* get fresh statfs info */ + dmu_objset_space(osd->od_os, &refdbytes, &availbytes, + &usedobjs, &availobjs); + + /* estimate the number of objects based on the disk usage */ + uidobjs = osd_objs_count_estimate(refdbytes, usedobjs, + uidbytes >> SPA_MAXBLOCKSHIFT); + if (uidbytes > 0) + /* if we have at least 1 byte, we have at least one dnode ... */ + uidobjs = max_t(uint64_t, uidobjs, 1); + + return uidobjs; +} + +/** * Space Accounting Management */ @@ -90,7 +113,7 @@ static int osd_acct_index_lookup(const struct lu_env *env, * not associated with any dmu_but_t (see dnode_special_open()). * As a consequence, we cannot use udmu_zap_lookup() here since it * requires a valid oo_db. */ - rc = -zap_lookup(osd->od_objset.os, oid, buf, sizeof(uint64_t), 1, + rc = -zap_lookup(osd->od_os, oid, buf, sizeof(uint64_t), 1, &rec->bspace); if (rc == -ENOENT) /* user/group has not created anything yet */ @@ -102,14 +125,13 @@ static int osd_acct_index_lookup(const struct lu_env *env, if (osd->od_quota_iused_est) { if (rec->bspace != 0) /* estimate #inodes in use */ - rec->ispace = udmu_objset_user_iused(&osd->od_objset, - rec->bspace); + rec->ispace = osd_objset_user_iused(osd, rec->bspace); RETURN(+1); } /* as for inode accounting, it is not maintained by DMU, so we just * use our own ZAP to track inode usage */ - rc = -zap_lookup(osd->od_objset.os, obj->oo_db->db_object, + rc = -zap_lookup(osd->od_os, obj->oo_db->db_object, buf, sizeof(uint64_t), 1, &rec->ispace); if (rc == -ENOENT) /* user/group has not created any file yet */ @@ -150,7 +172,7 @@ static struct dt_it *osd_it_acct_init(const struct lu_env *env, it->oiq_oid = osd_quota_fid2dmu(lu_object_fid(lo)); /* initialize zap cursor */ - rc = -udmu_zap_cursor_init(&it->oiq_zc, &osd->od_objset, it->oiq_oid,0); + rc = osd_zap_cursor_init(&it->oiq_zc, osd->od_os, it->oiq_oid, 0); if (rc) RETURN(ERR_PTR(rc)); @@ -171,7 +193,7 @@ static void osd_it_acct_fini(const struct lu_env *env, struct dt_it *di) { struct osd_it_quota *it = (struct osd_it_quota *)di; ENTRY; - udmu_zap_cursor_fini(it->oiq_zc); + osd_zap_cursor_fini(it->oiq_zc); lu_object_put(env, &it->oiq_obj->oo_dt.do_lu); EXIT; } @@ -188,15 +210,16 @@ static void osd_it_acct_fini(const struct lu_env *env, struct dt_it *di) static int osd_it_acct_next(const struct lu_env *env, struct dt_it *di) { struct osd_it_quota *it = (struct osd_it_quota *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc; ENTRY; if (it->oiq_reset == 0) zap_cursor_advance(it->oiq_zc); it->oiq_reset = 0; - rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, NULL, 32); + rc = -zap_cursor_retrieve(it->oiq_zc, za); if (rc == -ENOENT) /* reached the end */ - RETURN(+1); + rc = 1; RETURN(rc); } @@ -209,17 +232,16 @@ static struct dt_key *osd_it_acct_key(const struct lu_env *env, const struct dt_it *di) { struct osd_it_quota *it = (struct osd_it_quota *)di; - struct osd_thread_info *info = osd_oti_get(env); - char *buf = info->oti_buf; - char *p; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc; ENTRY; it->oiq_reset = 0; - rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, buf, 32); + rc = -zap_cursor_retrieve(it->oiq_zc, za); if (rc) RETURN(ERR_PTR(rc)); - it->oiq_id = simple_strtoull(buf, &p, 16); + rc = kstrtoull(za->za_name, 16, &it->oiq_id); + RETURN((struct dt_key *) &it->oiq_id); } @@ -235,6 +257,43 @@ static int osd_it_acct_key_size(const struct lu_env *env, RETURN((int)sizeof(uint64_t)); } +/* + * zap_cursor_retrieve read from current record. + * to read bytes we need to call zap_lookup explicitly. + */ +static int osd_zap_cursor_retrieve_value(const struct lu_env *env, + zap_cursor_t *zc, char *buf, + int buf_size, int *bytes_read) +{ + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc, actual_size; + + rc = -zap_cursor_retrieve(zc, za); + if (unlikely(rc != 0)) + return -rc; + + if (unlikely(za->za_integer_length <= 0)) + return -ERANGE; + + actual_size = za->za_integer_length * za->za_num_integers; + + if (actual_size > buf_size) { + actual_size = buf_size; + buf_size = actual_size / za->za_integer_length; + } else { + buf_size = za->za_num_integers; + } + + rc = -zap_lookup(zc->zc_objset, zc->zc_zapobj, + za->za_name, za->za_integer_length, + buf_size, buf); + + if (likely(rc == 0)) + *bytes_read = actual_size; + + return rc; +} + /** * Return pointer to the record under iterator. * @@ -246,7 +305,7 @@ static int osd_it_acct_rec(const struct lu_env *env, struct dt_rec *dtrec, __u32 attr) { struct osd_thread_info *info = osd_oti_get(env); - char *buf = info->oti_buf; + zap_attribute_t *za = &info->oti_za; struct osd_it_quota *it = (struct osd_it_quota *)di; struct lquota_acct_rec *rec = (struct lquota_acct_rec *)dtrec; struct osd_object *obj = it->oiq_obj; @@ -259,33 +318,32 @@ static int osd_it_acct_rec(const struct lu_env *env, rec->ispace = rec->bspace = 0; /* retrieve block usage from the DMU accounting object */ - rc = -udmu_zap_cursor_retrieve_value(env, it->oiq_zc, - (char *)&rec->bspace, - sizeof(uint64_t), &bytes_read); + rc = osd_zap_cursor_retrieve_value(env, it->oiq_zc, + (char *)&rec->bspace, + sizeof(uint64_t), &bytes_read); if (rc) RETURN(rc); if (osd->od_quota_iused_est) { if (rec->bspace != 0) /* estimate #inodes in use */ - rec->ispace = udmu_objset_user_iused(&osd->od_objset, - rec->bspace); + rec->ispace = osd_objset_user_iused(osd, rec->bspace); RETURN(0); } /* retrieve key associated with the current cursor */ - rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, buf, 32); - if (rc) + rc = -zap_cursor_retrieve(it->oiq_zc, za); + if (unlikely(rc != 0)) RETURN(rc); /* inode accounting is not maintained by DMU, so we use our own ZAP to * track inode usage */ - rc = -zap_lookup(osd->od_objset.os, it->oiq_obj->oo_db->db_object, - buf, sizeof(uint64_t), 1, &rec->ispace); + rc = -zap_lookup(osd->od_os, it->oiq_obj->oo_db->db_object, + za->za_name, sizeof(uint64_t), 1, &rec->ispace); if (rc == -ENOENT) /* user/group has not created any file yet */ CDEBUG(D_QUOTA, "%s: id %s not found in accounting ZAP\n", - osd->od_svname, buf); + osd->od_svname, za->za_name); else if (rc) RETURN(rc); @@ -303,7 +361,7 @@ static __u64 osd_it_acct_store(const struct lu_env *env, struct osd_it_quota *it = (struct osd_it_quota *)di; ENTRY; it->oiq_reset = 0; - RETURN(udmu_zap_cursor_serialize(it->oiq_zc)); + RETURN(osd_zap_cursor_serialize(it->oiq_zc)); } /** @@ -322,23 +380,25 @@ static int osd_it_acct_load(const struct lu_env *env, { struct osd_it_quota *it = (struct osd_it_quota *)di; struct osd_device *osd = osd_obj2dev(it->oiq_obj); + zap_attribute_t *za = &osd_oti_get(env)->oti_za; zap_cursor_t *zc; int rc; ENTRY; /* create new cursor pointing to the new hash */ - rc = -udmu_zap_cursor_init(&zc, &osd->od_objset, it->oiq_oid, hash); + rc = osd_zap_cursor_init(&zc, osd->od_os, it->oiq_oid, hash); if (rc) RETURN(rc); - udmu_zap_cursor_fini(it->oiq_zc); + osd_zap_cursor_fini(it->oiq_zc); it->oiq_zc = zc; it->oiq_reset = 0; - rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, NULL, 32); + rc = -zap_cursor_retrieve(it->oiq_zc, za); if (rc == 0) - RETURN(+1); + rc = 1; else if (rc == -ENOENT) - RETURN(0); + rc = 0; + RETURN(rc); } diff --git a/lustre/osd-zfs/osd_xattr.c b/lustre/osd-zfs/osd_xattr.c index f66f986..a24aa34 100644 --- a/lustre/osd-zfs/osd_xattr.c +++ b/lustre/osd-zfs/osd_xattr.c @@ -81,7 +81,7 @@ * * No locking is done here. */ -int __osd_xattr_load(udmu_objset_t *uos, uint64_t dnode, nvlist_t **sa_xattr) +int __osd_xattr_load(struct osd_device *osd, uint64_t dnode, nvlist_t **sa) { sa_handle_t *sa_hdl; char *buf; @@ -90,14 +90,14 @@ int __osd_xattr_load(udmu_objset_t *uos, uint64_t dnode, nvlist_t **sa_xattr) if (unlikely(dnode == ZFS_NO_OBJECT)) return -ENOENT; - rc = -sa_handle_get(uos->os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); + rc = -sa_handle_get(osd->od_os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) return rc; - rc = -sa_size(sa_hdl, SA_ZPL_DXATTR(uos), &size); + rc = -sa_size(sa_hdl, SA_ZPL_DXATTR(osd), &size); if (rc) { if (rc == -ENOENT) - rc = -nvlist_alloc(sa_xattr, NV_UNIQUE_NAME, KM_SLEEP); + rc = -nvlist_alloc(sa, NV_UNIQUE_NAME, KM_SLEEP); goto out_sa; } @@ -106,9 +106,9 @@ int __osd_xattr_load(udmu_objset_t *uos, uint64_t dnode, nvlist_t **sa_xattr) rc = -ENOMEM; goto out_sa; } - rc = -sa_lookup(sa_hdl, SA_ZPL_DXATTR(uos), buf, size); + rc = -sa_lookup(sa_hdl, SA_ZPL_DXATTR(osd), buf, size); if (rc == 0) - rc = -nvlist_unpack(buf, size, sa_xattr, KM_SLEEP); + rc = -nvlist_unpack(buf, size, sa, KM_SLEEP); sa_spill_free(buf); out_sa: sa_handle_destroy(sa_hdl); @@ -122,12 +122,12 @@ static inline int __osd_xattr_cache(const struct lu_env *env, LASSERT(obj->oo_sa_xattr == NULL); LASSERT(obj->oo_db != NULL); - return __osd_xattr_load(&osd_obj2dev(obj)->od_objset, - obj->oo_db->db_object, &obj->oo_sa_xattr); + return __osd_xattr_load(osd_obj2dev(obj), obj->oo_db->db_object, + &obj->oo_sa_xattr); } int __osd_sa_xattr_get(const struct lu_env *env, struct osd_object *obj, - const struct lu_buf *buf, const char *name, int *sizep) + const struct lu_buf *buf, const char *name, int *sizep) { uchar_t *nv_value; int rc; @@ -158,7 +158,7 @@ int __osd_sa_xattr_get(const struct lu_env *env, struct osd_object *obj, return 0; } -int __osd_xattr_get_large(const struct lu_env *env, udmu_objset_t *uos, +int __osd_xattr_get_large(const struct lu_env *env, struct osd_device *osd, uint64_t xattr, struct lu_buf *buf, const char *name, int *sizep) { @@ -172,22 +172,22 @@ int __osd_xattr_get_large(const struct lu_env *env, udmu_objset_t *uos, return -ENOENT; /* Lookup the object number containing the xattr data */ - rc = -zap_lookup(uos->os, xattr, name, sizeof(uint64_t), 1, + rc = -zap_lookup(osd->od_os, xattr, name, sizeof(uint64_t), 1, &xa_data_obj); if (rc) return rc; - rc = __osd_obj2dbuf(env, uos->os, xa_data_obj, &xa_data_db, FTAG); + rc = __osd_obj2dbuf(env, osd->od_os, xa_data_obj, &xa_data_db); if (rc) return rc; - rc = -sa_handle_get(uos->os, xa_data_obj, NULL, SA_HDL_PRIVATE, + rc = -sa_handle_get(osd->od_os, xa_data_obj, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) goto out_rele; /* Get the xattr value length / object size */ - rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(uos), &size, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(osd), &size, 8); if (rc) goto out; @@ -207,7 +207,7 @@ int __osd_xattr_get_large(const struct lu_env *env, udmu_objset_t *uos, goto out; } - rc = -dmu_read(uos->os, xa_data_db->db_object, 0, + rc = -dmu_read(osd->od_os, xa_data_db->db_object, 0, size, buf->lb_buf, DMU_READ_PREFETCH); out: @@ -219,7 +219,7 @@ out_rele: } int __osd_xattr_get(const struct lu_env *env, struct osd_object *obj, - struct lu_buf *buf, const char *name, int *sizep) + struct lu_buf *buf, const char *name, int *sizep) { int rc; @@ -228,15 +228,15 @@ int __osd_xattr_get(const struct lu_env *env, struct osd_object *obj, if (rc != -ENOENT) return rc; - rc = __osd_xattr_get_large(env, &osd_obj2dev(obj)->od_objset, - obj->oo_xattr, buf, name, sizep); + rc = __osd_xattr_get_large(env, osd_obj2dev(obj), obj->oo_xattr, + buf, name, sizep); return rc; } int osd_xattr_get(const struct lu_env *env, struct dt_object *dt, - struct lu_buf *buf, const char *name, - struct lustre_capa *capa) + struct lu_buf *buf, const char *name, + struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); int rc, size = 0; @@ -263,10 +263,10 @@ int osd_xattr_get(const struct lu_env *env, struct dt_object *dt, } void __osd_xattr_declare_set(const struct lu_env *env, struct osd_object *obj, - int vallen, const char *name, struct osd_thandle *oh) + int vallen, const char *name, + struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; dmu_buf_t *db = obj->oo_db; dmu_tx_t *tx = oh->ot_tx; uint64_t xa_data_obj; @@ -300,7 +300,7 @@ void __osd_xattr_declare_set(const struct lu_env *env, struct osd_object *obj, return; } - rc = -zap_lookup(uos->os, obj->oo_xattr, name, sizeof(uint64_t), 1, + rc = -zap_lookup(osd->od_os, obj->oo_xattr, name, sizeof(uint64_t), 1, &xa_data_obj); if (rc == 0) { /* @@ -328,8 +328,8 @@ void __osd_xattr_declare_set(const struct lu_env *env, struct osd_object *obj, } int osd_declare_xattr_set(const struct lu_env *env, struct dt_object *dt, - const struct lu_buf *buf, const char *name, - int fl, struct thandle *handle) + const struct lu_buf *buf, const char *name, + int fl, struct thandle *handle) { struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; @@ -355,10 +355,9 @@ int osd_declare_xattr_set(const struct lu_env *env, struct dt_object *dt, */ static int __osd_sa_xattr_update(const struct lu_env *env, struct osd_object *obj, - struct osd_thandle *oh) + struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; char *dxattr; size_t sa_size; int rc; @@ -381,15 +380,15 @@ __osd_sa_xattr_update(const struct lu_env *env, struct osd_object *obj, if (rc) GOTO(out_free, rc); - rc = osd_object_sa_update(obj, SA_ZPL_DXATTR(uos), dxattr, sa_size, oh); + rc = osd_object_sa_update(obj, SA_ZPL_DXATTR(osd), dxattr, sa_size, oh); out_free: sa_spill_free(dxattr); RETURN(rc); } int __osd_sa_xattr_set(const struct lu_env *env, struct osd_object *obj, - const struct lu_buf *buf, const char *name, int fl, - struct osd_thandle *oh) + const struct lu_buf *buf, const char *name, int fl, + struct osd_thandle *oh) { uchar_t *nv_value; size_t size; @@ -445,14 +444,14 @@ int __osd_sa_xattr_set(const struct lu_env *env, struct osd_object *obj, /* Ensure xattr doesn't exist in ZAP */ if (obj->oo_xattr != ZFS_NO_OBJECT) { - udmu_objset_t *uos = &osd_obj2dev(obj)->od_objset; - uint64_t xa_data_obj; - rc = -zap_lookup(uos->os, obj->oo_xattr, - name, 8, 1, &xa_data_obj); + struct osd_device *osd = osd_obj2dev(obj); + uint64_t objid; + rc = -zap_lookup(osd->od_os, obj->oo_xattr, + name, 8, 1, &objid); if (rc == 0) { - rc = __osd_object_free(uos, xa_data_obj, oh->ot_tx); + rc = -dmu_object_free(osd->od_os, objid, oh->ot_tx); if (rc == 0) - zap_remove(uos->os, obj->oo_xattr, + zap_remove(osd->od_os, obj->oo_xattr, name, oh->ot_tx); } } @@ -472,7 +471,6 @@ __osd_xattr_set(const struct lu_env *env, struct osd_object *obj, struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; dmu_buf_t *xa_zap_db = NULL; dmu_buf_t *xa_data_db = NULL; uint64_t xa_data_obj; @@ -488,20 +486,20 @@ __osd_xattr_set(const struct lu_env *env, struct osd_object *obj, la->la_valid = LA_MODE; la->la_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - rc = __osd_zap_create(env, uos, &xa_zap_db, tx, la, - obj->oo_db->db_object, FTAG, 0); + rc = __osd_zap_create(env, osd, &xa_zap_db, tx, la, + obj->oo_db->db_object, 0); if (rc) return rc; obj->oo_xattr = xa_zap_db->db_object; - rc = osd_object_sa_update(obj, SA_ZPL_XATTR(uos), + rc = osd_object_sa_update(obj, SA_ZPL_XATTR(osd), &obj->oo_xattr, 8, oh); if (rc) goto out; } - rc = -zap_lookup(uos->os, obj->oo_xattr, name, sizeof(uint64_t), 1, - &xa_data_obj); + rc = -zap_lookup(osd->od_os, obj->oo_xattr, name, sizeof(uint64_t), 1, + &xa_data_obj); if (rc == 0) { if (fl & LU_XATTR_CREATE) { rc = -EEXIST; @@ -511,21 +509,21 @@ __osd_xattr_set(const struct lu_env *env, struct osd_object *obj, * Entry already exists. * We'll truncate the existing object. */ - rc = __osd_obj2dbuf(env, uos->os, xa_data_obj, - &xa_data_db, FTAG); + rc = __osd_obj2dbuf(env, osd->od_os, xa_data_obj, + &xa_data_db); if (rc) goto out; - rc = -sa_handle_get(uos->os, xa_data_obj, NULL, + rc = -sa_handle_get(osd->od_os, xa_data_obj, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) goto out; - rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(uos), &size, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(osd), &size, 8); if (rc) goto out_sa; - rc = -dmu_free_range(uos->os, xa_data_db->db_object, + rc = -dmu_free_range(osd->od_os, xa_data_db->db_object, 0, DMU_OBJECT_END, tx); if (rc) goto out_sa; @@ -544,18 +542,18 @@ __osd_xattr_set(const struct lu_env *env, struct osd_object *obj, la->la_valid = LA_MODE; la->la_mode = S_IFREG | S_IRUGO | S_IWUSR; - rc = __osd_object_create(env, uos, &xa_data_db, tx, la, - obj->oo_xattr, FTAG); + rc = __osd_object_create(env, osd, &xa_data_db, tx, la, + obj->oo_xattr); if (rc) goto out; xa_data_obj = xa_data_db->db_object; - rc = -sa_handle_get(uos->os, xa_data_obj, NULL, + rc = -sa_handle_get(osd->od_os, xa_data_obj, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) goto out; - rc = -zap_add(uos->os, obj->oo_xattr, name, sizeof(uint64_t), + rc = -zap_add(osd->od_os, obj->oo_xattr, name, sizeof(uint64_t), 1, &xa_data_obj, tx); if (rc) goto out_sa; @@ -565,10 +563,10 @@ __osd_xattr_set(const struct lu_env *env, struct osd_object *obj, } /* Finally write the xattr value */ - dmu_write(uos->os, xa_data_obj, 0, buf->lb_len, buf->lb_buf, tx); + dmu_write(osd->od_os, xa_data_obj, 0, buf->lb_len, buf->lb_buf, tx); size = buf->lb_len; - rc = -sa_update(sa_hdl, SA_ZPL_SIZE(uos), &size, 8, tx); + rc = -sa_update(sa_hdl, SA_ZPL_SIZE(osd), &size, 8, tx); out_sa: sa_handle_destroy(sa_hdl); @@ -616,7 +614,6 @@ __osd_xattr_declare_del(const struct lu_env *env, struct osd_object *obj, const char *name, struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; dmu_tx_t *tx = oh->ot_tx; uint64_t xa_data_obj; int rc; @@ -627,7 +624,7 @@ __osd_xattr_declare_del(const struct lu_env *env, struct osd_object *obj, if (obj->oo_xattr == ZFS_NO_OBJECT) return; - rc = -zap_lookup(uos->os, obj->oo_xattr, name, 8, 1, &xa_data_obj); + rc = -zap_lookup(osd->od_os, obj->oo_xattr, name, 8, 1, &xa_data_obj); if (rc == 0) { /* * Entry exists. @@ -649,7 +646,7 @@ __osd_xattr_declare_del(const struct lu_env *env, struct osd_object *obj, } int osd_declare_xattr_del(const struct lu_env *env, struct dt_object *dt, - const char *name, struct thandle *handle) + const char *name, struct thandle *handle) { struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; @@ -671,7 +668,7 @@ int osd_declare_xattr_del(const struct lu_env *env, struct dt_object *dt, } int __osd_sa_xattr_del(const struct lu_env *env, struct osd_object *obj, - const char *name, struct osd_thandle *oh) + const char *name, struct osd_thandle *oh) { int rc; @@ -688,10 +685,9 @@ int __osd_sa_xattr_del(const struct lu_env *env, struct osd_object *obj, } int __osd_xattr_del(const struct lu_env *env, struct osd_object *obj, - const char *name, struct osd_thandle *oh) + const char *name, struct osd_thandle *oh) { struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; uint64_t xa_data_obj; int rc; @@ -703,7 +699,7 @@ int __osd_xattr_del(const struct lu_env *env, struct osd_object *obj, if (obj->oo_xattr == ZFS_NO_OBJECT) return 0; - rc = -zap_lookup(uos->os, obj->oo_xattr, name, sizeof(uint64_t), 1, + rc = -zap_lookup(osd->od_os, obj->oo_xattr, name, sizeof(uint64_t), 1, &xa_data_obj); if (rc == -ENOENT) { rc = 0; @@ -712,19 +708,19 @@ int __osd_xattr_del(const struct lu_env *env, struct osd_object *obj, * Entry exists. * We'll delete the existing object and ZAP entry. */ - rc = __osd_object_free(uos, xa_data_obj, oh->ot_tx); + rc = -dmu_object_free(osd->od_os, xa_data_obj, oh->ot_tx); if (rc) return rc; - rc = -zap_remove(uos->os, obj->oo_xattr, name, oh->ot_tx); + rc = -zap_remove(osd->od_os, obj->oo_xattr, name, oh->ot_tx); } return rc; } int osd_xattr_del(const struct lu_env *env, struct dt_object *dt, - const char *name, struct thandle *handle, - struct lustre_capa *capa) + const char *name, struct thandle *handle, + struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; @@ -752,7 +748,7 @@ int osd_xattr_del(const struct lu_env *env, struct dt_object *dt, static int osd_sa_xattr_list(const struct lu_env *env, struct osd_object *obj, - struct lu_buf *lb) + struct lu_buf *lb) { nvpair_t *nvp = NULL; int len, counted = 0, remain = lb->lb_len; @@ -791,12 +787,11 @@ osd_sa_xattr_list(const struct lu_env *env, struct osd_object *obj, } int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, - struct lu_buf *lb, struct lustre_capa *capa) + struct lu_buf *lb, struct lustre_capa *capa) { - struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); - udmu_objset_t *uos = &osd->od_objset; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; zap_cursor_t *zc; int rc, counted = 0, remain = lb->lb_len; ENTRY; @@ -817,25 +812,24 @@ int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, if (obj->oo_xattr == ZFS_NO_OBJECT) GOTO(out, rc = counted); - rc = -udmu_zap_cursor_init(&zc, uos, obj->oo_xattr, 0); + rc = osd_zap_cursor_init(&zc, osd->od_os, obj->oo_xattr, 0); if (rc) GOTO(out, rc); - while ((rc = -udmu_zap_cursor_retrieve_key(env, zc, oti->oti_key, - MAXNAMELEN)) == 0) { + while ((rc = -zap_cursor_retrieve(zc, za)) == 0) { if (!osd_obj2dev(obj)->od_posix_acl && - (strcmp(oti->oti_key, POSIX_ACL_XATTR_ACCESS) == 0 || - strcmp(oti->oti_key, POSIX_ACL_XATTR_DEFAULT) == 0)) { + (strcmp(za->za_name, POSIX_ACL_XATTR_ACCESS) == 0 || + strcmp(za->za_name, POSIX_ACL_XATTR_DEFAULT) == 0)) { zap_cursor_advance(zc); continue; } - rc = strlen(oti->oti_key); + rc = strlen(za->za_name); if (lb->lb_buf != NULL) { if (rc + 1 > remain) RETURN(-ERANGE); - memcpy(lb->lb_buf, oti->oti_key, rc); + memcpy(lb->lb_buf, za->za_name, rc); lb->lb_buf += rc; *((char *)lb->lb_buf) = '\0'; lb->lb_buf++; @@ -852,7 +846,7 @@ int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, rc = counted; out_fini: - udmu_zap_cursor_fini(zc); + osd_zap_cursor_fini(zc); out: up(&obj->oo_guard); RETURN(rc); diff --git a/lustre/osd-zfs/udmu.c b/lustre/osd-zfs/udmu.c deleted file mode 100644 index ce3df6a..0000000 --- a/lustre/osd-zfs/udmu.c +++ /dev/null @@ -1,417 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, Intel Corporation. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/osd-zfs/udmu.c - * Module that interacts with the ZFS DMU and provides an abstraction - * to the rest of Lustre. - * - * Author: Alex Zhuravlev - * Author: Atul Vidwansa - * Author: Manoj Joseph - * Author: Mike Pershin - */ - -#include /* OBD_OBJECT_EOF */ -#include /* struct obd_statfs */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "udmu.h" - -int udmu_blk_insert_cost(void) -{ - int max_blockshift, nr_blkptrshift; - - /* max_blockshift is the log2 of the number of blocks needed to reach - * the maximum filesize (that's to say 2^64) */ - max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT; - - /* nr_blkptrshift is the log2 of the number of block pointers that can - * be stored in an indirect block */ - CLASSERT(DN_MAX_INDBLKSHIFT > SPA_BLKPTRSHIFT); - nr_blkptrshift = DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT; - - /* max_blockshift / nr_blkptrshift is thus the maximum depth of the - * tree. We add +1 for rounding purpose. - * The tree depth times the indirect block size gives us the maximum - * cost of inserting a block in the tree */ - return (max_blockshift / nr_blkptrshift + 1) * (1 << DN_MAX_INDBLKSHIFT); -} - -int udmu_objset_open(char *osname, udmu_objset_t *uos) -{ - uint64_t refdbytes, availbytes, usedobjs, availobjs; - uint64_t version = ZPL_VERSION; - uint64_t sa_obj; - int error; - - memset(uos, 0, sizeof(udmu_objset_t)); - - error = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, uos, &uos->os); - if (error) { - uos->os = NULL; - goto out; - } - - /* Check ZFS version */ - error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &version); - if (error) { - CERROR("%s: Error looking up ZPL VERSION\n", osname); - /* - * We can't return ENOENT because that would mean the objset - * didn't exist. - */ - error = EIO; - goto out; - } - - error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, - &sa_obj); - if (error) - goto out; - - error = sa_setup(uos->os, sa_obj, zfs_attr_table, ZPL_END, - &uos->z_attr_table); - if (error) - goto out; - - error = zap_lookup(uos->os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &uos->root); - if (error) { - CERROR("%s: Error looking up ZFS root object.\n", osname); - error = EIO; - goto out; - } - ASSERT(uos->root != 0); - - /* Check that user/group usage tracking is supported */ - if (!dmu_objset_userused_enabled(uos->os) || - DMU_USERUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED || - DMU_GROUPUSED_DNODE(uos->os)->dn_type != DMU_OT_USERGROUP_USED) { - CERROR("%s: Space accounting not supported by this target, " - "aborting\n", osname); - error = ENOTSUPP; - goto out; - } - - /* - * as DMU doesn't maintain f_files absolutely actual (it's updated - * at flush, not when object is create/destroyed) we've implemented - * own counter which is initialized from on-disk at mount, then is - * being maintained by DMU OSD - */ - dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, - &availobjs); - uos->objects = usedobjs; - spin_lock_init(&uos->lock); - -out: - if (error && uos->os != NULL) - dmu_objset_disown(uos->os, uos); - - return error; -} - -void udmu_objset_close(udmu_objset_t *uos) -{ - ASSERT(uos->os != NULL); - - /* - * Force a txg sync. This should not be needed, neither for - * correctness nor safety. Presumably, we are only doing - * this to force commit callbacks to be called sooner. - */ - txg_wait_synced(dmu_objset_pool(uos->os), 0ULL); - - /* close the object set */ - dmu_objset_disown(uos->os, uos); - - uos->os = NULL; -} - -/* Estimate the number of objects from a number of blocks */ -static uint64_t udmu_objs_count_estimate(uint64_t refdbytes, - uint64_t usedobjs, - uint64_t nrblocks) -{ - uint64_t est_objs, est_refdblocks, est_usedobjs; - - /* Compute an nrblocks estimate based on the actual number of - * dnodes that could fit in the space. Since we don't know the - * overhead associated with each dnode (xattrs, SAs, VDEV overhead, - * etc) just using DNODE_SHIFT isn't going to give a good estimate. - * Instead, compute an estimate based on the average space usage per - * dnode, with an upper and lower cap. - * - * In case there aren't many dnodes or blocks used yet, add a small - * correction factor using OSD_DNODE_EST_SHIFT. This correction - * factor gradually disappears as the number of real dnodes grows. - * This also avoids the need to check for divide-by-zero later. - */ - CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0); - CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0); - - est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) + - (OSD_DNODE_EST_COUNT >> OSD_DNODE_EST_BLKSHIFT); - est_usedobjs = usedobjs + OSD_DNODE_EST_COUNT; - - /* Average space/dnode more than maximum dnode size, use max dnode - * size to estimate free dnodes from adjusted free blocks count. - * OSTs typically use more than one block dnode so this case applies. */ - if (est_usedobjs <= est_refdblocks * 2) { - est_objs = nrblocks; - - /* Average space/dnode smaller than min dnode size (probably due to - * metadnode compression), use min dnode size to estimate the number of - * objects. - * An MDT typically uses below 512 bytes/dnode so this case applies. */ - } else if (est_usedobjs >= (est_refdblocks << OSD_DNODE_MIN_BLKSHIFT)) { - est_objs = nrblocks << OSD_DNODE_MIN_BLKSHIFT; - - /* Between the extremes, we try to use the average size of - * existing dnodes to compute the number of dnodes that fit - * into nrblocks: - * - * est_objs = nrblocks * (est_usedobjs / est_refblocks); - * - * but this may overflow 64 bits or become 0 if not handled well - * - * We know nrblocks is below (64 - 17 = 47) bits from - * SPA_MAXBLKSHIFT, and est_usedobjs is under 48 bits due to - * DN_MAX_OBJECT_SHIFT, which means that multiplying them may - * get as large as 2 ^ 95. - * - * We also know (est_usedobjs / est_refdblocks) is between 2 and - * 256, due to above checks, so we can safely compute this first. - * We care more about accuracy on the MDT (many dnodes/block) - * which is good because this is where truncation errors are - * smallest. This adds 8 bits to nrblocks so we can use 7 bits - * to compute a fixed-point fraction and nrblocks can still fit - * in 64 bits. */ - } else { - unsigned dnodes_per_block = (est_usedobjs << 7)/est_refdblocks; - - est_objs = (nrblocks * dnodes_per_block) >> 7; - } - return est_objs; -} - -int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs) -{ - uint64_t refdbytes, availbytes, usedobjs, availobjs; - uint64_t est_availobjs; - uint64_t reserved; - - dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, - &availobjs); - - /* - * ZFS allows multiple block sizes. For statfs, Linux makes no - * proper distinction between bsize and frsize. For calculations - * of free and used blocks incorrectly uses bsize instead of frsize, - * but bsize is also used as the optimal blocksize. We return the - * largest possible block size as IO size for the optimum performance - * and scale the free and used blocks count appropriately. - */ - osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT; - - osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT; - osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT; - osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */ - - /* Take replication (i.e. number of copies) into account */ - osfs->os_bavail /= uos->os->os_copies; - - /* - * Reserve some space so we don't run into ENOSPC due to grants not - * accounting for metadata overhead in ZFS, and to avoid fragmentation. - * Rather than report this via os_bavail (which makes users unhappy if - * they can't fill the filesystem 100%), reduce os_blocks as well. - * - * Reserve 0.78% of total space, at least 4MB for small filesystems, - * for internal files to be created/unlinked when space is tight. - */ - CLASSERT(OSD_STATFS_RESERVED_BLKS > 0); - if (likely(osfs->os_blocks >= - OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT)) - reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; - else - reserved = OSD_STATFS_RESERVED_BLKS; - - osfs->os_blocks -= reserved; - osfs->os_bfree -= MIN(reserved, osfs->os_bfree); - osfs->os_bavail -= MIN(reserved, osfs->os_bavail); - - /* - * The availobjs value returned from dmu_objset_space() is largely - * useless, since it reports the number of objects that might - * theoretically still fit into the dataset, independent of minor - * issues like how much space is actually available in the pool. - * Compute a better estimate in udmu_objs_count_estimate(). - */ - est_availobjs = udmu_objs_count_estimate(refdbytes, usedobjs, - osfs->os_bfree); - - osfs->os_ffree = min(availobjs, est_availobjs); - osfs->os_files = osfs->os_ffree + uos->objects; - - /* ZFS XXX: fill in backing dataset FSID/UUID - memcpy(osfs->os_fsid, .... );*/ - - /* We're a zfs filesystem. */ - osfs->os_type = UBERBLOCK_MAGIC; - - /* ZFS XXX: fill in appropriate OS_STATE_{DEGRADED,READONLY} flags - osfs->os_state = vf_to_stf(vfsp->vfs_flag); - if (sb->s_flags & MS_RDONLY) - osfs->os_state = OS_STATE_READONLY; - */ - - osfs->os_namelen = MAXNAMELEN; - osfs->os_maxbytes = OBD_OBJECT_EOF; - - return 0; -} - -/** - * Helper function to estimate the number of inodes in use for a give uid/gid - * from the block usage - */ -uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes) -{ - uint64_t refdbytes, availbytes, usedobjs, availobjs; - uint64_t uidobjs; - - /* get fresh statfs info */ - dmu_objset_space(uos->os, &refdbytes, &availbytes, &usedobjs, - &availobjs); - - /* estimate the number of objects based on the disk usage */ - uidobjs = udmu_objs_count_estimate(refdbytes, usedobjs, - uidbytes >> SPA_MAXBLOCKSHIFT); - if (uidbytes > 0) - /* if we have at least 1 byte, we have at least one dnode ... */ - uidobjs = max_t(uint64_t, uidobjs, 1); - return uidobjs; -} - -/* We don't actually have direct access to the zap_hashbits() function - * so just pretend like we do for now. If this ever breaks we can look at - * it at that time. */ -#define zap_hashbits(zc) 48 -/* - * ZFS hash format: - * | cd (16 bits) | hash (48 bits) | - * we need it in other form: - * |0| hash (48 bit) | cd (15 bit) | - * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge - * the readdir hashes from multiple directory stripes uniformly on the client. - * Another point is sign bit, the hash range should be in [0, 2^63-1] because - * loff_t (for llseek) needs to be a positive value. This means the "cd" field - * should only be the low 15 bits. - */ -uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc) -{ - uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1); - - return (zfs_hash >> zap_hashbits(zc)) | - (zfs_hash << (63 - zap_hashbits(zc))); -} - -void udmu_zap_cursor_init_serialized(zap_cursor_t *zc, udmu_objset_t *uos, - uint64_t zapobj, uint64_t dirhash) -{ - uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) | - (dirhash >> (63 - zap_hashbits(zc))); - zap_cursor_init_serialized(zc, uos->os, zapobj, zfs_hash); -} - -/* - * Zap cursor APIs - */ -int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos, - uint64_t zapobj, uint64_t dirhash) -{ - zap_cursor_t *t; - - t = kmem_alloc(sizeof(*t), KM_NOSLEEP); - if (t) { - udmu_zap_cursor_init_serialized(t, uos, zapobj, dirhash); - *zc = t; - return 0; - } - return (ENOMEM); -} - -void udmu_zap_cursor_fini(zap_cursor_t *zc) -{ - zap_cursor_fini(zc); - kmem_free(zc, sizeof(*zc)); -} - -/* - * Get the object id from dmu_buf_t - */ -int udmu_object_is_zap(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db; - dnode_t *dn; - int rc; - - DB_DNODE_ENTER(dbi); - - dn = DB_DNODE(dbi); - rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS || - dn->dn_type == DMU_OT_USERGROUP_USED); - - DB_DNODE_EXIT(dbi); - - return rc; -} - diff --git a/lustre/osd-zfs/udmu.h b/lustre/osd-zfs/udmu.h deleted file mode 100644 index 9fec38f..0000000 --- a/lustre/osd-zfs/udmu.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, Intel Corporation. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/osd-zfs/udmu.h - * - * Author: Alex Tomas - * Author: Atul Vidwansa - * Author: Manoj Joseph - */ - -#ifndef _DMU_H -#define _DMU_H - -#include -#include -#include - -#include - -typedef struct udmu_objset { - struct objset *os; - uint64_t root; /* id of root znode */ - spinlock_t lock; /* protects objects below */ - uint64_t objects; /* in-core counter of objects */ - /* SA attr mapping->id, - * name is the same as in ZFS to use defines SA_ZPL_...*/ - sa_attr_type_t *z_attr_table; -} udmu_objset_t; - -#ifndef _SYS_TXG_H -#define TXG_WAIT 1ULL -#define TXG_NOWAIT 2ULL -#endif - -#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) - -/* Statfs space reservation for grant, fragmentation, and unlink space. */ -#define OSD_STATFS_RESERVED_BLKS (1ULL << (22 - SPA_MAXBLOCKSHIFT)) /* 4MB */ -#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ - -/* Statfs {minimum, safe estimate, and maximum} dnodes per block */ -#define OSD_DNODE_MIN_BLKSHIFT (SPA_MAXBLOCKSHIFT - DNODE_SHIFT) /* 17-9 =8 */ -#define OSD_DNODE_EST_BLKSHIFT (SPA_MAXBLOCKSHIFT - 12) /* 17-12=5 */ -#define OSD_DNODE_EST_COUNT 1024 - -#define OSD_GRANT_FOR_LOCAL_OIDS (2ULL << 20) /* 2MB for last_rcvd, ... */ - -void udmu_init(void); -void udmu_fini(void); - -/* udmu object-set API */ -int udmu_objset_open(char *osname, udmu_objset_t *uos); -void udmu_objset_close(udmu_objset_t *uos); -int udmu_objset_statfs(udmu_objset_t *uos, struct obd_statfs *osfs); -uint64_t udmu_objset_user_iused(udmu_objset_t *uos, uint64_t uidbytes); -int udmu_objset_root(udmu_objset_t *uos, dmu_buf_t **dbp, void *tag); -uint64_t udmu_get_txg(udmu_objset_t *uos, dmu_tx_t *tx); -int udmu_blk_insert_cost(void); - -/* zap cursor apis */ -int udmu_zap_cursor_init(zap_cursor_t **zc, udmu_objset_t *uos, - uint64_t zapobj, uint64_t hash); - -void udmu_zap_cursor_fini(zap_cursor_t *zc); - -void udmu_zap_cursor_advance(zap_cursor_t *zc); - -uint64_t udmu_zap_cursor_serialize(zap_cursor_t *zc); - -int udmu_zap_cursor_move_to_key(zap_cursor_t *zc, const char *name); - -/* Commit callbacks */ -int udmu_object_is_zap(dmu_buf_t *); - -#endif /* _DMU_H */ diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 0eeaf3cf..1bc644b 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -5608,30 +5608,24 @@ check_catastrophe() { } # CMD: determine mds index where directory inode presents -get_mds_dir () { +get_mds_dir() { local dir=$1 - local file=$dir/f0.get_mds_dir_tmpfile + local SEQ - mkdir -p $dir - rm -f $file - sleep 1 - local iused=$(lfs df -i $dir | grep MDT | awk '{print $3}') - local -a oldused=($iused) - - openfile -f O_CREAT:O_LOV_DELAY_CREATE -m 0644 $file > /dev/null - sleep 1 - iused=$(lfs df -i $dir | grep MDT | awk '{print $3}') - local -a newused=($iused) - - local num=0 - for ((i=0; i<${#newused[@]}; i++)); do - if [ ${oldused[$i]} -lt ${newused[$i]} ]; then - echo $(( i + 1 )) - rm -f $file - return 0 - fi - done - error "mdt-s : inodes count OLD ${oldused[@]} NEW ${newused[@]}" + SEQ=$(lfs path2fid $dir | tr '[:]' ' '|cut -f2 -d ' ') + if [ "$SEQ" == "" ]; then + error "can't get sequence for $dir" + return 1 + fi + export SEQ + + do_facet mds1 "cat /proc/fs/lustre/fld/srv-*-MDT0000/fldb" | \ + tr '[)]:-' ' ' | \ + while read SS EE IDX TYP; do \ + if let "SEQ >= SS && SEQ < EE"; then \ + echo $IDX; \ + fi; \ + done } mdsrate_cleanup () { diff --git a/lustre/utils/mount_utils_zfs.c b/lustre/utils/mount_utils_zfs.c index 0a0280a..9db2f8c 100644 --- a/lustre/utils/mount_utils_zfs.c +++ b/lustre/utils/mount_utils_zfs.c @@ -528,13 +528,12 @@ int zfs_make_lustre(struct mkfs_opts *mop) /* * Create the ZFS filesystem with any required mkfs options: * - canmount=off is set to prevent zfs automounting - * - version=4 is set because SA are not yet handled by the osd + * - xattr=sa is set to use system attribute based xattrs */ memset(mkfs_cmd, 0, PATH_MAX); snprintf(mkfs_cmd, PATH_MAX, "zfs create -o canmount=off -o xattr=sa%s %s", - zfs_mkfs_opts(mop, mkfs_tmp, PATH_MAX), - ds); + zfs_mkfs_opts(mop, mkfs_tmp, PATH_MAX), ds); vprint("mkfs_cmd = %s\n", mkfs_cmd); ret = run_command(mkfs_cmd, PATH_MAX); -- 1.8.3.1