From 79b4ae9139cba69b2a81db25276cb2c0ae12a31c Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 29 May 2012 14:45:32 +0400 Subject: [PATCH] LU-1305 osd: osd_handler.c for osd-zfs functions to setup/mount zfs backend Signed-off-by: Alex Zhuravlev Change-Id: I200709e0c86e4f7aae35528cbae7f7b08e094f47 Reviewed-on: http://review.whamcloud.com/2963 Tested-by: Hudson Reviewed-by: Andreas Dilger --- lustre/osd-zfs/osd_handler.c | 797 ++++++++++++++++++++++++++++++++++++++++++ lustre/osd-zfs/osd_internal.h | 403 +++++++++++++++++++++ 2 files changed, 1200 insertions(+) create mode 100644 lustre/osd-zfs/osd_handler.c create mode 100644 lustre/osd-zfs/osd_internal.h diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c new file mode 100644 index 0000000..4efe29e --- /dev/null +++ b/lustre/osd-zfs/osd_handler.c @@ -0,0 +1,797 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012 Whamcloud, Inc. + * Use is subject to license terms. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/osd-zfs/osd_handler.c + * Top-level entry points into osd module + * + * Author: Alex Zhuravlev + * Author: Mike Pershin + * Author: Johann Lombardi + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_OSD + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osd_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct lu_context_key osd_key; + +static char *root_tag = "osd_mount, rootdb"; + +/* Slab for OSD object allocation */ +cfs_mem_cache_t *osd_object_kmem; + +static struct lu_kmem_descr osd_caches[] = { + { + .ckd_cache = &osd_object_kmem, + .ckd_name = "zfs_osd_obj", + .ckd_size = sizeof(struct osd_object) + }, + { + .ckd_cache = NULL + } +}; + +static void arc_prune_func(int64_t bytes, void *private) +{ + struct osd_device *od = private; + struct lu_site *site = &od->od_site; + struct lu_env env; + int rc; + + rc = lu_env_init(&env, LCT_SHRINKER); + if (rc) { + CERROR("%s: can't initialize shrinker env: rc = %d\n", + od->od_svname, rc); + return; + } + + lu_site_purge(&env, site, (bytes >> 10)); + + lu_env_fini(&env); +} + +/* + * Concurrency: doesn't access mutable data + */ +static int osd_root_get(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f) +{ + lu_local_obj_fid(f, OSD_FS_ROOT_OID); + return 0; +} + +/* + * OSD object methods. + */ + +/* + * Concurrency: shouldn't matter. + */ +static void osd_trans_commit_cb(void *cb_data, int error) +{ + struct osd_thandle *oh = cb_data; + struct thandle *th = &oh->ot_super; + struct lu_device *lud = &th->th_dev->dd_lu_dev; + struct dt_txn_commit_cb *dcb, *tmp; + + ENTRY; + + if (error) { + if (error == ECANCELED) + CWARN("%s: transaction @0x%p was aborted\n", + osd_dt_dev(th->th_dev)->od_svname, th); + else + CERROR("%s: transaction @0x%p commit error: rc = %d\n", + osd_dt_dev(th->th_dev)->od_svname, th, error); + } + + dt_txn_hook_commit(th); + + /* call per-transaction callbacks if any */ + cfs_list_for_each_entry_safe(dcb, tmp, &oh->ot_dcb_list, dcb_linkage) + dcb->dcb_func(NULL, th, dcb, error); + + lu_device_put(lud); + th->th_dev = NULL; + lu_context_exit(&th->th_ctx); + lu_context_fini(&th->th_ctx); + OBD_FREE_PTR(oh); + + EXIT; +} + +static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb) +{ + struct osd_thandle *oh; + + oh = container_of0(th, struct osd_thandle, ot_super); + cfs_list_add(&dcb->dcb_linkage, &oh->ot_dcb_list); + + return 0; +} + +/* + * Concurrency: shouldn't matter. + */ +static int osd_trans_start(const struct lu_env *env, struct dt_device *d, + struct thandle *th) +{ + struct osd_thandle *oh; + int rc; + ENTRY; + + oh = container_of0(th, struct osd_thandle, ot_super); + LASSERT(oh); + LASSERT(oh->ot_tx); + + rc = dt_txn_hook_start(env, d, th); + if (rc != 0) + RETURN(rc); + + if (oh->ot_write_commit && OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) + /* Unlike ldiskfs, ZFS checks for available space and returns + * -ENOSPC when assigning txg */ + RETURN(-ENOSPC); + + rc = -dmu_tx_assign(oh->ot_tx, TXG_WAIT); + if (unlikely(rc != 0)) { + struct osd_device *osd = osd_dt_dev(d); + /* dmu will call commit callback with error code during abort */ + if (!lu_device_is_md(&d->dd_lu_dev) && rc == -ENOSPC) + CERROR("%s: failed to start transaction due to ENOSPC. " + "Metadata overhead is underestimated or " + "grant_ratio is too low.\n", + osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name); + else + CERROR("%s: can't assign tx: rc = %d\n", + osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name, rc); + } else { + /* add commit callback */ + dmu_tx_callback_register(oh->ot_tx, osd_trans_commit_cb, oh); + oh->ot_assigned = 1; + lu_context_init(&th->th_ctx, th->th_tags); + lu_context_enter(&th->th_ctx); + lu_device_get(&d->dd_lu_dev); + } + + RETURN(rc); +} + +/* + * Concurrency: shouldn't matter. + */ +static int osd_trans_stop(const struct lu_env *env, struct thandle *th) +{ + struct osd_device *osd = osd_dt_dev(th->th_dev); + struct osd_thandle *oh; + uint64_t txg; + int rc; + ENTRY; + + oh = container_of0(th, struct osd_thandle, ot_super); + + if (oh->ot_assigned == 0) { + LASSERT(oh->ot_tx); + dmu_tx_abort(oh->ot_tx); + osd_object_sa_dirty_rele(oh); + OBD_FREE_PTR(oh); + RETURN(0); + } + + rc = dt_txn_hook_stop(env, th); + if (rc != 0) + CDEBUG(D_OTHER, "%s: transaction hook failed: rc = %d\n", + osd->od_svname, rc); + + LASSERT(oh->ot_tx); + txg = oh->ot_tx->tx_txg; + + osd_object_sa_dirty_rele(oh); + dmu_tx_commit(oh->ot_tx); + + if (th->th_sync) + txg_wait_synced(dmu_objset_pool(osd->od_objset.os), txg); + + RETURN(rc); +} + +static struct thandle *osd_trans_create(const struct lu_env *env, + struct dt_device *dt) +{ + struct osd_device *osd = osd_dt_dev(dt); + struct osd_thandle *oh; + struct thandle *th; + dmu_tx_t *tx; + ENTRY; + + tx = dmu_tx_create(osd->od_objset.os); + if (tx == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + /* alloc callback data */ + OBD_ALLOC_PTR(oh); + if (oh == NULL) { + dmu_tx_abort(tx); + RETURN(ERR_PTR(-ENOMEM)); + } + + oh->ot_tx = tx; + CFS_INIT_LIST_HEAD(&oh->ot_dcb_list); + CFS_INIT_LIST_HEAD(&oh->ot_sa_list); + cfs_sema_init(&oh->ot_sa_lock, 1); + th = &oh->ot_super; + th->th_dev = dt; + th->th_result = 0; + th->th_tags = LCT_TX_HANDLE; + RETURN(th); +} + +/* + * Concurrency: shouldn't matter. + */ +int osd_statfs(const struct lu_env *env, struct dt_device *d, + struct obd_statfs *osfs) +{ + struct osd_device *osd = osd_dt_dev(d); + int rc; + ENTRY; + + rc = udmu_objset_statfs(&osd->od_objset, osfs); + if (rc) + RETURN(rc); + osfs->os_bavail -= min_t(obd_size, + OSD_GRANT_FOR_LOCAL_OIDS / osfs->os_bsize, + osfs->os_bavail); + RETURN(0); +} + +/* + * Concurrency: doesn't access mutable data. + */ +static void osd_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + /* + * XXX should be taken from not-yet-existing fs abstraction layer. + */ + param->ddp_max_name_len = MAXNAMELEN; + param->ddp_max_nlink = 1 << 31; /* it's 8byte on a disk */ + param->ddp_block_shift = 12; /* XXX */ + param->ddp_mount_type = LDD_MT_ZFS; + + param->ddp_mntopts = MNTOPT_USERXATTR | MNTOPT_ACL; + param->ddp_max_ea_size = DXATTR_MAX_ENTRY_SIZE; + + /* for maxbytes, report same value as ZPL */ + param->ddp_maxbytes = MAX_LFS_FILESIZE; + + /* Default reserved fraction of the available space that should be kept + * for error margin. Unfortunately, there are many factors that can + * impact the overhead with zfs, so let's be very cautious for now and + * reserve 20% of the available space which is not given out as grant. + * This tunable can be changed on a live system via procfs if needed. */ + param->ddp_grant_reserved = 20; + + /* inodes are dynamically allocated, so we report the per-inode space + * consumption to upper layers. This static value is not really accurate + * and we should use the same logic as in udmu_objset_statfs() to + * estimate the real size consumed by an object */ + param->ddp_inodespace = OSD_DNODE_EST_COUNT; + /* per-fragment overhead to be used by the client code */ + param->ddp_grant_frag = udmu_blk_insert_cost(); +} + +/* + * Concurrency: shouldn't matter. + */ +static int osd_sync(const struct lu_env *env, struct dt_device *d) +{ + struct osd_device *osd = osd_dt_dev(d); + CDEBUG(D_HA, "syncing OSD %s\n", LUSTRE_OSD_ZFS_NAME); + txg_wait_synced(dmu_objset_pool(osd->od_objset.os), 0ULL); + return 0; +} + +static int osd_commit_async(const struct lu_env *env, struct dt_device *dev) +{ + struct osd_device *osd = osd_dt_dev(dev); + tx_state_t *tx = &dmu_objset_pool(osd->od_objset.os)->dp_tx; + uint64_t txg; + + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg) { + tx->tx_quiesce_txg_waiting = txg; + cv_broadcast(&tx->tx_quiesce_more_cv); + } + mutex_exit(&tx->tx_sync_lock); + + return 0; +} + +/* + * Concurrency: shouldn't matter. + */ +static int osd_ro(const struct lu_env *env, struct dt_device *d) +{ + struct osd_device *osd = osd_dt_dev(d); + ENTRY; + + CERROR("%s: *** setting device %s read-only ***\n", + osd->od_svname, LUSTRE_OSD_ZFS_NAME); + osd->od_rdonly = 1; + spa_freeze(dmu_objset_spa(osd->od_objset.os)); + + RETURN(0); +} + +/* + * Concurrency: serialization provided by callers. + */ +static int osd_init_capa_ctxt(const struct lu_env *env, struct dt_device *d, + int mode, unsigned long timeout, __u32 alg, + struct lustre_capa_key *keys) +{ + struct osd_device *dev = osd_dt_dev(d); + ENTRY; + + dev->od_fl_capa = mode; + dev->od_capa_timeout = timeout; + dev->od_capa_alg = alg; + dev->od_capa_keys = keys; + + RETURN(0); +} + +static char *osd_label_get(const struct lu_env *env, const struct dt_device *d) +{ + struct osd_device *dev = osd_dt_dev(d); + int rc; + ENTRY; + + rc = -udmu_userprop_get_str(&dev->od_objset, DMU_OSD_SVNAME, + dev->od_svname, sizeof(dev->od_svname)); + if (rc != 0) { + if (rc == -EOVERFLOW) + CWARN("%s: buffer too small\n", dev->od_svname); + RETURN(NULL); + } + + RETURN(&dev->od_svname[0]); +} + +static struct dt_device_operations osd_dt_ops = { + .dt_root_get = osd_root_get, + .dt_statfs = osd_statfs, + .dt_trans_create = osd_trans_create, + .dt_trans_start = osd_trans_start, + .dt_trans_stop = osd_trans_stop, + .dt_trans_cb_add = osd_trans_cb_add, + .dt_conf_get = osd_conf_get, + .dt_sync = osd_sync, + .dt_commit_async = osd_commit_async, + .dt_ro = osd_ro, + .dt_init_capa_ctxt = osd_init_capa_ctxt, +}; + +/* + * DMU OSD device type methods + */ +static int osd_type_init(struct lu_device_type *t) +{ + LU_CONTEXT_KEY_INIT(&osd_key); + return lu_context_key_register(&osd_key); +} + +static void osd_type_fini(struct lu_device_type *t) +{ + lu_context_key_degister(&osd_key); +} + +static void *osd_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osd_thread_info *info; + + OBD_ALLOC_PTR(info); + if (info != NULL) + info->oti_env = container_of(ctx, struct lu_env, le_ctx); + else + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osd_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osd_thread_info *info = data; + + OBD_FREE_PTR(info); +} + +static void osd_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osd_thread_info *info = data; + + memset(info, 0, sizeof(*info)); +} + +struct lu_context_key osd_key = { + .lct_tags = LCT_DT_THREAD | LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = osd_key_init, + .lct_fini = osd_key_fini, + .lct_exit = osd_key_exit +}; + +static int osd_shutdown(const struct lu_env *env, struct osd_device *o) +{ + RETURN(0); +} + +static int osd_mount(const struct lu_env *env, + struct osd_device *o, struct lustre_cfg *cfg) +{ + char *dev = lustre_cfg_string(cfg, 0); + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + dmu_buf_t *rootdb; + char *label; + int rc; + ENTRY; + + if (o->od_objset.os != NULL) + RETURN(0); + + lmi = server_get_mount(dev); + if (lmi == NULL) { + CERROR("Unknown mount point: '%s'\n", dev); + RETURN(-ENODEV); + } + + lsi = s2lsi(lmi->lmi_sb); + dev = lsi->lsi_lmd->lmd_dev; + + if (strlen(dev) >= sizeof(o->od_mntdev)) + RETURN(-E2BIG); + + strcpy(o->od_mntdev, dev); + + rc = -udmu_objset_open(o->od_mntdev + 1, &o->od_objset); + if (rc) { + CERROR("can't open objset %s: %d\n", o->od_mntdev, rc); + RETURN(rc); + } + + rc = __osd_obj2dbuf(env, o->od_objset.os, o->od_objset.root, + &rootdb, root_tag); + if (rc) { + CERROR("udmu_obj2dbuf() failed with error %d\n", rc); + udmu_objset_close(&o->od_objset); + RETURN(rc); + } + + o->od_root = rootdb->db_object; + sa_buf_rele(rootdb, root_tag); + + /* 1. initialize oi before any file create or file open */ + rc = osd_oi_init(env, o); + if (rc) + GOTO(err, rc); + + label = osd_label_get(env, &o->od_dt_dev); + if (label == NULL) + GOTO(err, rc = -ENODEV); + + /* Use our own ZAP for inode accounting by default, this can be changed + * via procfs to estimate the inode usage from the block usage */ + o->od_quota_iused_est = 0; + + rc = osd_procfs_init(o, label); + if (rc) + GOTO(err, rc); + + o->arc_prune_cb = arc_add_prune_callback(arc_prune_func, o); + +err: + RETURN(rc); +} + +static void osd_umount(const struct lu_env *env, struct osd_device *o) +{ + ENTRY; + + if (cfs_atomic_read(&o->od_zerocopy_alloc)) + CERROR("%s: lost %d allocated page(s)\n", o->od_svname, + cfs_atomic_read(&o->od_zerocopy_alloc)); + if (cfs_atomic_read(&o->od_zerocopy_loan)) + CERROR("%s: lost %d loaned abuf(s)\n", o->od_svname, + cfs_atomic_read(&o->od_zerocopy_loan)); + if (cfs_atomic_read(&o->od_zerocopy_pin)) + CERROR("%s: lost %d pinned dbuf(s)\n", o->od_svname, + cfs_atomic_read(&o->od_zerocopy_pin)); + + if (o->od_objset.os != NULL) + udmu_objset_close(&o->od_objset); + + EXIT; +} + +static int osd_device_init0(const struct lu_env *env, + struct osd_device *o, + struct lustre_cfg *cfg) +{ + struct lu_device *l = osd2lu_dev(o); + int rc; + + /* if the module was re-loaded, env can loose its keys */ + rc = lu_env_refill((struct lu_env *) env); + if (rc) + GOTO(out, rc); + + l->ld_ops = &osd_lu_ops; + o->od_dt_dev.dd_ops = &osd_dt_ops; + + o->od_capa_hash = init_capa_hash(); + if (o->od_capa_hash == NULL) + GOTO(out, rc = -ENOMEM); + +out: + RETURN(rc); +} + +static struct lu_device *osd_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct osd_device *o; + int rc; + + OBD_ALLOC_PTR(o); + if (o == NULL) + return ERR_PTR(-ENOMEM); + + rc = dt_device_init(&o->od_dt_dev, t); + if (rc == 0) { + rc = osd_device_init0(env, o, cfg); + if (rc) + dt_device_fini(&o->od_dt_dev); + } + + if (unlikely(rc != 0)) + OBD_FREE_PTR(o); + + return rc == 0 ? osd2lu_dev(o) : ERR_PTR(rc); +} + +static struct lu_device *osd_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osd_device *o = osd_dev(d); + ENTRY; + + cleanup_capa_hash(o->od_capa_hash); + /* XXX: make osd top device in order to release reference */ + /*d->ld_site->ls_top_dev = d; + lu_site_purge(env, d->ld_site, -1); + lu_site_fini(&o->od_site);*/ + dt_device_fini(&o->od_dt_dev); + OBD_FREE_PTR(o); + + RETURN (NULL); +} + +static struct lu_device *osd_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct osd_device *o = osd_dev(d); + struct lustre_mount_info *lmi; + int rc; + ENTRY; + + + osd_oi_fini(env, o); + + if (o->od_objset.os) { + arc_remove_prune_callback(o->arc_prune_cb); + o->arc_prune_cb = NULL; + osd_sync(env, lu2dt_dev(d)); + txg_wait_callbacks(spa_get_dsl(dmu_objset_spa(o->od_objset.os))); + } + + rc = osd_procfs_fini(o); + if (rc) { + CERROR("proc fini error %d\n", rc); + RETURN(ERR_PTR(rc)); + } + + if (o->od_objset.os) + osd_umount(env, o); + + lmi = server_get_mount_2(o->od_svname); + LASSERT(lmi); + server_put_mount(lmi->lmi_name, lmi->lmi_mnt); + + RETURN(NULL); +} + +static int osd_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + return 0; +} + +/* + * To be removed, setup is performed by osd_device_{init,alloc} and + * cleanup is performed by osd_device_{fini,free). + */ +static int osd_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct osd_device *o = osd_dev(d); + int err; + ENTRY; + + switch(cfg->lcfg_command) { + case LCFG_SETUP: + err = osd_mount(env, o, cfg); + break; + case LCFG_CLEANUP: + err = osd_shutdown(env, o); + break; + default: + err = -ENOTTY; + } + + RETURN(err); +} + +static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d) +{ + ENTRY; + RETURN(0); +} + +static int osd_prepare(const struct lu_env *env, struct lu_device *pdev, + struct lu_device *dev) +{ + return 0; +} + +struct lu_device_operations osd_lu_ops = { + .ldo_object_alloc = osd_object_alloc, + .ldo_process_config = osd_process_config, + .ldo_recovery_complete = osd_recovery_complete, + .ldo_prepare = osd_prepare, +}; + +static void osd_type_start(struct lu_device_type *t) +{ +} + +static void osd_type_stop(struct lu_device_type *t) +{ +} + +static struct lu_device_type_operations osd_device_type_ops = { + .ldto_init = osd_type_init, + .ldto_fini = osd_type_fini, + + .ldto_start = osd_type_start, + .ldto_stop = osd_type_stop, + + .ldto_device_alloc = osd_device_alloc, + .ldto_device_free = osd_device_free, + + .ldto_device_init = osd_device_init, + .ldto_device_fini = osd_device_fini +}; + +static struct lu_device_type osd_device_type = { + .ldt_tags = LU_DEVICE_DT, + .ldt_name = LUSTRE_OSD_ZFS_NAME, + .ldt_ops = &osd_device_type_ops, + .ldt_ctx_tags = LCT_LOCAL +}; + + +static struct obd_ops osd_obd_device_ops = { + .o_owner = THIS_MODULE, +}; + +int __init osd_init(void) +{ + int rc; + + rc = osd_options_init(); + if (rc) + return rc; + + rc = lu_kmem_init(osd_caches); + if (rc) + return rc; + + rc = class_register_type(&osd_obd_device_ops, NULL, + lprocfs_osd_module_vars, + LUSTRE_OSD_ZFS_NAME, &osd_device_type); + if (rc) + lu_kmem_fini(osd_caches); + return rc; +} + +void __exit osd_exit(void) +{ + class_unregister_type(LUSTRE_OSD_ZFS_NAME); + lu_kmem_fini(osd_caches); +} + +extern unsigned int osd_oi_count; +CFS_MODULE_PARM(osd_oi_count, "i", int, 0444, + "Number of Object Index containers to be created, " + "it's only valid for new filesystem."); + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Object Storage Device ("LUSTRE_OSD_ZFS_NAME")"); +MODULE_LICENSE("GPL"); + +cfs_module(osd, LUSTRE_VERSION_STRING, osd_init, osd_exit); diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h new file mode 100644 index 0000000..45bac12 --- /dev/null +++ b/lustre/osd-zfs/osd_internal.h @@ -0,0 +1,403 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012 Whamcloud, Inc. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/osd-zfs/osd_internal.h + * Shared definitions and declarations for zfs/dmu osd + * + * Author: Alex Zhuravlev + * Author: Mike Pershin + * Author: Johann Lombardi + */ + +#ifndef _OSD_INTERNAL_H +#define _OSD_INTERNAL_H + +#include +#include + +#include + +#include "udmu.h" + +#define LUSTRE_ROOT_FID_SEQ 0 +#define DMU_OSD_SVNAME "svname" +#define DMU_OSD_OI_NAME_BASE "oi" + +#define OSD_GFP_IO (GFP_NOFS | __GFP_HIGHMEM) + +/** + * Iterator's in-memory data structure for quota file. + */ +struct osd_it_quota { + struct osd_object *oiq_obj; + /* DMU accounting object id */ + uint64_t oiq_oid; + /* ZAP cursor */ + zap_cursor_t *oiq_zc; + /** identifier for current quota record */ + __u64 oiq_id; + unsigned oiq_reset:1; /* 1 -- no need to advance */ +}; + +/** + * Iterator's in-memory data structure for ZAPs + */ +struct osd_zap_it { + zap_cursor_t *ozi_zc; + struct osd_object *ozi_obj; + struct lustre_capa *ozi_capa; + unsigned ozi_reset:1; /* 1 -- no need to advance */ + union { + char ozi_name[NAME_MAX + 1]; /* file name for dir */ + __u64 ozi_key; /* binary key for index files */ + }; +}; +#define DT_IT2DT(it) (&((struct osd_zap_it *)it)->ozi_obj->oo_dt) + +/* + * regular ZFS direntry + */ +struct zpl_direntry { + uint64_t zde_dnode:48, + zde_pad:12, + zde_type:4; +} __attribute__((packed)); + +/* + * lustre direntry adds a fid to regular ZFS direntry + */ +struct luz_direntry { + struct zpl_direntry lzd_reg; + struct lu_fid lzd_fid; +} __attribute__((packed)); + + +/* cached SA attributes */ +struct osa_attr { + uint64_t mode; + uint64_t gid; + uint64_t uid; + uint64_t nlink; + uint64_t rdev; + uint64_t flags; + uint64_t size; + uint64_t atime[2]; + uint64_t mtime[2]; + uint64_t ctime[2]; +}; + +struct osd_thread_info { + const struct lu_env *oti_env; + + struct lu_fid oti_fid; + /* + * XXX temporary: for ->i_op calls. + */ + struct timespec oti_time; + /* + * XXX temporary: for capa operations. + */ + struct lustre_capa_key oti_capa_key; + struct lustre_capa oti_capa; + + struct ost_id oti_ostid; + + char oti_buf[64]; + + /** osd iterator context used for iterator session */ + union { + struct osd_zap_it oti_it_zap; + struct osd_it_quota oti_it_quota; + }; + + char oti_str[64]; + char oti_key[MAXNAMELEN + 1]; + + struct lu_attr oti_la; + struct osa_attr oti_osa; + zap_attribute_t oti_za; + dmu_object_info_t oti_doi; + struct luz_direntry oti_zde; +}; + +extern struct lu_context_key osd_key; + +static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &osd_key); +} + +struct osd_thandle { + struct thandle ot_super; + cfs_list_t ot_dcb_list; + cfs_list_t ot_sa_list; + cfs_semaphore_t ot_sa_lock; + dmu_tx_t *ot_tx; + __u32 ot_write_commit:1, + ot_assigned:1; +}; + +#define OSD_OI_NAME_SIZE 16 + +/* + * Object Index (OI) instance. + */ +struct osd_oi { + char oi_name[OSD_OI_NAME_SIZE]; /* unused */ + uint64_t oi_zapid; +}; + +#define OSD_OST_MAP_SIZE 32 + +/* + * osd device. + */ +struct osd_device { + /* super-class */ + struct dt_device od_dt_dev; + /* information about underlying file system */ + udmu_objset_t od_objset; + + /* + * Fid Capability + */ + unsigned int od_fl_capa:1; + unsigned long od_capa_timeout; + __u32 od_capa_alg; + struct lustre_capa_key *od_capa_keys; + cfs_hlist_head_t *od_capa_hash; + + cfs_proc_dir_entry_t *od_proc_entry; + struct lprocfs_stats *od_stats; + + uint64_t od_root; + struct osd_oi **od_oi_table; + unsigned int od_oi_count; + uint64_t od_ost_compat_dirs[OSD_OST_MAP_SIZE]; + uint64_t od_ost_compat_grp0; + + unsigned int od_rdonly:1, + od_quota_iused_est:1; + char od_mntdev[128]; + char od_svname[128]; + + int od_connects; + struct lu_site od_site; + + /* object IDs of the inode accounting indexes */ + uint64_t od_iusr_oid; + uint64_t od_igrp_oid; + + /* used to debug zerocopy logic: the fields track all + * allocated, loaned and referenced buffers in use. + * to be removed once the change is tested well. */ + cfs_atomic_t od_zerocopy_alloc; + cfs_atomic_t od_zerocopy_loan; + cfs_atomic_t od_zerocopy_pin; + + arc_prune_t *arc_prune_cb; +}; + +struct osd_object { + struct dt_object oo_dt; + /* + * Inode for file system object represented by this osd_object. This + * inode is pinned for the whole duration of lu_object life. + * + * Not modified concurrently (either setup early during object + * creation, or assigned by osd_object_create() under write lock). + */ + dmu_buf_t *oo_db; + sa_handle_t *oo_sa_hdl; + nvlist_t *oo_sa_xattr; + cfs_list_t oo_sa_linkage; + + cfs_rw_semaphore_t oo_sem; + + /* cached attributes */ + cfs_rwlock_t oo_attr_lock; + struct lu_attr oo_attr; + + /* protects extended attributes */ + cfs_semaphore_t oo_guard; + uint64_t oo_xattr; + + /* record size for index file */ + int oo_recsize; +}; + +int osd_statfs(const struct lu_env *, struct dt_device *, struct obd_statfs *); +extern const struct dt_index_operations osd_acct_index_ops; +uint64_t osd_quota_fid2dmu(const struct lu_fid *fid); +extern struct lu_device_operations osd_lu_ops; + +/* + * Helpers. + */ +static inline int lu_device_is_osd(const struct lu_device *d) +{ + return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops); +} + +static inline struct osd_object *osd_obj(const struct lu_object *o) +{ + LASSERT(lu_device_is_osd(o->lo_dev)); + return container_of0(o, struct osd_object, oo_dt.do_lu); +} + +static inline struct osd_device *osd_dt_dev(const struct dt_device *d) +{ + LASSERT(lu_device_is_osd(&d->dd_lu_dev)); + return container_of0(d, struct osd_device, od_dt_dev); +} + +static inline struct osd_device *osd_dev(const struct lu_device *d) +{ + LASSERT(lu_device_is_osd(d)); + return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev)); +} + +static inline struct osd_object *osd_dt_obj(const struct dt_object *d) +{ + return osd_obj(&d->do_lu); +} + +static inline struct osd_device *osd_obj2dev(const struct osd_object *o) +{ + return osd_dev(o->oo_dt.do_lu.lo_dev); +} + +static inline struct lu_device *osd2lu_dev(struct osd_device *osd) +{ + return &osd->od_dt_dev.dd_lu_dev; +} + +static inline struct objset * osd_dtobj2objset(struct dt_object *o) +{ + return osd_dev(o->do_lu.lo_dev)->od_objset.os; +} + +static inline int osd_invariant(const struct osd_object *obj) +{ + return 1; +} + +static inline int osd_object_invariant(const struct lu_object *l) +{ + return osd_invariant(osd_obj(l)); +} + + +#ifdef LPROCFS +enum { + LPROC_OSD_READ_BYTES = 0, + LPROC_OSD_WRITE_BYTES = 1, + LPROC_OSD_GET_PAGE = 2, + LPROC_OSD_NO_PAGE = 3, + LPROC_OSD_CACHE_ACCESS = 4, + LPROC_OSD_CACHE_HIT = 5, + LPROC_OSD_CACHE_MISS = 6, + LPROC_OSD_COPY_IO = 7, + LPROC_OSD_ZEROCOPY_IO = 8, + LPROC_OSD_TAIL_IO = 9, + LPROC_OSD_LAST, +}; + +/* osd_lproc.c */ +extern struct lprocfs_vars lprocfs_osd_obd_vars[]; +extern struct lprocfs_vars lprocfs_osd_module_vars[]; + +int osd_procfs_init(struct osd_device *osd, const char *name); +int osd_procfs_fini(struct osd_device *osd); + +int udmu_zap_cursor_retrieve_key(const struct lu_env *env, + zap_cursor_t *zc, char *key, int max); +int udmu_zap_cursor_retrieve_value(const struct lu_env *env, + zap_cursor_t *zc, char *buf, + int buf_size, int *bytes_read); + +/* osd_object.c */ +void osd_object_sa_dirty_rele(struct osd_thandle *oh); +int __osd_obj2dbuf(const struct lu_env *env, objset_t *os, + uint64_t oid, dmu_buf_t **dbp, void *tag); +struct lu_object *osd_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *d); +int osd_object_sa_update(struct osd_object *obj, sa_attr_type_t type, + void *buf, uint32_t buflen, struct osd_thandle *oh); +int __osd_zap_create(const struct lu_env *env, udmu_objset_t *uos, + dmu_buf_t **zap_dbp, dmu_tx_t *tx, + struct lu_attr *la, void *tag, zap_flags_t flags); +int __osd_object_create(const struct lu_env *env, udmu_objset_t *uos, + dmu_buf_t **dbp, dmu_tx_t *tx, + struct lu_attr *la, void *tag); +int __osd_object_free(udmu_objset_t *uos, uint64_t oid, dmu_tx_t *tx); + +/* osd_oi.c */ +int osd_oi_init(const struct lu_env *env, struct osd_device *o); +void osd_oi_fini(const struct lu_env *env, struct osd_device *o); +int osd_fid_lookup(const struct lu_env *env, + struct osd_device *, const struct lu_fid *, uint64_t *); +uint64_t osd_get_name_n_idx(const struct lu_env *env, struct osd_device *osd, + const struct lu_fid *fid, char *buf); +int osd_options_init(void); + +/* osd_index.c */ +int osd_index_try(const struct lu_env *env, struct dt_object *dt, + const struct dt_index_features *feat); + + +/* osd_xattr.c */ +int osd_xattr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name, + struct lustre_capa *capa); +int osd_declare_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, + int fl, struct thandle *handle); +int osd_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, int fl, + struct thandle *handle, struct lustre_capa *capa); +int osd_declare_xattr_del(const struct lu_env *env, struct dt_object *dt, + const char *name, struct thandle *handle); +int osd_xattr_del(const struct lu_env *env, struct dt_object *dt, + const char *name, struct thandle *handle, + struct lustre_capa *capa); +int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *lb, struct lustre_capa *capa); + +#endif +#endif /* _OSD_INTERNAL_H */ -- 1.8.3.1