From: Alex Zhuravlev Date: Wed, 19 Sep 2012 11:16:24 +0000 (+0400) Subject: LU-1303 lod: introduce lod device X-Git-Tag: 2.3.51~34 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=b2287531b63ac56692746a67ea17f576a6d2ab81 LU-1303 lod: introduce lod device this is the first patch in the series landing lod. LOD stands for LOgical Device, it's purpose is to hide striping for other layers like MDD. Signed-off-by: Alex Zhuravlev Change-Id: Ib33b3e8b2de6e816397a17af65a0bffab912960b Reviewed-on: http://review.whamcloud.com/4045 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: wangdi --- diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 94714bb..80db6b6 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -8,7 +8,7 @@ subdir-m += obdecho subdir-m += mgc @SERVER_TRUE@subdir-m += mds obdfilter ost mgs mdt cmm mdd ofd osd-ldiskfs -@SERVER_TRUE@subdir-m += quota osp +@SERVER_TRUE@subdir-m += quota osp lod @CLIENT_TRUE@subdir-m += mdc lmv llite fld @ZFS_ENABLED_TRUE@subdir-m += osd-zfs diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 6ed9139..f31442a 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -43,7 +43,7 @@ ALWAYS_SUBDIRS = include lvfs obdclass ldlm ptlrpc osc lov obdecho \ mgc fid fld doc utils tests scripts autoconf contrib conf SERVER_SUBDIRS = obdfilter ost mds mgs mdt cmm mdd ofd osd-zfs osd-ldiskfs \ - quota osp + quota osp lod CLIENT_SUBDIRS = mdc lmv llite lclient diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index fae89c8..f30df7f 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -2497,6 +2497,8 @@ lustre/utils/Makefile lustre/utils/gss/Makefile lustre/osp/Makefile lustre/osp/autoMakefile +lustre/lod/Makefile +lustre/lod/autoMakefile lustre/obdclass/darwin/Makefile ]) ]) diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index 7d2b97e..9aa277e 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -1375,6 +1375,18 @@ static inline int dt_commit_async(const struct lu_env *env, return dev->dd_ops->dt_commit_async(env, dev); } +static inline int dt_init_capa_ctxt(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_init_capa_ctxt); + return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode, + timeout, alg, keys); +} + static inline int dt_lookup(const struct lu_env *env, struct dt_object *dt, struct dt_rec *rec, diff --git a/lustre/lod/Makefile.in b/lustre/lod/Makefile.in new file mode 100644 index 0000000..1a3913a --- /dev/null +++ b/lustre/lod/Makefile.in @@ -0,0 +1,6 @@ +MODULES := lod +lod-objs := lod_dev.o lod_lov.o + +EXTRA_DIST = $(lod-objs:.o=.c) lod_internal.h + +@INCLUDE_RULES@ diff --git a/lustre/lod/autoMakefile.am b/lustre/lod/autoMakefile.am new file mode 100644 index 0000000..d34f420 --- /dev/null +++ b/lustre/lod/autoMakefile.am @@ -0,0 +1,42 @@ +# +# GPL HEADER START +# +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 only, +# as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License version 2 for more details (a copy is included +# in the LICENSE file that accompanied this code). +# +# You should have received a copy of the GNU General Public License +# version 2 along with this program; If not, see +# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf +# +# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, +# CA 95054 USA or visit www.sun.com if you need additional information or +# have any questions. +# +# GPL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved +# Use is subject to license terms. +# + +# +# This file is part of Lustre, http://www.lustre.org/ +# Lustre is a trademark of Sun Microsystems, Inc. +# + +if MODULES +modulefs_DATA = lod$(KMODEXT) +endif + +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +EXTRA_DIST := $(lod-objs:%.o=%.c) lod_internal.h diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c new file mode 100644 index 0000000..28ce3bd --- /dev/null +++ b/lustre/lod/lod_dev.c @@ -0,0 +1,642 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel, Inc. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lod/lod_dev.c + * + * Lustre Logical Object Device + * + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include + +#include "lod_internal.h" + +/* Slab for OSD object allocation */ +cfs_mem_cache_t *lod_object_kmem; + +static struct lu_kmem_descr lod_caches[] = { + { + .ckd_cache = &lod_object_kmem, + .ckd_name = "lod_obj", + .ckd_size = sizeof(struct lod_object) + }, + { + .ckd_cache = NULL + } +}; + +static int lod_process_config(const struct lu_env *env, + struct lu_device *dev, + struct lustre_cfg *lcfg) +{ + struct lod_device *lod = lu2lod_dev(dev); + struct lu_device *next = &lod->lod_child->dd_lu_dev; + char *arg1; + int rc, i; + ENTRY; + + switch(lcfg->lcfg_command) { + + case LCFG_LOV_DEL_OBD: + case LCFG_LOV_ADD_INA: + case LCFG_LOV_ADD_OBD: { + __u32 index; + int gen; + /* lov_modify_tgts add 0:lov_mdsA 1:osp 2:0 3:1 */ + arg1 = lustre_cfg_string(lcfg, 1); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) + GOTO(out, rc = -EINVAL); + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) + GOTO(out, rc = -EINVAL); + + rc = -EINVAL; + break; + } + + case LCFG_PARAM: { + struct lprocfs_static_vars v = { 0 }; + struct obd_device *obd = lod2obd(lod); + + rc = class_process_proc_param(PARAM_LOV, v.obd_vars, lcfg, obd); + if (rc > 0) + rc = 0; + GOTO(out, rc); + } + + case LCFG_CLEANUP: + lu_dev_del_linkage(dev->ld_site, dev); + lod_getref(lod); + lod_foreach_ost(lod, i) { + struct lod_ost_desc *ost; + ost = OST_TGT(lod, i); + LASSERT(ost && ost->ltd_ost); + next = &ost->ltd_ost->dd_lu_dev; + rc = next->ld_ops->ldo_process_config(env, next, lcfg); + if (rc) + CERROR("%s: can't process %u: %d\n", + lod2obd(lod)->obd_name, + lcfg->lcfg_command, rc); + } + lod_putref(lod); + + /* + * do cleanup on underlying storage only when + * all OSPs are cleaned up, as they use that OSD as well + */ + next = &lod->lod_child->dd_lu_dev; + rc = next->ld_ops->ldo_process_config(env, next, lcfg); + if (rc) + CERROR("%s: can't process %u: %d\n", + lod2obd(lod)->obd_name, lcfg->lcfg_command, rc); + break; + + default: + CERROR("%s: unknown command %u\n", lod2obd(lod)->obd_name, + lcfg->lcfg_command); + rc = -EINVAL; + break; + } + +out: + RETURN(rc); +} + +static int lod_recovery_complete(const struct lu_env *env, + struct lu_device *dev) +{ + struct lod_device *lod = lu2lod_dev(dev); + struct lu_device *next = &lod->lod_child->dd_lu_dev; + struct lod_ost_desc *ost; + int i, rc; + ENTRY; + + LASSERT(lod->lod_recovery_completed == 0); + lod->lod_recovery_completed = 1; + + rc = next->ld_ops->ldo_recovery_complete(env, next); + + lod_getref(lod); + lod_foreach_ost(lod, i) { + ost = OST_TGT(lod, i); + LASSERT(ost && ost->ltd_ost); + next = &ost->ltd_ost->dd_lu_dev; + rc = next->ld_ops->ldo_recovery_complete(env, next); + if (rc) + CERROR("%s: can't complete recovery on #%d: %d\n", + lod2obd(lod)->obd_name, i, rc); + } + lod_putref(lod); + + RETURN(rc); +} + +const struct lu_device_operations lod_lu_ops = { + .ldo_process_config = lod_process_config, + .ldo_recovery_complete = lod_recovery_complete, +}; + +static int lod_root_get(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f) +{ + return dt_root_get(env, dt2lod_dev(dev)->lod_child, f); +} + +static int lod_statfs(const struct lu_env *env, + struct dt_device *dev, struct obd_statfs *sfs) +{ + return dt_statfs(env, dt2lod_dev(dev)->lod_child, sfs); +} + +static struct thandle *lod_trans_create(const struct lu_env *env, + struct dt_device *dev) +{ + return dt_trans_create(env, dt2lod_dev(dev)->lod_child); +} + +static int lod_trans_start(const struct lu_env *env, struct dt_device *dev, + struct thandle *th) +{ + return dt_trans_start(env, dt2lod_dev(dev)->lod_child, th); +} + +static int lod_trans_stop(const struct lu_env *env, struct thandle *th) +{ + /* XXX: we don't know next device, will be fixed with DNE */ + return dt_trans_stop(env, th->th_dev, th); +} + +static void lod_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + dt_conf_get(env, dt2lod_dev((struct dt_device *)dev)->lod_child, param); +} + +static int lod_sync(const struct lu_env *env, struct dt_device *dev) +{ + struct lod_device *lod = dt2lod_dev(dev); + struct lod_ost_desc *ost; + int rc = 0, i; + ENTRY; + + lod_getref(lod); + lod_foreach_ost(lod, i) { + ost = OST_TGT(lod, i); + LASSERT(ost && ost->ltd_ost); + rc = dt_sync(env, ost->ltd_ost); + if (rc) { + CERROR("%s: can't sync %u: %d\n", + lod2obd(lod)->obd_name, i, rc); + break; + } + } + lod_putref(lod); + if (rc == 0) + rc = dt_sync(env, lod->lod_child); + + RETURN(rc); +} + +static int lod_ro(const struct lu_env *env, struct dt_device *dev) +{ + return dt_ro(env, dt2lod_dev(dev)->lod_child); +} + +static int lod_commit_async(const struct lu_env *env, struct dt_device *dev) +{ + return dt_commit_async(env, dt2lod_dev(dev)->lod_child); +} + +static int lod_init_capa_ctxt(const struct lu_env *env, struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys) +{ + struct dt_device *next = dt2lod_dev(dev)->lod_child; + return dt_init_capa_ctxt(env, next, mode, timeout, alg, keys); +} + +static const struct dt_device_operations lod_dt_ops = { + .dt_root_get = lod_root_get, + .dt_statfs = lod_statfs, + .dt_trans_create = lod_trans_create, + .dt_trans_start = lod_trans_start, + .dt_trans_stop = lod_trans_stop, + .dt_conf_get = lod_conf_get, + .dt_sync = lod_sync, + .dt_ro = lod_ro, + .dt_commit_async = lod_commit_async, + .dt_init_capa_ctxt = lod_init_capa_ctxt, +}; + +static int lod_connect_to_osd(const struct lu_env *env, struct lod_device *lod, + struct lustre_cfg *cfg) +{ + struct obd_connect_data *data = NULL; + struct obd_device *obd; + char *nextdev = NULL, *p, *s; + int rc, len = 0; + ENTRY; + + LASSERT(cfg); + LASSERT(lod->lod_child_exp == NULL); + + /* compatibility hack: we still use old config logs + * which specify LOV, but we need to learn underlying + * OSD device, which is supposed to be: + * -MDTxxxx-osd + * + * 2.x MGS generates lines like the following: + * #03 (176)lov_setup 0:lustre-MDT0000-mdtlov 1:(struct lov_desc) + * 1.8 MGS generates lines like the following: + * #03 (168)lov_setup 0:lustre-mdtlov 1:(struct lov_desc) + * + * we use "-MDT" to differentiate 2.x from 1.8 */ + + if ((p = lustre_cfg_string(cfg, 0)) && strstr(p, "-mdtlov")) { + len = strlen(p) + 1; + OBD_ALLOC(nextdev, len); + if (nextdev == NULL) + GOTO(out, rc = -ENOMEM); + + strcpy(nextdev, p); + s = strstr(nextdev, "-mdtlov"); + if (unlikely(s == NULL)) { + CERROR("unable to parse device name %s\n", + lustre_cfg_string(cfg, 0)); + GOTO(out, rc = -EINVAL); + } + + if (strstr(nextdev, "-MDT")) { + /* 2.x config */ + strcpy(s, "-osd"); + } else { + /* 1.8 config */ + strcpy(s, "-MDT0000-osd"); + } + } else { + CERROR("unable to parse device name %s\n", + lustre_cfg_string(cfg, 0)); + GOTO(out, rc = -EINVAL); + } + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + + obd = class_name2obd(nextdev); + if (obd == NULL) { + CERROR("can not locate next device: %s\n", nextdev); + GOTO(out, rc = -ENOTCONN); + } + + data->ocd_connect_flags = OBD_CONNECT_VERSION; + data->ocd_version = LUSTRE_VERSION_CODE; + + rc = obd_connect(env, &lod->lod_child_exp, obd, &obd->obd_uuid, + data, NULL); + if (rc) { + CERROR("cannot connect to next dev %s (%d)\n", nextdev, rc); + GOTO(out, rc); + } + + lod->lod_dt_dev.dd_lu_dev.ld_site = + lod->lod_child_exp->exp_obd->obd_lu_dev->ld_site; + LASSERT(lod->lod_dt_dev.dd_lu_dev.ld_site); + lod->lod_child = lu2dt_dev(lod->lod_child_exp->exp_obd->obd_lu_dev); + +out: + if (data) + OBD_FREE_PTR(data); + if (nextdev) + OBD_FREE(nextdev, len); + RETURN(rc); +} + +static int lod_init0(const struct lu_env *env, struct lod_device *lod, + struct lu_device_type *ldt, struct lustre_cfg *cfg) +{ + struct dt_device_param ddp; + struct proc_dir_entry *lov_proc_dir; + struct obd_device *obd; + int rc; + ENTRY; + + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + if (obd == NULL) { + CERROR("Cannot find obd with name %s\n", + lustre_cfg_string(cfg, 0)); + RETURN(-ENODEV); + } + + lod->lod_dt_dev.dd_lu_dev.ld_ops = &lod_lu_ops; + lod->lod_dt_dev.dd_ops = &lod_dt_ops; + + rc = lod_connect_to_osd(env, lod, cfg); + if (rc) + RETURN(rc); + + dt_conf_get(env, &lod->lod_dt_dev, &ddp); + lod->lod_osd_max_easize = ddp.ddp_max_ea_size; + + /* for compatibility we link old procfs's OSC entries to osp ones */ + lov_proc_dir = lprocfs_srch(proc_lustre_root, "lov"); + if (lov_proc_dir) { + cfs_proc_dir_entry_t *symlink = NULL; + char *name; + OBD_ALLOC(name, strlen(obd->obd_name) + 1); + if (name) { + strcpy(name, obd->obd_name); + if (strstr(name, "lov")) + symlink = lprocfs_add_symlink(name, + lov_proc_dir, + "../lod/%s", + obd->obd_name); + OBD_FREE(name, strlen(obd->obd_name) + 1); + lod->lod_symlink = symlink; + } + } + + cfs_mutex_init(&lod->lod_mutex); + cfs_init_rwsem(&lod->lod_rw_sem); + cfs_spin_lock_init(&lod->lod_desc_lock); + + RETURN(0); + + obd_disconnect(lod->lod_child_exp); + RETURN(rc); +} + +static struct lu_device *lod_device_free(const struct lu_env *env, + struct lu_device *lu) +{ + struct lod_device *lod = lu2lod_dev(lu); + struct lu_device *next = &lod->lod_child->dd_lu_dev; + ENTRY; + + LASSERT(cfs_atomic_read(&lu->ld_ref) == 0); + dt_device_fini(&lod->lod_dt_dev); + OBD_FREE_PTR(lod); + RETURN(next); +} + +static struct lu_device *lod_device_alloc(const struct lu_env *env, + struct lu_device_type *type, + struct lustre_cfg *lcfg) +{ + struct lod_device *lod; + struct lu_device *lu_dev; + + OBD_ALLOC_PTR(lod); + if (lod == NULL) { + lu_dev = ERR_PTR(-ENOMEM); + } else { + int rc; + + lu_dev = lod2lu_dev(lod); + dt_device_init(&lod->lod_dt_dev, type); + rc = lod_init0(env, lod, type, lcfg); + if (rc != 0) { + lod_device_free(env, lu_dev); + lu_dev = ERR_PTR(rc); + } + } + + return lu_dev; +} + +static struct lu_device *lod_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lod_device *lod = lu2lod_dev(d); + int rc; + ENTRY; + + if (lod->lod_symlink) + lprocfs_remove(&lod->lod_symlink); + + rc = obd_disconnect(lod->lod_child_exp); + if (rc) + CERROR("error in disconnect from storage: %d\n", rc); + + RETURN(NULL); +} + +/* + * we use exports to track all LOD users + */ +static int lod_obd_connect(const struct lu_env *env, struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); + struct lustre_handle conn; + int rc; + ENTRY; + + CDEBUG(D_CONFIG, "connect #%d\n", lod->lod_connects); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + RETURN(rc); + + *exp = class_conn2export(&conn); + + cfs_mutex_lock(&lod->lod_mutex); + lod->lod_connects++; + /* at the moment we expect the only user */ + LASSERT(lod->lod_connects == 1); + cfs_mutex_unlock(&lod->lod_mutex); + + RETURN(0); +} + +/* + * once last export (we don't count self-export) disappeared + * lod can be released + */ +static int lod_obd_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); + int rc, release = 0; + ENTRY; + + /* Only disconnect the underlying layers on the final disconnect. */ + cfs_mutex_lock(&lod->lod_mutex); + lod->lod_connects--; + if (lod->lod_connects != 0) { + /* why should there be more than 1 connect? */ + cfs_mutex_unlock(&lod->lod_mutex); + CERROR("%s: disconnect #%d\n", exp->exp_obd->obd_name, + lod->lod_connects); + goto out; + } + cfs_mutex_unlock(&lod->lod_mutex); + + /* the last user of lod has gone, let's release the device */ + release = 1; + +out: + rc = class_disconnect(exp); /* bz 9811 */ + + if (rc == 0 && release) + class_manual_cleanup(obd); + RETURN(rc); +} + +LU_KEY_INIT(lod, struct lod_thread_info); + +static void lod_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lod_thread_info *info = data; + /* allocated in lod_get_lov_ea + * XXX: this is overload, a tread may have such store but used only + * once. Probably better would be pool of such stores per LOD. + */ + if (info->lti_ea_store) { + OBD_FREE_LARGE(info->lti_ea_store, info->lti_ea_store_size); + info->lti_ea_store = NULL; + info->lti_ea_store_size = 0; + } + OBD_FREE_PTR(info); +} + +/* context key: lod_thread_key */ +LU_CONTEXT_KEY_DEFINE(lod, LCT_MD_THREAD); + +LU_TYPE_INIT_FINI(lod, &lod_thread_key); + +static struct lu_device_type_operations lod_device_type_ops = { + .ldto_init = lod_type_init, + .ldto_fini = lod_type_fini, + + .ldto_start = lod_type_start, + .ldto_stop = lod_type_stop, + + .ldto_device_alloc = lod_device_alloc, + .ldto_device_free = lod_device_free, + + .ldto_device_fini = lod_device_fini +}; + +static struct lu_device_type lod_device_type = { + .ldt_tags = LU_DEVICE_DT, + .ldt_name = LUSTRE_LOD_NAME, + .ldt_ops = &lod_device_type_ops, + .ldt_ctx_tags = LCT_MD_THREAD, +}; + +static int lod_obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + struct lod_device *d = lu2lod_dev(obd->obd_lu_dev); + struct lod_ost_desc *ost; + int i, rc = 1; + ENTRY; + + LASSERT(d); + lod_getref(d); + cfs_foreach_bit(d->lod_ost_bitmap, i) { + ost = OST_TGT(d, i); + LASSERT(ost && ost->ltd_ost); + rc = obd_health_check(env, ost->ltd_exp->exp_obd); + /* one healthy device is enough */ + if (rc == 0) + break; + } + lod_putref(d); + RETURN(rc); +} + +static struct obd_ops lod_obd_device_ops = { + .o_owner = THIS_MODULE, + .o_connect = lod_obd_connect, + .o_disconnect = lod_obd_disconnect, + .o_health_check = lod_obd_health_check, +}; + +static int __init lod_mod_init(void) +{ + struct lprocfs_static_vars lvars = { 0 }; + cfs_proc_dir_entry_t *lov_proc_dir; + int rc; + + rc = lu_kmem_init(lod_caches); + if (rc) + return rc; + + rc = class_register_type(&lod_obd_device_ops, NULL, lvars.module_vars, + LUSTRE_LOD_NAME, &lod_device_type); + if (rc) { + lu_kmem_fini(lod_caches); + return rc; + } + + /* create "lov" entry in procfs for compatibility purposes */ + lov_proc_dir = lprocfs_srch(proc_lustre_root, "lov"); + if (lov_proc_dir == NULL) { + lov_proc_dir = lprocfs_register("lov", proc_lustre_root, + NULL, NULL); + if (IS_ERR(lov_proc_dir)) + CERROR("lod: can't create compat entry \"lov\": %d\n", + (int)PTR_ERR(lov_proc_dir)); + } + + return rc; +} + +static void __exit lod_mod_exit(void) +{ + + lprocfs_try_remove_proc_entry("lov", proc_lustre_root); + + class_unregister_type(LUSTRE_LOD_NAME); + lu_kmem_fini(lod_caches); +} + +MODULE_AUTHOR("Whamcloud, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Object Device ("LUSTRE_LOD_NAME")"); +MODULE_LICENSE("GPL"); + +module_init(lod_mod_init); +module_exit(lod_mod_exit); + diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h new file mode 100644 index 0000000..bc6283f8 --- /dev/null +++ b/lustre/lod/lod_internal.h @@ -0,0 +1,248 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lod/lod_internal.h + * + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#ifndef _LOD_INTERNAL_H +#define _LOD_INTERNAL_H + +#include +#include +#include + +#define LOV_USES_ASSIGNED_STRIPE 0 +#define LOV_USES_DEFAULT_STRIPE 1 + +struct lod_ost_desc { + struct dt_device *ltd_ost; + struct list_head ltd_kill; + struct obd_export *ltd_exp; + struct obd_uuid ltd_uuid; + __u32 ltd_gen; + __u32 ltd_index; + struct ltd_qos ltd_qos; /* qos info per target */ + struct obd_statfs ltd_statfs; + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1; /* should this target be deleted */ +}; + +#define OST_PTRS 256 /* number of pointers at 1st level */ +#define OST_PTRS_PER_BLOCK 256 /* number of pointers at 2nd level */ + +struct lod_ost_desc_idx { + struct lod_ost_desc *ldi_ost[OST_PTRS_PER_BLOCK]; +}; + +#define OST_TGT(dev,index) \ + ((dev)->lod_ost_idx[(index) / \ + OST_PTRS_PER_BLOCK]->ldi_ost[(index)%OST_PTRS_PER_BLOCK]) + +struct lod_device { + struct dt_device lod_dt_dev; + struct obd_export *lod_child_exp; + struct dt_device *lod_child; + cfs_proc_dir_entry_t *lod_proc_entry; + struct lprocfs_stats *lod_stats; + int lod_connects; + int lod_recovery_completed; + + /* lov settings descriptor storing static information */ + struct lov_desc lod_desc; + + /* use to protect ld_active_tgt_count and all ltd_active */ + cfs_spinlock_t lod_desc_lock; + + /* list of known OSTs */ + struct lod_ost_desc_idx *lod_ost_idx[OST_PTRS]; + + /* Size of the lod_osts array, granted to be a power of 2 */ + __u32 lod_osts_size; + /* number of registered OSTs */ + int lod_ostnr; + /* OSTs scheduled to be deleted */ + __u32 lod_death_row; + /* bitmap of OSTs available */ + cfs_bitmap_t *lod_ost_bitmap; + + /* maximum EA size underlied OSD may have */ + unsigned int lod_osd_max_easize; + + /* Table refcount used for delayed deletion */ + int lod_refcount; + /* mutex to serialize concurrent updates to the ost table */ + cfs_mutex_t lod_mutex; + /* read/write semaphore used for array relocation */ + cfs_rw_semaphore_t lod_rw_sem; + + /* QoS info per LOD */ + struct lov_qos lod_qos; /* qos info per lod */ + + /* OST pool data */ + struct ost_pool lod_pool_info; /* all OSTs in a packed array */ + int lod_pool_count; + cfs_hash_t *lod_pools_hash_body; /* used for key access */ + cfs_list_t lod_pool_list; /* used for sequential access */ + cfs_proc_dir_entry_t *lod_pool_proc_entry; + + enum lustre_sec_part lod_sp_me; + + cfs_proc_dir_entry_t *lod_symlink; +}; + +/* + * XXX: shrink this structure, currently it's 72bytes on 32bit arch, + * so, slab will be allocating 128bytes + */ +struct lod_object { + struct dt_object ldo_obj; + + /* if object is striped, then the next fields describe stripes */ + __u16 ldo_stripenr; + __u16 ldo_layout_gen; + __u32 ldo_stripe_size; + char *ldo_pool; + struct dt_object **ldo_stripe; + /* to know how much memory to free, ldo_stripenr can be less */ + int ldo_stripes_allocated; + /* default striping for directory represented by this object + * is cached in stripenr/stripe_size */ + int ldo_striping_cached:1; + int ldo_def_striping_set:1; + __u32 ldo_def_stripe_size; + __u16 ldo_def_stripenr; + __u16 ldo_def_stripe_offset; +}; + + +struct lod_thread_info { + /* per-thread buffer for LOV EA */ + void *lti_ea_store; + int lti_ea_store_size; + struct lu_buf lti_buf; + struct ost_id lti_ostid; + struct lu_fid lti_fid; + struct obd_statfs lti_osfs; + struct lu_attr lti_attr; +}; + +extern const struct lu_device_operations lod_lu_ops; + +static inline int lu_device_is_lod(struct lu_device *d) +{ + return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &lod_lu_ops); +} + +static inline struct lod_device* lu2lod_dev(struct lu_device *d) +{ + LASSERT(lu_device_is_lod(d)); + return container_of0(d, struct lod_device, lod_dt_dev.dd_lu_dev); +} + +static inline struct lu_device *lod2lu_dev(struct lod_device *d) +{ + return &d->lod_dt_dev.dd_lu_dev; +} + +static inline struct obd_device *lod2obd(struct lod_device *d) +{ + return d->lod_dt_dev.dd_lu_dev.ld_obd; +} + +static inline struct lod_device *dt2lod_dev(struct dt_device *d) +{ + LASSERT(lu_device_is_lod(&d->dd_lu_dev)); + return container_of0(d, struct lod_device, lod_dt_dev); +} + +static inline struct lod_object *lu2lod_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_lod(o->lo_dev))); + return container_of0(o, struct lod_object, ldo_obj.do_lu); +} + +static inline struct lu_object *lod2lu_obj(struct lod_object *obj) +{ + return &obj->ldo_obj.do_lu; +} + +static inline struct lod_object *lod_obj(const struct lu_object *o) +{ + LASSERT(lu_device_is_lod(o->lo_dev)); + return container_of0(o, struct lod_object, ldo_obj.do_lu); +} + +static inline struct lod_object *lod_dt_obj(const struct dt_object *d) +{ + return lod_obj(&d->do_lu); +} + +static inline struct dt_object* lod_object_child(struct lod_object *o) +{ + return container_of0(lu_object_next(lod2lu_obj(o)), + struct dt_object, do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of0(o, struct dt_object, do_lu); +} + +static inline struct dt_object *dt_object_child(struct dt_object *o) +{ + return container_of0(lu_object_next(&(o)->do_lu), + struct dt_object, do_lu); +} + +extern struct lu_context_key lod_thread_key; + +static inline struct lod_thread_info *lod_env_info(const struct lu_env *env) +{ + struct lod_thread_info *info; + info = lu_context_key_get(&env->le_ctx, &lod_thread_key); + LASSERT(info); + return info; +} + +#define lod_foreach_ost(__dev, index) \ + if ((__dev)->lod_osts_size > 0) \ + cfs_foreach_bit((__dev)->lod_ost_bitmap, (index)) + +/* lod_lov.c */ +void lod_getref(struct lod_device *lod); +void lod_putref(struct lod_device *lod); + + +#endif + diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c new file mode 100644 index 0000000..5973a67 --- /dev/null +++ b/lustre/lod/lod_lov.c @@ -0,0 +1,112 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * lustre/lod/lod_lov.c + * + * Author: Alex Zhuravlev + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include + +#include "lod_internal.h" + +/* + * Keep a refcount of lod->lod_osts usage to prevent racing with + * addition/deletion. Any function that expects lov_tgts to remain stationary + * must take a ref. + * + * \param lod - is the lod device from which we want to grab a reference + */ +void lod_getref(struct lod_device *lod) +{ + cfs_down_read(&lod->lod_rw_sem); + cfs_mutex_lock(&lod->lod_mutex); + lod->lod_refcount++; + cfs_mutex_unlock(&lod->lod_mutex); +} + +/* + * Companion of lod_getref() to release a reference on the lod table. + * If this is the last reference and the ost entry was scheduled for deletion, + * the descriptor is removed from the array. + * + * \param lod - is the lod device from which we release a reference + */ +void lod_putref(struct lod_device *lod) +{ + cfs_mutex_lock(&lod->lod_mutex); + lod->lod_refcount--; + if (lod->lod_refcount == 0 && lod->lod_death_row) { + struct lod_ost_desc *ost_desc, *tmp; + int idx; + CFS_LIST_HEAD(kill); + + CDEBUG(D_CONFIG, "destroying %d lod desc\n", + lod->lod_death_row); + + cfs_foreach_bit(lod->lod_ost_bitmap, idx) { + ost_desc = OST_TGT(lod, idx); + LASSERT(ost_desc); + + if (!ost_desc->ltd_reap) + continue; + + cfs_list_add(&ost_desc->ltd_kill, &kill); + /* XXX: remove from the pool */ + OST_TGT(lod, idx) = NULL; + lod->lod_ostnr--; + cfs_bitmap_clear(lod->lod_ost_bitmap, idx); + if (ost_desc->ltd_active) + lod->lod_desc.ld_active_tgt_count--; + lod->lod_death_row--; + } + cfs_mutex_unlock(&lod->lod_mutex); + cfs_up_read(&lod->lod_rw_sem); + + cfs_list_for_each_entry_safe(ost_desc, tmp, &kill, ltd_kill) { + int rc; + cfs_list_del(&ost_desc->ltd_kill); + /* XXX: remove from QoS structures */ + /* disconnect from OSP */ + rc = obd_disconnect(ost_desc->ltd_exp); + if (rc) + CERROR("%s: failed to disconnect %s (%d)\n", + lod2obd(lod)->obd_name, + obd_uuid2str(&ost_desc->ltd_uuid), rc); + OBD_FREE_PTR(ost_desc); + } + } else { + cfs_mutex_unlock(&lod->lod_mutex); + cfs_up_read(&lod->lod_rw_sem); + } +} +