Whamcloud - gitweb
LU-1303 lod: introduce lod device
authorAlex Zhuravlev <bzzz@whamcloud.com>
Wed, 19 Sep 2012 11:16:24 +0000 (15:16 +0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 26 Sep 2012 14:54:06 +0000 (10:54 -0400)
this is the first patch in the series landing lod.
LOD stands for LOgical Device, it's purpose is to
hide striping for other layers like MDD.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Ib33b3e8b2de6e816397a17af65a0bffab912960b
Reviewed-on: http://review.whamcloud.com/4045
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: wangdi <di.wang@whamcloud.com>
lustre/Makefile.in
lustre/autoMakefile.am
lustre/autoconf/lustre-core.m4
lustre/include/dt_object.h
lustre/lod/Makefile.in [new file with mode: 0644]
lustre/lod/autoMakefile.am [new file with mode: 0644]
lustre/lod/lod_dev.c [new file with mode: 0644]
lustre/lod/lod_internal.h [new file with mode: 0644]
lustre/lod/lod_lov.c [new file with mode: 0644]

index 94714bb..80db6b6 100644 (file)
@@ -8,7 +8,7 @@ subdir-m += obdecho
 subdir-m += mgc
 
 @SERVER_TRUE@subdir-m += mds obdfilter ost mgs mdt cmm mdd ofd osd-ldiskfs
-@SERVER_TRUE@subdir-m += quota osp
+@SERVER_TRUE@subdir-m += quota osp lod
 @CLIENT_TRUE@subdir-m += mdc lmv llite fld
 @ZFS_ENABLED_TRUE@subdir-m += osd-zfs
 
index 6ed9139..f31442a 100644 (file)
@@ -43,7 +43,7 @@ ALWAYS_SUBDIRS = include lvfs obdclass ldlm ptlrpc osc lov obdecho \
        mgc fid fld doc utils tests scripts autoconf contrib conf
 
 SERVER_SUBDIRS = obdfilter ost mds mgs mdt cmm mdd ofd osd-zfs osd-ldiskfs \
-       quota osp
+       quota osp lod
 
 CLIENT_SUBDIRS = mdc lmv llite lclient
 
index fae89c8..f30df7f 100644 (file)
@@ -2497,6 +2497,8 @@ lustre/utils/Makefile
 lustre/utils/gss/Makefile
 lustre/osp/Makefile
 lustre/osp/autoMakefile
+lustre/lod/Makefile
+lustre/lod/autoMakefile
 lustre/obdclass/darwin/Makefile
 ])
 ])
index 7d2b97e..9aa277e 100644 (file)
@@ -1375,6 +1375,18 @@ static inline int dt_commit_async(const struct lu_env *env,
         return dev->dd_ops->dt_commit_async(env, dev);
 }
 
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+                                   struct dt_device *dev,
+                                   int mode, unsigned long timeout,
+                                   __u32 alg, struct lustre_capa_key *keys)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+       return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+                                             timeout, alg, keys);
+}
+
 static inline int dt_lookup(const struct lu_env *env,
                             struct dt_object *dt,
                             struct dt_rec *rec,
diff --git a/lustre/lod/Makefile.in b/lustre/lod/Makefile.in
new file mode 100644 (file)
index 0000000..1a3913a
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := lod
+lod-objs := lod_dev.o lod_lov.o
+
+EXTRA_DIST = $(lod-objs:.o=.c) lod_internal.h
+
+@INCLUDE_RULES@
diff --git a/lustre/lod/autoMakefile.am b/lustre/lod/autoMakefile.am
new file mode 100644 (file)
index 0000000..d34f420
--- /dev/null
@@ -0,0 +1,42 @@
+#
+# GPL HEADER START
+#
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 only,
+# as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License version 2 for more details (a copy is included
+# in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License
+# version 2 along with this program; If not, see
+# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+#
+# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+# CA 95054 USA or visit www.sun.com if you need additional information or
+# have any questions.
+#
+# GPL HEADER END
+#
+
+#
+# Copyright  2008 Sun Microsystems, Inc. All rights reserved
+# Use is subject to license terms.
+#
+
+#
+# This file is part of Lustre, http://www.lustre.org/
+# Lustre is a trademark of Sun Microsystems, Inc.
+#
+
+if MODULES
+modulefs_DATA = lod$(KMODEXT)
+endif
+
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
+EXTRA_DIST := $(lod-objs:%.o=%.c) lod_internal.h
diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c
new file mode 100644 (file)
index 0000000..28ce3bd
--- /dev/null
@@ -0,0 +1,642 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2009 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel, Inc.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lod/lod_dev.c
+ *
+ * Lustre Logical Object Device
+ *
+ * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <obd_class.h>
+#include <lustre_param.h>
+
+#include "lod_internal.h"
+
+/* Slab for OSD object allocation */
+cfs_mem_cache_t *lod_object_kmem;
+
+static struct lu_kmem_descr lod_caches[] = {
+       {
+               .ckd_cache = &lod_object_kmem,
+               .ckd_name  = "lod_obj",
+               .ckd_size  = sizeof(struct lod_object)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+static int lod_process_config(const struct lu_env *env,
+                             struct lu_device *dev,
+                             struct lustre_cfg *lcfg)
+{
+       struct lod_device *lod = lu2lod_dev(dev);
+       struct lu_device  *next = &lod->lod_child->dd_lu_dev;
+       char              *arg1;
+       int                rc, i;
+       ENTRY;
+
+       switch(lcfg->lcfg_command) {
+
+       case LCFG_LOV_DEL_OBD:
+       case LCFG_LOV_ADD_INA:
+       case LCFG_LOV_ADD_OBD: {
+               __u32 index;
+               int gen;
+               /* lov_modify_tgts add  0:lov_mdsA  1:osp  2:0  3:1 */
+               arg1 = lustre_cfg_string(lcfg, 1);
+
+               if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+                       GOTO(out, rc = -EINVAL);
+               if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+                       GOTO(out, rc = -EINVAL);
+
+               rc = -EINVAL;
+               break;
+       }
+
+       case LCFG_PARAM: {
+               struct lprocfs_static_vars  v = { 0 };
+               struct obd_device         *obd = lod2obd(lod);
+
+               rc = class_process_proc_param(PARAM_LOV, v.obd_vars, lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               GOTO(out, rc);
+        }
+
+       case LCFG_CLEANUP:
+               lu_dev_del_linkage(dev->ld_site, dev);
+               lod_getref(lod);
+               lod_foreach_ost(lod, i) {
+                       struct lod_ost_desc *ost;
+                       ost = OST_TGT(lod, i);
+                       LASSERT(ost && ost->ltd_ost);
+                       next = &ost->ltd_ost->dd_lu_dev;
+                       rc = next->ld_ops->ldo_process_config(env, next, lcfg);
+                       if (rc)
+                               CERROR("%s: can't process %u: %d\n",
+                                      lod2obd(lod)->obd_name,
+                                      lcfg->lcfg_command, rc);
+               }
+               lod_putref(lod);
+
+               /*
+                * do cleanup on underlying storage only when
+                * all OSPs are cleaned up, as they use that OSD as well
+                */
+               next = &lod->lod_child->dd_lu_dev;
+               rc = next->ld_ops->ldo_process_config(env, next, lcfg);
+               if (rc)
+                       CERROR("%s: can't process %u: %d\n",
+                              lod2obd(lod)->obd_name, lcfg->lcfg_command, rc);
+               break;
+
+       default:
+              CERROR("%s: unknown command %u\n", lod2obd(lod)->obd_name,
+                     lcfg->lcfg_command);
+              rc = -EINVAL;
+              break;
+       }
+
+out:
+       RETURN(rc);
+}
+
+static int lod_recovery_complete(const struct lu_env *env,
+                                struct lu_device *dev)
+{
+       struct lod_device   *lod = lu2lod_dev(dev);
+       struct lu_device    *next = &lod->lod_child->dd_lu_dev;
+       struct lod_ost_desc *ost;
+       int                  i, rc;
+       ENTRY;
+
+       LASSERT(lod->lod_recovery_completed == 0);
+       lod->lod_recovery_completed = 1;
+
+       rc = next->ld_ops->ldo_recovery_complete(env, next);
+
+       lod_getref(lod);
+       lod_foreach_ost(lod, i) {
+               ost = OST_TGT(lod, i);
+               LASSERT(ost && ost->ltd_ost);
+               next = &ost->ltd_ost->dd_lu_dev;
+               rc = next->ld_ops->ldo_recovery_complete(env, next);
+               if (rc)
+                       CERROR("%s: can't complete recovery on #%d: %d\n",
+                              lod2obd(lod)->obd_name, i, rc);
+       }
+       lod_putref(lod);
+
+       RETURN(rc);
+}
+
+const struct lu_device_operations lod_lu_ops = {
+       .ldo_process_config     = lod_process_config,
+       .ldo_recovery_complete  = lod_recovery_complete,
+};
+
+static int lod_root_get(const struct lu_env *env,
+                       struct dt_device *dev, struct lu_fid *f)
+{
+       return dt_root_get(env, dt2lod_dev(dev)->lod_child, f);
+}
+
+static int lod_statfs(const struct lu_env *env,
+                     struct dt_device *dev, struct obd_statfs *sfs)
+{
+       return dt_statfs(env, dt2lod_dev(dev)->lod_child, sfs);
+}
+
+static struct thandle *lod_trans_create(const struct lu_env *env,
+                                       struct dt_device *dev)
+{
+       return dt_trans_create(env, dt2lod_dev(dev)->lod_child);
+}
+
+static int lod_trans_start(const struct lu_env *env, struct dt_device *dev,
+                          struct thandle *th)
+{
+       return dt_trans_start(env, dt2lod_dev(dev)->lod_child, th);
+}
+
+static int lod_trans_stop(const struct lu_env *env, struct thandle *th)
+{
+       /* XXX: we don't know next device, will be fixed with DNE */
+       return dt_trans_stop(env, th->th_dev, th);
+}
+
+static void lod_conf_get(const struct lu_env *env,
+                        const struct dt_device *dev,
+                        struct dt_device_param *param)
+{
+       dt_conf_get(env, dt2lod_dev((struct dt_device *)dev)->lod_child, param);
+}
+
+static int lod_sync(const struct lu_env *env, struct dt_device *dev)
+{
+       struct lod_device   *lod = dt2lod_dev(dev);
+       struct lod_ost_desc *ost;
+       int                  rc = 0, i;
+       ENTRY;
+
+       lod_getref(lod);
+       lod_foreach_ost(lod, i) {
+               ost = OST_TGT(lod, i);
+               LASSERT(ost && ost->ltd_ost);
+               rc = dt_sync(env, ost->ltd_ost);
+               if (rc) {
+                       CERROR("%s: can't sync %u: %d\n",
+                              lod2obd(lod)->obd_name, i, rc);
+                       break;
+               }
+       }
+       lod_putref(lod);
+       if (rc == 0)
+               rc = dt_sync(env, lod->lod_child);
+
+       RETURN(rc);
+}
+
+static int lod_ro(const struct lu_env *env, struct dt_device *dev)
+{
+       return dt_ro(env, dt2lod_dev(dev)->lod_child);
+}
+
+static int lod_commit_async(const struct lu_env *env, struct dt_device *dev)
+{
+       return dt_commit_async(env, dt2lod_dev(dev)->lod_child);
+}
+
+static int lod_init_capa_ctxt(const struct lu_env *env, struct dt_device *dev,
+                             int mode, unsigned long timeout,
+                             __u32 alg, struct lustre_capa_key *keys)
+{
+       struct dt_device *next = dt2lod_dev(dev)->lod_child;
+       return dt_init_capa_ctxt(env, next, mode, timeout, alg, keys);
+}
+
+static const struct dt_device_operations lod_dt_ops = {
+       .dt_root_get         = lod_root_get,
+       .dt_statfs           = lod_statfs,
+       .dt_trans_create     = lod_trans_create,
+       .dt_trans_start      = lod_trans_start,
+       .dt_trans_stop       = lod_trans_stop,
+       .dt_conf_get         = lod_conf_get,
+       .dt_sync             = lod_sync,
+       .dt_ro               = lod_ro,
+       .dt_commit_async     = lod_commit_async,
+       .dt_init_capa_ctxt   = lod_init_capa_ctxt,
+};
+
+static int lod_connect_to_osd(const struct lu_env *env, struct lod_device *lod,
+                             struct lustre_cfg *cfg)
+{
+       struct obd_connect_data *data = NULL;
+       struct obd_device       *obd;
+       char                    *nextdev = NULL, *p, *s;
+       int                      rc, len = 0;
+       ENTRY;
+
+       LASSERT(cfg);
+       LASSERT(lod->lod_child_exp == NULL);
+
+       /* compatibility hack: we still use old config logs
+        * which specify LOV, but we need to learn underlying
+        * OSD device, which is supposed to be:
+        *  <fsname>-MDTxxxx-osd
+        *
+        * 2.x MGS generates lines like the following:
+        *   #03 (176)lov_setup 0:lustre-MDT0000-mdtlov  1:(struct lov_desc)
+        * 1.8 MGS generates lines like the following:
+        *   #03 (168)lov_setup 0:lustre-mdtlov  1:(struct lov_desc)
+        *
+        * we use "-MDT" to differentiate 2.x from 1.8 */
+
+       if ((p = lustre_cfg_string(cfg, 0)) && strstr(p, "-mdtlov")) {
+               len = strlen(p) + 1;
+               OBD_ALLOC(nextdev, len);
+               if (nextdev == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               strcpy(nextdev, p);
+               s = strstr(nextdev, "-mdtlov");
+               if (unlikely(s == NULL)) {
+                       CERROR("unable to parse device name %s\n",
+                              lustre_cfg_string(cfg, 0));
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               if (strstr(nextdev, "-MDT")) {
+                       /* 2.x config */
+                       strcpy(s, "-osd");
+               } else {
+                       /* 1.8 config */
+                       strcpy(s, "-MDT0000-osd");
+               }
+       } else {
+               CERROR("unable to parse device name %s\n",
+                      lustre_cfg_string(cfg, 0));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       obd = class_name2obd(nextdev);
+       if (obd == NULL) {
+               CERROR("can not locate next device: %s\n", nextdev);
+               GOTO(out, rc = -ENOTCONN);
+       }
+
+       data->ocd_connect_flags = OBD_CONNECT_VERSION;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+
+       rc = obd_connect(env, &lod->lod_child_exp, obd, &obd->obd_uuid,
+                        data, NULL);
+       if (rc) {
+               CERROR("cannot connect to next dev %s (%d)\n", nextdev, rc);
+               GOTO(out, rc);
+       }
+
+       lod->lod_dt_dev.dd_lu_dev.ld_site =
+               lod->lod_child_exp->exp_obd->obd_lu_dev->ld_site;
+       LASSERT(lod->lod_dt_dev.dd_lu_dev.ld_site);
+       lod->lod_child = lu2dt_dev(lod->lod_child_exp->exp_obd->obd_lu_dev);
+
+out:
+       if (data)
+               OBD_FREE_PTR(data);
+       if (nextdev)
+               OBD_FREE(nextdev, len);
+       RETURN(rc);
+}
+
+static int lod_init0(const struct lu_env *env, struct lod_device *lod,
+                    struct lu_device_type *ldt, struct lustre_cfg *cfg)
+{
+       struct dt_device_param ddp;
+       struct proc_dir_entry *lov_proc_dir;
+       struct obd_device     *obd;
+       int                    rc;
+       ENTRY;
+
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       if (obd == NULL) {
+               CERROR("Cannot find obd with name %s\n",
+                      lustre_cfg_string(cfg, 0));
+               RETURN(-ENODEV);
+       }
+
+       lod->lod_dt_dev.dd_lu_dev.ld_ops = &lod_lu_ops;
+       lod->lod_dt_dev.dd_ops = &lod_dt_ops;
+
+       rc = lod_connect_to_osd(env, lod, cfg);
+       if (rc)
+               RETURN(rc);
+
+       dt_conf_get(env, &lod->lod_dt_dev, &ddp);
+       lod->lod_osd_max_easize = ddp.ddp_max_ea_size;
+
+       /* for compatibility we link old procfs's OSC entries to osp ones */
+       lov_proc_dir = lprocfs_srch(proc_lustre_root, "lov");
+       if (lov_proc_dir) {
+               cfs_proc_dir_entry_t *symlink = NULL;
+               char *name;
+               OBD_ALLOC(name, strlen(obd->obd_name) + 1);
+               if (name) {
+                       strcpy(name, obd->obd_name);
+                       if (strstr(name, "lov"))
+                               symlink = lprocfs_add_symlink(name,
+                                               lov_proc_dir,
+                                               "../lod/%s",
+                                               obd->obd_name);
+                       OBD_FREE(name, strlen(obd->obd_name) + 1);
+                       lod->lod_symlink = symlink;
+               }
+       }
+
+       cfs_mutex_init(&lod->lod_mutex);
+       cfs_init_rwsem(&lod->lod_rw_sem);
+       cfs_spin_lock_init(&lod->lod_desc_lock);
+
+       RETURN(0);
+
+       obd_disconnect(lod->lod_child_exp);
+       RETURN(rc);
+}
+
+static struct lu_device *lod_device_free(const struct lu_env *env,
+                                        struct lu_device *lu)
+{
+       struct lod_device *lod = lu2lod_dev(lu);
+       struct lu_device  *next = &lod->lod_child->dd_lu_dev;
+       ENTRY;
+
+       LASSERT(cfs_atomic_read(&lu->ld_ref) == 0);
+       dt_device_fini(&lod->lod_dt_dev);
+       OBD_FREE_PTR(lod);
+       RETURN(next);
+}
+
+static struct lu_device *lod_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *type,
+                                         struct lustre_cfg *lcfg)
+{
+       struct lod_device *lod;
+       struct lu_device  *lu_dev;
+
+       OBD_ALLOC_PTR(lod);
+       if (lod == NULL) {
+               lu_dev = ERR_PTR(-ENOMEM);
+       } else {
+               int rc;
+
+               lu_dev = lod2lu_dev(lod);
+               dt_device_init(&lod->lod_dt_dev, type);
+               rc = lod_init0(env, lod, type, lcfg);
+               if (rc != 0) {
+                       lod_device_free(env, lu_dev);
+                       lu_dev = ERR_PTR(rc);
+               }
+       }
+
+       return lu_dev;
+}
+
+static struct lu_device *lod_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct lod_device *lod = lu2lod_dev(d);
+       int                rc;
+       ENTRY;
+
+       if (lod->lod_symlink)
+               lprocfs_remove(&lod->lod_symlink);
+
+       rc = obd_disconnect(lod->lod_child_exp);
+       if (rc)
+               CERROR("error in disconnect from storage: %d\n", rc);
+
+       RETURN(NULL);
+}
+
+/*
+ * we use exports to track all LOD users
+ */
+static int lod_obd_connect(const struct lu_env *env, struct obd_export **exp,
+                          struct obd_device *obd, struct obd_uuid *cluuid,
+                          struct obd_connect_data *data, void *localdata)
+{
+       struct lod_device    *lod = lu2lod_dev(obd->obd_lu_dev);
+       struct lustre_handle  conn;
+       int                   rc;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "connect #%d\n", lod->lod_connects);
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               RETURN(rc);
+
+       *exp = class_conn2export(&conn);
+
+       cfs_mutex_lock(&lod->lod_mutex);
+       lod->lod_connects++;
+       /* at the moment we expect the only user */
+       LASSERT(lod->lod_connects == 1);
+       cfs_mutex_unlock(&lod->lod_mutex);
+
+       RETURN(0);
+}
+
+/*
+ * once last export (we don't count self-export) disappeared
+ * lod can be released
+ */
+static int lod_obd_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev);
+       int                rc, release = 0;
+       ENTRY;
+
+       /* Only disconnect the underlying layers on the final disconnect. */
+       cfs_mutex_lock(&lod->lod_mutex);
+       lod->lod_connects--;
+       if (lod->lod_connects != 0) {
+               /* why should there be more than 1 connect? */
+               cfs_mutex_unlock(&lod->lod_mutex);
+               CERROR("%s: disconnect #%d\n", exp->exp_obd->obd_name,
+                      lod->lod_connects);
+               goto out;
+       }
+       cfs_mutex_unlock(&lod->lod_mutex);
+
+       /* the last user of lod has gone, let's release the device */
+       release = 1;
+
+out:
+       rc = class_disconnect(exp); /* bz 9811 */
+
+       if (rc == 0 && release)
+               class_manual_cleanup(obd);
+       RETURN(rc);
+}
+
+LU_KEY_INIT(lod, struct lod_thread_info);
+
+static void lod_key_fini(const struct lu_context *ctx,
+               struct lu_context_key *key, void *data)
+{
+       struct lod_thread_info *info = data;
+       /* allocated in lod_get_lov_ea
+        * XXX: this is overload, a tread may have such store but used only
+        * once. Probably better would be pool of such stores per LOD.
+        */
+       if (info->lti_ea_store) {
+               OBD_FREE_LARGE(info->lti_ea_store, info->lti_ea_store_size);
+               info->lti_ea_store = NULL;
+               info->lti_ea_store_size = 0;
+       }
+       OBD_FREE_PTR(info);
+}
+
+/* context key: lod_thread_key */
+LU_CONTEXT_KEY_DEFINE(lod, LCT_MD_THREAD);
+
+LU_TYPE_INIT_FINI(lod, &lod_thread_key);
+
+static struct lu_device_type_operations lod_device_type_ops = {
+       .ldto_init           = lod_type_init,
+       .ldto_fini           = lod_type_fini,
+
+       .ldto_start          = lod_type_start,
+       .ldto_stop           = lod_type_stop,
+
+       .ldto_device_alloc   = lod_device_alloc,
+       .ldto_device_free    = lod_device_free,
+
+       .ldto_device_fini    = lod_device_fini
+};
+
+static struct lu_device_type lod_device_type = {
+       .ldt_tags     = LU_DEVICE_DT,
+       .ldt_name     = LUSTRE_LOD_NAME,
+       .ldt_ops      = &lod_device_type_ops,
+       .ldt_ctx_tags = LCT_MD_THREAD,
+};
+
+static int lod_obd_health_check(const struct lu_env *env,
+               struct obd_device *obd)
+{
+       struct lod_device   *d = lu2lod_dev(obd->obd_lu_dev);
+       struct lod_ost_desc *ost;
+       int                  i, rc = 1;
+       ENTRY;
+
+       LASSERT(d);
+       lod_getref(d);
+       cfs_foreach_bit(d->lod_ost_bitmap, i) {
+               ost = OST_TGT(d, i);
+               LASSERT(ost && ost->ltd_ost);
+               rc = obd_health_check(env, ost->ltd_exp->exp_obd);
+               /* one healthy device is enough */
+               if (rc == 0)
+                       break;
+       }
+       lod_putref(d);
+       RETURN(rc);
+}
+
+static struct obd_ops lod_obd_device_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_connect      = lod_obd_connect,
+       .o_disconnect   = lod_obd_disconnect,
+       .o_health_check = lod_obd_health_check,
+};
+
+static int __init lod_mod_init(void)
+{
+       struct lprocfs_static_vars  lvars = { 0 };
+       cfs_proc_dir_entry_t       *lov_proc_dir;
+       int                         rc;
+
+       rc = lu_kmem_init(lod_caches);
+       if (rc)
+               return rc;
+
+       rc = class_register_type(&lod_obd_device_ops, NULL, lvars.module_vars,
+                                LUSTRE_LOD_NAME, &lod_device_type);
+       if (rc) {
+               lu_kmem_fini(lod_caches);
+               return rc;
+       }
+
+       /* create "lov" entry in procfs for compatibility purposes */
+       lov_proc_dir = lprocfs_srch(proc_lustre_root, "lov");
+       if (lov_proc_dir == NULL) {
+               lov_proc_dir = lprocfs_register("lov", proc_lustre_root,
+                                               NULL, NULL);
+               if (IS_ERR(lov_proc_dir))
+                       CERROR("lod: can't create compat entry \"lov\": %d\n",
+                              (int)PTR_ERR(lov_proc_dir));
+       }
+
+       return rc;
+}
+
+static void __exit lod_mod_exit(void)
+{
+
+       lprocfs_try_remove_proc_entry("lov", proc_lustre_root);
+
+       class_unregister_type(LUSTRE_LOD_NAME);
+       lu_kmem_fini(lod_caches);
+}
+
+MODULE_AUTHOR("Whamcloud, Inc. <http://www.whamcloud.com/>");
+MODULE_DESCRIPTION("Lustre Logical Object Device ("LUSTRE_LOD_NAME")");
+MODULE_LICENSE("GPL");
+
+module_init(lod_mod_init);
+module_exit(lod_mod_exit);
+
diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h
new file mode 100644 (file)
index 0000000..bc6283f
--- /dev/null
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2009 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lod/lod_internal.h
+ *
+ * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#ifndef _LOD_INTERNAL_H
+#define _LOD_INTERNAL_H
+
+#include <libcfs/libcfs.h>
+#include <obd.h>
+#include <dt_object.h>
+
+#define LOV_USES_ASSIGNED_STRIPE        0
+#define LOV_USES_DEFAULT_STRIPE         1
+
+struct lod_ost_desc {
+       struct dt_device  *ltd_ost;
+       struct list_head   ltd_kill;
+       struct obd_export *ltd_exp;
+       struct obd_uuid    ltd_uuid;
+       __u32              ltd_gen;
+       __u32              ltd_index;
+       struct ltd_qos     ltd_qos; /* qos info per target */
+       struct obd_statfs  ltd_statfs;
+       unsigned long      ltd_active:1,/* is this target up for requests */
+                          ltd_activate:1,/* should  target be activated */
+                          ltd_reap:1;  /* should this target be deleted */
+};
+
+#define OST_PTRS                256     /* number of pointers at 1st level */
+#define OST_PTRS_PER_BLOCK      256     /* number of pointers at 2nd level */
+
+struct lod_ost_desc_idx {
+       struct lod_ost_desc *ldi_ost[OST_PTRS_PER_BLOCK];
+};
+
+#define OST_TGT(dev,index)      \
+       ((dev)->lod_ost_idx[(index) / \
+       OST_PTRS_PER_BLOCK]->ldi_ost[(index)%OST_PTRS_PER_BLOCK])
+
+struct lod_device {
+       struct dt_device      lod_dt_dev;
+       struct obd_export    *lod_child_exp;
+       struct dt_device     *lod_child;
+       cfs_proc_dir_entry_t *lod_proc_entry;
+       struct lprocfs_stats *lod_stats;
+       int                   lod_connects;
+       int                   lod_recovery_completed;
+
+       /* lov settings descriptor storing static information */
+       struct lov_desc       lod_desc;
+
+       /* use to protect ld_active_tgt_count and all ltd_active */
+       cfs_spinlock_t        lod_desc_lock;
+
+       /* list of known OSTs */
+       struct lod_ost_desc_idx *lod_ost_idx[OST_PTRS];
+
+       /* Size of the lod_osts array, granted to be a power of 2 */
+       __u32                 lod_osts_size;
+       /* number of registered OSTs */
+       int                   lod_ostnr;
+       /* OSTs scheduled to be deleted */
+       __u32                 lod_death_row;
+       /* bitmap of OSTs available */
+       cfs_bitmap_t         *lod_ost_bitmap;
+
+       /* maximum EA size underlied OSD may have */
+       unsigned int          lod_osd_max_easize;
+
+       /* Table refcount used for delayed deletion */
+       int                   lod_refcount;
+       /* mutex to serialize concurrent updates to the ost table */
+       cfs_mutex_t           lod_mutex;
+       /* read/write semaphore used for array relocation */
+       cfs_rw_semaphore_t    lod_rw_sem;
+
+       /* QoS info per LOD */
+       struct lov_qos        lod_qos; /* qos info per lod */
+
+       /* OST pool data */
+       struct ost_pool       lod_pool_info; /* all OSTs in a packed array */
+       int                   lod_pool_count;
+       cfs_hash_t           *lod_pools_hash_body; /* used for key access */
+       cfs_list_t            lod_pool_list; /* used for sequential access */
+       cfs_proc_dir_entry_t *lod_pool_proc_entry;
+
+       enum lustre_sec_part   lod_sp_me;
+
+       cfs_proc_dir_entry_t *lod_symlink;
+};
+
+/*
+ * XXX: shrink this structure, currently it's 72bytes on 32bit arch,
+ *      so, slab will be allocating 128bytes
+ */
+struct lod_object {
+       struct dt_object   ldo_obj;
+
+       /* if object is striped, then the next fields describe stripes */
+       __u16              ldo_stripenr;
+       __u16              ldo_layout_gen;
+       __u32              ldo_stripe_size;
+       char              *ldo_pool;
+       struct dt_object **ldo_stripe;
+       /* to know how much memory to free, ldo_stripenr can be less */
+       int                ldo_stripes_allocated;
+       /* default striping for directory represented by this object
+        * is cached in stripenr/stripe_size */
+       int                ldo_striping_cached:1;
+       int                ldo_def_striping_set:1;
+       __u32              ldo_def_stripe_size;
+       __u16              ldo_def_stripenr;
+       __u16              ldo_def_stripe_offset;
+};
+
+
+struct lod_thread_info {
+       /* per-thread buffer for LOV EA */
+       void             *lti_ea_store;
+       int               lti_ea_store_size;
+       struct lu_buf     lti_buf;
+       struct ost_id     lti_ostid;
+       struct lu_fid     lti_fid;
+       struct obd_statfs lti_osfs;
+       struct lu_attr    lti_attr;
+};
+
+extern const struct lu_device_operations lod_lu_ops;
+
+static inline int lu_device_is_lod(struct lu_device *d)
+{
+       return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &lod_lu_ops);
+}
+
+static inline struct lod_device* lu2lod_dev(struct lu_device *d)
+{
+       LASSERT(lu_device_is_lod(d));
+       return container_of0(d, struct lod_device, lod_dt_dev.dd_lu_dev);
+}
+
+static inline struct lu_device *lod2lu_dev(struct lod_device *d)
+{
+       return &d->lod_dt_dev.dd_lu_dev;
+}
+
+static inline struct obd_device *lod2obd(struct lod_device *d)
+{
+       return d->lod_dt_dev.dd_lu_dev.ld_obd;
+}
+
+static inline struct lod_device *dt2lod_dev(struct dt_device *d)
+{
+       LASSERT(lu_device_is_lod(&d->dd_lu_dev));
+       return container_of0(d, struct lod_device, lod_dt_dev);
+}
+
+static inline struct lod_object *lu2lod_obj(struct lu_object *o)
+{
+       LASSERT(ergo(o != NULL, lu_device_is_lod(o->lo_dev)));
+       return container_of0(o, struct lod_object, ldo_obj.do_lu);
+}
+
+static inline struct lu_object *lod2lu_obj(struct lod_object *obj)
+{
+       return &obj->ldo_obj.do_lu;
+}
+
+static inline struct lod_object *lod_obj(const struct lu_object *o)
+{
+       LASSERT(lu_device_is_lod(o->lo_dev));
+       return container_of0(o, struct lod_object, ldo_obj.do_lu);
+}
+
+static inline struct lod_object *lod_dt_obj(const struct dt_object *d)
+{
+       return lod_obj(&d->do_lu);
+}
+
+static inline struct dt_object* lod_object_child(struct lod_object *o)
+{
+       return container_of0(lu_object_next(lod2lu_obj(o)),
+                       struct dt_object, do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+       LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+       return container_of0(o, struct dt_object, do_lu);
+}
+
+static inline struct dt_object *dt_object_child(struct dt_object *o)
+{
+       return container_of0(lu_object_next(&(o)->do_lu),
+                       struct dt_object, do_lu);
+}
+
+extern struct lu_context_key lod_thread_key;
+
+static inline struct lod_thread_info *lod_env_info(const struct lu_env *env)
+{
+       struct lod_thread_info *info;
+       info = lu_context_key_get(&env->le_ctx, &lod_thread_key);
+       LASSERT(info);
+       return info;
+}
+
+#define lod_foreach_ost(__dev, index)  \
+       if ((__dev)->lod_osts_size > 0) \
+               cfs_foreach_bit((__dev)->lod_ost_bitmap, (index))
+
+/* lod_lov.c */
+void lod_getref(struct lod_device *lod);
+void lod_putref(struct lod_device *lod);
+
+
+#endif
+
diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c
new file mode 100644 (file)
index 0000000..5973a67
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2009 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * lustre/lod/lod_lov.c
+ *
+ * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com> 
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lod_internal.h"
+
+/*
+ * Keep a refcount of lod->lod_osts usage to prevent racing with
+ * addition/deletion. Any function that expects lov_tgts to remain stationary
+ * must take a ref.
+ *
+ * \param lod - is the lod device from which we want to grab a reference
+ */
+void lod_getref(struct lod_device *lod)
+{
+       cfs_down_read(&lod->lod_rw_sem);
+       cfs_mutex_lock(&lod->lod_mutex);
+       lod->lod_refcount++;
+       cfs_mutex_unlock(&lod->lod_mutex);
+}
+
+/*
+ * Companion of lod_getref() to release a reference on the lod table.
+ * If this is the last reference and the ost entry was scheduled for deletion,
+ * the descriptor is removed from the array.
+ *
+ * \param lod - is the lod device from which we release a reference
+ */
+void lod_putref(struct lod_device *lod)
+{
+       cfs_mutex_lock(&lod->lod_mutex);
+       lod->lod_refcount--;
+       if (lod->lod_refcount == 0 && lod->lod_death_row) {
+               struct lod_ost_desc *ost_desc, *tmp;
+               int                  idx;
+               CFS_LIST_HEAD(kill);
+
+               CDEBUG(D_CONFIG, "destroying %d lod desc\n",
+                      lod->lod_death_row);
+
+               cfs_foreach_bit(lod->lod_ost_bitmap, idx) {
+                       ost_desc = OST_TGT(lod, idx);
+                       LASSERT(ost_desc);
+
+                       if (!ost_desc->ltd_reap)
+                               continue;
+
+                       cfs_list_add(&ost_desc->ltd_kill, &kill);
+                       /* XXX: remove from the pool */
+                       OST_TGT(lod, idx) = NULL;
+                       lod->lod_ostnr--;
+                       cfs_bitmap_clear(lod->lod_ost_bitmap, idx);
+                       if (ost_desc->ltd_active)
+                               lod->lod_desc.ld_active_tgt_count--;
+                       lod->lod_death_row--;
+               }
+               cfs_mutex_unlock(&lod->lod_mutex);
+               cfs_up_read(&lod->lod_rw_sem);
+
+               cfs_list_for_each_entry_safe(ost_desc, tmp, &kill, ltd_kill) {
+                       int rc;
+                       cfs_list_del(&ost_desc->ltd_kill);
+                       /* XXX: remove from QoS structures */
+                       /* disconnect from OSP */
+                       rc = obd_disconnect(ost_desc->ltd_exp);
+                       if (rc)
+                               CERROR("%s: failed to disconnect %s (%d)\n",
+                                      lod2obd(lod)->obd_name,
+                                      obd_uuid2str(&ost_desc->ltd_uuid), rc);
+                       OBD_FREE_PTR(ost_desc);
+               }
+       } else {
+               cfs_mutex_unlock(&lod->lod_mutex);
+               cfs_up_read(&lod->lod_rw_sem);
+       }
+}
+