Whamcloud - gitweb
b=8341
authornathan <nathan>
Thu, 27 Apr 2006 23:51:36 +0000 (23:51 +0000)
committernathan <nathan>
Thu, 27 Apr 2006 23:51:36 +0000 (23:51 +0000)
Land mountconf on b1_5

134 files changed:
lustre/Makefile.in
lustre/autoMakefile.am
lustre/autoconf/lustre-core.m4
lustre/include/liblustre.h
lustre/include/linux/Makefile.am
lustre/include/linux/lustre_disk.h [deleted file]
lustre/include/linux/lustre_fsfilt.h
lustre/include/lustre/liblustreapi.h
lustre/include/lustre/lustre_idl.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre_cfg.h
lustre/include/lustre_disk.h [new file with mode: 0644]
lustre/include/lustre_export.h
lustre/include/lustre_import.h
lustre/include/lustre_lib.h
lustre/include/lustre_log.h
lustre/include/lustre_net.h
lustre/include/lustre_param.h [new file with mode: 0644]
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config
lustre/ldlm/ldlm_inodebits.c
lustre/ldlm/ldlm_internal.h
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lockd.c
lustre/liblustre/llite_lib.c
lustre/liblustre/llite_lib.h
lustre/liblustre/super.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/rw.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/lov/lov_log.c
lustre/lov/lov_obd.c
lustre/lvfs/fsfilt.c
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/lvfs_linux.c
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/mds_fs.c
lustre/mds/mds_internal.h
lustre/mds/mds_log.c
lustre/mds/mds_lov.c
lustre/mds/mds_open.c
lustre/mgc/.cvsignore [new file with mode: 0644]
lustre/mgc/Makefile.in [new file with mode: 0644]
lustre/mgc/autoMakefile.am [new file with mode: 0644]
lustre/mgc/mgc_request.c [new file with mode: 0644]
lustre/mgs/.cvsignore [new file with mode: 0644]
lustre/mgs/Makefile.in [new file with mode: 0644]
lustre/mgs/autoMakefile.am [new file with mode: 0644]
lustre/mgs/lproc_mgs.c [new file with mode: 0644]
lustre/mgs/mgs_fs.c [new file with mode: 0644]
lustre/mgs/mgs_handler.c [new file with mode: 0644]
lustre/mgs/mgs_internal.h [new file with mode: 0644]
lustre/mgs/mgs_llog.c [new file with mode: 0644]
lustre/obdclass/Makefile.in
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/llog.c
lustre/obdclass/llog_ioctl.c
lustre/obdclass/llog_lvfs.c
lustre/obdclass/llog_obd.c
lustre/obdclass/llog_swab.c
lustre/obdclass/lustre_peer.c
lustre/obdclass/obd_config.c
lustre/obdclass/obd_mount.c [new file with mode: 0644]
lustre/obdclass/uuid.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_internal.h
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/import.c
lustre/ptlrpc/llog_client.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/service.c
lustre/quota/quota_check.c
lustre/tests/Makefile.am
lustre/tests/acceptance-small.sh
lustre/tests/cfg/insanity-local.sh
lustre/tests/cfg/insanity-ltest.sh
lustre/tests/cfg/local.sh
lustre/tests/cfg/mdev.sh
lustre/tests/conf-sanity.sh
lustre/tests/insanity.sh
lustre/tests/llmount.sh
lustre/tests/llmountcleanup.sh
lustre/tests/llrmount.sh [deleted file]
lustre/tests/local.sh
lustre/tests/lov.sh
lustre/tests/mmap_sanity.c
lustre/tests/mountconf.sh [new file with mode: 0755]
lustre/tests/oos.sh
lustre/tests/oos2.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-ost-single.sh
lustre/tests/replay-single.sh
lustre/tests/runtests
lustre/tests/sanity-quota.sh
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/test-framework.sh
lustre/utils/.cvsignore
lustre/utils/Makefile.am
lustre/utils/cluster_scripts/1uml.csv [new file with mode: 0644]
lustre/utils/cluster_scripts/cluster_config.sh [new file with mode: 0755]
lustre/utils/cluster_scripts/gen_clumanager_config.sh [new file with mode: 0755]
lustre/utils/cluster_scripts/gen_hb_config.sh [new file with mode: 0755]
lustre/utils/cluster_scripts/module_config.sh [new file with mode: 0755]
lustre/utils/cluster_scripts/verify_cluster_net.sh [new file with mode: 0755]
lustre/utils/cluster_scripts/verify_serviceIP.sh [new file with mode: 0755]
lustre/utils/ha_assist.sh [deleted file]
lustre/utils/ha_assist2.sh [deleted file]
lustre/utils/lconf
lustre/utils/lctl.c
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/llog_reader.c
lustre/utils/load_ldap.sh [deleted file]
lustre/utils/lustre_cfg.c
lustre/utils/mkfs_lustre.c [new file with mode: 0644]
lustre/utils/module_setup.sh
lustre/utils/mount_lustre.c [new file with mode: 0644]
lustre/utils/obd.c
lustre/utils/obdctl.h
lustre/utils/rmmod_all.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 6da79a1..1b7a9be 100644 (file)
@@ -6,9 +6,10 @@ subdir-m += lov
 subdir-m += ptlrpc
 subdir-m += osc
 subdir-m += obdecho
+subdir-m += mgc
 
-@SERVER_TRUE@subdir-m += mds obdfilter ost
-@CLIENT_TRUE@subdir-m += mdc llite
+@SERVER_TRUE@subdir-m += mds obdfilter ost mgs
+@CLIENT_TRUE@subdir-m += mdc llite 
 @QUOTA_TRUE@subdir-m += quota
 
 @INCLUDE_RULES@
index 27db5ce..368c081 100644 (file)
@@ -6,9 +6,9 @@
 AUTOMAKE_OPTIONS = foreign
 
 ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \
-       doc utils tests conf scripts autoconf
+       mgc doc utils tests conf scripts autoconf
 
-SERVER_SUBDIRS := ldiskfs obdfilter ost mds
+SERVER_SUBDIRS := ldiskfs obdfilter ost mds mgs
 
 CLIENT_SUBDIRS := mdc llite
 
index 638763a..d695f43 100644 (file)
@@ -702,6 +702,10 @@ lustre/osc/Makefile
 lustre/osc/autoMakefile
 lustre/ost/Makefile
 lustre/ost/autoMakefile
+lustre/mgc/Makefile
+lustre/mgc/autoMakefile
+lustre/mgs/Makefile
+lustre/mgs/autoMakefile
 lustre/ptlrpc/Makefile
 lustre/ptlrpc/autoMakefile
 lustre/quota/Makefile
index d35d750..19fad73 100644 (file)
@@ -318,13 +318,15 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {}
 
 #ifndef min_t
 #define min_t(type,x,y) \
-       ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #endif
 #ifndef max_t
 #define max_t(type,x,y) \
-       ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
+        ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
 #endif
 
+#define simple_strtol strtol
+
 /* registering symbols */
 #ifndef ERESTARTSYS
 #define ERESTARTSYS ERESTART
@@ -664,7 +666,7 @@ static inline int schedule_timeout(signed long t)
 })
 #define time_after(a, b) ((long)(b) - (long)(a) < 0)
 #define time_before(a, b) time_after(b,a)
-#define time_after_eq(a,b)     ((long)(a) - (long)(b) >= 0)
+#define time_after_eq(a,b)      ((long)(a) - (long)(b) >= 0)
 
 struct timer_list {
         struct list_head tl_list;
index 1f6af34..1d58500 100644 (file)
@@ -13,4 +13,5 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_lib.h \
   lustre_dlm.h lustre_handles.h lustre_net.h obd_class.h obd_support.h  \
   lustre_log.h lustre_compat25.h lustre_fsfilt.h lustre_mds.h obd.h \
   lvfs.h lvfs_linux.h lustre_lite.h  lustre_quota.h \
-  lustre_disk.h lustre_user.h lustre_types.h
+  lustre_disk.h lustre_user.h lustre_types.h lustre_param.h
+
diff --git a/lustre/include/linux/lustre_disk.h b/lustre/include/linux/lustre_disk.h
deleted file mode 100644 (file)
index 43cfba2..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org
- *
- * Lustre disk format definitions.
- */
-#ifndef _LUSTRE_DISK_H
-#define _LUSTRE_DISK_H_
-
-#include <linux/types.h>
-
-#include <lnet/types.h>
-
-/****************** last_rcvd file *********************/
-
-#define LAST_RCVD "last_rcvd"
-#define LOV_OBJID "lov_objid"
-
-#define LR_SERVER_SIZE   512
-#define LR_CLIENT_START 8192
-#define LR_CLIENT_SIZE   128
-#if LR_CLIENT_START < LR_SERVER_SIZE
-#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
-#endif
-/* This limit is arbitrary (32k clients on x86), but it is convenient to use
- * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */
-#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
-
-#define OBD_COMPAT_OST          0x00000002 /* this is an OST (temporary) */
-#define OBD_COMPAT_MDT          0x00000004 /* this is an MDT (temporary) */
-
-#define OBD_ROCOMPAT_LOVOBJID   0x00000001 /* MDS handles LOV_OBJID file */
-#define OBD_ROCOMPAT_CROW       0x00000002 /* OST will CROW create objects */
-
-#define OBD_INCOMPAT_GROUPS     0x00000001 /* OST handles group subdirs */
-#define OBD_INCOMPAT_OST        0x00000002 /* this is an OST (permanent) */
-#define OBD_INCOMPAT_MDT        0x00000004 /* this is an MDT (permanent) */
-
-/* Data stored per client in the last_rcvd file.  In le32 order. */
-struct lsd_client_data {
-        __u8 lcd_uuid[40];      /* client UUID */
-        __u64 lcd_last_transno; /* last completed transaction ID */
-        __u64 lcd_last_xid;     /* xid for the last transaction */
-        __u32 lcd_last_result;  /* result from last RPC */
-        __u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
-        /* for MDS_CLOSE requests */
-        __u64 lcd_last_close_transno; /* last completed transaction ID */
-        __u64 lcd_last_close_xid;     /* xid for the last transaction */
-        __u32 lcd_last_close_result;  /* result from last RPC */
-        __u32 lcd_last_close_data;    /* per-op data */
-        __u8 lcd_padding[LR_CLIENT_SIZE - 88];
-};
-
-#endif /* _LUSTRE_DISK_H_ */
index 7bc0602..84e9af9 100644 (file)
@@ -49,7 +49,8 @@ struct fsfilt_operations {
         struct list_head fs_list;
         struct module *fs_owner;
         char   *fs_type;
-        char   *(* fs_label)(struct super_block *sb);
+        char   *(* fs_getlabel)(struct super_block *sb);
+        int     (* fs_setlabel)(struct super_block *sb, char *label);
         char   *(* fs_uuid)(struct super_block *sb);
         void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
                              int logs);
@@ -113,14 +114,23 @@ extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
 extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
 extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
 
-static inline char *fsfilt_label(struct obd_device *obd, struct super_block *sb)
+static inline char *fsfilt_get_label(struct obd_device *obd,
+                                     struct super_block *sb)
 {
-        if (obd->obd_fsops->fs_label == NULL)
+        if (obd->obd_fsops->fs_getlabel == NULL)
                 return NULL;
-        if (obd->obd_fsops->fs_label(sb)[0] == '\0')
+        if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
                 return NULL;
 
-        return obd->obd_fsops->fs_label(sb);
+        return obd->obd_fsops->fs_getlabel(sb);
+}
+
+static inline int fsfilt_set_label(struct obd_device *obd,
+                                   struct super_block *sb, char *label)
+{
+        if (obd->obd_fsops->fs_setlabel == NULL)
+                return -ENOSYS;
+        return (obd->obd_fsops->fs_setlabel(sb, label));
 }
 
 static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb)
index 557c3ab..08f8786 100644 (file)
@@ -23,7 +23,7 @@ extern int llapi_ping(char *obd_type, char *obd_name);
 extern int llapi_target_check(int num_types, char **obd_types, char *dir);
 extern int llapi_catinfo(char *dir, char *keyword, char *node_name);
 extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
-extern int llapi_is_lustre_mnttype(char *type);
+extern int llapi_is_lustre_mnttype(struct mntent *mnt);
 extern int llapi_quotachown(char *path, int flag);
 extern int llapi_quotacheck(char *mnt, int check_type);
 extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
index 438402c..d7322cf 100644 (file)
@@ -43,6 +43,8 @@
 #error Unsupported operating system.
 #endif
 
+#include <lnet/types.h>   /* for lnet_nid_t */
+
 /* Defn's shared with user-space. */
 #include <lustre/lustre_user.h>
 
@@ -86,6 +88,9 @@
 #define MDS_SETATTR_PORTAL             22
 #define MDS_READPAGE_PORTAL            23
 
+#define MGC_REPLY_PORTAL               25
+#define MGS_REQUEST_PORTAL             26
+#define MGS_REPLY_PORTAL               27
 #define OST_REQUEST_PORTAL             28
 
 #define SVC_KILLED               1
 #define LUSTRE_OST_VERSION  0x00030000
 #define LUSTRE_DLM_VERSION  0x00040000
 #define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
 
 struct lustre_handle {
         __u64 cookie;
@@ -225,6 +232,8 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define OBD_CONNECT_IBITS     0x1000ULL /* support for inodebits locks */
 #define OBD_CONNECT_JOIN      0x2000ULL /* files can be concatenated */
 #define OBD_CONNECT_NODEVOH   0x8000ULL /* No open handle for special nodes */
+#define OBD_CONNECT_EMPTY 0x80000000ULL /* fake: these are empty connect flags*/
+
 /* also update obd_connect_names[] for lprocfs_rd_connect_flags() */
 
 #define MDS_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
@@ -235,6 +244,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX)
 #define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION)
 
 #define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
                                                 ((patch)<<8) + (fix))
@@ -964,6 +974,57 @@ struct ldlm_reply {
 
 extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
 
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+        MGS_CONNECT = 250,
+        MGS_DISCONNECT,
+        MGS_EXCEPTION,         /* node died, etc. */
+        MGS_TARGET_REG,        /* whenever target starts up */
+        MGS_TARGET_DEL,
+        MGS_LAST_OPC
+} mgs_cmd_t;
+
+#define MTI_NAME_MAXLEN 64
+#define MTI_UUID_MAXLEN MTI_NAME_MAXLEN + 5
+/* each host can have multiple nids, and multiple failover hosts, and I don't
+   want to run out of room... */
+#define MTI_NIDS_MAX 64 /* match lustre_disk.h */
+
+struct mgs_target_info {
+        char             mti_fsname[MTI_NAME_MAXLEN];
+        char             mti_svname[MTI_NAME_MAXLEN];
+        char             mti_uuid[sizeof(struct obd_uuid)];
+        lnet_nid_t       mti_nids[MTI_NIDS_MAX];     /* host nids */
+        lnet_nid_t       mti_failnids[MTI_NIDS_MAX]; /* partner nids */
+        __u16            mti_failnodes[8];  /* last nid index of each partner */
+        __u32            mti_stripe_index;
+        __u32            mti_nid_count;
+        __u32            mti_failnid_count;
+        __u32            mti_config_ver;
+        __u32            mti_flags;
+        char             mti_params[2048];
+};
+
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+#define CM_START       0x01
+#define CM_END         0x02
+#define CM_SKIP        0x04
+#define CM_UPGRADE146  0x08
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+        __u32             cm_step;  /* aka config version */
+        __u32             cm_flags;
+        time_t            cm_createtime; /*when this record was first created */
+        time_t            cm_canceltime; /*when this record is no longer valid*/
+        char              cm_svname[16];
+        char              cm_comment[40];
+};
+
 /*
  * Opcodes for multiple servers.
  */
index 1d226ea..a9b9812 100644 (file)
@@ -128,6 +128,19 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
         uuid->uuid[sizeof(*uuid) - 1] = '\0';
 }
 
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid) 
+{
+        if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+                /* Obviously not safe, but for printfs, no real harm done...*/
+                static char temp[sizeof(*uuid)];
+                memcpy(temp, uuid->uuid, sizeof(*uuid));
+                temp[sizeof(*uuid) - 1] = '\0';
+                return temp;
+        }
+        return (char *)(uuid->uuid);
+}
+
 #define LUSTRE_Q_QUOTAON  0x800002     /* turn quotas on */
 #define LUSTRE_Q_QUOTAOFF 0x800003     /* turn quotas off */
 #define LUSTRE_Q_GETINFO  0x800005     /* get information about quota files */
index cd13b97..c75b49d 100644 (file)
@@ -33,6 +33,9 @@
 #define LCFG_HDR_SIZE(count) \
     size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
 
+/* If not LCFG_REQUIRED, we can ignore this cmd and go on. */
+#define LCFG_REQUIRED         0x0001000
+
 enum lcfg_command_type {
         LCFG_ATTACH         = 0x00cf001,
         LCFG_DETACH         = 0x00cf002,
@@ -48,8 +51,11 @@ enum lcfg_command_type {
         LCFG_DEL_CONN       = 0x00cf00c,
         LCFG_LOV_ADD_OBD    = 0x00cf00d,
         LCFG_LOV_DEL_OBD    = 0x00cf00e,
-        LCFG_PARAM          = 0x00cf00f,
-        LCFG_MARKER         = 0x00cf010
+        LCFG_PARAM          = 0x00ce00f,
+        LCFG_MARKER         = 0x00ce010,
+        LCFG_LOG_START      = 0x00ce011,
+        LCFG_LOG_END        = 0x00ce012,
+        LCFG_LOV_ADD_INA    = 0x00ce013,
 };
 
 struct lustre_cfg_bufs {
@@ -151,9 +157,14 @@ static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
                 return NULL;
 
         /* make sure it's NULL terminated, even if this kills a char
-         * of data
+         * of data.  Try to use the padding first though.
          */
-        s[lcfg->lcfg_buflens[index] - 1] = '\0';
+        if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+                int last = min((int)lcfg->lcfg_buflens[index], 
+                               size_round(lcfg->lcfg_buflens[index]) - 1);
+                s[last] = '\0';
+                CWARN("Truncating buf %d to '%s'\n", index, s);
+        }
         return s;
 }
 
@@ -223,6 +234,7 @@ static inline int lustre_cfg_sanity_check(void *buf, int len)
 
         if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
                 RETURN(-EINVAL);
+        
         if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
                 RETURN(-EINVAL);
 
@@ -237,50 +249,4 @@ static inline int lustre_cfg_sanity_check(void *buf, int len)
         RETURN(0);
 }
 
-
-#define LMD_MAGIC       0xbdacbd03
-#define LMD_MAGIC_MASK (0xffffff00 & LMD_MAGIC)
-
-#define lmd_bad_magic(LMDP)                                             \
-({                                                                      \
-        struct lustre_mount_data *_lmd__ = (LMDP);                      \
-        int _ret__ = 0;                                                 \
-        if (!_lmd__) {                                                  \
-                LCONSOLE_ERROR("Missing mount data: "                   \
-                       "check that /sbin/mount.lustre is installed.\n");\
-                _ret__ = 1;                                             \
-        } else if (_lmd__->lmd_magic == LMD_MAGIC) {                    \
-                _ret__ = 0;                                             \
-        } else if ((_lmd__->lmd_magic & LMD_MAGIC_MASK) == LMD_MAGIC_MASK) { \
-                LCONSOLE_ERROR("You're using an old version of "        \
-                       "/sbin/mount.lustre.  Please install version "   \
-                       "1.%d\n", LMD_MAGIC & 0xFF);                     \
-                _ret__ = 1;                                             \
-        } else {                                                        \
-                LCONSOLE_ERROR("Invalid mount data (%#x != %#x): "      \
-                       "check that /sbin/mount.lustre is installed\n",  \
-                       _lmd__->lmd_magic, LMD_MAGIC);                   \
-                _ret__ = 1;                                             \
-        }                                                               \
-        _ret__;                                                         \
-})
-
-#define MAX_FAILOVER_NIDS 10
-
-/* Passed by mount */
-/* Any changes in the alignment of elements in this stuct require a change to
-   LMD_MAGIC */
-struct lustre_mount_data {
-        uint32_t   lmd_magic;
-        uint32_t   lmd_flags;
-        uint16_t   lmd_nid_count; /* how many failover nids we have for the MDS */
-        lnet_nid_t lmd_nid[MAX_FAILOVER_NIDS];
-        char       lmd_mds[64];
-        char       lmd_profile[64];
-};
-
-#define LMD_FLG_FLOCK           0x0001
-#define LMD_FLG_USER_XATTR      0x0002
-#define LMD_FLG_ACL             0x0004
-
 #endif // _LUSTRE_CFG_H
diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h
new file mode 100644 (file)
index 0000000..8430107
--- /dev/null
@@ -0,0 +1,307 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *
+ * Lustre disk format definitions.
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+#include <linux/types.h>
+#include <lnet/types.h>
+
+
+/****************** persistent mount data *********************/
+
+/* Persistent mount data are stored on the disk in this file.
+   Used before the setup llog can be read. */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define MOUNT_DATA_FILE   MOUNT_CONFIGS_DIR"/mountdata"
+#define MDT_LOGS_DIR      "LOGS"  /* COMPAT_146 */
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_NEED_INDEX    0x0010 /* need an index assignment */
+#define LDD_F_VIRGIN        0x0020 /* never registered */
+#define LDD_F_UPDATE        0x0040 /* update the config logs for this server*/
+#define LDD_F_REWRITE_LDD   0x0080 /* rewrite the LDD */
+#define LDD_F_WRITECONF     0x0100 /* regenerate all logs for this fs */
+#define LDD_F_UPGRADE14     0x0200 /* COMPAT_14 */
+#define MTI_F_IOCTL         0x0400 /* only used in mti  */
+
+
+enum ldd_mount_type {
+        LDD_MT_EXT3 = 0, 
+        LDD_MT_LDISKFS,
+        LDD_MT_SMFS,   
+        LDD_MT_REISERFS,
+        LDD_MT_LAST
+};
+       
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+        static char *mount_type_string[] = {
+                "ext3",
+                "ldiskfs",
+                "smfs",
+                "reiserfs",
+        };
+        //LASSERT(mt < LDD_MT_LAST);
+        return mount_type_string[mt];
+}
+
+#ifndef MTI_NIDS_MAX  /* match lustre_idl.h */
+#define MTI_NIDS_MAX 64
+#endif
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* FIXME does on-disk ldd have to be a fixed endianness? (like last_rcvd) */
+struct lustre_disk_data {
+        __u32      ldd_magic;
+        __u32      ldd_feature_compat;  /* compatible feature flags */
+        __u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+        __u32      ldd_feature_incompat;/* incompatible feature flags */
+        
+        __u32      ldd_config_ver;      /* config rewrite count - not used */
+        __u32      ldd_flags;           /* LDD_SV_TYPE */
+        __u32      ldd_svindex;         /* server index (0001), must match 
+                                           svname */
+        __u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+        char       ldd_fsname[64];      /* filesystem this server is part of */
+        char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+        __u8       ldd_uuid[40];        /* server UUID (COMPAT_146) */
+   
+/*200*/ __u8       ldd_padding[4096 - 200];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+#define IS_MDT(data)   ((data)->ldd_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)   ((data)->ldd_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)  ((data)->ldd_flags & LDD_F_SV_TYPE_MGS)
+#define MT_STR(data)   mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+                                   char *name)
+{
+        if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+                sprintf(name, "%.8s-%s%04x", fs,
+                        (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",  
+                        index);
+        } else if (flags & LDD_F_SV_TYPE_MGS) {
+                sprintf(name, "MGS");
+        } else {
+                CERROR("unknown server type %#x\n", flags);
+                return 1;
+        }
+        return 0;
+}
+
+/* Get the index from the obd name */
+int server_name2index(char *svname, __u32 *idx, char **endptr);
+
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes 
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+        __u32      lmd_magic;
+        __u32      lmd_flags;         /* lustre mount flags */
+        int        lmd_mgs_failnodes; /* mgs failover node count */
+        int        lmd_exclude_count;
+        char      *lmd_dev;           /* device name */
+        char      *lmd_profile;       /* client only */
+        char      *lmd_opts;          /* lustre mount options (as opposed to 
+                                         _device_ mount options) */
+        __u32     *lmd_exclude;       /* array of OSTs to ignore */
+};
+
+#define LMD_FLG_CLIENT       0x0002  /* Mounting a client only */
+#define LMD_FLG_RECOVER      0x0004  /* Allow recovery */
+#define LMD_FLG_NOSVC        0x0008  /* Only start MGS/MGC for servers, 
+                                        no other services */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) 
+
+/****************** mkfs command *********************/
+
+#define MO_IS_LOOP     0x01
+#define MO_FORCEFORMAT 0x02
+
+/* used to describe the options to format the lustre disk, not persistent */
+struct mkfs_opts {
+        struct lustre_disk_data mo_ldd; /* to be written in MOUNT_DATA_FILE */
+        char  mo_mount_type_string[20]; /* "ext3", "ldiskfs", ... */
+        char  mo_device[128];           /* disk device name */
+        char  mo_mkfsopts[128];         /* options to the backing-store mkfs */
+        char  mo_loopdev[128];          /* in case a loop dev is needed */
+        __u64 mo_device_sz;             /* in KB */
+        int   mo_stripe_count;
+        int   mo_flags; 
+        int   mo_mgs_failnodes;
+};
+
+/****************** on-disk files *********************/
+
+#define LAST_RCVD    "last_rcvd"
+#define LOV_OBJID    "lov_objid"
+#define HEALTH_CHECK "health_check"
+
+/****************** last_rcvd file *********************/
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+/* This limit is arbitrary (32k clients on x86), but it is convenient to use
+ * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */
+#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
+
+                                                                                
+/* COMPAT_146 */
+#define OBD_COMPAT_OST          0x00000002 /* this is an OST (temporary) */
+#define OBD_COMPAT_MDT          0x00000004 /* this is an MDT (temporary) */
+/* end COMPAT_146 */
+
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001 /* MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_CROW       0x00000002 /* OST will CROW create objects */
+
+#define OBD_INCOMPAT_GROUPS     0x00000001 /* OST handles group subdirs */
+#define OBD_INCOMPAT_OST        0x00000002 /* this is an OST */
+#define OBD_INCOMPAT_MDT        0x00000004 /* this is an MDT */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008 /* common last_rvcd format */
+
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+        __u8  lsd_uuid[40];        /* server UUID */
+        __u64 lsd_unused;          /* was fsd_last_objid - don't use for now */
+        __u64 lsd_last_transno;    /* last completed transaction ID */
+        __u64 lsd_mount_count;     /* incarnation number */
+        __u32 lsd_feature_compat;  /* compatible feature flags */
+        __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+        __u32 lsd_feature_incompat;/* incompatible feature flags */
+        __u32 lsd_server_size;     /* size of server data area */
+        __u32 lsd_client_start;    /* start of per-client data area */
+        __u16 lsd_client_size;     /* size of per-client data area */
+        __u16 lsd_subdir_count;    /* number of subdirectories for objects */
+        __u64 lsd_catalog_oid;     /* recovery catalog object id */
+        __u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+        __u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+        __u32 lsd_ost_index;       /* index number of OST in LOV */
+        __u32 lsd_mdt_index;       /* index number of MDT in LMV */
+        __u8  lsd_padding[LR_SERVER_SIZE - 148];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+        __u8  lcd_uuid[40];      /* client UUID */
+        __u64 lcd_last_transno; /* last completed transaction ID */
+        __u64 lcd_last_xid;     /* xid for the last transaction */
+        __u32 lcd_last_result;  /* result from last RPC */
+        __u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+        /* for MDS_CLOSE requests */
+        __u64 lcd_last_close_transno; /* last completed transaction ID */
+        __u64 lcd_last_close_xid;     /* xid for the last transaction */
+        __u32 lcd_last_close_result;  /* result from last RPC */
+        __u32 lcd_last_close_data;    /* per-op data */
+        __u8  lcd_padding[LR_CLIENT_SIZE - 88];
+};
+
+
+#ifdef __KERNEL__
+/****************** superblock additional info *********************/
+struct ll_sb_info;
+
+struct lustre_sb_info {
+        int                       lsi_flags;
+        struct obd_device        *lsi_mgc;     /* mgc obd */
+        struct lustre_mount_data *lsi_lmd;     /* mount command info */
+        struct lustre_disk_data  *lsi_ldd;     /* mount info on-disk */
+        struct ll_sb_info        *lsi_llsbi;   /* add'l client sbi info */
+        struct vfsmount          *lsi_srv_mnt; /* the one server mount */
+        atomic_t                  lsi_mounts;  /* references to the srv_mnt */
+};
+
+#define LSI_SERVER                       0x00000001
+#define LSI_UMOUNT_FORCE                 0x00000010
+#define LSI_UMOUNT_FAILOVER              0x00000020
+
+#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+# define    s2lsi(sb)        ((struct lustre_sb_info *)((sb)->s_fs_info))
+# define    s2lsi_nocast(sb) ((sb)->s_fs_info)
+#else  /* 2.4 here */
+# define    s2lsi(sb)        ((struct lustre_sb_info *)((sb)->u.generic_sbp))
+# define    s2lsi_nocast(sb) ((sb)->u.generic_sbp)
+#endif
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+
+#endif /* __KERNEL__ */
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+        char               *lmi_name;
+        struct super_block *lmi_sb;
+        struct vfsmount    *lmi_mnt;
+        struct list_head    lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+#ifdef __KERNEL__
+#include <obd_class.h>
+
+/* obd_mount.c */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+int lustre_process_log(struct super_block *sb, char *logname, 
+                     struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname, 
+                       struct config_llog_instance *cfg);
+struct lustre_mount_info *server_get_mount(char *name);
+int server_put_mount(char *name, struct vfsmount *mnt);
+int server_register_target(struct super_block *sb);
+struct mgs_target_info;
+int server_mti_print(char *title, struct mgs_target_info *mti);
+
+/* mgc_request.c */
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id);
+
+#endif
+
+#endif // _LUSTRE_DISK_H
index a7f3a34..f6e3f36 100644 (file)
@@ -8,6 +8,7 @@
 #include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
 
+/* Data stored per client in the last_rcvd file.  In le32 order. */
 struct mds_client_data;
 
 struct mds_export_data {
index d172dec..ff74277 100644 (file)
@@ -86,11 +86,17 @@ struct obd_import {
         spinlock_t                imp_lock;
 
         /* flags */
-        unsigned int              imp_invalid:1, imp_replayable:1,
-                                  imp_dlm_fake:1, imp_server_timeout:1,
-                                  imp_initial_recov:1, imp_initial_recov_bk:1,
-                                  imp_force_verify:1, imp_pingable:1,
-                                  imp_resend_replay:1, imp_deactive:1;
+        unsigned int             
+                imp_invalid:1,          /* evicted */
+                imp_replayable:1,       /* try to recover the import */
+                imp_dlm_fake:1,         /* don't run recovery (timeout instead) */
+                imp_server_timeout:1,   /* use 1/2 timeout on MDS' OSCs */
+                imp_initial_recov:1,    /* retry the initial connection */  
+                imp_initial_recov_bk:1, /* turn off init_recov after trying all failover nids */
+                imp_force_verify:1,     /* force an immidiate ping */
+                imp_pingable:1,         /* pingable */
+                imp_resend_replay:1,    /* resend for replay */
+                imp_deactive:1;         /* administratively disabled */
         __u32                     imp_connect_op;
         struct obd_connect_data   imp_connect_data;
         __u64                     imp_connect_flags_orig;
index c98cfc8..d83db00 100644 (file)
@@ -471,6 +471,7 @@ static inline void obd_ioctl_freedata(char *buf, int len)
 #define OBD_IOC_PROCESS_CFG            _IOWR('f', 184, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_DUMP_LOG               _IOWR('f', 185, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_CLEAR_LOG              _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM                  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
 
 #define OBD_IOC_CATLOGLIST             _IOWR('f', 190, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_LLOG_INFO              _IOWR('f', 191, OBD_IOC_DATA_TYPE)
index 2bc951d..c05ce65 100644 (file)
@@ -92,6 +92,7 @@ int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
                          void *data, void *catdata);
 extern int llog_cancel_rec(struct llog_handle *loghandle, int index);
 extern int llog_close(struct llog_handle *cathandle);
+extern int llog_get_size(struct llog_handle *loghandle);
 
 /* llog_cat.c   -  catalog api */
 struct llog_process_data {
@@ -217,7 +218,7 @@ static inline void llog_gen_init(struct llog_ctxt *ctxt)
 
         if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME))
                 ctxt->loc_gen.mnt_cnt = obd->u.mds.mds_mount_count;
-        else if (!strstr(obd->obd_type->typ_name, LUSTRE_FILTER_NAME))
+        else if (!strstr(obd->obd_type->typ_name, LUSTRE_OST_NAME))
                 ctxt->loc_gen.mnt_cnt = obd->u.filter.fo_mount_count;
         else
                 ctxt->loc_gen.mnt_cnt = 0;
index 857c29d..bd596bf 100644 (file)
 #define MDS_MAXREQSIZE  (5 * 1024)
 #define MDS_MAXREPSIZE  max(9 * 1024, 280 + LOV_MAX_STRIPE_COUNT * 56)
 
+/* FIXME fix all constants here.  Andreas suggests dyamically adding threads. */
+#define MGS_MAX_THREADS 8UL
+#define MGS_NUM_THREADS max(2UL, min_t(unsigned long, MGS_MAX_THREADS, \
+                            num_physpages * smp_num_cpus >> (26 - PAGE_SHIFT)))
+                                  
+#define MGS_NBUFS       (64 * smp_num_cpus)
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (5 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
 #define OST_MAX_THREADS 512UL
 #define OST_DEF_THREADS max_t(unsigned long, 2, \
                               (num_physpages >> (26-PAGE_SHIFT)) * smp_num_cpus)
diff --git a/lustre/include/lustre_param.h b/lustre/include/lustre_param.h
new file mode 100644 (file)
index 0000000..142c1f1
--- /dev/null
@@ -0,0 +1,46 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *
+ * User-settable parameter keys
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/* obd_mount.c */
+int class_find_param(char *buf, char *key, char **valp);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+
+
+/****************** User-settable parameter keys *********************/
+
+#define PARAM_MGSNODE          "mgsnode="
+#define PARAM_FAILNODE         "failnode="
+#define PARAM_OBD_TIMEOUT      "obd_timeout="
+#define PARAM_DEFAULT_STRIPE   "default_stripe_"
+#define PARAM_D_STRIPE_SIZE    PARAM_DEFAULT_STRIPE"size"
+#define PARAM_D_STRIPE_COUNT   PARAM_DEFAULT_STRIPE"count"
+#define PARAM_D_STRIPE_OFFSET  PARAM_DEFAULT_STRIPE"offset"
+#define PARAM_D_STRIPE_PATTERN PARAM_DEFAULT_STRIPE"pattern"
+
+#endif // _LUSTRE_PARAM_H
index 1f03420..8c22e02 100644 (file)
@@ -101,7 +101,7 @@ struct lov_stripe_md {
         struct {
                 /* Public members. */
                 __u64 lw_object_id;        /* lov object id */
-                __u64 lw_object_gr;        /* lov object id */
+                __u64 lw_object_gr;        /* lov object group */
                 __u64 lw_maxbytes;         /* maximum possible file size */
                 unsigned long lw_xfersize; /* optimal transfer size */
 
@@ -235,7 +235,7 @@ struct filter_obd {
         spinlock_t           fo_translock;      /* protect fsd_last_transno */
         struct file         *fo_rcvd_filp;
         struct file         *fo_health_check_filp;
-        struct filter_server_data *fo_fsd;
+        struct lr_server_data *fo_fsd;
         unsigned long       *fo_last_rcvd_slots;
         __u64                fo_mount_count;
 
@@ -297,8 +297,6 @@ struct filter_obd {
         atomic_t                 fo_quotachecking;
 };
 
-struct mds_server_data;
-
 #define OSC_MAX_RIF_DEFAULT       8
 #define OSC_MAX_RIF_MAX         256
 #define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
@@ -368,6 +366,13 @@ struct client_obd {
         struct mdc_rpc_lock     *cl_setattr_lock;
         struct osc_creator       cl_oscc;
 
+        /* mgc datastruct */
+        struct semaphore         cl_mgc_sem;
+        struct vfsmount         *cl_mgc_vfsmnt;
+        struct dentry           *cl_mgc_configs_dir;
+        atomic_t                 cl_mgc_refcount;
+        struct obd_export       *cl_mgc_mgsexp;
+
         /* Flags section */
         unsigned int             cl_checksum:1; /* debug checksums */
 
@@ -381,6 +386,16 @@ struct client_obd {
 
 #define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
 
+struct mgs_obd {
+        struct ptlrpc_service           *mgs_service;
+        struct vfsmount                 *mgs_vfsmnt;
+        struct super_block              *mgs_sb;
+        struct dentry                   *mgs_configs_dir;
+        struct dentry                   *mgs_fid_de;
+        struct list_head                 mgs_fs_db_list;
+        struct semaphore                 mgs_sem;
+};
+
 struct mds_obd {
         /* NB this field MUST be first */
         struct obd_device_target         mds_obt;
@@ -399,7 +414,7 @@ struct mds_obd {
         unsigned long                    mds_atime_diff;
         struct semaphore                 mds_epoch_sem;
         struct ll_fid                    mds_rootfid;
-        struct mds_server_data          *mds_server_data;
+        struct lr_server_data           *mds_server_data;
         cfs_dentry_t                    *mds_pending_dir;
         cfs_dentry_t                    *mds_logs_dir;
         cfs_dentry_t                    *mds_objects_dir;
@@ -409,9 +424,11 @@ struct mds_obd {
         struct obd_uuid                  mds_lov_uuid;
         char                            *mds_profile;
         struct obd_export               *mds_osc_exp; /* XXX lov_exp */
-        int                              mds_has_lov_desc;
         struct lov_desc                  mds_lov_desc;
         obd_id                          *mds_lov_objids;
+        int                              mds_lov_objids_size;
+        __u32                            mds_lov_objids_in_file;
+        unsigned int                     mds_lov_objids_dirty:1;
         int                              mds_lov_nextid_set;
         struct file                     *mds_lov_objid_filp;
         struct file                     *mds_health_check_filp;
@@ -464,6 +481,7 @@ struct lov_obd {
         struct semaphore lov_lock;
         atomic_t refcount;
         struct lov_desc desc;
+        struct obd_connect_data ocd;
         int bufsize;
         int connects;
         int death_row;      /* Do we have tgts scheduled to be deleted?
@@ -484,19 +502,27 @@ struct niobuf_local {
 };
 
 /* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
 #define LUSTRE_MDS_NAME         "mds"
 #define LUSTRE_MDT_NAME         "mdt"
 #define LUSTRE_MDC_NAME         "mdc"
-#define LUSTRE_FILTER_NAME      "obdfilter"
-#define LUSTRE_OST_NAME         "ost"
+#define LUSTRE_OSS_NAME         "ost" /*FIXME change name to oss*/
+#define LUSTRE_OST_NAME         "obdfilter" /* FIXME change name to ost*/
 #define LUSTRE_OSC_NAME         "osc"
+#define LUSTRE_LOV_NAME         "lov"
+#define LUSTRE_MGS_NAME         "mgs"
+#define LUSTRE_MGC_NAME         "mgc"
+
+#define LUSTRE_OSTSAN_NAME      "sanobdfilter"
 #define LUSTRE_SANOSC_NAME      "sanosc"
 #define LUSTRE_SANOST_NAME      "sanost"
-#define LUSTRE_LOV_NAME         "lov"
 #define LUSTRE_CACHEOBD_NAME    "cobd"
 #define LUSTRE_ECHO_NAME        "obdecho"
 #define LUSTRE_ECHO_CLIENT_NAME "echo_client"
 
+/* Constant obd names */
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
 
 /* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
 #define N_LOCAL_TEMP_PAGE 0x10000000
@@ -589,7 +615,10 @@ enum obd_notify_event {
         /* Device deactivated */
         OBD_NOTIFY_INACTIVE,
         /* Connect data for import were changed */
-        OBD_NOTIFY_OCD
+        OBD_NOTIFY_OCD,
+        /* Sync request */
+        OBD_NOTIFY_SYNC_NONBLOCK,
+        OBD_NOTIFY_SYNC
 };
 
 /*
@@ -605,13 +634,12 @@ struct obd_notify_upcall {
 
 /* corresponds to one of the obd's */
 struct obd_device {
-        struct obd_type *obd_type;
-
+        struct obd_type        *obd_type;
         /* common and UUID name of this device */
-        char *obd_name;
-        struct obd_uuid obd_uuid;
+        char                   *obd_name;
+        struct obd_uuid         obd_uuid;
 
-        int obd_minor;
+        int                     obd_minor;
         unsigned int obd_attached:1, obd_set_up:1, obd_recovering:1,
                 obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1,
                 obd_no_recov:1, obd_stopping:1, obd_starting:1,
@@ -667,6 +695,7 @@ struct obd_device {
                 struct echo_client_obd echo_client;
                 struct echo_obd echo;
                 struct lov_obd lov;
+                struct mgs_obd mgs;
         } u;
         /* Fields used by LProcFS */
         unsigned int           obd_cntr_base;
@@ -694,6 +723,13 @@ enum obd_cleanup_stage {
         OBD_CLEANUP_OBD,
 };
 
+/* get/set_info keys */
+#define KEY_MDS_CONN "mds_conn"
+#define KEY_NEXT_ID  "next_id"
+#define KEY_LOVDESC  "lovdesc"
+#define KEY_INIT_RECOV "initial_recov"
+#define KEY_INIT_RECOV_BACKUP "init_recov_bk"
+
 struct obd_ops {
         struct module *o_owner;
         int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
@@ -848,7 +884,7 @@ struct obd_ops {
                               enum obd_import_event);
 
         int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
-                        enum obd_notify_event ev);
+                        enum obd_notify_event ev, void *data);
 
         int (*o_health_check)(struct obd_device *);
 
index 78ec204..2a6fbf4 100644 (file)
@@ -62,6 +62,7 @@ int class_name2dev(char *name);
 struct obd_device *class_name2obd(char *name);
 int class_uuid2dev(struct obd_uuid *uuid);
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
 struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
                                           char * typ_name,
                                           struct obd_uuid *grp_uuid);
@@ -83,7 +84,7 @@ char *obd_export_nid2str(struct obd_export *exp);
 int obd_export_evict_by_nid(struct obd_device *obd, char *nid);
 int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid);
 
-/* config.c */
+/* obd_config.c */
 int class_process_config(struct lustre_cfg *lcfg);
 int class_attach(struct lustre_cfg *lcfg);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -92,16 +93,36 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
 struct obd_device *class_incref(struct obd_device *obd);
 void class_decref(struct obd_device *obd);
 
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Translation to new obd names required */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+
 /* Passed as data param to class_config_parse_llog */
 struct config_llog_instance {
-        char * cfg_instance;
-        struct obd_uuid cfg_uuid;
+        char *              cfg_instance;
+        struct super_block *cfg_sb;
+        struct obd_uuid     cfg_uuid;
+        int                 cfg_last_idx; /* for partial llog processing */
+        int                 cfg_flags; 
 };
 int class_config_parse_llog(struct llog_ctxt *ctxt, char *name,
                             struct config_llog_instance *cfg);
 int class_config_dump_llog(struct llog_ctxt *ctxt, char *name,
                            struct config_llog_instance *cfg);
 
+/* list of active configuration logs  */
+struct config_llog_data {
+        char               *cld_logname;
+        struct ldlm_res_id  cld_resid;
+        struct config_llog_instance cld_cfg;
+        struct list_head    cld_list_chain;
+        atomic_t            cld_refcount;
+        unsigned int        cld_stopping:1;
+};
+
 struct lustre_profile {
         struct list_head lp_list;
         char * lp_profile;
@@ -112,6 +133,7 @@ struct lustre_profile {
 struct lustre_profile *class_get_profile(char * prof);
 void class_del_profile(char *prof);
 
+/* genops.c */
 #define class_export_get(exp)                                                  \
 ({                                                                             \
         struct obd_export *exp_ = exp;                                         \
@@ -140,6 +162,7 @@ void class_import_put(struct obd_import *);
 struct obd_import *class_new_import(struct obd_device *obd);
 void class_destroy_import(struct obd_import *exp);
 
+struct obd_type *class_search_type(char *name);
 struct obd_type *class_get_type(char *name);
 void class_put_type(struct obd_type *type);
 int class_connect(struct lustre_handle *conn, struct obd_device *obd,
@@ -148,7 +171,7 @@ int class_disconnect(struct obd_export *exp);
 void class_fail_export(struct obd_export *exp);
 void class_disconnect_exports(struct obd_device *obddev);
 void class_disconnect_stale_exports(struct obd_device *obddev);
-void class_manual_cleanup(struct obd_device *obd);
+int class_manual_cleanup(struct obd_device *obd);
 
 /* obdo.c */
 void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
@@ -1042,11 +1065,16 @@ static inline void obd_import_event(struct obd_device *obd,
 
 static inline int obd_notify(struct obd_device *obd,
                              struct obd_device *watched,
-                             enum obd_notify_event ev)
+                             enum obd_notify_event ev, void *data)
 {
         ENTRY;
         OBD_CHECK_DEV(obd);
-        if (!obd->obd_set_up) {
+
+        /* the check for async_recov is a complete hack - I'm hereby
+           overloading the meaning to also mean "this was called from
+           mds_postsetup".  I know that my mds is able to handle notifies
+           by this point, and it needs to get them to execute mds_postrecov. */                                                                                
+        if (!obd->obd_set_up && !obd->obd_async_recov) {
                 CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
                 RETURN(-EINVAL);
         }
@@ -1057,12 +1085,12 @@ static inline int obd_notify(struct obd_device *obd,
         }
 
         OBD_COUNTER_INCREMENT(obd, notify);
-        RETURN(OBP(obd, notify)(obd, watched, ev));
+        RETURN(OBP(obd, notify)(obd, watched, ev, data));
 }
 
 static inline int obd_notify_observer(struct obd_device *observer,
                                       struct obd_device *observed,
-                                      enum obd_notify_event ev)
+                                      enum obd_notify_event ev, void *data)
 {
         int rc1;
         int rc2;
@@ -1070,7 +1098,7 @@ static inline int obd_notify_observer(struct obd_device *observer,
         struct obd_notify_upcall *onu;
 
         if (observer->obd_observer)
-                rc1 = obd_notify(observer->obd_observer, observed, ev);
+                rc1 = obd_notify(observer->obd_observer, observed, ev, data);
         else
                 rc1 = 0;
         /*
@@ -1181,7 +1209,7 @@ extern void obd_sysctl_clean (void);
 
 /* uuid.c  */
 typedef __u8 class_uuid_t[16];
-//int class_uuid_parse(struct obd_uuid in, class_uuid_t out);
+void class_generate_random_uuid(class_uuid_t uuid);
 void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
 
 /* lustre_peer.c    */
index 2fa9852..beca205 100644 (file)
@@ -166,6 +166,10 @@ extern cfs_waitq_t obd_race_waitq;
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 
+#define OBD_FAIL_MGS                     0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+
 /* preparation for a more advanced failure testbed (not functional yet) */
 #define OBD_FAIL_MASK_SYS    0x0000FF00
 #define OBD_FAIL_MASK_LOC    (0x000000FF | OBD_FAIL_MASK_SYS)
index 8629266..e7eb927 100644 (file)
@@ -2365,7 +2365,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
 CONFIG_DEBUG_HIGHMEM=y
-CONFIG_DEBUG_INFO=y
+# CONFIG_DEBUG_INFO is not set
 # CONFIG_FRAME_POINTER is not set
 CONFIG_EARLY_PRINTK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
index 58229f3..8c473dd 100644 (file)
@@ -44,7 +44,7 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
         int compat = 1;
         ENTRY;
 
-        LASSERT(req_bits); /* There is no sence in lock with no bits set,
+        LASSERT(req_bits); /* There is no sense in lock with no bits set,
                               I think. Also such a lock would be compatible
                                with any other bit lock */
         list_for_each(tmp, queue) {
index a2dcf4b..f9f6c43 100644 (file)
@@ -50,7 +50,6 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
 int ldlm_process_flock_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                             ldlm_error_t *err);
 
-
 /* ldlm_inodebits.c */
 int ldlm_process_inodebits_lock(struct ldlm_lock *lock, int *flags,
                                 int first_enq, ldlm_error_t *err);
index 6ee4f03..0b9945d 100644 (file)
@@ -207,6 +207,10 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
                 rq_portal = MDS_REQUEST_PORTAL;
                 rp_portal = MDC_REPLY_PORTAL;
                 connect_op = MDS_CONNECT;
+        } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+                rq_portal = MGS_REQUEST_PORTAL;
+                rp_portal = MGC_REPLY_PORTAL;
+                connect_op = MGS_CONNECT;
         } else {
                 CERROR("unknown client OBD type \"%s\", can't setup\n",
                        name);
@@ -234,6 +238,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         }
 
         sema_init(&cli->cl_sem, 1);
+        sema_init(&cli->cl_mgc_sem, 1);
         cli->cl_conn_count = 0;
         memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
                min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
@@ -284,6 +289,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         imp->imp_client = &obddev->obd_ldlm_client;
         imp->imp_connect_op = connect_op;
         imp->imp_initial_recov = 1;
+        imp->imp_initial_recov_bk = 0;
         CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
         memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
                LUSTRE_CFG_BUFLEN(lcfg, 1));
@@ -331,7 +337,7 @@ int client_obd_cleanup(struct obd_device *obddev)
         RETURN(0);
 }
 
-/* ->o_connect() method for client side (OSC and MDC) */
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
 int client_connect_import(struct lustre_handle *dlm_handle,
                           struct obd_device *obd, struct obd_uuid *cluuid,
                           struct obd_connect_data *data)
@@ -531,8 +537,16 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 
         obd_str2uuid (&tgtuuid, str);
         target = class_uuid2obd(&tgtuuid);
+        /* COMPAT_146 */
+        /* old (pre 1.6) lustre_process_log tries to connect to mdsname
+           (eg. mdsA) instead of uuid. */
+        if (!target) {
+                snprintf((char *)tgtuuid.uuid, sizeof(tgtuuid), "%s_UUID", str);
+                target = class_uuid2obd(&tgtuuid);
+        }
         if (!target)
                 target = class_name2obd(str);
+        /* end COMPAT_146 */
 
         if (!target || target->obd_stopping || !target->obd_set_up) {
                 DEBUG_REQ(D_ERROR, req, "UUID '%s' is not available "
@@ -585,12 +599,12 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
                 if (!data) {
-                        DEBUG_REQ(D_INFO, req, "Refusing old (unversioned) "
+                        DEBUG_REQ(D_WARNING, req, "Refusing old (unversioned) "
                                   "libclient connection attempt\n");
                         GOTO(out, rc = -EPROTO);
                 } else if (data->ocd_version < LUSTRE_VERSION_CODE -
                                                LUSTRE_VERSION_ALLOWED_OFFSET) {
-                        DEBUG_REQ(D_INFO, req, "Refusing old (%d.%d.%d.%d) "
+                        DEBUG_REQ(D_WARNING, req, "Refusing old (%d.%d.%d.%d) "
                                   "libclient connection attempt\n",
                                   OBD_OCD_VERSION_MAJOR(data->ocd_version),
                                   OBD_OCD_VERSION_MINOR(data->ocd_version),
index 293733e..b801f01 100644 (file)
@@ -713,6 +713,11 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
                 GOTO(out, rc = -EPROTO);
         }
 
+#if 0
+        /* FIXME this makes it impossible to use LDLM_PLAIN locks -- check 
+           against server's _CONNECT_SUPPORTED flags? (I don't want to use
+           ibits for mgc/mgs) */
+
         /* INODEBITS_INTEROP: Perform conversion from plain lock to
          * inodebits lock if client does not support them. */
         if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) &&
@@ -723,6 +728,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
                 if (dlm_req->lock_desc.l_req_mode == LCK_PR)
                         dlm_req->lock_desc.l_req_mode = LCK_CR;
         }
+#endif
 
         if (flags & LDLM_FL_REPLAY) {
                 lock = find_existing_lock(req->rq_export,
@@ -1579,6 +1585,9 @@ static int ldlm_setup(void)
         spin_lock_init(&waiting_locks_spinlock);
         cfs_timer_init(&waiting_locks_timer, waiting_locks_callback, 0);
 
+        /* Using CLONE_FILES instead of CLONE_FS here causes failures in 
+           conf-sanity test 21.  But using CLONE_FS can cause problems
+           if the daemonize happens between push/pop_ctxt... */
         rc = cfs_kernel_thread(expired_lock_main, NULL, CLONE_VM | CLONE_FS);
         if (rc < 0) {
                 CERROR("Cannot start ldlm expired-lock thread: %d\n", rc);
index 054fa0d..cdb8f04 100644 (file)
@@ -101,7 +101,7 @@ int liblustre_process_log(struct config_llog_instance *cfg,
                 GOTO(out, rc);
 
         lustre_cfg_bufs_reset(&bufs, name);
-        lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);
+        lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);//FIXME connect to mgc
         lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid);
         lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
         rc = class_process_config(lcfg);
@@ -131,7 +131,7 @@ int liblustre_process_log(struct config_llog_instance *cfg,
 
         /* Disable initial recovery on this import */
         rc = obd_set_info_async(obd->obd_self_export,
-                                strlen("initial_recov"), "initial_recov",
+                                strlen(KEY_INIT_RECOV), KEY_INIT_RECOV,
                                 sizeof(allow_recov), &allow_recov, NULL);
 
         rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd);
index 96979ee..d5dfd8c 100644 (file)
@@ -52,7 +52,7 @@ struct llu_inode_info {
         char                   *lli_symlink_name;
         struct semaphore        lli_open_sem;
         __u64                   lli_maxbytes;
-        unsigned long          lli_flags;
+        unsigned long           lli_flags;
 
         /* for libsysio */
         struct file_identifier  lli_sysio_fid;
index d262d5c..7517ef1 100644 (file)
@@ -347,7 +347,7 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm)
 static struct inode* llu_new_inode(struct filesys *fs,
                                    struct ll_fid *fid)
 {
-       struct inode *inode;
+        struct inode *inode;
         struct llu_inode_info *lli;
         struct intnl_stat st = {
                 .st_dev  = 0,
@@ -377,11 +377,11 @@ static struct inode* llu_new_inode(struct filesys *fs,
         lli->lli_fid = *fid;
 
         /* file identifier is needed by functions like _sysio_i_find() */
-       inode = _sysio_i_new(fs, &lli->lli_sysio_fid,
+        inode = _sysio_i_new(fs, &lli->lli_sysio_fid,
                              &st, 0, &llu_inode_ops, lli);
 
-       if (!inode)
-               OBD_FREE(lli, sizeof(*lli));
+        if (!inode)
+                OBD_FREE(lli, sizeof(*lli));
 
         return inode;
 }
@@ -719,10 +719,10 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
                                     (rc = ll_permission(inode, MAY_WRITE)) != 0)
                                         RETURN(rc);
                         } else {
-                               /* from inode_change_ok() */
-                               if (current->fsuid != st->st_uid &&
-                                   !capable(CAP_FOWNER))
-                                       RETURN(-EPERM);
+                                /* from inode_change_ok() */
+                                if (current->fsuid != st->st_uid &&
+                                    !capable(CAP_FOWNER))
+                                        RETURN(-EPERM);
                         }
                 }
 
@@ -1692,7 +1692,7 @@ llu_fsswop_mount(const char *source,
         struct config_llog_instance cfg;
         char ll_instance[sizeof(sbi) * 2 + 1];
         struct lustre_profile *lprof;
-       char *zconf_mdsnid, *zconf_mdsname, *zconf_profile;
+        char *zconf_mdsnid, *zconf_mdsname, *zconf_profile;
         char *osc = NULL, *mdc = NULL;
         int async = 1, err = -EINVAL;
         struct obd_connect_data ocd = {0,};
@@ -1842,19 +1842,19 @@ llu_fsswop_mount(const char *source,
                 GOTO(out_request, err = -EBADF);
         }
 
-       /*
-        * Generate base path-node for root.
-        */
-       rootpb = _sysio_pb_new(&noname, NULL, root);
-       if (!rootpb) {
-               err = -ENOMEM;
-               goto out_inode;
-       }
+        /*
+         * Generate base path-node for root.
+         */
+        rootpb = _sysio_pb_new(&noname, NULL, root);
+        if (!rootpb) {
+                err = -ENOMEM;
+                goto out_inode;
+        }
 
-       err = _sysio_do_mount(fs, rootpb, flags, tocover, mntp);
-       if (err) {
+        err = _sysio_do_mount(fs, rootpb, flags, tocover, mntp);
+        if (err) {
                 _sysio_pb_gone(rootpb);
-               goto out_inode;
+                goto out_inode;
         }
 
         ptlrpc_req_finished(request);
index 68c8658..4b2132e 100644 (file)
@@ -13,7 +13,8 @@
 #include <lustre_debug.h>
 #include <lustre_ver.h>
 #include <linux/lustre_version.h>
-
+#include <lustre_disk.h>  /* for s2sbi */
 /*
 struct lustre_intent_data {
         __u64 it_lock_handle[2];
@@ -158,8 +159,6 @@ struct ll_sb_info {
         struct proc_dir_entry*    ll_proc_root;
         obd_id                    ll_rootino; /* number of root inode */
 
-        struct lustre_mount_data *ll_lmd;
-
         int                       ll_flags;
         struct list_head          ll_conn_chain; /* per-conn chain of SBs */
         struct lustre_client_ocd  ll_lco;
@@ -424,12 +423,10 @@ int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name);
 extern struct super_operations lustre_super_operations;
 
 char *ll_read_opt(const char *opt, char *data);
-int ll_set_opt(const char *opt, char *data, int fl);
-void ll_options(char *options, char **ost, char **mds, int *flags);
+void ll_options(char *options, int *flags);
 void ll_lli_init(struct ll_inode_info *lli);
-int ll_fill_super(struct super_block *sb, void *data, int silent);
-int lustre_fill_super(struct super_block *sb, void *data, int silent);
-void lustre_put_super(struct super_block *sb);
+int ll_fill_super(struct super_block *sb);
+void ll_put_super(struct super_block *sb);
 struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
 void ll_clear_inode(struct inode *inode);
 int ll_setattr_raw(struct inode *inode, struct iattr *attr);
@@ -442,7 +439,7 @@ void ll_read_inode2(struct inode *inode, void *opaque);
 int ll_iocontrol(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg);
 void ll_umount_begin(struct super_block *sb);
-int lustre_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
 int ll_prep_inode(struct obd_export *exp, struct inode **inode,
                   struct ptlrpc_request *req, int offset, struct super_block *);
 void lustre_dump_dentry(struct dentry *, int recur);
@@ -513,8 +510,9 @@ int ll_tree_unlock(struct ll_lock_tree *tree);
 
 #define LL_MAX_BLKSIZE          (4UL * 1024 * 1024)
 
+#define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
+
 #if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#define    ll_s2sbi_nocast(sb) ((sb)->s_fs_info)
 void __d_rehash(struct dentry * entry, int lock);
 static inline __u64 ll_ts2u64(struct timespec *time)
 {
@@ -522,13 +520,11 @@ static inline __u64 ll_ts2u64(struct timespec *time)
         return t;
 }
 #else  /* 2.4 here */
-#define    ll_s2sbi_nocast(sb) ((sb)->u.generic_sbp)
 static inline __u64 ll_ts2u64(time_t *time)
 {
         return *time;
 }
 #endif
-#define    ll_s2sbi(sb)        ((struct ll_sb_info *)ll_s2sbi_nocast(sb))
 
 /* don't need an addref as the sb_info should be holding one */
 static inline struct obd_export *ll_s2obdexp(struct super_block *sb)
index 90dd73c..cf0fc28 100644 (file)
@@ -32,6 +32,7 @@
 #include <lustre_ha.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
+#include <lustre_disk.h>
 #include "llite_internal.h"
 
 kmem_cache_t *ll_file_data_slab;
@@ -46,33 +47,8 @@ extern struct address_space_operations ll_dir_aops;
 #define log2(n) ffz(~(n))
 #endif
 
-/* We need to have some extra twiddling here because some systems have
- * no random state when they start up. */
-static void
-lustre_generate_random_uuid(class_uuid_t uuid)
-{
-        struct timeval t;
-        int *i, j, k;
-
-        ENTRY;
-        LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0);
-
-        j = jiffies;
-        do_gettimeofday(&t);
-        k = t.tv_usec;
-
-        generate_random_uuid(uuid);
-
-        for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) {
-                *i ^= j ^ k;
-                j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff);
-                k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000);
-        }
-
-        EXIT;
-}
 
-struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
+struct ll_sb_info *ll_init_sbi(void)
 {
         struct ll_sb_info *sbi = NULL;
         class_uuid_t uuid;
@@ -97,9 +73,8 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
 
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
-        ll_s2sbi_nocast(sb) = sbi;
 
-        lustre_generate_random_uuid(uuid);
+        class_generate_random_uuid(uuid);
         class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
         CDEBUG(D_HA, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
 
@@ -112,7 +87,7 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
         RETURN(sbi);
 }
 
-void lustre_free_sbi(struct super_block *sb)
+void ll_free_sbi(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         ENTRY;
@@ -123,7 +98,6 @@ void lustre_free_sbi(struct super_block *sb)
                 spin_unlock(&ll_sb_lock);
                 OBD_FREE(sbi, sizeof(*sbi));
         }
-        ll_s2sbi_nocast(sb) = NULL;
         EXIT;
 }
 
@@ -131,7 +105,7 @@ static struct dentry_operations ll_d_root_ops = {
         .d_compare = ll_dcompare,
 };
 
-int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
+int client_common_fill_super(struct super_block *sb, char *mdc, char *osc)
 {
         struct inode *root = 0;
         struct ll_sb_info *sbi = ll_s2sbi(sb);
@@ -184,7 +158,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
 
         err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data);
         if (err == -EBUSY) {
-                CERROR("An MDS (mdc %s) is performing recovery, of which this"
+                CERROR("An MDT (mdc %s) is performing recovery, of which this"
                        " client is not a part.  Please wait for recovery to "
                        "complete, abort, or time out.\n", mdc);
                 GOTO(out, err);
@@ -269,11 +243,18 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
 
         mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp);
 
+        err = obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL,
+                                  0, NULL, NULL, NULL);
+        if (err < 0) {
+                LCONSOLE_ERROR("There are no OST's in this filesystem. "
+                               "There must be at least one active OST for "
+                               "a client to start.\n");
+                GOTO(out_osc, err);
+        }
+
         if (!ll_async_page_slab) {
                 ll_async_page_slab_size =
-                        size_round(sizeof(struct ll_async_page)) +
-                        obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL,
-                                            0, NULL, NULL, NULL);
+                        size_round(sizeof(struct ll_async_page)) + err;
                 ll_async_page_slab = kmem_cache_create("ll_async_page",
                                                        ll_async_page_slab_size,
                                                        0, 0, NULL, NULL);
@@ -475,7 +456,7 @@ static void prune_deathrow(struct ll_sb_info *sbi, int try)
         EXIT;
 }
 
-void lustre_common_put_super(struct super_block *sb)
+void client_common_put_super(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         ENTRY;
@@ -524,24 +505,19 @@ char *ll_read_opt(const char *opt, char *data)
         RETURN(retval);
 }
 
-int ll_set_opt(const char *opt, char *data, int fl)
+static inline int ll_set_opt(const char *opt, char *data, int fl)
 {
-        ENTRY;
-
-        CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
-        if (strncmp(opt, data, strlen(opt)))
-                RETURN(0);
+        if (strncmp(opt, data, strlen(opt)) != 0)
+                return(0);
         else
-                RETURN(fl);
+                return(fl);
 }
 
-void ll_options(char *options, char **ost, char **mdc, int *flags)
+/* non-client-specific mount options are parsed in lmd_parse */
+void ll_options(char *options, int *flags)
 {
         int tmp;
-        char *this_char;
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        char *opt_ptr = options;
-#endif
+        char *s1 = options, *s2;
         ENTRY;
 
         if (!options) {
@@ -549,59 +525,57 @@ void ll_options(char *options, char **ost, char **mdc, int *flags)
                 return;
         }
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        for (this_char = strtok (options, ",");
-             this_char != NULL;
-             this_char = strtok (NULL, ","))
-#else
-        while ((this_char = strsep (&opt_ptr, ",")) != NULL)
-#endif
-        {
-                CDEBUG(D_SUPER, "this_char %s\n", this_char);
-                if (!*ost && (*ost = ll_read_opt(LUSTRE_OSC_NAME, this_char)))
-                        continue;
-                if (!*mdc && (*mdc = ll_read_opt(LUSTRE_MDC_NAME, this_char)))
-                        continue;
-                tmp = ll_set_opt("nolock", this_char, LL_SBI_NOLCK);
+        CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+        while (*s1) {
+                CDEBUG(D_SUPER, "next opt=%s\n", s1);
+                tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
                 if (tmp) {
                         *flags |= tmp;
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("flock", this_char, LL_SBI_FLOCK);
+                tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
                 if (tmp) {
                         *flags |= tmp;
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("noflock", this_char, LL_SBI_FLOCK);
+                tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK);
                 if (tmp) {
                         *flags &= ~tmp;
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("user_xattr", this_char, LL_SBI_USER_XATTR);
+                tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
                 if (tmp) {
                         *flags |= tmp;
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("nouser_xattr", this_char, LL_SBI_USER_XATTR);
+                tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
                 if (tmp) {
                         *flags &= ~tmp;
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("acl", this_char, LL_SBI_ACL);
+                tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
                 if (tmp) {
                         /* Ignore deprecated mount option.  The client will
                          * always try to mount with ACL support, whether this
                          * is used depends on whether server supports it. */
-                        continue;
+                        goto next;
                 }
-                tmp = ll_set_opt("noacl", this_char, LL_SBI_ACL);
+                tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
                 if (tmp) {
-                        continue;
+                        goto next;
                 }
+
+next:
+                /* Find next opt */
+                s2 = strchr(s1, ',');
+                if (s2 == NULL) 
+                        break;
+                s1 = s2 + 1;
         }
         EXIT;
 }
-
+                
 void ll_lli_init(struct ll_inode_info *lli)
 {
         sema_init(&lli->lli_open_sem, 1);
@@ -614,346 +588,136 @@ void ll_lli_init(struct ll_inode_info *lli)
         INIT_LIST_HEAD(&lli->lli_dead_list);
 }
 
-int ll_fill_super(struct super_block *sb, void *data, int silent)
+int ll_fill_super(struct super_block *sb)
 {
+        struct lustre_profile *lprof;
+        struct lustre_sb_info *lsi = s2lsi(sb);
         struct ll_sb_info *sbi;
-        char *osc = NULL;
-        char *mdc = NULL;
-        int err;
+        char  *osc = NULL;
+        char  *mdc = NULL;
+        char  *profilenm = get_profile_name(sb);
+        struct config_llog_instance cfg;
+        char   ll_instance[sizeof(sb) * 2 + 1];
+        int    err;
         ENTRY;
-
+                                                                                 
         CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 
-        sbi = lustre_init_sbi(sb);
-        if (!sbi)
+        /* client additional sb info */
+        lsi->lsi_llsbi = sbi = ll_init_sbi();
+        if (!sbi) 
                 RETURN(-ENOMEM);
 
-        ll_options(data, &osc, &mdc, &sbi->ll_flags);
-
-        if (!osc) {
-                CERROR("no osc\n");
-                GOTO(out, err = -EINVAL);
-        }
-
-        if (!mdc) {
-                CERROR("no mdc\n");
-                GOTO(out, err = -EINVAL);
-        }
-
-        err = lustre_common_fill_super(sb, mdc, osc);
-out:
-        if (err)
-                lustre_free_sbi(sb);
-
-        if (mdc)
-                OBD_FREE(mdc, strlen(mdc) + 1);
-        if (osc)
-                OBD_FREE(osc, strlen(osc) + 1);
-
-        RETURN(err);
-} /* ll_read_super */
-
-static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
-                   char *s1, char *s2)
-{
-        struct lustre_cfg_bufs bufs;
-        struct lustre_cfg    * lcfg = NULL;
-        int err;
-               
-        CDEBUG(D_TRACE, "lcfg %s %#x %s %s\n", cfgname, cmd, s1, s2); 
-
-        lustre_cfg_bufs_reset(&bufs, cfgname);
-        if (s1) 
-                lustre_cfg_bufs_set_string(&bufs, 1, s1);
-        if (s2) 
-                lustre_cfg_bufs_set_string(&bufs, 2, s2);
-
-        lcfg = lustre_cfg_new(cmd, &bufs);
-        lcfg->lcfg_nid = nid;
-        err = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
-        return(err);
-}
-
-static int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
-                       struct config_llog_instance *cfg)
-{
-        struct obd_device *obd;
-        struct lustre_handle mdc_conn = {0, };
-        struct obd_export *exp;
-        char * name = "mdc_dev";
-        class_uuid_t uuid;
-        struct obd_uuid mdc_uuid;
-        struct llog_ctxt *ctxt;
-        struct obd_connect_data ocd = { 0 };
-        lnet_nid_t nid;
-        int i, rc = 0, recov_bk = 1;
-        int err;
-        ENTRY;
-
-        if (lmd_bad_magic(lmd))
-                RETURN(-EINVAL);
-
-        lustre_generate_random_uuid(uuid);
-        class_uuid_unparse(uuid, &mdc_uuid);
-        CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid);
+        ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
         
-        nid = lmd->lmd_nid[0];
-        LASSERT(nid != LNET_NID_ANY);
-        rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0);
-        if (rc < 0)
-                GOTO(out, rc);
-
-        rc = do_lcfg(name, 0, LCFG_ATTACH, LUSTRE_MDC_NAME, mdc_uuid.uuid);
-        if (rc < 0)
-                GOTO(out_del_uuid, rc);
-
-        rc = do_lcfg(name, 0, LCFG_SETUP, lmd->lmd_mds, libcfs_nid2str(nid));
-        if (rc < 0) {
-                LCONSOLE_ERROR("I couldn't establish a connection with the MDS."
-                               " Check that the MDS host NID is correct and the"
-                               " networks are up.\n");
-                GOTO(out_detach, rc);
-        }
-
-        obd = class_name2obd(name);
-        if (obd == NULL)
-                GOTO(out_cleanup, rc = -EINVAL);
-
-        /* Add the redundant MDS nids */
-        for (i = 1; i < lmd->lmd_nid_count; i++) {
-                nid = lmd->lmd_nid[i];
-                rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0);
-                if (rc) {
-                        CERROR("Add uuid for %s failed %d\n", 
-                               libcfs_nid2str(nid), rc);
-                        continue;
-                }
-                rc = do_lcfg(name, 0, LCFG_ADD_CONN, libcfs_nid2str(nid), 0);
-                if (rc) 
-                        CERROR("Add conn for %s failed %d\n", 
-                               libcfs_nid2str(nid), rc);
-        }
-
-        /* Try all connections, but only once. */
-        rc = obd_set_info_async(obd->obd_self_export,
-                                strlen("init_recov_bk"), "init_recov_bk",
-                                sizeof(recov_bk), &recov_bk, NULL);
-        if (rc)
-                GOTO(out_cleanup, rc);
-
-        ocd.ocd_connect_flags = OBD_CONNECT_ACL;
-
-        rc = obd_connect(&mdc_conn, obd, &mdc_uuid, &ocd);
-        if (rc) {
-                CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc);
-                GOTO(out_cleanup, rc);
-        }
-
-        exp = class_conn2export(&mdc_conn);
-
-        ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
-#if 1
-        rc = class_config_parse_llog(ctxt, profile, cfg);
-#else
-        /*
-         * For debugging, it's useful to just dump the log
-         */
-        rc = class_config_dump_llog(ctxt, profile, cfg);
-#endif
-        switch (rc) {
-        case 0:
-                break;
-        case -EINVAL:
-                LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
-                               "from the MDS '%s'.  Make sure this client and "
-                               "the MDS are running compatible versions of "
-                               "Lustre.\n",
-                               obd->obd_name, profile, lmd->lmd_mds);
-                /* fall through */
-        default:
-                LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
-                               "from the MDS '%s'.  This may be the result of "
-                               "communication errors between the client and "
-                               "the MDS, or if the MDS is not running.\n",
-                               obd->obd_name, profile, lmd->lmd_mds);
-                break;
-        }
-
-        /* We don't so much care about errors in cleaning up the config llog
-         * connection, as we have already read the config by this point. */
-        err = obd_disconnect(exp);
-        if (err)
-                CERROR("obd_disconnect failed: rc = %d\n", err);
-
-out_cleanup:
-        err = do_lcfg(name, 0, LCFG_CLEANUP, 0, 0);
-        if (err)
-                CERROR("mdc_cleanup failed: rc = %d\n", err);
-
-out_detach:
-        err = do_lcfg(name, 0, LCFG_DETACH, 0, 0);
-        if (err)
-                CERROR("mdc_detach failed: rc = %d\n", err);
-
-out_del_uuid:
-        /* class_add_uuid adds a nid even if the same uuid exists; we might
-           delete any copy here.  So they all better match. */
-        for (i = 0; i < lmd->lmd_nid_count; i++) {
-                nid = lmd->lmd_nid[i];
-                err = do_lcfg(name, nid, LCFG_DEL_UUID, libcfs_nid2str(nid), 0);
-                if (err)
-                        CERROR("del MDC UUID %s failed: rc = %d\n", 
-                               libcfs_nid2str(nid), err);
-        }
-        /* class_import_put will get rid of the additional connections */
-out:
-        RETURN(rc);
-}
-
-static void lustre_manual_cleanup(struct ll_sb_info *sbi)
-{
-        struct obd_device *obd;
-        int next = 0;
-
-        while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
-                class_manual_cleanup(obd);
-        }
-
-        if (sbi->ll_lmd != NULL)
-                class_del_profile(sbi->ll_lmd->lmd_profile);
-}
-
-int lustre_fill_super(struct super_block *sb, void *data, int silent)
-{
-        struct lustre_mount_data * lmd = data;
-        struct ll_sb_info *sbi;
-        char *osc = NULL;
-        char *mdc = NULL;
-        int err;
-        ENTRY;
-
-        CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
-        if (lmd_bad_magic(lmd))
-                RETURN(-EINVAL);
-
-        sbi = lustre_init_sbi(sb);
-        if (!sbi)
-                RETURN(-ENOMEM);
-
-        if (lmd->lmd_profile) {
-                struct lustre_profile *lprof;
-                struct config_llog_instance cfg;
-                char ll_instance[sizeof(sb) * 2 + 1];
-
-                if (lmd->lmd_mds[0] == '\0') {
-                        CERROR("no mds name\n");
-                        GOTO(out_free, err = -EINVAL);
-                }
-
-                OBD_ALLOC(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
-                if (sbi->ll_lmd == NULL)
-                        GOTO(out_free, err = -ENOMEM);
-                memcpy(sbi->ll_lmd, lmd, sizeof(*lmd));
-                if (lmd->lmd_flags & LMD_FLG_FLOCK)
-                        sbi->ll_flags |= LL_SBI_FLOCK;
-                if (lmd->lmd_flags & LMD_FLG_USER_XATTR)
-                        sbi->ll_flags |= LL_SBI_USER_XATTR;
-
-                /* generate a string unique to this super, let's try
-                 the address of the super itself.*/
-                sprintf(ll_instance, "%p", sb);
-
-                cfg.cfg_instance = ll_instance;
-                cfg.cfg_uuid = sbi->ll_sb_uuid;
-                err = lustre_process_log(lmd, lmd->lmd_profile, &cfg);
-                if (err < 0) {
-                        CERROR("Unable to process log: %s\n", lmd->lmd_profile);
-                        GOTO(out_free, err);
-                }
-
-                lprof = class_get_profile(lmd->lmd_profile);
-                if (lprof == NULL) {
-                        CERROR("No profile found: %s\n", lmd->lmd_profile);
-                        GOTO(out_free, err = -EINVAL);
-                }
-                if (osc)
-                        OBD_FREE(osc, strlen(osc) + 1);
-                OBD_ALLOC(osc, strlen(lprof->lp_osc) +
-                          strlen(ll_instance) + 2);
-                sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance);
-
-                if (mdc)
-                        OBD_FREE(mdc, strlen(mdc) + 1);
-                OBD_ALLOC(mdc, strlen(lprof->lp_mdc) +
-                          strlen(ll_instance) + 2);
-                sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance);
-        }
-
-        if (!osc) {
-                CERROR("no osc\n");
-                GOTO(out_free, err = -EINVAL);
+        /* Generate a string unique to this super, in case some joker tries
+           to mount the same fs at two mount points. 
+           Use the address of the super itself.*/
+        sprintf(ll_instance, "%p", sb);
+        cfg.cfg_instance = ll_instance;
+        cfg.cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+        cfg.cfg_last_idx = 0;
+
+        /* set up client obds */
+        err = lustre_process_log(sb, profilenm, &cfg);
+        if (err < 0) {
+                CERROR("Unable to process log: %d\n", err);
+                GOTO(out_free, err);
         }
 
-        if (!mdc) {
-                CERROR("no mdc\n");
+        lprof = class_get_profile(profilenm);
+        if (lprof == NULL) {
+                CERROR("No profile found: %s\n", profilenm);
                 GOTO(out_free, err = -EINVAL);
         }
-
-        err = lustre_common_fill_super(sb, mdc, osc);
-
-        if (err)
-                GOTO(out_free, err);
-
-out_dev:
+        CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, 
+               lprof->lp_mdc, lprof->lp_osc);
+
+        OBD_ALLOC(osc, strlen(lprof->lp_osc) +
+                  strlen(ll_instance) + 2);
+        if (!osc) 
+                GOTO(out_free, err = -ENOMEM);
+        sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance);
+
+        OBD_ALLOC(mdc, strlen(lprof->lp_mdc) +
+                  strlen(ll_instance) + 2);
+        if (!mdc) 
+                GOTO(out_free, err = -ENOMEM);
+        sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance);
+  
+        /* connections, registrations, sb setup */
+        err = client_common_fill_super(sb, mdc, osc);
+  
+out_free:
         if (mdc)
                 OBD_FREE(mdc, strlen(mdc) + 1);
         if (osc)
                 OBD_FREE(osc, strlen(osc) + 1);
-
-        RETURN(err);
-
-out_free:
-        if (sbi->ll_lmd) {
-                lustre_manual_cleanup(sbi);
-                OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
+        if (err) {
+                struct obd_device *obd;
+                int next = 0;
+                /* like ll_put_super below */
+                lustre_end_log(sb, NULL, &cfg);
+                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) 
+                       != NULL) {
+                        class_manual_cleanup(obd);
+                }                       
+                class_del_profile(profilenm);
+                ll_free_sbi(sb);
+                lsi->lsi_llsbi = NULL;
+                lustre_common_put_super(sb);
         }
-        lustre_free_sbi(sb);
+        RETURN(err);
+} /* ll_fill_super */
 
-        goto out_dev;
-} /* lustre_fill_super */
 
-void lustre_put_super(struct super_block *sb)
+void ll_put_super(struct super_block *sb)
 {
+        struct config_llog_instance cfg;
+        char   ll_instance[sizeof(sb) * 2 + 1];
         struct obd_device *obd;
+        struct lustre_sb_info *lsi = s2lsi(sb);
         struct ll_sb_info *sbi = ll_s2sbi(sb);
-        int force = 0;
+        char *profilenm = get_profile_name(sb);
+        int next;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+        CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+        
+        sprintf(ll_instance, "%p", sb);
+        cfg.cfg_instance = ll_instance;
+        lustre_end_log(sb, NULL, &cfg);
+        
         obd = class_exp2obd(sbi->ll_mdc_exp);
         if (obd) {
-                int next = 0;
-                /* We need to set force before the lov_disconnect in
+                int force = obd->obd_no_recov;
+                /* We need to set force before the lov_disconnect in 
                 lustre_common_put_super, since l_d cleans up osc's as well. */
-                force = obd->obd_no_recov;
-                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))
-                       !=NULL) {
+                next = 0;
+                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) 
+                       != NULL) {
                         obd->obd_force = force;
-                }
+                }                       
         }
 
-        lustre_common_put_super(sb);
+        client_common_put_super(sb);
+                
+        next = 0;
+        while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+                class_manual_cleanup(obd);
+        }                       
+        
+        if (profilenm) 
+                class_del_profile(profilenm);
 
-        if (sbi->ll_lmd != NULL) {
-                lustre_manual_cleanup(sbi);
-                OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
-        }
+        ll_free_sbi(sb);
+        lsi->lsi_llsbi = NULL;
 
-        lustre_free_sbi(sb);
+        lustre_common_put_super(sb);
 
+        LCONSOLE_WARN("client umount complete\n");
         EXIT;
-} /* lustre_put_super */
+} /* client_put_super */
 
 #ifdef HAVE_REGISTER_CACHE
 #include <linux/cache_def.h>
@@ -1666,12 +1430,18 @@ int ll_iocontrol(struct inode *inode, struct file *file,
         RETURN(0);
 }
 
+/* umount -f client means force down, don't save state */
 void ll_umount_begin(struct super_block *sb)
 {
+        struct lustre_sb_info *lsi = s2lsi(sb);
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         struct obd_device *obd;
         struct obd_ioctl_data ioc_data = { 0 };
         ENTRY;
+
+        /* Tell the MGC we got umount -f */
+        lsi->lsi_flags |= LSI_UMOUNT_FORCE;
+
         CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
                sb->s_count, atomic_read(&sb->s_active));
 
@@ -1707,12 +1477,12 @@ void ll_umount_begin(struct super_block *sb)
         EXIT;
 }
 
-int lustre_remount_fs(struct super_block *sb, int *flags, char *data)
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         int err;
         __u32 read_only;
-
         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                 read_only = *flags & MS_RDONLY;
                 err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"),
@@ -1723,7 +1493,7 @@ int lustre_remount_fs(struct super_block *sb, int *flags, char *data)
                                "remount: %d\n", err);
                         return err;
                 }
-
                 if (read_only)
                         sb->s_flags |= MS_RDONLY;
                 else
index 0a54eca..d655cf4 100644 (file)
@@ -486,7 +486,7 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
                         ll_teardown_mmaps(page->mapping,
                                          (__u64)page->index<<PAGE_CACHE_SHIFT,
                                          ((__u64)page->index<<PAGE_CACHE_SHIFT)|
-                                         ~PAGE_CACHE_MASK);
+                                          ~PAGE_CACHE_MASK);
                         if (!PageDirty(page) && !page_mapped(page)) {
                                 ll_ra_accounting(llap, page->mapping);
                                 ll_truncate_complete_page(page);
index e2f60fb..3d3fef2 100644 (file)
 extern struct address_space_operations ll_aops;
 extern struct address_space_operations ll_dir_aops;
 
-static struct super_block *ll_read_super(struct super_block *sb,
-                                         void *data, int silent)
-{
-        int err;
-        ENTRY;
-        err = ll_fill_super(sb, data, silent);
-        if (err)
-                RETURN(NULL);
-        RETURN(sb);
-}
-
-static struct super_block *lustre_read_super(struct super_block *sb,
-                                             void *data, int silent)
-{
-        int err;
-        ENTRY;
-        err = lustre_fill_super(sb, data, silent);
-        if (err)
-                RETURN(NULL);
-        RETURN(sb);
-}
-
-static struct file_system_type lustre_lite_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "lustre_lite",
-        .fs_flags       = FS_NFSEXP_FSID,
-        .read_super     = ll_read_super,
-};
 
 /* exported operations */
 struct super_operations lustre_super_operations =
 {
         .read_inode2    = ll_read_inode2,
         .clear_inode    = ll_clear_inode,
-        .put_super      = lustre_put_super,
+        .put_super      = ll_put_super,
         .statfs         = ll_statfs,
         .umount_begin   = ll_umount_begin,
         .fh_to_dentry   = ll_fh_to_dentry,
         .dentry_to_fh   = ll_dentry_to_fh,
-        .remount_fs     = lustre_remount_fs,
-};
-
-static struct file_system_type lustre_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "lustre",
-        .fs_flags       = FS_NFSEXP_FSID,
-        .read_super     = lustre_read_super,
+        .remount_fs     = ll_remount_fs,
 };
 
 static int __init init_lustre_lite(void)
 {
         int rc, seed[2];
 
-        printk(KERN_INFO "Lustre: Lustre Lite Client File System; "
+        printk(KERN_INFO "Lustre: Lustre Client File System; "
                "info@clusterfs.com\n");
         ll_file_data_slab = kmem_cache_create("ll_file_data",
                                               sizeof(struct ll_file_data), 0,
@@ -107,14 +72,7 @@ static int __init init_lustre_lite(void)
 
         ll_register_cache(&ll_cache_definition);
 
-        rc = register_filesystem(&lustre_lite_fs_type);
-        if (rc == 0)
-                rc = register_filesystem(&lustre_fs_type);
-        if (rc) {
-                /* This is safe even if lustre_lite_fs_type isn't registered */
-                unregister_filesystem(&lustre_lite_fs_type);
-                ll_unregister_cache(&ll_cache_definition);
-        }
+        lustre_register_client_fill_super(ll_fill_super);
 
         get_random_bytes(seed, sizeof(seed));
         ll_srand(seed[0], seed[1]);
@@ -126,9 +84,8 @@ static void __exit exit_lustre_lite(void)
 {
         int rc;
 
-        unregister_filesystem(&lustre_lite_fs_type);
-        unregister_filesystem(&lustre_fs_type);
-
+        lustre_register_client_fill_super(NULL);
+        
         ll_unregister_cache(&ll_cache_definition);
 
         rc = kmem_cache_destroy(ll_file_data_slab);
index c3072a3..b6e7d51 100644 (file)
 #include <lprocfs_status.h>
 #include "llite_internal.h"
 
-struct super_block * ll_get_sb(struct file_system_type *fs_type,
-                               int flags, const char *devname, void * data)
-{
-        /* calls back in fill super */
-        return get_sb_nodev(fs_type, flags, data, ll_fill_super);
-}
-
-struct super_block * lustre_get_sb(struct file_system_type *fs_type,
-                               int flags, const char *devname, void * data)
-{
-        /* calls back in fill super */
-        return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
-}
-
 static kmem_cache_t *ll_inode_cachep;
 
 static struct inode *ll_alloc_inode(struct super_block *sb)
@@ -105,33 +91,17 @@ struct super_operations lustre_super_operations =
         .alloc_inode   = ll_alloc_inode,
         .destroy_inode = ll_destroy_inode,
         .clear_inode   = ll_clear_inode,
-        .put_super     = lustre_put_super,
+        .put_super     = ll_put_super,
         .statfs        = ll_statfs,
         .umount_begin  = ll_umount_begin,
-        .remount_fs    = lustre_remount_fs,
-};
-
-
-struct file_system_type lustre_lite_fs_type = {
-        .owner        = THIS_MODULE,
-        .name         = "lustre_lite",
-        .get_sb       = ll_get_sb,
-        .kill_sb      = kill_anon_super,
-        .fs_flags     = FS_BINARY_MOUNTDATA,
+        .remount_fs    = ll_remount_fs,
 };
 
-struct file_system_type lustre_fs_type = {
-        .owner        = THIS_MODULE,
-        .name         = "lustre",
-        .get_sb       = lustre_get_sb,
-        .kill_sb      = kill_anon_super,
-        .fs_flags     = FS_BINARY_MOUNTDATA,
-};
 
 static int __init init_lustre_lite(void)
 {
         int rc, seed[2];
-        printk(KERN_INFO "Lustre: Lustre Lite Client File System; "
+        printk(KERN_INFO "Lustre: Lustre Client File System; "
                "info@clusterfs.com\n");
         rc = ll_init_inodecache();
         if (rc)
@@ -148,19 +118,12 @@ static int __init init_lustre_lite(void)
                               proc_mkdir("llite", proc_lustre_root) : NULL;
 
         ll_register_cache(&ll_cache_definition);
-
-        rc = register_filesystem(&lustre_lite_fs_type);
-        if (rc == 0)
-                rc = register_filesystem(&lustre_fs_type);
-        if (rc) {
-                /* This is safe even if lustre_lite_fs_type isn't registered */
-                unregister_filesystem(&lustre_lite_fs_type);
-                ll_unregister_cache(&ll_cache_definition);
-        }
-
+        
+        lustre_register_client_fill_super(ll_fill_super);
+        
         get_random_bytes(seed, sizeof(seed));
         ll_srand(seed[0], seed[1]);
-
+        
         return rc;
 }
 
@@ -168,8 +131,7 @@ static void __exit exit_lustre_lite(void)
 {
         int rc;
 
-        unregister_filesystem(&lustre_fs_type);
-        unregister_filesystem(&lustre_lite_fs_type);
+        lustre_register_client_fill_super(NULL);
 
         ll_unregister_cache(&ll_cache_definition);
 
index 23ccc08..454b5a6 100644 (file)
@@ -107,8 +107,12 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count,
         int i, rc = 0;
         ENTRY;
 
-        LASSERT(lov->desc.ld_tgt_count  == count);
-        for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
+        /* We might have added an osc and not told the mds yet */
+        if (count != lov->desc.ld_tgt_count)
+                CERROR("Origin connect mds cnt %d != lov cnt %d\n", count,
+                       lov->desc.ld_tgt_count);
+
+        for (i = 0, tgt = lov->tgts; i < count; i++, tgt++) {
                 struct obd_device *child;
                 struct llog_ctxt *cctxt;
                 
@@ -121,7 +125,7 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count,
 
                 rc = llog_connect(cctxt, 1, logid, gen, uuid);
                 if (rc) {
-                        CERROR("error osc_llog_connect %d\n", i);
+                        CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc);
                         break;
                 }
         }
@@ -188,6 +192,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
         if (rc)
                 RETURN(rc);
 
+        CDEBUG(D_CONFIG, "llog init with %d targets\n", count);
         LASSERT(lov->desc.ld_tgt_count == count);
         for (i = 0, ctgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, ctgt++) {
                 struct obd_device *child;
@@ -196,7 +201,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
                 child = ctgt->ltd_exp->exp_obd;
                 rc = obd_llog_init(child, tgt, 1, logid + i);
                 if (rc) {
-                        CERROR("error osc_llog_init %d\n", i);
+                        CERROR("error osc_llog_init %d (%d)\n", i, rc);
                         break;
                 }
         }
index 4fcc9d1..4b22292 100644 (file)
@@ -46,6 +46,7 @@
 #include <obd_lov.h>
 #include <obd_ost.h>
 #include <lprocfs_status.h>
+#include <lustre_param.h>
 
 #include "lov_internal.h"
 
@@ -107,17 +108,22 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt,
                                         &obd->obd_uuid);
 
         if (!tgt_obd) {
-                CERROR("Target %s not attached\n", tgt_uuid->uuid);
+                CERROR("Target %s not attached\n", obd_uuid2str(tgt_uuid));
                 RETURN(-EINVAL);
         }
+        
+        CDEBUG(D_CONFIG, "Connect tgt %s (%s)\n", obd_uuid2str(tgt_uuid),
+               tgt_obd->obd_name);
 
         if (!tgt_obd->obd_set_up) {
-                CERROR("Target %s not set up\n", tgt_uuid->uuid);
+                CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
                 RETURN(-EINVAL);
         }
 
         if (activate) {
                 tgt_obd->obd_no_recov = 0;
+                /* FIXME this is probably supposed to be 
+                   ptlrpc_set_import_active.  Horrible naming. */
                 ptlrpc_activate_import(tgt_obd->u.cli.cl_import);
         }
 
@@ -128,19 +134,20 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt,
 
         if (imp->imp_invalid) {
                 CERROR("not connecting OSC %s; administratively "
-                       "disabled\n", tgt_uuid->uuid);
+                       "disabled\n", obd_uuid2str(tgt_uuid));
                 rc = obd_register_observer(tgt_obd, obd);
                 if (rc) {
                         CERROR("Target %s register_observer error %d; "
                                "will not be able to reactivate\n",
-                               tgt_uuid->uuid, rc);
+                               obd_uuid2str(tgt_uuid), rc);
                 }
                 RETURN(0);
         }
 
         rc = obd_connect(&conn, tgt_obd, &lov_osc_uuid, data);
         if (rc) {
-                CERROR("Target %s connect error %d\n", tgt_uuid->uuid, rc);
+                CERROR("Target %s connect error %d\n",
+                       obd_uuid2str(tgt_uuid), rc);
                 RETURN(rc);
         }
         tgt->ltd_exp = class_conn2export(&conn);
@@ -148,7 +155,7 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt,
         rc = obd_register_observer(tgt_obd, obd);
         if (rc) {
                 CERROR("Target %s register_observer error %d\n",
-                       tgt_uuid->uuid, rc);
+                       obd_uuid2str(tgt_uuid), rc);
                 obd_disconnect(tgt->ltd_exp);
                 tgt->ltd_exp = NULL;
                 RETURN(rc);
@@ -191,58 +198,20 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                        struct obd_uuid *cluuid, struct obd_connect_data *data)
 {
         struct lov_obd *lov = &obd->u.lov;
-        struct lov_tgt_desc *tgt;
-        struct obd_export *exp;
-        __u64 connect_flags = data ? data->ocd_connect_flags : 0;
-        int rc, rc2, i;
+        int rc;
         ENTRY;
 
-        rc = class_connect(conn, obd, cluuid);
-        if (rc)
-                RETURN(rc);
-
-        exp = class_conn2export(conn);
+        lov->ocd.ocd_connect_flags = OBD_CONNECT_EMPTY; 
+        if (data) 
+                lov->ocd = *data;
 
-        /* We don't want to actually do the underlying connections more than
-         * once, so keep track. */
-        lov->connects++;
-        if (lov->connects > 1) {
-                class_export_put(exp);
-                RETURN(0);
-        }
-
-        for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
-                if (obd_uuid_empty(&tgt->uuid))
-                        continue;
-                if (connect_flags & OBD_CONNECT_INDEX)
-                        data->ocd_index = i;
-                rc = lov_connect_obd(obd, tgt, 0, data);
-                if (rc)
-                        GOTO(out_disc, rc);
-                if (data)
-                        connect_flags &= data->ocd_connect_flags;
-        }
-
-        if (data)
-                data->ocd_connect_flags = connect_flags;
+        rc = class_connect(conn, obd, cluuid);
+        if (!rc) 
+                lov->connects++;
+        CDEBUG(D_CONFIG, "connect #%d\n", lov->connects);
 
-        class_export_put(exp);
-        RETURN (0);
+        /* target connects are done in lov_add_target */
 
- out_disc:
-        while (i-- > 0) {
-                struct obd_uuid uuid;
-                --tgt;
-                --lov->desc.ld_active_tgt_count;
-                tgt->active = 0;
-                /* save for CERROR below; (we know it's terminated) */
-                uuid = tgt->uuid;
-                rc2 = obd_disconnect(tgt->ltd_exp);
-                if (rc2)
-                        CERROR("error: LOV target %s disconnect on OST idx %d: "
-                               "rc = %d\n", uuid.uuid, i, rc2);
-        }
-        class_disconnect(exp);
         RETURN (rc);
 }
 
@@ -254,7 +223,8 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         int rc;
         ENTRY;
 
-        CDEBUG(D_CONFIG, "Disconnecting lov target %s\n", obd->obd_uuid.uuid);
+        CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", 
+               obd->obd_name, osc_obd->obd_name);
 
         lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
         if (lov_proc_dir) {
@@ -299,8 +269,8 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         RETURN(0);
 }
 
-static int lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp,
-                       int index, int gen);
+static int lov_del_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                          int index, int gen);
 
 static int lov_disconnect(struct obd_export *exp)
 {
@@ -315,8 +285,11 @@ static int lov_disconnect(struct obd_export *exp)
 
         /* Only disconnect the underlying layers on the final disconnect. */
         lov->connects--;
-        if (lov->connects != 0)
+        if (lov->connects != 0) {
+                /* why should there be more than 1 connect? */
+                CERROR("disconnect #%d\n", lov->connects);
                 goto out;
+        }
 
         /* Let's hold another reference so lov_del_obd doesn't spin through
            putref every time */
@@ -324,13 +297,13 @@ static int lov_disconnect(struct obd_export *exp)
         for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
                 if (tgt->ltd_exp) {
                         /* Disconnection is the last we know about an obd */
-                        lov_del_obd(obd, &tgt->uuid, i, tgt->ltd_gen);
+                        lov_del_target(obd, &tgt->uuid, i, tgt->ltd_gen);
                 }
         }
         lov_putref(obd);
 
 out:
-        rc = class_disconnect(exp);
+        rc = class_disconnect(exp); /* bz 9811 */
         RETURN(rc);
 }
 
@@ -384,21 +357,24 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
 }
 
 static int lov_notify(struct obd_device *obd, struct obd_device *watched,
-                      enum obd_notify_event ev)
+                      enum obd_notify_event ev, void *data)
 {
-        struct obd_uuid *uuid;
-        int rc;
+        int rc = 0;
         ENTRY;
 
-        if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
-                CERROR("unexpected notification of %s %s!\n",
-                       watched->obd_type->typ_name,
-                       watched->obd_name);
-                RETURN(-EINVAL);
-        }
-        uuid = &watched->u.cli.cl_target_uuid;
-
         if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+                struct obd_uuid *uuid;
+
+                LASSERT(watched);
+                
+                if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                        CERROR("unexpected notification of %s %s!\n",
+                               watched->obd_type->typ_name,
+                               watched->obd_name);
+                        RETURN(-EINVAL);
+                }
+                uuid = &watched->u.cli.cl_target_uuid;
+
                 /* Set OSC as active before notifying the observer, so the
                  * observer can use the OSC normally.
                  */
@@ -409,29 +385,51 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
                 if (rc) {
                         CERROR("%sactivation of %s failed: %d\n",
                                (ev == OBD_NOTIFY_ACTIVE) ? "" : "de",
-                               uuid->uuid, rc);
+                               obd_uuid2str(uuid), rc);
                         RETURN(rc);
                 }
         }
 
         /* Pass the notification up the chain. */
-        rc = obd_notify_observer(obd, watched, ev);
+        if (watched) {
+                rc = obd_notify_observer(obd, watched, ev, data);
+        } else {
+                /* NULL watched means all osc's in the lov (only for syncs) */
+                struct lov_obd *lov = &obd->u.lov;
+                struct lov_tgt_desc *tgt;
+                struct obd_device *tgt_obd;
+                int i;
+                for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; 
+                      i++, tgt++) {
+                        if (obd_uuid_empty(&tgt->uuid))
+                                continue;
+                        tgt_obd = class_exp2obd(tgt->ltd_exp);
+                        rc = obd_notify_observer(obd, tgt_obd, ev, data);
+                        if (rc) {
+                                CERROR("%s: notify %s of %s failed %d\n",
+                                       obd->obd_name, 
+                                       obd->obd_observer->obd_name,
+                                       tgt_obd->obd_name, rc);
+                                break;
+                        }
+                }
+        }
 
         RETURN(rc);
 }
 
-static int
-lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                          int index, int gen, int active)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct lov_tgt_desc *tgt;
-        obd_id params[2];
-        int rc, old_count;
-        __u32 bufsize, size = 2;
+        struct obd_connect_data *ocd = NULL;
+        __u32 bufsize, idx;
+        int rc;
         ENTRY;
 
-        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n",
-               uuidp->uuid, index, gen);
+        CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+               uuidp->uuid, index, gen, active);
 
         if (index < 0) {
                 CERROR("request to add OBD %s at invalid index: %d\n",
@@ -474,8 +472,8 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
 
         tgt = &lov->tgts[index];
         if (!obd_uuid_empty(&tgt->uuid)) {
-                CERROR("OBD already assigned at LOV target index %d\n",
-                       index);
+                CERROR("UUID %s already assigned at LOV target index %d\n",
+                       obd_uuid2str(&tgt->uuid), index);
                 RETURN(-EEXIST);
         }
 
@@ -485,18 +483,12 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         tgt->index = index;
         INIT_LIST_HEAD(&tgt->qos_bavail_list);
 
-        old_count = lov->desc.ld_tgt_count;
         if (index >= lov->desc.ld_tgt_count)
                 lov->desc.ld_tgt_count = index + 1;
 
         CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
                 index, tgt->ltd_gen, lov->desc.ld_tgt_count);
 
-        if (lov->connects == 0)
-                /* lov_connect hasn't been called yet. So we'll do the
-                   lov_connect_obd on this obd when that fn first runs. */
-                RETURN(0);
-
         if (tgt->ltd_exp) {
                 struct obd_device *osc_obd;
 
@@ -505,37 +497,36 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
                         osc_obd->obd_no_recov = 0;
         }
 
-        /* NULL may need to change when we use flags for osc's */
-        rc = lov_connect_obd(obd, tgt, 1, NULL);
-        if (rc || !obd->obd_observer)
-                RETURN(rc);
-
-        /* tell the mds_lov about the new target */
-        obd_llog_finish(obd->obd_observer, old_count);
-        llog_cat_initialize(obd->obd_observer, lov->desc.ld_tgt_count);
-
-        params[0] = index;
-        rc = obd_get_info(tgt->ltd_exp, strlen("last_id"), "last_id", &size,
-                          &params[1]);
+        if (lov->ocd.ocd_connect_flags != OBD_CONNECT_EMPTY) { 
+                /* Keep the original connect flags pristine */
+                OBD_ALLOC(ocd, sizeof(*ocd));
+                if (!ocd) 
+                        RETURN(-ENOMEM);
+                *ocd = lov->ocd;
+        }
+        rc = lov_connect_obd(obd, tgt, active, ocd);
+        if (ocd)
+                OBD_FREE(ocd, sizeof(*ocd));
         if (rc)
                 GOTO(out, rc);
 
-        rc = obd_set_info_async(obd->obd_observer->obd_self_export,
-                                strlen("next_id"),"next_id", 2, params, NULL);
-        if (rc)
-                GOTO(out, rc);
+        idx = index;
+        rc = lov_notify(obd, tgt->ltd_exp->exp_obd, 
+                        active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE,
+                        (void *)&idx);
 
-        rc = lov_notify(obd, tgt->ltd_exp->exp_obd, OBD_NOTIFY_ACTIVE);
-        GOTO(out, rc);
- out:
-        if (rc && tgt->ltd_exp != NULL)
-                lov_disconnect_obd(obd, tgt);
-        return rc;
+out:
+        if (rc) {
+                CERROR("add failed (%d), deleting %s\n", rc, 
+                       (char *)tgt->uuid.uuid);
+                lov_del_target(obd, &tgt->uuid, index, 0);
+        }
+        RETURN(rc);
 }
 
 /* Schedule a target for deletion */
-static int
-lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
+static int lov_del_target(struct obd_device *obd, struct obd_uuid *uuidp, 
+                          int index, int gen)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct lov_tgt_desc *tgt;
@@ -556,7 +547,7 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
                 RETURN(-EINVAL);
         }
 
-        if (strncmp(uuidp->uuid, tgt->uuid.uuid, sizeof uuidp->uuid) != 0) {
+        if (!obd_uuid_equals(uuidp, &tgt->uuid)) {
                 CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
                        tgt->uuid.uuid, index, uuidp->uuid);
                 RETURN(-EINVAL);
@@ -581,6 +572,9 @@ static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         LASSERT(tgt->reap);
         osc_obd = class_exp2obd(tgt->ltd_exp);
 
+        CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", tgt->uuid.uuid, 
+               osc_obd ? osc_obd->obd_name : "<no obd>");
+
         if (tgt->ltd_exp)
                 lov_disconnect_obd(obd, tgt);
 
@@ -602,6 +596,31 @@ static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         }
 }
 
+static void lov_fix_desc(struct lov_desc *desc)
+{
+        if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) {
+                CWARN("Increasing default_stripe_size "LPU64" to %u\n",
+                      desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE);
+                desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE;
+        } else if (desc->ld_default_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+                CWARN("default_stripe_size "LPU64" isn't a multiple of %u\n",
+                      desc->ld_default_stripe_size, LOV_MIN_STRIPE_SIZE);
+                desc->ld_default_stripe_size &= ~(LOV_MIN_STRIPE_SIZE - 1);
+                CWARN("changing to "LPU64"\n", desc->ld_default_stripe_size);
+       }
+
+        if (desc->ld_default_stripe_count == 0)
+                desc->ld_default_stripe_count = 1;
+
+        /* from lov_setstripe */
+        if ((desc->ld_pattern != 0) && 
+            (desc->ld_pattern != LOV_PATTERN_RAID0)) {
+                CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
+                       desc->ld_pattern);
+                desc->ld_pattern = 0;
+        }
+}
+
 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
 {
         struct lprocfs_static_vars lvars;
@@ -637,22 +656,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
                 }
         }
 
-        if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) {
-                CWARN("Increasing default_stripe_size "LPU64" to %u\n",
-                      desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE);
-                CWARN("Please update config and run --write-conf on MDS\n");
-
-                desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE;
-        } else if (desc->ld_default_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
-                CWARN("default_stripe_size "LPU64" isn't a multiple of %u\n",
-                      desc->ld_default_stripe_size, LOV_MIN_STRIPE_SIZE);
-                CWARN("Please update config and run --write-conf on MDS\n");
-
-                desc->ld_default_stripe_size &= ~(LOV_MIN_STRIPE_SIZE - 1);
-       }
-
-        if (desc->ld_default_stripe_count == 0)
-                desc->ld_default_stripe_count = 1;
+        lov_fix_desc(desc);
 
         /* Because of 64-bit divide/mod operations only work with a 32-bit
          * divisor in a 32-bit kernel, we cannot support a stripe width
@@ -746,8 +750,11 @@ static int lov_cleanup(struct obd_device *obd)
                         /* We should never get here - these should have
                            been removed in the disconnect. */
                         if (!obd_uuid_empty(&tgt->uuid)) {
-                                CERROR("lov tgt %d not cleaned!\n", i);
-                                lov_del_obd(obd, &tgt->uuid, i, 0);
+                                CERROR("lov tgt %d not cleaned!"
+                                       " deathrow=%d, lovrc=%d\n",
+                                       i, lov->death_row, 
+                                       atomic_read(&lov->refcount));
+                                lov_del_target(obd, &tgt->uuid, i, 0);
                         }
                 }
                 OBD_FREE(lov->tgts, lov->bufsize);
@@ -767,7 +774,9 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
 
         switch(cmd = lcfg->lcfg_command) {
         case LCFG_LOV_ADD_OBD:
+        case LCFG_LOV_ADD_INA:
         case LCFG_LOV_DEL_OBD: {
+                /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
                 if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
                         GOTO(out, rc = -EINVAL);
 
@@ -778,9 +787,52 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
                 if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
                         GOTO(out, rc = -EINVAL);
                 if (cmd == LCFG_LOV_ADD_OBD)
-                        rc = lov_add_obd(obd, &obd_uuid, index, gen);
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+                else if (cmd == LCFG_LOV_ADD_INA)
+                        rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
                 else
-                        rc = lov_del_obd(obd, &obd_uuid, index, gen);
+                        rc = lov_del_target(obd, &obd_uuid, index, gen);
+                GOTO(out, rc);
+        }
+        case LCFG_PARAM: {
+                int i;
+                struct lov_obd *lov = &obd->u.lov;
+                struct lov_desc *desc = &(lov->desc);
+                if (!desc)
+                        GOTO(out, rc = -EINVAL);
+                /* see jt_obd_lov_getconfig for variable names */
+                /* setparam 0:lov_mdsA 1:default_stripe_size=1048576 
+                   2:default_stripe_pattern=0 3:default_stripe_offset=0 */
+                for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+                        char *key, *sval;
+                        long val;
+                        key = lustre_cfg_buf(lcfg, i);
+                        sval = strchr(key, '=');
+                        if (!sval || (*(sval + 1) == 0)) {
+                                CERROR("Can't parse param %s\n", key);
+                                rc = -EINVAL;
+                                /* continue parsing other params */
+                                continue;
+                        }
+                        *sval = 0;
+                        val = simple_strtol(sval + 1, NULL, 0);
+                        if (strcmp(key, PARAM_D_STRIPE_SIZE) == 0)
+                                desc->ld_default_stripe_size = val;
+                        else if (strcmp(key, PARAM_D_STRIPE_COUNT) == 0)
+                                desc->ld_default_stripe_count = val;
+                        else if (strcmp(key, PARAM_D_STRIPE_OFFSET) == 0)
+                                desc->ld_default_stripe_offset = val;
+                        else if (strcmp(key, PARAM_D_STRIPE_PATTERN) == 0)
+                                desc->ld_pattern = val;
+                        else {
+                                CERROR("Unknown param %s\n", key);
+                                rc = -EINVAL;
+                                /* continue parsing other params */
+                                continue;
+                        }
+                        LCONSOLE_INFO("set %s to %ld\n", key, val);
+                }
+                lov_fix_desc(desc);
                 GOTO(out, rc);
         }
         default: {
@@ -837,6 +889,8 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
                 if (ost_uuid && !obd_uuid_equals(ost_uuid, &lov->tgts[i].uuid))
                         continue;
 
+                CDEBUG(D_CONFIG,"Clear orphans for %d:%s\n", i, ost_uuid->uuid);
+
                 memcpy(tmp_oa, src_oa, sizeof(*tmp_oa));
 
                 LASSERT(lov->tgts[i].ltd_exp);
@@ -1482,14 +1536,24 @@ int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
 {
         struct lov_obd *lov = &exp->exp_obd->u.lov;
         struct lov_async_page *lap;
-        int rc;
+        int rc = 0;
         ENTRY;
 
-        if (!page)
-                return size_round(sizeof(*lap)) +
-                       obd_prep_async_page(lov->tgts[0].ltd_exp, NULL, NULL,
-                                           NULL, 0, NULL, NULL, NULL);
-
+        if (!page) {
+                int i = 0;
+                /* Find an existing osc so we can get it's stupid sizeof(*oap).
+                   Only because of this layering limitation will a client 
+                   mount with no osts fail */
+                while (!lov->tgts[i].ltd_exp) {
+                        i++;
+                        if (i >= lov->desc.ld_tgt_count) 
+                                RETURN(-ENOTBLK);
+                }
+                rc = size_round(sizeof(*lap)) +
+                        obd_prep_async_page(lov->tgts[i].ltd_exp, NULL, NULL,
+                                            NULL, 0, NULL, NULL, NULL);
+                RETURN(rc);
+        }
         ASSERT_LSM_MAGIC(lsm);
         LASSERT(loi == NULL);
 
@@ -2142,7 +2206,8 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                                 GOTO(out, rc);
                 }
                 GOTO(out, rc = 0);
-        } else if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) {
+        } else if (keylen >= strlen(KEY_LOVDESC) && 
+                   strcmp(key, KEY_LOVDESC) == 0) {
                 struct lov_desc *desc_ret = val;
                 *desc_ret = lov->desc;
 
@@ -2171,15 +2236,15 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         RETURN(-ENOMEM);
         }
 
-        if (KEY_IS("next_id")) {
-                if (vallen != lov->desc.ld_tgt_count)
+        if (KEY_IS(KEY_NEXT_ID)) {
+                if (vallen > lov->desc.ld_tgt_count)
                         RETURN(-EINVAL);
                 vallen = sizeof(obd_id);
         }
 
         lov_getref(obddev);
 
-        if (KEY_IS("next_id") || KEY_IS("checksum")) {
+        if (KEY_IS(KEY_NEXT_ID) || KEY_IS("checksum")) {
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                         /* OST was disconnected */
                         if (!lov->tgts[i].ltd_exp)
@@ -2209,7 +2274,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                 GOTO(out, rc);
         }
 
-        if (KEY_IS("mds_conn") || KEY_IS("unlinked")) {
+        if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) {
                 if (vallen != 0)
                         GOTO(out, rc = -EINVAL);
         } else {
index fee6b69..6f88917 100644 (file)
@@ -35,16 +35,16 @@ int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
         if ((found = fsfilt_search_type(fs_ops->fs_type))) {
                 if (found != fs_ops) {
                         CERROR("different operations for type %s\n",
-                              fs_ops->fs_type);
+                               fs_ops->fs_type);
                         /* unlock fsfilt_types list */
                         RETURN(-EEXIST);
                 }
         } else {
                 PORTAL_MODULE_USE;
-               list_add(&fs_ops->fs_list, &fsfilt_types);
-       }
+                list_add(&fs_ops->fs_list, &fsfilt_types);
+        }
 
-       /* unlock fsfilt_types list */
+        /* unlock fsfilt_types list */
         return 0;
 }
 
@@ -54,7 +54,7 @@ void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
 
         /* lock fsfilt_types list */
         list_for_each(p, &fsfilt_types) {
-               struct fsfilt_operations *found;
+                struct fsfilt_operations *found;
 
                 found = list_entry(p, typeof(*found), fs_list);
                 if (found == fs_ops) {
@@ -86,9 +86,9 @@ struct fsfilt_operations *fsfilt_get_ops(const char *type)
                 }
 
                 if (rc) {
-                        CERROR("Can't find fsfilt_%s interface\n", name);
-                        RETURN(ERR_PTR(rc));
-                       /* unlock fsfilt_types list */
+                        CERROR("Can't find %s interface\n", name);
+                        RETURN(ERR_PTR(rc < 0 ? rc : -rc));
+                        /* unlock fsfilt_types list */
                 }
         }
         try_module_get(fs_ops->fs_owner);
index 34da8d1..533f0d3 100644 (file)
@@ -73,11 +73,44 @@ struct fsfilt_cb_data {
 #define EXT3_XATTR_INDEX_TRUSTED        4
 #endif
 
-static char *fsfilt_ext3_label(struct super_block *sb)
+static char *fsfilt_ext3_get_label(struct super_block *sb)
 {
         return EXT3_SB(sb)->s_es->s_volume_name;
 }
 
+static int fsfilt_ext3_set_label(struct super_block *sb, char *label)
+{
+        /* see e.g. fsfilt_ext3_write_record() */
+        journal_t *journal;
+        handle_t *handle;
+        int err;
+
+        journal = EXT3_SB(sb)->s_journal;
+        lock_24kernel();
+        handle = journal_start(journal, 1);
+        unlock_24kernel();
+        if (IS_ERR(handle)) {
+                CERROR("can't start transaction\n");
+                return(PTR_ERR(handle));
+        }
+
+        err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+        if (err)
+                goto out;
+
+        memcpy(EXT3_SB(sb)->s_es->s_volume_name, label,
+               sizeof(EXT3_SB(sb)->s_es->s_volume_name));
+
+        err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+
+out:
+        lock_24kernel();
+        journal_stop(handle);
+        unlock_24kernel();
+
+        return(err);
+}
+
 static char *fsfilt_ext3_uuid(struct super_block *sb)
 {
         return EXT3_SB(sb)->s_es->s_uuid;
@@ -693,7 +726,7 @@ static int fsfilt_ext3_sync(struct super_block *sb)
 #undef EXT3_MULTIBLOCK_ALLOCATOR
 #endif
 #ifndef EXT3_EXTENTS_FL
-#define EXT3_EXTENTS_FL                        0x00080000 /* Inode uses extents */
+#define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
 #endif
 
 #ifdef EXT3_MULTIBLOCK_ALLOCATOR
@@ -1924,7 +1957,8 @@ static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd)
 static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_type                = "ext3",
         .fs_owner               = THIS_MODULE,
-        .fs_label               = fsfilt_ext3_label,
+        .fs_getlabel            = fsfilt_ext3_get_label,
+        .fs_setlabel            = fsfilt_ext3_set_label,
         .fs_uuid                = fsfilt_ext3_uuid,
         .fs_start               = fsfilt_ext3_start,
         .fs_brw_start           = fsfilt_ext3_brw_start,
index 188f8be..969f1c3 100644 (file)
@@ -294,8 +294,9 @@ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
 
                 /* Fixup directory permissions if necessary */
                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
-                        CWARN("fixing permissions on %s from %o to %o\n",
-                              name, old_mode, mode);
+                        CDEBUG(D_CONFIG, 
+                               "fixing permissions on %s from %o to %o\n",
+                               name, old_mode, mode);
                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
                                                   (old_mode & ~S_IALLUGO);
                         mark_inode_dirty(dchild->d_inode);
index 580cebc..dc9fc62 100644 (file)
@@ -838,16 +838,16 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
         struct obd_import *imp = class_exp2cliimp(exp);
         int rc = -EINVAL;
 
-        if (KEY_IS("initial_recov")) {
+        if (KEY_IS(KEY_INIT_RECOV)) {
                 if (vallen != sizeof(int))
                         RETURN(-EINVAL);
                 imp->imp_initial_recov = *(int *)val;
-                CDEBUG(D_HA, "%s: set imp_no_init_recov = %d\n",
+                CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n",
                        exp->exp_obd->obd_name, imp->imp_initial_recov);
                 RETURN(0);
         }
         /* Turn off initial_recov after we try all backup servers once */
-        if (KEY_IS("init_recov_bk")) {
+        if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
                 if (vallen != sizeof(int))
                         RETURN(-EINVAL);
                 imp->imp_initial_recov_bk = *(int *)val;
@@ -1074,7 +1074,7 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
                 break;
         }
         case IMP_EVENT_INACTIVE: {
-                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
                 break;
         }
         case IMP_EVENT_INVALIDATE: {
@@ -1085,7 +1085,7 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
                 break;
         }
         case IMP_EVENT_ACTIVE: {
-                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
                 break;
         }
         case IMP_EVENT_OCD:
@@ -1154,7 +1154,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
         int rc, size;
         ENTRY;
 
-        rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc",
+        rc = obd_get_info(lov_exp, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC,
                           &valsize, &desc);
         if (rc)
                 RETURN(rc);
@@ -1187,13 +1187,17 @@ static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
         int rc = 0;
         ENTRY;
 
-        if (stage < OBD_CLEANUP_SELF_EXP)
-                RETURN(0);
-
-        rc = obd_llog_finish(obd, 0);
-        if (rc != 0)
-                CERROR("failed to cleanup llogging subsystems\n");
-
+        switch (stage) {
+        case OBD_CLEANUP_EARLY: 
+        case OBD_CLEANUP_EXPORTS:
+                break;
+        case OBD_CLEANUP_SELF_EXP:
+                rc = obd_llog_finish(obd, 0);
+                if (rc != 0)
+                        CERROR("failed to cleanup llogging subsystems\n");
+        case OBD_CLEANUP_OBD:
+                break;
+        }
         RETURN(rc);
 }
 
index 71d31f8..9a1f1fe 100644 (file)
 #endif
 #define DEBUG_SUBSYSTEM S_MDS
 
-#include <linux/module.h>
 #include <lustre_mds.h>
-#include <lustre_dlm.h>
+#include <linux/module.h>
 #include <linux/init.h>
-#include <obd_class.h>
 #include <linux/random.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #else
 # include <linux/locks.h>
 #endif
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
 #include <obd_lov.h>
-#include <lustre_mds.h>
 #include <lustre_fsfilt.h>
 #include <lprocfs_status.h>
 #include <lustre_commit_confd.h>
 #include <lustre_quota.h>
+#include <lustre_disk.h>
 #include <lustre_ver.h>
 
 #include "mds_internal.h"
@@ -995,7 +996,6 @@ out_ucred:
         return rc;
 }
 
-
 static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                           unsigned long max_age)
 {
@@ -1228,7 +1228,7 @@ static char *reint_names[] = {
         [REINT_OPEN]    "open",
 };
 
-static int mds_set_info(struct obd_export *exp, struct ptlrpc_request *req)
+static int mds_set_info_rpc(struct obd_export *exp, struct ptlrpc_request *req)
 {
         char *key;
         __u32 *val;
@@ -1585,7 +1585,7 @@ int mds_handle(struct ptlrpc_request *req)
 
         case MDS_SET_INFO:
                 DEBUG_REQ(D_INODE, req, "set_info");
-                rc = mds_set_info(req->rq_export, req);
+                rc = mds_set_info_rpc(req->rq_export, req);
                 break;
 
         case MDS_QUOTACHECK:
@@ -1708,22 +1708,44 @@ int mds_handle(struct ptlrpc_request *req)
 int mds_update_server_data(struct obd_device *obd, int force_sync)
 {
         struct mds_obd *mds = &obd->u.mds;
-        struct mds_server_data *msd = mds->mds_server_data;
+        struct lr_server_data *lsd = mds->mds_server_data;
+        struct lr_server_data *lsd_copy = NULL;
         struct file *filp = mds->mds_rcvd_filp;
         struct lvfs_run_ctxt saved;
         loff_t off = 0;
         int rc;
         ENTRY;
 
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        msd->msd_last_transno = cpu_to_le64(mds->mds_last_transno);
-
         CDEBUG(D_SUPER, "MDS mount_count is "LPU64", last_transno is "LPU64"\n",
                mds->mds_mount_count, mds->mds_last_transno);
-        rc = fsfilt_write_record(obd, filp, msd, sizeof(*msd), &off,force_sync);
+
+        lsd->lsd_last_transno = cpu_to_le64(mds->mds_last_transno);
+
+        if (!(lsd->lsd_feature_incompat & cpu_to_le32(OBD_INCOMPAT_COMMON_LR))){
+                /* Swap to the old mds_server_data format, in case
+                   someone wants to revert to a pre-1.6 lustre */
+                CDEBUG(D_CONFIG, "writing old last_rcvd format\n");
+                /* malloc new struct instead of swap in-place because 
+                   we don't have a lock on the last_trasno or mount count -
+                   someone may modify it while we're here, and we don't want
+                   them to inc the wrong thing. */
+                OBD_ALLOC(lsd_copy, sizeof(*lsd_copy));
+                if (!lsd_copy) 
+                        RETURN(-ENOMEM);
+                *lsd_copy = *lsd;
+                lsd_copy->lsd_unused = lsd->lsd_last_transno;
+                lsd_copy->lsd_last_transno = lsd->lsd_mount_count;
+                lsd = lsd_copy;
+        }
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        rc = fsfilt_write_record(obd, filp, lsd, sizeof(*lsd), &off,force_sync);
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         if (rc)
                 CERROR("error writing MDS server data: rc = %d\n", rc);
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        if (lsd_copy) 
+                OBD_FREE(lsd_copy, sizeof(*lsd_copy));
 
         RETURN(rc);
 }
@@ -1768,6 +1790,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         struct lprocfs_static_vars lvars;
         struct lustre_cfg* lcfg = buf;
         struct mds_obd *mds = &obd->u.mds;
+        struct lustre_mount_info *lmi;
         struct vfsmount *mnt;
         struct obd_uuid uuid;
         __u8 *uuid_ptr;
@@ -1777,6 +1800,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         int rc = 0;
         ENTRY;
 
+        /* setup 1:/dev/loop/0 2:ext3 3:mdsA 4:errors=remount-ro,iopen_nopriv */
+
         CLASSERT(offsetof(struct obd_device, u.obt) ==
                  offsetof(struct obd_device, u.mds.mds_obt));
 
@@ -1786,37 +1811,50 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0)
                 RETURN(rc = -EINVAL);
 
-        obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
-        if (IS_ERR(obd->obd_fsops))
-                RETURN(rc = PTR_ERR(obd->obd_fsops));
-
-        page = __get_free_page(GFP_KERNEL);
-        if (!page)
-                RETURN(-ENOMEM);
-
-        options = (char *)page;
-        memset(options, 0, PAGE_SIZE);
-
-        /* here we use "iopen_nopriv" hardcoded, because it affects MDS utility
-         * and the rest of options are passed by mount options. Probably this
-         * should be moved to somewhere else like startup scripts or lconf. */
-        strcpy(options, "iopen_nopriv");
-
-        if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) {
-                sprintf(options + strlen(options), ",%s",
-                        lustre_cfg_string(lcfg, 4));
-                fsoptions_to_mds_flags(mds, options);
-        }
+        lmi = server_get_mount(obd->obd_name);
+        if (lmi) {
+                /* We already mounted in lustre_fill_super.
+                   lcfg bufs 1, 2, 4 (device, fstype, mount opts) are ignored.*/
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+                mnt = lmi->lmi_mnt;
+                obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
+        } else {
+                /* old path - used by lctl */
+                CERROR("Using old MDS mount method\n");
+                page = __get_free_page(GFP_KERNEL);
+                if (!page)
+                        RETURN(-ENOMEM);
+
+                options = (char *)page;
+                memset(options, 0, PAGE_SIZE);
+
+                /* here we use "iopen_nopriv" hardcoded, because it affects
+                 * MDS utility and the rest of options are passed by mount
+                 * options. Probably this should be moved to somewhere else
+                 * like startup scripts or lconf. */
+                strcpy(options, "iopen_nopriv");
+
+                if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) {
+                        sprintf(options + strlen(options), ",%s",
+                                lustre_cfg_string(lcfg, 4));
+                        fsoptions_to_mds_flags(mds, options);
+                }
+                
+                mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0,
+                                    lustre_cfg_string(lcfg, 1), 
+                                    (void *)options);
+                free_page(page);
+                if (IS_ERR(mnt)) {
+                        rc = PTR_ERR(mnt);
+                        LCONSOLE_ERROR("Can't mount disk %s (%d)\n",
+                                       lustre_cfg_string(lcfg, 1), rc);
+                        RETURN(rc);
+                }
 
-        mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0,
-                            lustre_cfg_string(lcfg, 1), (void *)options);
-        free_page(page);
-        if (IS_ERR(mnt)) {
-                rc = PTR_ERR(mnt);
-                LCONSOLE_ERROR("Can't mount disk %s (%d)\n",
-                               lustre_cfg_string(lcfg, 1), rc);
-                GOTO(err_ops, rc);
+                obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
         }
+        if (IS_ERR(obd->obd_fsops))
+                GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
 
         CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt);
 
@@ -1833,7 +1871,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER);
         if (obd->obd_namespace == NULL) {
                 mds_cleanup(obd);
-                GOTO(err_put, rc = -ENOMEM);
+                GOTO(err_ops, rc = -ENOMEM);
         }
         ldlm_register_intent(obd->obd_namespace, mds_intent_policy);
 
@@ -1880,9 +1918,9 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         /* Don't wait for mds_postrecov trying to clear orphans */
         obd->obd_async_recov = 1;
         rc = mds_postsetup(obd);
+        obd->obd_async_recov = 0;
         if (rc)
                 GOTO(err_qctxt, rc);
-        obd->obd_async_recov = 0;
 
         lprocfs_init_vars(mds, &lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
@@ -1895,7 +1933,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                 str = "no UUID";
         }
 
-        label = fsfilt_label(obd, obd->u.obt.obt_sb);
+        label = fsfilt_get_label(obd, obd->u.obt.obt_sb);
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
                               "recovery until %d %s reconnect, or if no clients"
@@ -1932,13 +1970,18 @@ err_fs:
 err_ns:
         ldlm_namespace_free(obd->obd_namespace, 0);
         obd->obd_namespace = NULL;
-err_put:
-        unlock_kernel();
-        mntput(mds->mds_vfsmnt);
-        obd->u.obt.obt_sb = NULL;
-        lock_kernel();
 err_ops:
         fsfilt_put_ops(obd->obd_fsops);
+err_put:
+        if (lmi) {
+                server_put_mount(obd->obd_name, mds->mds_vfsmnt);
+        } else {
+                /* old method */
+                unlock_kernel();
+                mntput(mds->mds_vfsmnt);
+                lock_kernel();
+        }               
+        obd->u.obt.obt_sb = NULL;
         return rc;
 }
 
@@ -1957,7 +2000,6 @@ static int mds_lov_clean(struct obd_device *obd)
         /* There better be a lov */
         if (!osc)
                 RETURN(0);
-        
         if (IS_ERR(osc))
                 RETURN(PTR_ERR(osc));
 
@@ -1992,33 +2034,12 @@ static int mds_postsetup(struct obd_device *obd)
                 RETURN(rc);
 
         if (mds->mds_profile) {
-                struct lvfs_run_ctxt saved;
                 struct lustre_profile *lprof;
-                struct config_llog_instance cfg;
-
-                cfg.cfg_instance = NULL;
-                cfg.cfg_uuid = mds->mds_lov_uuid;
-                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-                rc = class_config_parse_llog(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT),
-                                             mds->mds_profile, &cfg);
-                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-                switch (rc) {
-                case 0:
-                        break;
-                case -EINVAL:
-                        LCONSOLE_ERROR("%s: the profile %s could not be read. "
-                                       "If you recently installed a new "
-                                       "version of Lustre, you may need to "
-                                       "re-run 'lconf --write_conf "
-                                       "<yourconfig>.xml' command line before "
-                                       "restarting the MDS.\n",
-                                       obd->obd_name, mds->mds_profile);
-                        /* fall through */
-                default:
-                        GOTO(err_llog, rc);
-                        break;
-                }
-
+                /* The profile defines which osc and mdc to connect to, for a 
+                   client.  We reuse that here to figure out the name of the
+                   lov to use (and ignore lprof->lp_mdc).
+                   The profile was set in the config log with 
+                   LCFG_MOUNTOPT profilenm oscnm mdcnm */
                 lprof = class_get_profile(mds->mds_profile);
                 if (lprof == NULL) {
                         CERROR("No profile found: %s\n", mds->mds_profile);
@@ -2033,7 +2054,6 @@ static int mds_postsetup(struct obd_device *obd)
 
 err_cleanup:
         mds_lov_clean(obd);
-err_llog:
         llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
         llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
         RETURN(rc);
@@ -2050,11 +2070,12 @@ int mds_postrecov(struct obd_device *obd)
         LASSERT(!obd->obd_recovering);
         LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
 
+        /* FIXME why not put this in the synchronize? */
         /* set nextid first, so we are sure it happens */
         rc = mds_lov_set_nextid(obd);
         if (rc) {
-                CERROR("%s: mds_lov_set_nextid failed\n",
-                       obd->obd_name);
+                CERROR("%s: mds_lov_set_nextid failed %d\n",
+                       obd->obd_name, rc);
                 GOTO(out, rc);
         }
 
@@ -2063,8 +2084,13 @@ int mds_postrecov(struct obd_device *obd)
         if (rc < 0)
                 GOTO(out, rc);
 
-        /* Does anyone need this to be synchronous ever? */
-        mds_lov_start_synchronize(obd, NULL, obd->obd_async_recov);
+        /* FIXME Does target_finish_recovery really need this to block? */
+        /* Notify the LOV, which will in turn call mds_notify for each tgt */
+        /* This means that we have to hack obd_notify to think we're obd_set_up
+           during mds_lov_connect. */
+        obd_notify(obd->u.mds.mds_osc_obd, NULL, 
+                   obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK :
+                   OBD_NOTIFY_SYNC, NULL);
 
         /* quota recovery */
         lquota_recovery(quota_interface, obd);
@@ -2115,6 +2141,7 @@ static int mds_cleanup(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
         lvfs_sbdev_type save_dev;
+        int must_put = 0;
         int must_relock = 0;
         ENTRY;
 
@@ -2132,20 +2159,15 @@ static int mds_cleanup(struct obd_device *obd)
         lquota_cleanup(quota_interface, obd);
 
         mds_update_server_data(obd, 1);
-        if (mds->mds_lov_objids != NULL) {
-                OBD_FREE(mds->mds_lov_objids,
-                         mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id));
-        }
+        if (mds->mds_lov_objids != NULL) 
+                OBD_FREE(mds->mds_lov_objids, mds->mds_lov_objids_size);
         mds_fs_cleanup(obd);
 
         upcall_cache_cleanup(mds->mds_group_hash);
         mds->mds_group_hash = NULL;
 
-        /* 2 seems normal on mds, (may_umount() also expects 2
-          fwiw), but we only see 1 at this point in obdfilter. */
-        if (atomic_read(&obd->u.mds.mds_vfsmnt->mnt_count) > 2)
-                CERROR("%s: mount busy, mnt_count %d != 2\n", obd->obd_name,
-                       atomic_read(&obd->u.mds.mds_vfsmnt->mnt_count));
+        must_put = server_put_mount(obd->obd_name, mds->mds_vfsmnt);
+        /* must_put is for old method (l_p_m returns non-0 on err) */
 
         /* We can only unlock kernel if we are in the context of sys_ioctl,
            otherwise we never called lock_kernel */
@@ -2153,8 +2175,10 @@ static int mds_cleanup(struct obd_device *obd)
                 unlock_kernel();
                 must_relock++;
         }
-
-        mntput(mds->mds_vfsmnt);
+        
+        if (must_put) 
+                /* In case we didn't mount with lustre_get_mount -- old method*/
+                mntput(mds->mds_vfsmnt);
         obd->u.obt.obt_sb = NULL;
 
         ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
index 8cd2dc9..2bae298 100644 (file)
 #include <obd_support.h>
 #include <lustre_lib.h>
 #include <lustre_fsfilt.h>
+#include <lustre_disk.h>
 #include <libcfs/list.h>
 
 #include "mds_internal.h"
 
-#define HEALTH_CHECK "health_check"
 
 /* Add client data to the MDS.  We use a bitmap to locate a free space
  * in the last_rcvd file if cl_off is -1 (i.e. a new client).
@@ -100,8 +100,8 @@ int mds_client_add(struct obd_device *obd, struct mds_obd *mds,
                cl_idx, med->med_mcd->mcd_uuid);
 
         med->med_lr_idx = cl_idx;
-        med->med_lr_off = le32_to_cpu(mds->mds_server_data->msd_client_start) +
-                (cl_idx * le16_to_cpu(mds->mds_server_data->msd_client_size));
+        med->med_lr_off = le32_to_cpu(mds->mds_server_data->lsd_client_start) +
+                (cl_idx * le16_to_cpu(mds->mds_server_data->lsd_client_size));
         LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off);
 
         if (new_client) {
@@ -209,7 +209,7 @@ static int mds_server_free_data(struct mds_obd *mds)
 static int mds_init_server_data(struct obd_device *obd, struct file *file)
 {
         struct mds_obd *mds = &obd->u.mds;
-        struct mds_server_data *msd;
+        struct lr_server_data *lsd;
         struct mds_client_data *mcd = NULL;
         loff_t off = 0;
         unsigned long last_rcvd_size = file->f_dentry->d_inode->i_size;
@@ -218,87 +218,104 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
         ENTRY;
 
         /* ensure padding in the struct is the correct size */
-        LASSERT(offsetof(struct mds_server_data, msd_padding) +
-                sizeof(msd->msd_padding) == LR_SERVER_SIZE);
+        LASSERT(offsetof(struct lr_server_data, lsd_padding) +
+                sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
         LASSERT(offsetof(struct mds_client_data, mcd_padding) +
                 sizeof(mcd->mcd_padding) == LR_CLIENT_SIZE);
 
-        OBD_ALLOC_WAIT(msd, sizeof(*msd));
-        if (!msd)
+        OBD_ALLOC_WAIT(lsd, sizeof(*lsd));
+        if (!lsd)
                 RETURN(-ENOMEM);
 
         OBD_ALLOC_WAIT(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8);
         if (!mds->mds_client_bitmap) {
-                OBD_FREE(msd, sizeof(*msd));
+                OBD_FREE(lsd, sizeof(*lsd));
                 RETURN(-ENOMEM);
         }
 
-        mds->mds_server_data = msd;
+        mds->mds_server_data = lsd;
 
         if (last_rcvd_size == 0) {
-                CWARN("%s: initializing new %s\n", obd->obd_name, LAST_RCVD);
-
-                memcpy(msd->msd_uuid, obd->obd_uuid.uuid,sizeof(msd->msd_uuid));
-                msd->msd_last_transno = 0;
-                mount_count = msd->msd_mount_count = 0;
-                msd->msd_server_size = cpu_to_le32(LR_SERVER_SIZE);
-                msd->msd_client_start = cpu_to_le32(LR_CLIENT_START);
-                msd->msd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
-                msd->msd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID);
+                LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
+
+                memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(lsd->lsd_uuid));
+                lsd->lsd_last_transno = 0;
+                mount_count = lsd->lsd_mount_count = 0;
+                lsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
+                lsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START);
+                lsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
+                lsd->lsd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID);
+                lsd->lsd_feature_incompat = cpu_to_le32(OBD_INCOMPAT_MDT |
+                                                        OBD_INCOMPAT_COMMON_LR);
         } else {
-                rc = fsfilt_read_record(obd, file, msd, sizeof(*msd), &off);
+                rc = fsfilt_read_record(obd, file, lsd, sizeof(*lsd), &off);
                 if (rc) {
                         CERROR("error reading MDS %s: rc %d\n", LAST_RCVD, rc);
                         GOTO(err_msd, rc);
                 }
-                if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) {
+                if (strcmp(lsd->lsd_uuid, obd->obd_uuid.uuid) != 0) {
                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
                                        " disk %s. Were the /dev/ assignments "
                                        "rearranged?\n",
-                                       obd->obd_uuid.uuid, msd->msd_uuid);
+                                       obd->obd_uuid.uuid, lsd->lsd_uuid);
                         GOTO(err_msd, rc = -EINVAL);
                 }
-                mount_count = le64_to_cpu(msd->msd_mount_count);
+                mount_count = le64_to_cpu(lsd->lsd_mount_count);
         }
-        if (msd->msd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) {
+
+        if (lsd->lsd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) {
                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
-                       obd->obd_name, le32_to_cpu(msd->msd_feature_incompat) &
+                       obd->obd_name, le32_to_cpu(lsd->lsd_feature_incompat) &
                        ~MDT_INCOMPAT_SUPP);
                 GOTO(err_msd, rc = -EINVAL);
         }
-
-        if (msd->msd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) {
+        if (lsd->lsd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) {
                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
-                       obd->obd_name, le32_to_cpu(msd->msd_feature_rocompat) &
+                       obd->obd_name, le32_to_cpu(lsd->lsd_feature_rocompat) &
                        ~MDT_ROCOMPAT_SUPP);
                 /* Do something like remount filesystem read-only */
                 GOTO(err_msd, rc = -EINVAL);
         }
+        if (!(lsd->lsd_feature_incompat & cpu_to_le32(OBD_INCOMPAT_COMMON_LR))){
+                CDEBUG(D_WARNING, "using old last_rcvd format\n");
+                lsd->lsd_mount_count = lsd->lsd_last_transno;
+                lsd->lsd_last_transno = lsd->lsd_unused;
+                /* If we update the last_rcvd, we can never go back to 
+                   an old install, so leave this in the old format for now.
+                lsd->lsd_feature_incompat |= cpu_to_le32(LR_INCOMPAT_COMMON_LR);
+                */
+        }
+        lsd->lsd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT);
+        
+        mds->mds_last_transno = le64_to_cpu(lsd->lsd_last_transno);
 
-        mds->mds_last_transno = le64_to_cpu(msd->msd_last_transno);
-
-        msd->msd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT);
         CDEBUG(D_INODE, "%s: server last_transno: "LPU64"\n",
                obd->obd_name, mds->mds_last_transno);
         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
                obd->obd_name, mount_count + 1);
         CDEBUG(D_INODE, "%s: server data size: %u\n",
-               obd->obd_name, le32_to_cpu(msd->msd_server_size));
+               obd->obd_name, le32_to_cpu(lsd->lsd_server_size));
         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
-               obd->obd_name, le32_to_cpu(msd->msd_client_start));
+               obd->obd_name, le32_to_cpu(lsd->lsd_client_start));
         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
-               obd->obd_name, le32_to_cpu(msd->msd_client_size));
+               obd->obd_name, le32_to_cpu(lsd->lsd_client_size));
         CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n",
                obd->obd_name, last_rcvd_size);
         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
-               last_rcvd_size <= le32_to_cpu(msd->msd_client_start) ? 0 :
-               (last_rcvd_size - le32_to_cpu(msd->msd_client_start)) /
-                le16_to_cpu(msd->msd_client_size));
+               last_rcvd_size <= le32_to_cpu(lsd->lsd_client_start) ? 0 :
+               (last_rcvd_size - le32_to_cpu(lsd->lsd_client_start)) /
+                le16_to_cpu(lsd->lsd_client_size));
+
+        if (!lsd->lsd_server_size || !lsd->lsd_client_start ||
+            !lsd->lsd_client_size) {
+                CERROR("Bad last_rcvd contents!\n");
+                GOTO(err_msd, rc = -EINVAL);
+        }
 
         /* When we do a clean MDS shutdown, we save the last_transno into
          * the header.  If we find clients with higher last_transno values
          * then those clients may need recovery done. */
-        for (cl_idx = 0, off = le32_to_cpu(msd->msd_client_start);
+        for (cl_idx = 0, off = le32_to_cpu(lsd->lsd_client_start);
              off < last_rcvd_size; cl_idx++) {
                 __u64 last_transno;
                 struct obd_export *exp;
@@ -312,9 +329,9 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
 
                 /* Don't assume off is incremented properly by
                  * fsfilt_read_record(), in case sizeof(*mcd)
-                 * isn't the same as msd->msd_client_size.  */
-                off = le32_to_cpu(msd->msd_client_start) +
-                        cl_idx * le16_to_cpu(msd->msd_client_size);
+                 * isn't the same as lsd->lsd_client_size.  */
+                off = le32_to_cpu(lsd->lsd_client_start) +
+                        cl_idx * le16_to_cpu(lsd->lsd_client_size);
                 rc = fsfilt_read_record(obd, file, mcd, sizeof(*mcd), &off);
                 if (rc) {
                         CERROR("error reading MDS %s idx %d, off %llu: rc %d\n",
@@ -335,7 +352,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                  */
                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
                        " srv lr: "LPU64" lx: "LPU64"\n", mcd->mcd_uuid, cl_idx,
-                       last_transno, le64_to_cpu(msd->msd_last_transno),
+                       last_transno, le64_to_cpu(lsd->lsd_last_transno),
                        le64_to_cpu(mcd->mcd_last_xid));
 
                 exp = class_new_export(obd, (struct obd_uuid *)mcd->mcd_uuid);
@@ -380,7 +397,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
         }
 
         mds->mds_mount_count = mount_count + 1;
-        msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
+        lsd->lsd_mount_count = cpu_to_le64(mds->mds_mount_count);
 
         /* save it, so mount count and last_transno is current */
         rc = mds_update_server_data(obd, 1);
@@ -410,6 +427,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
                 RETURN(rc);
 
         mds->mds_vfsmnt = mnt;
+        /* why not mnt->mnt_sb instead of mnt->mnt_root->d_inode->i_sb? */
         obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb;
 
         fsfilt_setup(obd, obd->u.obt.obt_sb);
@@ -458,13 +476,16 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
         }
         mds->mds_pending_dir = dentry;
 
-        dentry = simple_mkdir(current->fs->pwd, "LOGS", 0777, 1);
+        /* COMPAT_146 */
+        dentry = simple_mkdir(current->fs->pwd, MDT_LOGS_DIR, 0777, 1);
         if (IS_ERR(dentry)) {
                 rc = PTR_ERR(dentry);
-                CERROR("cannot create LOGS directory: rc = %d\n", rc);
+                CERROR("cannot create %s directory: rc = %d\n",
+                       MDT_LOGS_DIR, rc);
                 GOTO(err_pending, rc);
         }
         mds->mds_logs_dir = dentry;
+        /* end COMPAT_146 */
 
         dentry = simple_mkdir(current->fs->pwd, "OBJECTS", 0777, 1);
         if (IS_ERR(dentry)) {
@@ -560,8 +581,8 @@ int mds_fs_cleanup(struct obd_device *obd)
         int rc = 0;
 
         if (obd->obd_fail)
-                CWARN("%s: shutting down for failover; client state will "
-                      "be preserved.\n", obd->obd_name);
+                LCONSOLE_WARN("%s: shutting down for failover; client state "
+                              "will be preserved.\n", obd->obd_name);
 
         class_disconnect_exports(obd); /* cleans up client info too */
         mds_server_free_data(mds);
index 78f6768..d90664a 100644 (file)
@@ -5,34 +5,11 @@
 #ifndef _MDS_INTERNAL_H
 #define _MDS_INTERNAL_H
 
-#include <linux/lustre_disk.h>
+#include <lustre_disk.h>
 #include <lustre_mds.h>
 
 #define MDT_ROCOMPAT_SUPP       (OBD_ROCOMPAT_LOVOBJID)
-
-#define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT)
-
-/* Data stored per server at the head of the last_rcvd file.  In le32 order.
- * Try to keep this the same as fsd_server_data so we might one day merge. */
-struct mds_server_data {
-        __u8  msd_uuid[40];        /* server UUID */
-        __u64 msd_last_transno;    /* last completed transaction ID */
-        __u64 msd_mount_count;     /* MDS incarnation number */
-        __u64 msd_mount_count_new; /* future MDS incarnation number */
-        __u32 msd_feature_compat;  /* compatible feature flags */
-        __u32 msd_feature_rocompat;/* read-only compatible feature flags */
-        __u32 msd_feature_incompat;/* incompatible feature flags */
-        __u32 msd_server_size;     /* size of server data area */
-        __u32 msd_client_start;    /* start of per-client data area */
-        __u16 msd_client_size;     /* size of per-client data area */
-        __u16 msd_subdir_count;    /* number of subdirectories for objects */
-        __u64 msd_catalog_oid;     /* recovery catalog object id */
-        __u32 msd_catalog_ogen;    /* recovery catalog inode generation */
-        __u8  msd_peeruuid[40];    /* UUID of LOV/OSC associated with MDS */
-        __u32 msd_ost_index;       /* index number of OST in LOV */
-        __u32 msd_mds_index;       /* index number of MDS in LMV */
-        __u8  msd_padding[LR_SERVER_SIZE - 148];
-};
+#define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR)
 
 /* Data stored per client in the last_rcvd file.  In le32 order. */
 struct mds_client_data {
@@ -210,11 +187,12 @@ int mds_lov_write_objids(struct obd_device *obd);
 void mds_lov_update_objids(struct obd_device *obd, obd_id *ids);
 int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
 int mds_lov_set_nextid(struct obd_device *obd);
-int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
-                              int nonblock);
+int mds_lov_start_synchronize(struct obd_device *obd, 
+                              struct obd_device *watched,
+                              void *data, int nonblock);
 int mds_post_mds_lovconf(struct obd_device *obd);
 int mds_notify(struct obd_device *obd, struct obd_device *watched,
-               enum obd_notify_event ev);
+               enum obd_notify_event ev, void *data);
 int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
                        struct lov_mds_md *lmm, int lmm_size);
 void mds_objids_from_lmm(obd_id *ids, struct lov_mds_md *lmm,
index a76be7d..43a63e3 100644 (file)
@@ -83,7 +83,7 @@ static int mds_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls
         ENTRY;
 
         lctxt = llog_get_context(lov_obd, ctxt->loc_idx);
-        rc = llog_cancel(lctxt, lsm, count, cookies,flags);
+        rc = llog_cancel(lctxt, lsm, count, cookies, flags);
         RETURN(rc);
 }
 
index 4135e9b..0f95347 100644 (file)
@@ -50,8 +50,10 @@ void mds_lov_update_objids(struct obd_device *obd, obd_id *ids)
 
         lock_kernel();
         for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++)
-                if (ids[i] > (mds->mds_lov_objids)[i])
+                if (ids[i] > (mds->mds_lov_objids)[i]) {
                         (mds->mds_lov_objids)[i] = ids[i];
+                        mds->mds_lov_objids_dirty = 1;
+                }
         unlock_kernel();
         EXIT;
 }
@@ -61,47 +63,67 @@ static int mds_lov_read_objids(struct obd_device *obd)
         struct mds_obd *mds = &obd->u.mds;
         obd_id *ids;
         loff_t off = 0;
-        int i, rc, size = mds->mds_lov_desc.ld_tgt_count * sizeof(*ids);
+        int i, rc, size;
         ENTRY;
 
-        if (mds->mds_lov_objids != NULL)
+        LASSERT(!mds->mds_lov_objids_size);
+        LASSERT(!mds->mds_lov_objids_dirty);
+
+        /* Read everything in the file, even if our current lov desc 
+           has fewer targets. Old targets not in the lov descriptor 
+           during mds setup may still have valid objids. */
+        size = mds->mds_lov_objid_filp->f_dentry->d_inode->i_size;
+        if (size == 0)
                 RETURN(0);
 
         OBD_ALLOC(ids, size);
         if (ids == NULL)
                 RETURN(-ENOMEM);
         mds->mds_lov_objids = ids;
+        mds->mds_lov_objids_size = size;
 
-        if (mds->mds_lov_objid_filp->f_dentry->d_inode->i_size == 0)
-                RETURN(0);
         rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, ids, size, &off);
         if (rc < 0) {
                 CERROR("Error reading objids %d\n", rc);
-        } else {
-                mds->mds_lov_objids_valid = 1;
-                rc = 0;
+                RETURN(rc);
         }
-
-        for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++)
+                
+        mds->mds_lov_objids_in_file = size / sizeof(*ids); 
+        
+        for (i = 0; i < mds->mds_lov_objids_in_file; i++) {
                 CDEBUG(D_INFO, "read last object "LPU64" for idx %d\n",
                        mds->mds_lov_objids[i], i);
-
-        RETURN(rc);
+        }
+        RETURN(0);
 }
 
 int mds_lov_write_objids(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
         loff_t off = 0;
-        int i, rc, size = mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id);
+        int i, rc, tgts; 
         ENTRY;
 
-        for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++)
+        if (!mds->mds_lov_objids_dirty)
+                RETURN(0);
+
+        tgts = max(mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids_in_file);
+
+        if (!tgts)
+                RETURN(0);
+
+        for (i = 0; i < tgts; i++)
                 CDEBUG(D_INFO, "writing last object "LPU64" for idx %d\n",
                        mds->mds_lov_objids[i], i);
 
         rc = fsfilt_write_record(obd, mds->mds_lov_objid_filp,
-                                 mds->mds_lov_objids, size, &off, 0);
+                                 mds->mds_lov_objids, tgts * sizeof(obd_id),
+                                 &off, 0);
+        if (rc >= 0) {
+                mds->mds_lov_objids_dirty = 0;
+                rc = 0;
+        }
+
         RETURN(rc);
 }
 
@@ -141,41 +163,146 @@ int mds_lov_set_nextid(struct obd_device *obd)
 
         LASSERT(mds->mds_lov_objids != NULL);
 
-        rc = obd_set_info_async(mds->mds_osc_exp, strlen("next_id"), "next_id",
+        rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID),
+                                KEY_NEXT_ID,
                                 mds->mds_lov_desc.ld_tgt_count,
                                 mds->mds_lov_objids, NULL);
+        
+        if (rc) 
+                CERROR ("%s: mds_lov_set_nextid failed (%d)\n", 
+                        obd->obd_name, rc);
+
         RETURN(rc);
 }
 
-int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp)
+/* Update the lov desc for a new size lov. */
+static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
 {
         struct mds_obd *mds = &obd->u.mds;
-        int valsize, rc, tgt_count;
-        __u32 stripes;
+        struct lov_desc *ld; 
+        __u32 size, stripes, valsize = sizeof(mds->mds_lov_desc);
+        int rc = 0;
         ENTRY;
 
-        mds->mds_has_lov_desc = 0;
-        valsize = sizeof(mds->mds_lov_desc);
-        rc = obd_get_info(mds->mds_osc_exp, strlen("lovdesc") + 1,
-                          "lovdesc", &valsize, &mds->mds_lov_desc);
-        if (rc) {
-                CERROR("can't get lov_desc, rc %d\n", rc);
-                RETURN(rc);
+        OBD_ALLOC(ld, sizeof(*ld));
+        if (!ld)
+                RETURN(-ENOMEM);
+
+        rc = obd_get_info(lov, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC,
+                          &valsize, ld);
+        if (rc)
+                GOTO(out, rc);
+
+        /* The size of the LOV target table may have increased. */
+        size = ld->ld_tgt_count * sizeof(obd_id);
+        if ((mds->mds_lov_objids_size == 0) || 
+            (size > mds->mds_lov_objids_size)) {
+                obd_id *ids;
+                
+                /* add room by powers of 2 */
+                size = 1;
+                while (size < ld->ld_tgt_count) 
+                        size = size << 1;
+                size = size * sizeof(obd_id);
+
+                OBD_ALLOC(ids, size);
+                if (ids == NULL)
+                        GOTO(out, rc = -ENOMEM);
+                memset(ids, 0, size);
+                if (mds->mds_lov_objids_size) {
+                        obd_id *old_ids = mds->mds_lov_objids;
+                        memcpy(ids, mds->mds_lov_objids, 
+                               mds->mds_lov_objids_size);
+                        mds->mds_lov_objids = ids;
+                        OBD_FREE(old_ids, mds->mds_lov_objids_size);
+                }
+                mds->mds_lov_objids = ids;
+                mds->mds_lov_objids_size = size;
         }
 
-        mds->mds_has_lov_desc = 1;
-        tgt_count = mds->mds_lov_desc.ld_tgt_count;
-        stripes = min(tgt_count, LOV_MAX_STRIPE_COUNT);
+        /* Don't change the mds_lov_desc until the objids size matches the
+           count (paranoia) */
+        mds->mds_lov_desc = *ld;
+        CDEBUG(D_CONFIG, "updated lov_desc, tgt_count: %d\n",
+               mds->mds_lov_desc.ld_tgt_count);
 
+        stripes = min((__u32)LOV_MAX_STRIPE_COUNT, 
+                      max(mds->mds_lov_desc.ld_tgt_count,
+                          mds->mds_lov_objids_in_file));
         mds->mds_max_mdsize = lov_mds_md_size(stripes);
         mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
+        CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize: %d/%d\n",
+               mds->mds_max_mdsize, mds->mds_max_cookiesize);
+
+out:
+        OBD_FREE(ld, sizeof(*ld));
+        RETURN(rc);
+}
 
-        CDEBUG(D_HA, "updated lov_desc, tgt_count: %d\n", tgt_count);
 
-        CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
-               mds->mds_max_mdsize, mds->mds_max_cookiesize);
+#define MDSLOV_NO_INDEX -1
 
-        RETURN(0);
+/* Inform MDS about new/updated target */
+static int mds_lov_update_mds(struct obd_device *obd,   
+                              struct obd_device *watched, 
+                              __u32 idx)
+{
+        struct mds_obd *mds = &obd->u.mds;
+        int old_count;
+        int rc = 0;
+        ENTRY;
+
+        old_count = mds->mds_lov_desc.ld_tgt_count;
+        rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
+        if (rc)
+                RETURN(rc);
+
+        CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n",
+               idx, obd->obd_recovering, obd->obd_async_recov, old_count, 
+               mds->mds_lov_desc.ld_tgt_count);
+
+        /* idx is set as data from lov_notify. */
+        if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) {
+                if (idx >= mds->mds_lov_desc.ld_tgt_count) {
+                        CERROR("index %d > count %d!\n", idx, 
+                               mds->mds_lov_desc.ld_tgt_count);
+                        RETURN(-EINVAL);
+                }
+                
+                if (idx >= mds->mds_lov_objids_in_file) {
+                        /* We never read this lastid; ask the osc */
+                        obd_id lastid;
+                        __u32 size = sizeof(lastid);
+                        rc = obd_get_info(watched->obd_self_export,
+                                          strlen("last_id"), 
+                                          "last_id", &size, &lastid);
+                        if (rc)
+                                RETURN(rc);
+                        mds->mds_lov_objids[idx] = lastid;
+                        mds->mds_lov_objids_dirty = 1;
+                        mds_lov_write_objids(obd);
+                } else {
+                        /* We have read this lastid from disk; tell the osc.
+                           Don't call this during recovery. */ 
+                        rc = mds_lov_set_nextid(obd);
+                }
+        
+                CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n",
+                      mds->mds_lov_objids[idx], idx);
+        }
+
+        /* If we added a target we have to reconnect the llogs */
+        /* Only do this at first add (idx), or the first time after recovery */
+        if (idx != MDSLOV_NO_INDEX || 1/*FIXME*/) {
+                CDEBUG(D_CONFIG, "reset llogs idx=%d\n", idx);
+                /* These two must be atomic */
+                down(&mds->mds_orphan_recovery_sem);
+                obd_llog_finish(obd, old_count);
+                llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count);
+                up(&mds->mds_orphan_recovery_sem);
+        }
+
+        RETURN(rc);
 }
 
 /* update the LOV-OSC knowledge of the last used object id's */
@@ -223,17 +350,17 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                 GOTO(err_discon, rc);
         }
 
-        /* init lov_desc + easize */
-        rc = mds_init_lov_desc(obd, mds->mds_osc_exp);
-        if (rc)
-                GOTO(err_reg, rc);
-
         rc = mds_lov_read_objids(obd);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
                 GOTO(err_reg, rc);
         }
 
+        rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
+        if (rc)
+                GOTO(err_reg, rc);
+
+        /* tgt_count may be 0! */
         rc = llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count);
         if (rc) {
                 CERROR("failed to initialize catalog %d\n", rc);
@@ -242,7 +369,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
 
         /* If we're mounting this code for the first time on an existing FS,
          * we need to populate the objids array from the real OST values */
-        if (!mds->mds_lov_objids_valid) {
+        if (mds->mds_lov_desc.ld_tgt_count > mds->mds_lov_objids_in_file) {
                 int size = sizeof(obd_id) * mds->mds_lov_desc.ld_tgt_count;
                 rc = obd_get_info(mds->mds_osc_exp, strlen("last_id"),
                                   "last_id", &size, mds->mds_lov_objids);
@@ -250,7 +377,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                         for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++)
                                 CWARN("got last object "LPU64" from OST %d\n",
                                       mds->mds_lov_objids[i], i);
-                        mds->mds_lov_objids_valid = 1;
+                        mds->mds_lov_objids_dirty = 1;
                         rc = mds_lov_write_objids(obd);
                         if (rc)
                                 CERROR("got last objids from OSTs, but error "
@@ -461,8 +588,9 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 rc = llog_ioctl(ctxt, cmd, data);
                 pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
                 llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count);
-                rc2 = obd_set_info_async(mds->mds_osc_exp, strlen("mds_conn"),
-                                         "mds_conn", 0, NULL, NULL);
+                rc2 = obd_set_info_async(mds->mds_osc_exp,
+                                         strlen(KEY_MDS_CONN), KEY_MDS_CONN,
+                                         0, NULL, NULL);
                 if (!rc)
                         rc = rc2;
                 RETURN(rc);
@@ -493,33 +621,47 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
 }
 
 struct mds_lov_sync_info {
-        struct obd_device *mlsi_obd; /* the lov device to sync */
-        struct obd_uuid   *mlsi_uuid;  /* target to sync */
+        struct obd_device *mlsi_obd;     /* the lov device to sync */
+        struct obd_device *mlsi_watched; /* target osc */
+        __u32              mlsi_index;   /* index of target */
 };
 
-static int __mds_lov_syncronize(void *data)
+/* We only sync one osc at a time, so that we don't have to hold
+   any kind of lock on the whole mds_lov_desc, which may change 
+   (grow) as a result of mds_lov_add_ost.  This also avoids any
+   kind of mismatch between the lov_desc and the mds_lov_desc, 
+   which are not in lock-step during lov_add_obd */
+static int __mds_lov_synchronize(void *data)
 {
         struct mds_lov_sync_info *mlsi = data;
-        struct obd_device *obd;
+        struct obd_device *obd = mlsi->mlsi_obd;
+        struct obd_device *watched = mlsi->mlsi_watched;
+        struct mds_obd *mds = &obd->u.mds;
         struct obd_uuid *uuid;
+        __u32  idx = mlsi->mlsi_index;
         int rc = 0;
         ENTRY;
 
-        obd = mlsi->mlsi_obd;
-        uuid = mlsi->mlsi_uuid;
-
         OBD_FREE(mlsi, sizeof(*mlsi));
 
-        LASSERT(obd != NULL);
+        LASSERT(obd);
+        LASSERT(watched);
+        uuid = &watched->u.cli.cl_target_uuid;
+        LASSERT(uuid);
 
-        rc = obd_set_info_async(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
-                          "mds_conn", 0, uuid, NULL);
+        rc = mds_lov_update_mds(obd, watched, idx);
+        if (rc != 0)
+                GOTO(out, rc);
+        
+        rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_MDS_CONN),
+                                KEY_MDS_CONN, 0, uuid, NULL);
         if (rc != 0)
                 GOTO(out, rc);
 
         rc = llog_connect(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT),
-                          obd->u.mds.mds_lov_desc.ld_tgt_count,
+                          mds->mds_lov_desc.ld_tgt_count,
                           NULL, NULL, uuid);
+        
         if (rc != 0) {
                 CERROR("%s: failed at llog_origin_connect: %d\n",
                        obd->obd_name, rc);
@@ -527,50 +669,60 @@ static int __mds_lov_syncronize(void *data)
         }
 
         LCONSOLE_INFO("MDS %s: %s now active, resetting orphans\n",
-                      obd->obd_name, uuid ? (char *)uuid->uuid : "All OSCs");
+              obd->obd_name, obd_uuid2str(uuid));
 
         if (obd->obd_stopping)
                 GOTO(out, rc = -ENODEV);
 
-        rc = mds_lov_clear_orphans(&obd->u.mds, uuid);
+        rc = mds_lov_clear_orphans(mds, uuid);
         if (rc != 0) {
                 CERROR("%s: failed at mds_lov_clear_orphans: %d\n",
                        obd->obd_name, rc);
                 GOTO(out, rc);
         }
 
-        EXIT;
 out:
         class_decref(obd);
-        return rc;
+        RETURN(rc);
 }
 
 int mds_lov_synchronize(void *data)
 {
-        ptlrpc_daemonize("mds_lov_sync");
+        struct mds_lov_sync_info *mlsi = data;
+        char name[20];
 
-        return (__mds_lov_syncronize(data));
+        sprintf(name, "ll_mlov_sync_%02u", mlsi->mlsi_index);
+        ptlrpc_daemonize(name);
+
+        RETURN(__mds_lov_synchronize(data));
 }
 
-int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
-                              int nonblock)
+int mds_lov_start_synchronize(struct obd_device *obd, 
+                              struct obd_device *watched,
+                              void *data, int nonblock)
 {
         struct mds_lov_sync_info *mlsi;
         int rc;
 
         ENTRY;
 
+        LASSERT(watched);
+
         OBD_ALLOC(mlsi, sizeof(*mlsi));
         if (mlsi == NULL)
                 RETURN(-ENOMEM);
 
         mlsi->mlsi_obd = obd;
-        mlsi->mlsi_uuid = uuid;
+        mlsi->mlsi_watched = watched;
+        if (data) 
+                mlsi->mlsi_index = *(__u32 *)data;
+        else
+                mlsi->mlsi_index = MDSLOV_NO_INDEX;
 
         /* Although class_export_get(obd->obd_self_export) would lock
            the MDS in place, since it's only a self-export
            it doesn't lock the LOV in place.  The LOV can be disconnected
-           during MDS precleanup, leaving nothing for __mds_lov_syncronize.
+           during MDS precleanup, leaving nothing for __mds_lov_synchronize.
            Simply taking an export ref on the LOV doesn't help, because it's
            still disconnected. Taking an obd reference insures that we don't
            disconnect the LOV.  This of course means a cleanup won't
@@ -578,61 +730,67 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
         class_incref(obd);
 
         if (nonblock) {
-                /* Syncronize in the background */
-                rc = kernel_thread(mds_lov_synchronize, mlsi, CLONE_VM | CLONE_FILES);
+                /* Synchronize in the background */
+                rc = cfs_kernel_thread(mds_lov_synchronize, mlsi,
+                                       CLONE_VM | CLONE_FILES);
                 if (rc < 0) {
                         CERROR("%s: error starting mds_lov_synchronize: %d\n",
                                obd->obd_name, rc);
                         class_decref(obd);
                 } else {
-                        CDEBUG(D_HA, "%s: mds_lov_synchronize thread: %d\n",
-                               obd->obd_name, rc);
+                        CDEBUG(D_HA, "%s: mds_lov_synchronize idx=%d "
+                               "thread=%d\n", obd->obd_name,
+                               mlsi->mlsi_index, rc);
                         rc = 0;
                 }
         } else {
-                rc = __mds_lov_syncronize((void *)mlsi);
+                rc = __mds_lov_synchronize((void *)mlsi);
         }
 
         RETURN(rc);
 }
 
 int mds_notify(struct obd_device *obd, struct obd_device *watched,
-               enum obd_notify_event ev)
+               enum obd_notify_event ev, void *data)
 {
-        struct mds_obd *mds = &obd->u.mds;
-        struct obd_uuid *uuid;
         int rc = 0;
         ENTRY;
 
-        if (ev != OBD_NOTIFY_ACTIVE)
+        switch (ev) {
+        /* We only handle these: */
+        case OBD_NOTIFY_ACTIVE:
+        case OBD_NOTIFY_SYNC:
+        case OBD_NOTIFY_SYNC_NONBLOCK:
+                break;
+        default:
                 RETURN(0);
+        }
+
+        CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev);
 
-        if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+        if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) != 0) {
                 CERROR("unexpected notification of %s %s!\n",
                        watched->obd_type->typ_name, watched->obd_name);
                 RETURN(-EINVAL);
         }
 
-        uuid = &watched->u.cli.cl_target_uuid;
         if (obd->obd_recovering) {
-                /* in the case OBD is in recovery we do not reinit desc and
-                 * easize, as that will be done in mds_lov_connect() after
-                 * recovery is finished. */
                 CWARN("MDS %s: in recovery, not resetting orphans on %s\n",
-                      obd->obd_name, uuid->uuid);
-        } else {
-                LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
-
-                /* this may be called also in case of adding new OST, thus, we
-                 * have to update MDS lov_desc and re-init MDS easize. The same
-                 * should be done on clients. */
-                rc = mds_init_lov_desc(obd, mds->mds_osc_exp);
-                if (rc)
-                        RETURN(rc);
-
-                rc = mds_lov_start_synchronize(obd, uuid, 1);
-                lquota_recovery(quota_interface, obd);
+                      obd->obd_name, 
+                      obd_uuid2str(&watched->u.cli.cl_target_uuid));
+                /* We still have to fix the lov descriptor for ost's added 
+                   after the mdt in the config log.  They didn't make it into
+                   mds_lov_connect. */
+                rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp);
+                RETURN(rc);
         }
+
+        LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
+        rc = mds_lov_start_synchronize(obd, watched, data, 
+                                       !(ev == OBD_NOTIFY_SYNC));
+        
+        lquota_recovery(quota_interface, obd);
+                
         RETURN(rc);
 }
 
index f2c8d1b..585cbb0 100644 (file)
@@ -314,7 +314,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
         struct obd_trans_info oti = { 0 };
         struct lov_stripe_md *lsm = NULL;
         struct lov_mds_md *lmm = NULL;
-        int rc, lmm_bufsize, lmm_size;
+        int rc, lmm_size;
         struct mds_body *body;
         struct obdo *oa;
         void *lmm_buf;
@@ -359,15 +359,13 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
 
                 mds_objids_from_lmm(*ids, lmm, &mds->mds_lov_desc);
 
-                lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0);
-                lmm_bufsize = req->rq_repmsg->buflens[offset];
+                rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov");
+                lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, lmm_size);
                 LASSERT(lmm_buf);
-                LASSERT(lmm_bufsize >= lmm_size);
                 memcpy(lmm_buf, lmm, lmm_size);
-                rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov");
                 if (rc)
                         CERROR("open replay failed to set md:%d\n", rc);
-                RETURN(0);
+                RETURN(rc);
         }
 
         if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_ALLOC_OBDO))
@@ -478,11 +476,8 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
         }
 
         rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov");
-        lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0);
-        lmm_bufsize = req->rq_repmsg->buflens[offset];
+        lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, lmm_size);
         LASSERT(lmm_buf);
-        LASSERT(lmm_bufsize >= lmm_size);
-
         memcpy(lmm_buf, lmm, lmm_size);
         obd_free_diskmd(mds->mds_osc_exp, &lmm);
  out_oa:
diff --git a/lustre/mgc/.cvsignore b/lustre/mgc/.cvsignore
new file mode 100644 (file)
index 0000000..d5103fa
--- /dev/null
@@ -0,0 +1,15 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
diff --git a/lustre/mgc/Makefile.in b/lustre/mgc/Makefile.in
new file mode 100644 (file)
index 0000000..2d7cad5
--- /dev/null
@@ -0,0 +1,4 @@
+MODULES := mgc
+mgc-objs := mgc_request.o
+
+@INCLUDE_RULES@
diff --git a/lustre/mgc/autoMakefile.am b/lustre/mgc/autoMakefile.am
new file mode 100644 (file)
index 0000000..2b3a807
--- /dev/null
@@ -0,0 +1,11 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+modulefs_DATA = mgc$(KMODEXT)
+endif
+
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
+DIST_SOURCES := $(mgc-objs:%.o=%.c)
diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c
new file mode 100644 (file)
index 0000000..0b419ec
--- /dev/null
@@ -0,0 +1,1118 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/mgc/mgc_request.c
+ *  Lustre Management Client
+ *
+ *  Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+#endif
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id)
+{
+        char *name_end;
+        int len;
+        __u64 resname = 0;
+        
+        /* fsname is at most 8 chars long at the beginning of the logname
+           e.g. "lustre-MDT0001" or "lustre" */
+        name_end = strrchr(logname, '-');
+        if (name_end)
+                len = name_end - logname;
+        else
+                len = strlen(logname);
+        if (len > 8) {
+                CERROR("fsname too long: %s\n", logname);
+                return -EINVAL;
+        }
+        if (len <= 0) {
+                CERROR("missing fsname: %s\n", logname);
+                return -EINVAL;
+        }
+        memcpy(&resname, logname, len);
+
+        memset(res_id, 0, sizeof(*res_id));
+        /* FIXME are resid names swabbed across the wire? */
+        res_id->name[0] = cpu_to_le64(resname);
+        CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", logname,
+               res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+        return 0;
+}
+EXPORT_SYMBOL(mgc_logname2resid);
+
+/********************** config llog list **********************/
+static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list);
+static spinlock_t       config_list_lock = SPIN_LOCK_UNLOCKED;
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+        ENTRY;
+        CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+               atomic_read(&cld->cld_refcount));
+        if (cld->cld_stopping)
+                RETURN(1);
+        atomic_inc(&cld->cld_refcount);
+        RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced, 
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+        ENTRY;
+        CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+               atomic_read(&cld->cld_refcount));
+        if (atomic_dec_and_test(&cld->cld_refcount)) {
+                CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+                OBD_FREE(cld->cld_logname, strlen(cld->cld_logname) + 1);
+                if (cld->cld_cfg.cfg_instance != NULL)
+                        OBD_FREE(cld->cld_cfg.cfg_instance, 
+                                 strlen(cld->cld_cfg.cfg_instance) + 1);
+                OBD_FREE(cld, sizeof(*cld));
+        }
+        EXIT;
+}
+
+/* Find a config log by name */
+static struct config_llog_data *config_log_find(char *logname, 
+                                               struct config_llog_instance *cfg)
+{
+        struct list_head *tmp;
+        struct config_llog_data *cld;
+        char *logid = logname;
+        int match_instance = 0;
+        ENTRY;
+
+        if (cfg && cfg->cfg_instance) {
+                match_instance++;
+                logid = cfg->cfg_instance;
+        }
+        if (!logid) {
+                CERROR("No log specified\n");
+                RETURN(ERR_PTR(-EINVAL));
+        }
+
+        spin_lock(&config_list_lock);
+        list_for_each(tmp, &config_llog_list) {
+                cld = list_entry(tmp, struct config_llog_data, cld_list_chain);
+                if (match_instance && cld->cld_cfg.cfg_instance && 
+                    strcmp(logid, cld->cld_cfg.cfg_instance) == 0)
+                        goto out_found;
+                if (!match_instance &&  
+                    strcmp(logid, cld->cld_logname) == 0)
+                        goto out_found;
+        }
+        spin_unlock(&config_list_lock);
+
+        CERROR("can't get log %s\n", logid);
+        RETURN(ERR_PTR(-ENOENT));
+out_found:
+        atomic_inc(&cld->cld_refcount);
+        spin_unlock(&config_list_lock);
+        RETURN(cld);
+}
+
+/* Add this log to our list of active logs. 
+   We have one active log per "mount" - client instance or servername.
+   Each instance may be at a different point in the log. */
+static int config_log_add(char *logname, struct config_llog_instance *cfg,
+                          struct super_block *sb)
+{
+        struct config_llog_data *cld;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MGC, "adding config log %s:%s\n", logname, cfg->cfg_instance);
+        
+        OBD_ALLOC(cld, sizeof(*cld));
+        if (!cld) 
+                RETURN(-ENOMEM);
+        OBD_ALLOC(cld->cld_logname, strlen(logname) + 1);
+        if (!cld->cld_logname) { 
+                OBD_FREE(cld, sizeof(*cld));
+                RETURN(-ENOMEM);
+        }
+        strcpy(cld->cld_logname, logname);
+        cld->cld_cfg = *cfg;
+        cld->cld_cfg.cfg_last_idx = 0;
+        cld->cld_cfg.cfg_flags = 0;
+        cld->cld_cfg.cfg_sb = sb;
+        atomic_set(&cld->cld_refcount, 1);
+        if (cfg->cfg_instance != NULL) {
+                OBD_ALLOC(cld->cld_cfg.cfg_instance, 
+                          strlen(cfg->cfg_instance) + 1);
+                strcpy(cld->cld_cfg.cfg_instance, cfg->cfg_instance);
+        }
+        rc = mgc_logname2resid(logname, &cld->cld_resid);
+        if (rc) {
+                config_log_put(cld);
+                RETURN(rc);
+        }
+        spin_lock(&config_list_lock);
+        list_add(&cld->cld_list_chain, &config_llog_list);
+        spin_unlock(&config_list_lock);
+        
+        RETURN(rc);
+}
+
+/* Stop watching for updates on this log. */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{       
+        struct config_llog_data *cld;
+        int rc = 0;
+        ENTRY;
+                                       
+        cld = config_log_find(logname, cfg);
+        if (IS_ERR(cld)) 
+                RETURN(PTR_ERR(cld));
+        /* drop the ref from the find */
+        config_log_put(cld);
+
+        cld->cld_stopping = 1;
+        spin_lock(&config_list_lock);
+        list_del(&cld->cld_list_chain);
+        spin_unlock(&config_list_lock);
+        /* drop the start ref */
+        config_log_put(cld);
+        CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+               rc);
+        RETURN(rc);
+}
+
+/* Failsafe */
+static void config_log_end_all(void)
+{
+        struct list_head *tmp, *n;
+        struct config_llog_data *cld;
+        ENTRY;
+        
+        spin_lock(&config_list_lock);
+        list_for_each_safe(tmp, n, &config_llog_list) {
+                cld = list_entry(tmp, struct config_llog_data, cld_list_chain);
+                CERROR("conflog failsafe %s\n", cld->cld_logname);
+                list_del(&cld->cld_list_chain);
+                config_log_put(cld);
+        }
+        spin_unlock(&config_list_lock);
+        EXIT;
+}
+
+
+/********************** class fns **********************/
+
+static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb, 
+                        struct vfsmount *mnt)
+{
+        struct lvfs_run_ctxt saved;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct client_obd *cli = &obd->u.cli;
+        struct dentry *dentry;
+        char *label;
+        int err = 0;
+        ENTRY;
+
+        LASSERT(lsi);
+        LASSERT(lsi->lsi_srv_mnt == mnt);
+
+        /* The mgc fs exclusion sem. Only one fs can be setup at a time. */
+        down(&cli->cl_mgc_sem);
+
+        cleanup_group_info();
+
+        obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
+        if (IS_ERR(obd->obd_fsops)) {
+                up(&cli->cl_mgc_sem);
+                CERROR("No fstype %s rc=%ld\n", MT_STR(lsi->lsi_ldd), 
+                       PTR_ERR(obd->obd_fsops));
+                RETURN(PTR_ERR(obd->obd_fsops));
+        }
+
+        cli->cl_mgc_vfsmnt = mnt;
+        fsfilt_setup(obd, mnt->mnt_sb);
+
+        OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+        obd->obd_lvfs_ctxt.pwdmnt = mnt;
+        obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+        obd->obd_lvfs_ctxt.fs = get_ds();
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd,
+                                strlen(MOUNT_CONFIGS_DIR));
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        if (IS_ERR(dentry)) {
+                err = PTR_ERR(dentry);
+                CERROR("cannot lookup %s directory: rc = %d\n", 
+                       MOUNT_CONFIGS_DIR, err);
+                GOTO(err_ops, err);
+        }
+        cli->cl_mgc_configs_dir = dentry;
+
+        /* We take an obd ref to insure that we can't get to mgc_cleanup
+           without calling mgc_fs_cleanup first. */
+        class_incref(obd);
+
+        label = fsfilt_get_label(obd, mnt->mnt_sb);
+        if (label)
+                CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label);
+
+        /* We keep the cl_mgc_sem until mgc_fs_cleanup */
+        RETURN(0);
+
+err_ops:        
+        fsfilt_put_ops(obd->obd_fsops);
+        obd->obd_fsops = NULL;
+        cli->cl_mgc_vfsmnt = NULL;
+        up(&cli->cl_mgc_sem);
+        RETURN(err);
+}
+
+static int mgc_fs_cleanup(struct obd_device *obd)
+{
+        struct client_obd *cli = &obd->u.cli;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(cli->cl_mgc_vfsmnt != NULL);
+
+        if (cli->cl_mgc_configs_dir != NULL) {
+                struct lvfs_run_ctxt saved;
+                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                l_dput(cli->cl_mgc_configs_dir);
+                cli->cl_mgc_configs_dir = NULL; 
+                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                class_decref(obd);
+        }
+
+        cli->cl_mgc_vfsmnt = NULL;
+        if (obd->obd_fsops) 
+                fsfilt_put_ops(obd->obd_fsops);
+        
+        up(&cli->cl_mgc_sem);
+        RETURN(rc);
+}
+
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+        int rc = 0;
+        ENTRY;
+
+        switch (stage) {
+        case OBD_CLEANUP_EARLY: 
+        case OBD_CLEANUP_EXPORTS:
+                break;
+        case OBD_CLEANUP_SELF_EXP:
+                rc = obd_llog_finish(obd, 0);
+                if (rc != 0)
+                        CERROR("failed to cleanup llogging subsystems\n");
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
+        }
+        RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+        struct client_obd *cli = &obd->u.cli;
+        int rc;
+        ENTRY;
+
+        LASSERT(cli->cl_mgc_vfsmnt == NULL);
+        
+        config_log_end_all();
+
+        ptlrpcd_decref();
+
+        rc = client_obd_cleanup(obd);
+        RETURN(rc);
+}
+
+static struct obd_device *the_mgc;
+
+static int mgc_setup(struct obd_device *obd, obd_count len, void *buf)
+{
+        int rc;
+        ENTRY;
+
+        ptlrpcd_addref();
+
+        rc = client_obd_setup(obd, len, buf);
+        if (rc)
+                GOTO(err_decref, rc);
+
+        rc = obd_llog_init(obd, obd, 0, NULL);
+        if (rc) {
+                CERROR("failed to setup llogging subsystems\n");
+                GOTO(err_cleanup, rc);
+        }
+
+        the_mgc = obd;
+        RETURN(rc);
+
+err_cleanup:
+        client_obd_cleanup(obd);
+err_decref:
+        ptlrpcd_decref();
+        RETURN(rc);
+}
+
+static int mgc_process_log(struct obd_device *mgc, 
+                           struct config_llog_data *cld);
+
+/* FIXME I don't want a thread for every cld; make a list of cld's to requeue
+   and use only 1 thread. */
+/* reenqueue the lock, reparse the log */
+static int mgc_async_requeue(void *data)
+{
+        wait_queue_head_t   waitq;
+        struct l_wait_info  lwi;
+        struct config_llog_data *cld = (struct config_llog_data *)data;
+        char name[24];
+        int rc = 0;
+        ENTRY;
+
+        if (!data) 
+                RETURN(-EINVAL);
+        if (cld->cld_stopping) 
+                GOTO(out, rc = 0);
+
+        snprintf(name, sizeof(name), "ll_log_%s", cld->cld_logname);
+        name[sizeof(name)-1] = '\0';
+        ptlrpc_daemonize(name);
+
+        CDEBUG(D_MGC, "requeue "LPX64" %s:%s\n", 
+               cld->cld_resid.name[0], cld->cld_logname, 
+               cld->cld_cfg.cfg_instance);
+        
+        /* Sleep a few seconds to allow the server who caused
+           the lock revocation to finish its setup, plus some random
+           so everyone doesn't try to reconnect at once. */
+        init_waitqueue_head(&waitq);
+        lwi = LWI_TIMEOUT(3 * HZ + (ll_rand() & 0x7f), NULL, NULL);
+        l_wait_event(waitq, 0, &lwi);
+
+        LASSERT(the_mgc);
+
+        class_export_get(the_mgc->obd_self_export);
+#if 0
+        /* Re-send server info every time, in case MGS needs to regen its
+           logs (for write_conf).  Do we need this?  It's extra RPCs for
+           every server at every update.  Turning it off until I'm sure
+           it's needed. */
+        server_register_target(cld->cld_cfg.cfg_sb);
+#endif 
+        rc = mgc_process_log(the_mgc, cld);
+        class_export_put(the_mgc->obd_self_export);
+out:
+        /* Whether we enqueued again or not in mgc_process_log, 
+           we're done with the ref from the old mgc_blocking_ast */        
+        config_log_put(cld);                                                    
+
+        RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, int flag)
+{
+        struct lustre_handle lockh;
+        struct config_llog_data *cld = (struct config_llog_data *)data;
+        int rc = 0;
+        ENTRY;
+
+        switch (flag) {
+        case LDLM_CB_BLOCKING:
+                /* mgs wants the lock, give it up... */
+                LDLM_DEBUG(lock, "MGC blocking CB");
+                ldlm_lock2handle(lock, &lockh);
+                rc = ldlm_cli_cancel(&lockh);
+                break;
+        case LDLM_CB_CANCELING: {
+                /* We've given up the lock, prepare ourselves to update. */
+                LDLM_DEBUG(lock, "MGC cancel CB");
+                
+                CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n",
+                       lock->l_resource->lr_name.name[0], 
+                       (char *)&lock->l_resource->lr_name.name[0]);
+
+                /* Make sure not to re-enqueue when the mgc is stopping
+                   (we get called from client_disconnect_export) */
+                if (!lock->l_conn_export ||
+                    !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+                        CDEBUG(D_MGC, "Disconnecting, don't requeue\n");
+                        goto out_drop;
+                }
+                if (lock->l_req_mode != lock->l_granted_mode) {
+                        CERROR("original grant failed, won't requeue\n");
+                        goto out_drop;
+                }
+                if (!data) {
+                        CERROR("missing data, won't requeue\n");
+                        goto out_drop;
+                }
+                if (cld->cld_stopping) {
+                        CERROR("stopping, won't requeue\n");
+                        goto out_drop;
+                }
+
+                /* Re-enqueue the lock in a separate thread, because we must
+                   return from this fn before that lock can be taken. */
+                rc = cfs_kernel_thread(mgc_async_requeue, data,
+                                       CLONE_VM | CLONE_FILES);
+                if (rc < 0) {
+                        CERROR("Cannot re-enqueue thread: %d\n", rc);
+                } else {
+                        rc = 0;
+                        break;
+                }
+out_drop:
+                /* Drop this here or in mgc_async_requeue,
+                   in either case, we're done with the reference
+                   after this. */
+                config_log_put(cld);    
+                break;
+        }
+        default:
+                LBUG();
+        }
+
+
+        if (rc) {
+                CERROR("%s CB failed %d:\n", flag == LDLM_CB_BLOCKING ? 
+                       "blocking" : "cancel", rc);
+                LDLM_ERROR(lock, "MGC ast");
+        }
+        RETURN(rc);
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+                       __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                       int *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+                       void *data, __u32 lvb_len, void *lvb_swabber,
+                       struct lustre_handle *lockh)
+{                       
+        struct config_llog_data *cld = (struct config_llog_data *)data;
+        struct obd_device *obd = class_exp2obd(exp);
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
+               cld->cld_resid.name[0]);
+                
+        /* We can only drop this config log ref when we drop the lock */
+        if (config_log_get(cld))
+                RETURN(ELDLM_LOCK_ABORTED);
+
+        /* We need a callback for every lockholder, so don't try to
+           ldlm_lock_match (see rev 1.1.2.11.2.47) */
+
+        rc = ldlm_cli_enqueue(exp, NULL, obd->obd_namespace, cld->cld_resid,
+                              type, NULL, mode, flags, 
+                              mgc_blocking_ast, ldlm_completion_ast, NULL,
+                              data, NULL, 0, NULL, lockh);
+
+        RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                      __u32 mode, struct lustre_handle *lockh)
+{
+        ENTRY;
+
+        ldlm_lock_decref(lockh, mode);
+
+        RETURN(0);
+}
+
+#if 0
+static int mgc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                         void *karg, void *uarg)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct obd_ioctl_data *data = karg;
+        struct llog_ctxt *ctxt;
+        struct lvfs_run_ctxt saved;
+        int rc;
+        ENTRY;
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        MOD_INC_USE_COUNT;
+#else
+        if (!try_module_get(THIS_MODULE)) {
+                CERROR("Can't get module. Is it alive?");
+                return -EINVAL;
+        }
+#endif
+        switch (cmd) {
+        /* REPLicator context */
+        case OBD_IOC_PARSE: {
+                CERROR("MGC parsing llog %s\n", data->ioc_inlbuf1);
+                ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
+                rc = class_config_parse_llog(ctxt, data->ioc_inlbuf1, NULL);
+                GOTO(out, rc);
+        }
+#ifdef __KERNEL__
+        case OBD_IOC_LLOG_INFO:
+        case OBD_IOC_LLOG_PRINT: {
+                ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+                rc = llog_ioctl(ctxt, cmd, data);
+
+                GOTO(out, rc);
+        }
+#endif
+        /* ORIGinator context */
+        case OBD_IOC_DUMP_LOG: {
+                ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL);
+                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                if (rc)
+                        RETURN(rc);
+
+                GOTO(out, rc);
+        }
+        default:
+                CERROR("mgc_ioctl(): unrecognised ioctl %#x\n", cmd);
+                GOTO(out, rc = -ENOTTY);
+        }
+out:
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        MOD_DEC_USE_COUNT;
+#else
+        module_put(THIS_MODULE);
+#endif
+
+        return rc;
+}
+#endif
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+                               struct mgs_target_info *mti)
+{
+        struct ptlrpc_request *req;
+        struct mgs_target_info *req_mti, *rep_mti;
+        int size = sizeof(*req_mti);
+        int rep_size = sizeof(*mti);
+        int rc;
+        ENTRY;
+
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MGS_VERSION,
+                              MGS_TARGET_REG, 1, &size, NULL);
+        if (!req)
+                RETURN(-ENOMEM);
+
+        req_mti = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*req_mti));
+        if (!req_mti) 
+                RETURN(-ENOMEM);
+        memcpy(req_mti, mti, sizeof(*req_mti));
+
+        req->rq_replen = lustre_msg_size(1, &rep_size);
+
+        CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+        
+        rc = ptlrpc_queue_wait(req);
+        if (!rc) {
+                rep_mti = lustre_swab_repbuf(req, 0, sizeof(*rep_mti),
+                                             lustre_swab_mgs_target_info);
+                memcpy(mti, rep_mti, sizeof(*rep_mti));
+                CDEBUG(D_MGC, "register %s got index = %d\n",
+                       mti->mti_svname, mti->mti_stripe_index);
+        } else {
+                CERROR("register failed. rc=%d\n", rc);
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+
+int mgc_set_info_async(struct obd_export *exp, obd_count keylen,
+                       void *key, obd_count vallen, void *val, 
+                       struct ptlrpc_request_set *set)
+{
+        struct obd_import *imp = class_exp2cliimp(exp);
+        int rc = -EINVAL;
+        ENTRY;
+
+        /* Try to "recover" the initial connection; i.e. retry */
+        if (KEY_IS(KEY_INIT_RECOV)) {
+                if (vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                imp->imp_initial_recov = *(int *)val;
+                CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n",
+                       exp->exp_obd->obd_name, imp->imp_initial_recov);
+                RETURN(0);
+        }
+        /* Turn off initial_recov after we try all backup servers once */
+        if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+                int value;
+                if (vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                value = *(int *)val;
+                imp->imp_initial_recov_bk = value > 0;
+                if (imp->imp_invalid || value > 1) {
+                        /* Resurrect if we previously died */
+                        CDEBUG(D_MGC, "Reactivate %s %d:%d:%d:%s\n", 
+                               imp->imp_obd->obd_name, value,
+                               imp->imp_deactive, imp->imp_invalid, 
+                               ptlrpc_import_state_name(imp->imp_state));
+                        /* can't put this in obdclass, module loop with ptlrpc*/
+                        /* This seems to be necessary when restarting a 
+                           combo mgs/mdt while the mgc is alive */
+                        ptlrpc_invalidate_import(imp);
+                        /* Remove 'invalid' flag */
+                        ptlrpc_activate_import(imp);
+                        /* Attempt a new connect */
+                        ptlrpc_recover_import(imp, NULL);
+                }
+                RETURN(0);
+        }
+        /* Hack alert */
+        if (KEY_IS("register_target")) {
+                struct mgs_target_info *mti;
+                if (vallen != sizeof(struct mgs_target_info))
+                        RETURN(-EINVAL);
+                mti = (struct mgs_target_info *)val;
+                CDEBUG(D_MGC, "register_target %s %#x\n",
+                       mti->mti_svname, mti->mti_flags);
+                rc =  mgc_target_register(exp, mti);
+                RETURN(rc);
+        }
+        if (KEY_IS("set_fs")) {
+                struct super_block *sb = (struct super_block *)val;
+                struct lustre_sb_info *lsi;
+                if (vallen != sizeof(struct super_block))
+                        RETURN(-EINVAL);
+                lsi = s2lsi(sb);
+                rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt);
+                if (rc) {
+                        CERROR("set_fs got %d\n", rc);
+                }
+                RETURN(rc);
+        }
+        if (KEY_IS("clear_fs")) {
+                if (vallen != 0)
+                        RETURN(-EINVAL);
+                rc = mgc_fs_cleanup(exp->exp_obd);
+                if (rc) {
+                        CERROR("clear_fs got %d\n", rc);
+                }
+                RETURN(rc);
+        }
+
+        RETURN(rc);
+}               
+
+static int mgc_import_event(struct obd_device *obd,
+                            struct obd_import *imp,
+                            enum obd_import_event event)
+{
+        int rc = 0;
+
+        LASSERT(imp->imp_obd == obd);
+        CDEBUG(D_MGC, "import event %#x\n", event);
+
+        switch (event) {
+        case IMP_EVENT_INVALIDATE: {
+                struct ldlm_namespace *ns = obd->obd_namespace;
+                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                break;
+        }
+        case IMP_EVENT_DISCON: 
+                /* MGC imports should not wait for recovery */
+                ptlrpc_invalidate_import(imp);
+                break;
+        case IMP_EVENT_INACTIVE: 
+        case IMP_EVENT_ACTIVE: 
+        case IMP_EVENT_OCD:
+                break;
+        default:
+                CERROR("Unknown import event %#x\n", event);
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_device *tgt,
+                         int count, struct llog_catid *logid)
+{
+        struct llog_ctxt *ctxt;
+        int rc;
+        ENTRY;
+
+        rc = llog_setup(obd, LLOG_CONFIG_ORIG_CTXT, tgt, 0, NULL,
+                        &llog_lvfs_ops);
+        if (rc)
+                RETURN(rc);
+
+        rc = llog_setup(obd, LLOG_CONFIG_REPL_CTXT, tgt, 0, NULL,
+                        &llog_client_ops);
+        if (rc == 0) {
+                ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+                ctxt->loc_imp = obd->u.cli.cl_import;
+        }
+
+        RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+        int rc;
+        ENTRY;
+
+        rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_REPL_CTXT));
+        rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+
+        RETURN(rc);
+}
+
+/* identical to mgs_log_is_empty */
+static int mgc_llog_is_empty(struct obd_device *obd, struct llog_ctxt *ctxt,
+                            char *name)
+{
+        struct lvfs_run_ctxt saved;
+        struct llog_handle *llh;
+        int rc = 0;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        rc = llog_create(ctxt, &llh, NULL, name);
+        if (rc == 0) {
+                llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL);
+                rc = llog_get_size(llh);
+                llog_close(llh);
+        }
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        /* header is record 1 */
+        return(rc <= 1);
+}
+
+static int mgc_copy_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, 
+                            void *data)
+{
+        struct llog_rec_hdr local_rec = *rec;
+        struct llog_handle *local_llh = (struct llog_handle *)data;
+        char *cfg_buf = (char*) (rec + 1);
+        struct lustre_cfg *lcfg;
+        int rc = 0;
+        ENTRY;
+
+        lcfg = (struct lustre_cfg *)cfg_buf;
+
+        /* FIXME we should always write to an empty log, so remove this check.*/
+        /* append new records */
+        if (rec->lrh_index >= llog_get_size(local_llh)) { 
+                rc = llog_write_rec(local_llh, &local_rec, NULL, 0, 
+                                    (void *)cfg_buf, -1);
+
+                CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n", 
+                       rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command, 
+                       lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+        } else {
+                CDEBUG(D_INFO, "skip idx=%d\n",  rec->lrh_index);
+        }
+
+        RETURN(rc);
+}
+
+static int mgc_copy_llog(struct obd_device *obd, struct llog_ctxt *rctxt,
+                         struct llog_ctxt *lctxt, char *logname)
+{
+        struct llog_handle *local_llh, *remote_llh;
+        struct obd_uuid *uuid;
+        int rc, rc2;
+        ENTRY;
+
+        /* open local log */
+        rc = llog_create(lctxt, &local_llh, NULL, logname);
+        if (rc)
+                RETURN(rc);
+        /* set the log header uuid for fun */
+        OBD_ALLOC_PTR(uuid);
+        obd_str2uuid(uuid, logname);
+        rc = llog_init_handle(local_llh, LLOG_F_IS_PLAIN, uuid);
+        OBD_FREE_PTR(uuid);
+        if (rc)
+                GOTO(out_closel, rc);
+
+        /* FIXME write new log to a temp name, then vfs_rename over logname
+           upon successful completion. */
+
+        /* open remote log */
+        rc = llog_create(rctxt, &remote_llh, NULL, logname);
+        if (rc)
+                GOTO(out_closel, rc);
+        rc = llog_init_handle(remote_llh, LLOG_F_IS_PLAIN, NULL);
+        if (rc)
+                GOTO(out_closer, rc);
+
+        rc = llog_process(remote_llh, mgc_copy_handler,(void *)local_llh, NULL);
+
+out_closer:
+        rc2 = llog_close(remote_llh);
+        if (!rc)
+                rc = rc2;
+out_closel:
+        rc2 = llog_close(local_llh);
+        if (!rc)
+                rc = rc2;
+
+        CDEBUG(D_MGC, "Copied remote log %s (%d)\n", logname, rc);
+        RETURN(rc);
+}
+
+DECLARE_MUTEX(llog_process_lock);
+
+/* Get a config log from the MGS and process it.
+   This func is called for both clients and servers. */
+static int mgc_process_log(struct obd_device *mgc, 
+                           struct config_llog_data *cld)
+{
+        struct llog_ctxt *ctxt, *lctxt;
+        struct lustre_handle lockh;
+        struct client_obd *cli = &mgc->u.cli;
+        struct lvfs_run_ctxt saved;
+        struct lustre_sb_info *lsi;
+        int rc = 0, rcl, flags = 0, must_pop = 0;
+        ENTRY;
+
+        if (!cld || !cld->cld_cfg.cfg_sb) {
+                /* This should never happen */
+                CERROR("Missing cld, aborting log update\n");
+                RETURN(-EINVAL);
+        }
+        if (cld->cld_stopping) 
+                RETURN(0);
+
+        lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+        CDEBUG(D_MGC, "Process log %s:%s from %d\n", cld->cld_logname, 
+               cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+        ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+        if (!ctxt) {
+                CERROR("missing llog context\n");
+                RETURN(-EINVAL);
+        }
+
+        /* I don't want mutliple processes running process_log at once -- 
+           sounds like badness.  It actually might be fine, as long as 
+           we're not trying to update from the same log
+           simultaneously (in which case we should use a per-log sem.) */
+        down(&llog_process_lock);
+
+        /* Get the cfg lock on the llog */
+        rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, 
+                          LCK_CR, &flags, NULL, NULL, NULL, 
+                          cld, 0, NULL, &lockh);
+        if (rcl) 
+                CERROR("Can't get cfg lock: %d\n", rcl);
+        
+        lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+        /* Copy the setup log locally if we can. Don't mess around if we're 
+           running an MGS though (logs are already local). */
+        if (lctxt && lsi && (lsi->lsi_flags & LSI_SERVER) && 
+            (lsi->lsi_srv_mnt == cli->cl_mgc_vfsmnt) &&
+            !IS_MGS(lsi->lsi_ldd)) {
+                push_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL);
+                must_pop++;
+                if (rcl == 0) 
+                        /* Only try to copy log if we have the lock. */
+                        rc = mgc_copy_llog(mgc, ctxt, lctxt, cld->cld_logname);
+                if (rcl || rc) {
+                        if (mgc_llog_is_empty(mgc, lctxt, cld->cld_logname)) {
+                                LCONSOLE_ERROR("Failed to get MGS log %s "
+                                               "and no local copy.\n",
+                                               cld->cld_logname);
+                                GOTO(out_pop, rc = -ENOTCONN);
+                        }
+                        LCONSOLE_WARN("Failed to get MGS log %s, using "
+                                      "local copy.\n", cld->cld_logname);
+                }
+                /* Now, whether we copied or not, start using the local llog.
+                   If we failed to copy, we'll start using whatever the old 
+                   log has. */
+                ctxt = lctxt;
+        }
+
+        /* logname and instance info should be the same, so use our 
+           copy of the instance for the update.  The cfg_last_idx will
+           be updated here. */
+        rc = class_config_parse_llog(ctxt, cld->cld_logname, &cld->cld_cfg);
+        
+ out_pop:
+        if (must_pop) 
+                pop_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL);
+
+        /* Now drop the lock so MGS can revoke it */ 
+        if (!rcl) {
+                rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL, 
+                                 LCK_CR, &lockh);
+                if (rcl) 
+                        CERROR("Can't drop cfg lock: %d\n", rcl);
+        }
+        
+        if (rc) {
+                CERROR("%s: the configuration '%s' could not be read "
+                       "(%d) from the MGS.\n",
+                       mgc->obd_name, cld->cld_logname, rc);
+        }
+
+        up(&llog_process_lock);
+        
+        RETURN(rc);
+}
+
+static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+        struct lustre_cfg *lcfg = buf;
+        int cmd;
+        int rc = 0;
+        ENTRY;
+
+        switch(cmd = lcfg->lcfg_command) {
+        case LCFG_LOV_ADD_OBD: {
+                struct mgs_target_info *mti;
+
+                if (LUSTRE_CFG_BUFLEN(lcfg, 1) != 
+                    sizeof(struct mgs_target_info))
+                        GOTO(out, rc = -EINVAL);
+
+                mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+                CDEBUG(D_MGC, "add_target %s %#x\n",    
+                       mti->mti_svname, mti->mti_flags);
+                rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+                break;
+        }
+        case LCFG_LOV_DEL_OBD: 
+                /* FIXME */
+                CERROR("lov_del_obd unimplemented\n");
+                rc = -ENOSYS;
+                break;
+        case LCFG_LOG_START: {
+                struct config_llog_data *cld;
+                struct config_llog_instance *cfg;
+                struct super_block *sb;
+                char *logname = lustre_cfg_string(lcfg, 1);
+                cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+                sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+                
+                CDEBUG(D_MGC, "parse_log %s from %d\n", logname, 
+                       cfg->cfg_last_idx);
+
+                /* We're only called through here on the initial mount */
+                rc = config_log_add(logname, cfg, sb);
+                if (rc) 
+                        break;
+                cld = config_log_find(logname, cfg);
+                if (IS_ERR(cld)) {
+                        rc = PTR_ERR(cld);
+                        break;
+                }
+                
+                /* COMPAT_146 */
+                /* For old logs, there was no start marker. */
+                /* FIXME only set this for old logs! */
+                cld->cld_cfg.cfg_flags |= CFG_F_MARKER;
+                
+                rc = mgc_process_log(obd, cld);
+                config_log_put(cld);
+                
+                break;       
+        }
+        case LCFG_LOG_END: {
+                struct config_llog_instance *cfg = NULL;
+                char *logname = lustre_cfg_string(lcfg, 1);
+                if (lcfg->lcfg_bufcount >= 2)
+                        cfg = (struct config_llog_instance *)lustre_cfg_buf(
+                                lcfg, 2);
+                rc = config_log_end(logname, cfg);
+                break;
+        }
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
+struct obd_ops mgc_obd_ops = {
+        .o_owner        = THIS_MODULE,
+        .o_setup        = mgc_setup,
+        .o_precleanup   = mgc_precleanup,
+        .o_cleanup      = mgc_cleanup,
+        .o_add_conn     = client_import_add_conn,
+        .o_del_conn     = client_import_del_conn,
+        .o_connect      = client_connect_import,
+        .o_disconnect   = client_disconnect_export,
+        //.o_enqueue      = mgc_enqueue,
+        .o_cancel       = mgc_cancel,
+        //.o_iocontrol    = mgc_iocontrol,
+        .o_set_info_async = mgc_set_info_async,
+        .o_import_event = mgc_import_event,
+        .o_llog_init    = mgc_llog_init,
+        .o_llog_finish  = mgc_llog_finish,
+        .o_process_config = mgc_process_config,
+};
+
+int __init mgc_init(void)
+{
+        return class_register_type(&mgc_obd_ops, NULL, LUSTRE_MGC_NAME);
+}
+
+#ifdef __KERNEL__
+static void /*__exit*/ mgc_exit(void)
+{
+        class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
+#endif
diff --git a/lustre/mgs/.cvsignore b/lustre/mgs/.cvsignore
new file mode 100644 (file)
index 0000000..d5103fa
--- /dev/null
@@ -0,0 +1,15 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
diff --git a/lustre/mgs/Makefile.in b/lustre/mgs/Makefile.in
new file mode 100644 (file)
index 0000000..8bb6a5f
--- /dev/null
@@ -0,0 +1,4 @@
+MODULES := mgs
+mgs-objs := mgs_handler.o mgs_fs.o mgs_llog.o lproc_mgs.o
+
+@INCLUDE_RULES@
diff --git a/lustre/mgs/autoMakefile.am b/lustre/mgs/autoMakefile.am
new file mode 100644 (file)
index 0000000..53734b0
--- /dev/null
@@ -0,0 +1,11 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+modulefs_DATA = mgs$(KMODEXT)
+endif
+
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
+DIST_SOURCES := $(mgs-objs:%.o=%.c) mgs_internal.h
diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c
new file mode 100644 (file)
index 0000000..d1ce512
--- /dev/null
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   You may have signed or agreed to another license before downloading
+ *   this software.  If so, you are bound by the terms and conditions
+ *   of that agreement, and the following does not apply to you.  See the
+ *   LICENSE file included with this distribution for more information.
+ *
+ *   If you did not agree to a different license, then this copy of Lustre
+ *   is open source software; you can redistribute it and/or modify it
+ *   under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   In either case, Lustre is distributed in the hope that it will be
+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   license text for more details.
+ *
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <asm/statfs.h>
+#endif
+#include <obd.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgs_internal.h"
+
+#ifdef LPROCFS
+struct lprocfs_vars lprocfs_mgs_obd_vars[] = {
+        { 0 }
+};
+
+struct lprocfs_vars lprocfs_mgs_module_vars[] = {
+        { 0 }
+};
+
+struct lprocfs_vars lprocfs_mgt_obd_vars[] = {
+        { 0 }
+};
+
+struct lprocfs_vars lprocfs_mgt_module_vars[] = {
+        { 0 }
+};
+
+LPROCFS_INIT_VARS(mgs, lprocfs_mgs_module_vars, lprocfs_mgs_obd_vars);
+LPROCFS_INIT_VARS(mgt, lprocfs_mgt_module_vars, lprocfs_mgt_obd_vars);
+#endif
diff --git a/lustre/mgs/mgs_fs.c b/lustre/mgs/mgs_fs.c
new file mode 100644 (file)
index 0000000..8151705
--- /dev/null
@@ -0,0 +1,200 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/mgs/mgs_fs.c
+ *  Lustre Management Server (MGS) filesystem interface code
+ *
+ *  Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MGS
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/version.h>
+#include <linux/sched.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/mount.h>
+#endif
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_lib.h>
+#include <lustre_fsfilt.h>
+#include <libcfs/list.h>
+#include "mgs_internal.h"
+
+/* Same as mds_fid2dentry */
+/* Look up an entry by inode number. */
+/* this function ONLY returns valid dget'd dentries with an initialized inode
+   or errors */
+static struct dentry *mgs_fid2dentry(struct mgs_obd *mgs, struct ll_fid *fid)
+{
+        char fid_name[32];
+        unsigned long ino = fid->id;
+        __u32 generation = fid->generation;
+        struct inode *inode;
+        struct dentry *result;
+
+        CDEBUG(D_DENTRY, "--> mgs_fid2dentry: ino/gen %lu/%u, sb %p\n",
+               ino, generation, mgs->mgs_sb);
+
+        if (ino == 0)
+                RETURN(ERR_PTR(-ESTALE));
+        
+        snprintf(fid_name, sizeof(fid_name), "0x%lx", ino);
+        
+        /* under ext3 this is neither supposed to return bad inodes
+           nor NULL inodes. */
+        result = ll_lookup_one_len(fid_name, mgs->mgs_fid_de, strlen(fid_name));
+        if (IS_ERR(result))
+                RETURN(result);
+
+        inode = result->d_inode;
+        if (!inode)
+                RETURN(ERR_PTR(-ENOENT));
+
+        if (inode->i_generation == 0 || inode->i_nlink == 0) {
+                LCONSOLE_WARN("Found inode with zero generation or link -- this"
+                              " may indicate disk corruption (inode: %lu, link:"
+                              " %lu, count: %d)\n", inode->i_ino,
+                              (unsigned long)inode->i_nlink,
+                              atomic_read(&inode->i_count));
+                l_dput(result);
+                RETURN(ERR_PTR(-ENOENT));
+        }
+
+        if (generation && inode->i_generation != generation) {
+                /* we didn't find the right inode.. */
+                CDEBUG(D_INODE, "found wrong generation: inode %lu, link: %lu, "
+                       "count: %d, generation %u/%u\n", inode->i_ino,
+                       (unsigned long)inode->i_nlink,
+                       atomic_read(&inode->i_count), inode->i_generation,
+                       generation);
+                l_dput(result);
+                RETURN(ERR_PTR(-ENOENT));
+        }
+
+        RETURN(result);
+}
+
+static struct dentry *mgs_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr,
+                                          void *data)
+{
+        struct obd_device *obd = data;
+        struct ll_fid fid;
+        fid.id = id;
+        fid.generation = gen;
+        return mgs_fid2dentry(&obd->u.mgs, &fid);
+}
+
+struct lvfs_callback_ops mgs_lvfs_ops = {
+        l_fid2dentry:     mgs_lvfs_fid2dentry,
+};
+
+int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct lvfs_run_ctxt saved;
+        struct dentry *dentry;
+        int rc;
+        ENTRY;
+
+        // FIXME what's this?
+        rc = cleanup_group_info();
+        if (rc)
+                RETURN(rc);
+
+        mgs->mgs_vfsmnt = mnt;
+        mgs->mgs_sb = mnt->mnt_root->d_inode->i_sb;
+
+        fsfilt_setup(obd, mgs->mgs_sb);
+
+        OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+        obd->obd_lvfs_ctxt.pwdmnt = mnt;
+        obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+        obd->obd_lvfs_ctxt.fs = get_ds();
+        obd->obd_lvfs_ctxt.cb_ops = mgs_lvfs_ops;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        /* Setup the configs dir */
+        dentry = simple_mkdir(current->fs->pwd, MOUNT_CONFIGS_DIR, 0777, 1);
+        if (IS_ERR(dentry)) {
+                rc = PTR_ERR(dentry);
+                CERROR("cannot create %s directory: rc = %d\n", 
+                       MOUNT_CONFIGS_DIR, rc);
+                GOTO(err_pop, rc);
+        }
+        mgs->mgs_configs_dir = dentry;
+
+        /* Need the iopen dir for fid2dentry, required by
+           LLOG_ORIGIN_HANDLE_READ_HEADER */
+        dentry = lookup_one_len("__iopen__", current->fs->pwd,
+                                strlen("__iopen__"));
+        if (IS_ERR(dentry)) {
+                rc = PTR_ERR(dentry);
+                CERROR("cannot lookup __iopen__ directory: rc = %d\n", rc);
+                GOTO(err_configs, rc);
+        }
+        mgs->mgs_fid_de = dentry;
+        if (!dentry->d_inode || is_bad_inode(dentry->d_inode)) {
+                rc = -ENOENT;
+                CERROR("__iopen__ directory has no inode? rc = %d\n", rc);
+                GOTO(err_fid, rc);
+        }
+
+err_pop:
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        return rc;
+err_fid:
+        dput(mgs->mgs_fid_de);
+err_configs:
+        dput(mgs->mgs_configs_dir);
+        goto err_pop;
+}
+
+int mgs_fs_cleanup(struct obd_device *obd)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct lvfs_run_ctxt saved;
+        int rc = 0;
+
+        class_disconnect_exports(obd); /* cleans up client info too */
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        if (mgs->mgs_configs_dir) {
+                /*CERROR("configs dir dcount=%d\n",
+                       atomic_read(&mgs->mgs_configs_dir->d_count));*/
+                l_dput(mgs->mgs_configs_dir);
+                mgs->mgs_configs_dir = NULL;
+        }
+
+        shrink_dcache_parent(mgs->mgs_fid_de);
+        /*CERROR("fid dir dcount=%d\n",
+               atomic_read(&mgs->mgs_fid_de->d_count));*/
+        dput(mgs->mgs_fid_de);
+
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        return rc;
+}
diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c
new file mode 100644 (file)
index 0000000..94dc87c
--- /dev/null
@@ -0,0 +1,698 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/mgs/mgs_handler.c
+ *  Lustre Management Server (mgs) request handler
+ *
+ *  Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MGS
+#define D_MGS D_CONFIG/*|D_WARNING*/
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+#endif
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_fsfilt.h>
+#include <lustre_commit_confd.h>
+#include <lustre_disk.h>
+#include <lustre_ver.h>
+#include "mgs_internal.h"
+
+
+/* Establish a connection to the MGS.*/
+static int mgs_connect(struct lustre_handle *conn, struct obd_device *obd,
+                       struct obd_uuid *cluuid, struct obd_connect_data *data)
+{
+        struct obd_export *exp;
+        int rc;
+        ENTRY;
+
+        if (!conn || !obd || !cluuid)
+                RETURN(-EINVAL);
+
+        rc = class_connect(conn, obd, cluuid);
+        if (rc)
+                RETURN(rc);
+        exp = class_conn2export(conn);
+        LASSERT(exp);
+
+        if (data != NULL) {
+                data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
+                exp->exp_connect_flags = data->ocd_connect_flags;
+                data->ocd_version = LUSTRE_VERSION_CODE;
+        }
+
+        if (rc) {
+                class_disconnect(exp);
+        } else {
+                class_export_put(exp);
+        }
+
+        RETURN(rc);
+}
+
+static int mgs_disconnect(struct obd_export *exp)
+{
+        unsigned long irqflags;
+        int rc;
+        ENTRY;
+
+        LASSERT(exp);
+        class_export_get(exp);
+
+        /* Disconnect early so that clients can't keep using export */
+        rc = class_disconnect(exp);
+        ldlm_cancel_locks_for_export(exp);
+
+        /* complete all outstanding replies */
+        spin_lock_irqsave(&exp->exp_lock, irqflags);
+        while (!list_empty(&exp->exp_outstanding_replies)) {
+                struct ptlrpc_reply_state *rs =
+                        list_entry(exp->exp_outstanding_replies.next,
+                                   struct ptlrpc_reply_state, rs_exp_list);
+                struct ptlrpc_service *svc = rs->rs_service;
+
+                spin_lock(&svc->srv_lock);
+                list_del_init(&rs->rs_exp_list);
+                ptlrpc_schedule_difficult_reply(rs);
+                spin_unlock(&svc->srv_lock);
+        }
+        spin_unlock_irqrestore(&exp->exp_lock, irqflags);
+
+        class_export_put(exp);
+        RETURN(rc);
+}
+
+static int mgs_cleanup(struct obd_device *obd);
+static int mgs_handle(struct ptlrpc_request *req);
+
+/* Start the MGS obd */
+static int mgs_setup(struct obd_device *obd, obd_count len, void *buf)
+{
+        struct lprocfs_static_vars lvars;
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct lustre_mount_info *lmi;
+        struct lustre_sb_info *lsi;
+        struct vfsmount *mnt;
+        int rc = 0;
+        ENTRY;
+
+        CDEBUG(D_CONFIG, "Starting MGS\n");
+
+        /* Find our disk */
+        lmi = server_get_mount(obd->obd_name);
+        if (!lmi) 
+                RETURN(rc = -EINVAL);
+
+        mnt = lmi->lmi_mnt;
+        lsi = s2lsi(lmi->lmi_sb);
+        obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
+        if (IS_ERR(obd->obd_fsops))
+                GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
+
+        /* namespace for mgs llog */
+        obd->obd_namespace = ldlm_namespace_new("MGS", LDLM_NAMESPACE_SERVER);
+        if (obd->obd_namespace == NULL) {
+                mgs_cleanup(obd);
+                GOTO(err_ops, rc = -ENOMEM);
+        }
+
+        /* ldlm setup */
+        ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+                           "mgs_ldlm_client", &obd->obd_ldlm_client);
+
+        LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
+
+        rc = mgs_fs_setup(obd, mnt);
+        if (rc) {
+                CERROR("%s: MGS filesystem method init failed: rc = %d\n",
+                       obd->obd_name, rc);
+                GOTO(err_ns, rc);
+        }
+
+        rc = llog_start_commit_thread();
+        if (rc < 0)
+                GOTO(err_fs, rc);
+
+        rc = llog_setup(obd, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL,
+                        &llog_lvfs_ops);
+        if (rc)
+                GOTO(err_fs, rc);
+
+        /* Allow reconnect attempts */
+        obd->obd_replayable = 1;
+
+        /* Internal mgs setup */
+        mgs_init_fsdb_list(obd);
+        sema_init(&mgs->mgs_sem, 1);
+
+        /* Start the service threads */
+        mgs->mgs_service =
+                ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
+                                MGS_MAXREPSIZE, MGS_REQUEST_PORTAL, 
+                                MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT,
+                                mgs_handle, LUSTRE_MGS_NAME, 
+                                obd->obd_proc_entry, NULL, MGS_NUM_THREADS);
+
+        if (!mgs->mgs_service) {
+                CERROR("failed to start service\n");
+                GOTO(err_fs, rc = -ENOMEM);
+        }
+
+        rc = ptlrpc_start_threads(obd, mgs->mgs_service, "ll_mgs");
+        if (rc)
+                GOTO(err_thread, rc);
+
+        /* Setup proc */
+        lprocfs_init_vars(mgs, &lvars);
+        lprocfs_obd_setup(obd, lvars.obd_vars);
+
+        ping_evictor_start();
+
+        LCONSOLE_INFO("MGS %s started\n", obd->obd_name);
+
+        RETURN(0);
+
+err_thread:
+        ptlrpc_unregister_service(mgs->mgs_service);
+err_fs:
+        /* No extra cleanup needed for llog_init_commit_thread() */
+        mgs_fs_cleanup(obd);
+err_ns:
+        ldlm_namespace_free(obd->obd_namespace, 0);
+        obd->obd_namespace = NULL;
+err_ops:
+        fsfilt_put_ops(obd->obd_fsops);
+err_put:
+        server_put_mount(obd->obd_name, mgs->mgs_vfsmnt);
+        mgs->mgs_sb = 0;
+        return rc;
+}
+
+static int mgs_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+        int rc = 0;
+        ENTRY;
+
+        switch (stage) {
+        case OBD_CLEANUP_EARLY:
+        case OBD_CLEANUP_EXPORTS:
+                break;
+        case OBD_CLEANUP_SELF_EXP:
+                llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+                rc = obd_llog_finish(obd, 0);
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
+        }
+        RETURN(rc);
+}
+
+static int mgs_ldlm_nsfree(void *data)
+{
+        struct ldlm_namespace *ns = (struct ldlm_namespace *)data;
+        int rc;
+        ENTRY;
+
+        ptlrpc_daemonize("ll_mgs_nsfree");
+        rc = ldlm_namespace_free(ns, 1 /* obd_force should always be on */);
+        RETURN(rc);
+}
+
+static int mgs_cleanup(struct obd_device *obd)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        lvfs_sbdev_type save_dev;
+        ENTRY;
+
+        ping_evictor_stop();
+
+        if (mgs->mgs_sb == NULL)
+                RETURN(0);
+
+        save_dev = lvfs_sbdev(mgs->mgs_sb);
+        
+        ptlrpc_unregister_service(mgs->mgs_service);
+
+        lprocfs_obd_cleanup(obd);
+
+        mgs_cleanup_fsdb_list(obd);
+
+        mgs_fs_cleanup(obd);
+
+        server_put_mount(obd->obd_name, mgs->mgs_vfsmnt);
+        mgs->mgs_sb = NULL;
+
+        /* Free the namespace in it's own thread, so that if the 
+           ldlm_cancel_handler put the last mgs obd ref, we won't 
+           deadlock here. */
+        cfs_kernel_thread(mgs_ldlm_nsfree, obd->obd_namespace, 
+                          CLONE_VM | CLONE_FILES);
+
+        lvfs_clear_rdonly(save_dev);
+
+        fsfilt_put_ops(obd->obd_fsops);
+
+        LCONSOLE_INFO("%s has stopped.\n", obd->obd_name);
+        RETURN(0);
+}
+
+/* similar to filter_prepare_destroy */
+static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname,
+                            struct lustre_handle *lockh)
+{
+        struct ldlm_res_id res_id;
+        int rc, flags = 0;
+        ENTRY;
+
+        rc = mgc_logname2resid(fsname, &res_id);
+        if (!rc) 
+                rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
+                                      LDLM_PLAIN, NULL, LCK_EX, &flags,
+                                      ldlm_blocking_ast, ldlm_completion_ast, 
+                                      NULL, fsname, NULL, 0, NULL, lockh);
+        if (rc) 
+                CERROR("can't take cfg lock for %s (%d)\n", fsname, rc);
+        
+        RETURN(rc);
+}
+
+static int mgs_put_cfg_lock(struct lustre_handle *lockh)
+{
+        ENTRY;
+        ldlm_lock_decref(lockh, LCK_EX);
+        RETURN(0);
+}
+
+/* rc=0 means ok */
+static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
+{
+        int rc;
+        ENTRY;
+
+        rc = mgs_check_index(obd, mti);
+        if (rc == 0) {
+                LCONSOLE_ERROR("Index for %s has disappeared!  "
+                               "Regenerating this portion of the logs."
+                               "\n", mti->mti_svname);
+                mti->mti_flags |= LDD_F_UPDATE;
+                rc = 1;
+        } else if (rc == -1) {
+                LCONSOLE_ERROR("Client log %s-client has disappeared! "
+                               "Regenerating all logs.\n",
+                               mti->mti_fsname);
+                mti->mti_flags |= LDD_F_WRITECONF;
+                rc = 1;
+        } else {
+                /* Index is correctly marked as used */
+
+                /* If the logs don't contain the mti_nids then add 
+                   them as failover nids */
+                rc = mgs_check_failnid(obd, mti);
+        }
+
+
+        RETURN(rc);
+}
+
+/* Called whenever a target starts up.  Flags indicate first connect, etc. */
+static int mgs_handle_target_reg(struct ptlrpc_request *req)
+{    
+        struct obd_device *obd = req->rq_export->exp_obd;
+        struct lustre_handle lockh;
+        struct mgs_target_info *mti, *rep_mti;
+        int rep_size = sizeof(*mti);
+        int rc = 0, lockrc;
+        ENTRY;
+
+        mti = lustre_swab_reqbuf(req, 0, sizeof(*mti),
+                                 lustre_swab_mgs_target_info);
+        
+        if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 |
+                                LDD_F_UPDATE))) {
+                /* We're just here as a startup ping. */
+                CDEBUG(D_MGS, "Server %s is running on %s\n",
+                       mti->mti_svname, obd_export_nid2str(req->rq_export));
+                rc = mgs_check_target(obd, mti);
+                /* above will set appropriate mti flags */
+                if (!rc) 
+                        /* Nothing wrong, don't revoke lock */
+                        GOTO(out_nolock, rc);
+        }
+
+        /* Revoke the config lock to make sure nobody is reading. */
+        /* Although actually I think it should be alright if
+           someone was reading while we were updating the logs - if we 
+           revoke at the end they will just update from where they left off. */
+        lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh);
+        if (lockrc != ELDLM_OK) {
+                LCONSOLE_ERROR("%s: Can't signal other nodes to update "
+                               "their configuration (%d). Updating local logs "
+                               "anyhow; you might have to manually restart "
+                               "other nodes to get the latest configuration.\n",
+                               obd->obd_name, lockrc);
+        }
+
+        /* Log writing contention is handled by the fsdb_sem */
+
+        if (mti->mti_flags & LDD_F_WRITECONF) {
+                rc = mgs_erase_logs(obd, mti->mti_fsname);
+                mti->mti_flags |= LDD_F_UPDATE;
+                LCONSOLE_WARN("%s: Logs for fs %s were removed by user request."
+                              " All servers must re-register in order to "
+                              "regenerate the client log.\n",
+                              obd->obd_name, mti->mti_fsname);
+                mti->mti_flags &= ~LDD_F_WRITECONF;
+        }
+
+        /* COMPAT_146 */
+        if (mti->mti_flags & LDD_F_UPGRADE14) {
+                rc = mgs_upgrade_sv_14(obd, mti);
+                if (rc) {
+                        CERROR("Can't upgrade from 1.4 (%d)\n", rc);
+                        GOTO(out, rc);
+                }
+                
+                mti->mti_flags &= ~LDD_F_UPGRADE14;
+                /* Turn off the upgrade flag permanently */
+                mti->mti_flags |= LDD_F_REWRITE_LDD;
+        }
+        /* end COMPAT_146 */
+
+        if (mti->mti_flags & LDD_F_UPDATE) {
+                CDEBUG(D_MGS, "adding %s, index=%d\n", mti->mti_svname, 
+                       mti->mti_stripe_index);
+                
+                /* create the log for the new target 
+                   and update the client/mdt logs */
+                rc = mgs_write_log_target(obd, mti);
+                if (rc) {
+                        CERROR("Failed to write %s log (%d)\n", 
+                               mti->mti_svname, rc);
+                        GOTO(out, rc);
+                }
+
+                mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE | 
+                                    LDD_F_NEED_INDEX);
+                mti->mti_flags |= LDD_F_REWRITE_LDD;
+        }
+
+out:
+        /* done with log update */
+        if (lockrc == ELDLM_OK)
+                mgs_put_cfg_lock(&lockh);
+out_nolock:
+        CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname, 
+               mti->mti_stripe_index, rc);
+        lustre_pack_reply(req, 1, &rep_size, NULL); 
+        /* send back the whole mti in the reply */
+        rep_mti = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep_mti));
+        memcpy(rep_mti, mti, sizeof(*rep_mti));
+        RETURN(rc);
+}
+
+int mgs_handle(struct ptlrpc_request *req)
+{
+        int fail = OBD_FAIL_MGS_ALL_REPLY_NET;
+        int rc = 0;
+        ENTRY;
+
+        OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
+
+        LASSERT(current->journal_info == NULL);
+        if (req->rq_reqmsg->opc != MGS_CONNECT) {
+                if (req->rq_export == NULL) {
+                        CERROR("lustre_mgs: operation %d on unconnected MGS\n",
+                               req->rq_reqmsg->opc);
+                        req->rq_status = -ENOTCONN;
+                        GOTO(out, rc = -ENOTCONN);
+                }
+        }
+
+        switch (req->rq_reqmsg->opc) {
+        case MGS_CONNECT:
+                DEBUG_REQ(D_MGS, req, "connect");
+                rc = target_handle_connect(req, mgs_handle);
+                if (!rc && (req->rq_reqmsg->conn_cnt > 1))
+                        /* Make clients trying to reconnect after a MGS restart
+                           happy; also requires obd_replayable */
+                        lustre_msg_add_op_flags(req->rq_repmsg,
+                                                MSG_CONNECT_RECONNECT);
+                break;
+        case MGS_DISCONNECT:
+                DEBUG_REQ(D_MGS, req, "disconnect");
+                rc = target_handle_disconnect(req);
+                req->rq_status = rc;            /* superfluous? */
+                break;
+        case MGS_TARGET_REG:
+                DEBUG_REQ(D_MGS, req, "target add\n");
+                rc = mgs_handle_target_reg(req);
+                break;
+        case MGS_TARGET_DEL:
+                DEBUG_REQ(D_MGS, req, "target del\n");
+                //rc = mgs_handle_target_del(req);
+                break;
+
+        case LDLM_ENQUEUE:
+                DEBUG_REQ(D_MGS, req, "enqueue");
+                rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
+                                         ldlm_server_blocking_ast, NULL);
+                fail = OBD_FAIL_LDLM_REPLY;
+                break;
+        case LDLM_BL_CALLBACK:
+        case LDLM_CP_CALLBACK:
+                DEBUG_REQ(D_MGS, req, "callback");
+                CERROR("callbacks should not happen on MGS\n");
+                LBUG();
+                break;
+
+        case OBD_PING:
+                DEBUG_REQ(D_INFO, req, "ping");
+                rc = target_handle_ping(req);
+                break;
+        case OBD_LOG_CANCEL:
+                DEBUG_REQ(D_MGS, req, "log cancel\n");
+                rc = -ENOTSUPP; /* la la la */
+                break;
+
+        case LLOG_ORIGIN_HANDLE_CREATE:
+                DEBUG_REQ(D_MGS, req, "llog_init");
+                rc = llog_origin_handle_create(req);
+                break;
+        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+                DEBUG_REQ(D_MGS, req, "llog next block");
+                rc = llog_origin_handle_next_block(req);
+                break;
+        case LLOG_ORIGIN_HANDLE_READ_HEADER:
+                DEBUG_REQ(D_MGS, req, "llog read header");
+                rc = llog_origin_handle_read_header(req);
+                break;
+        case LLOG_ORIGIN_HANDLE_CLOSE:
+                DEBUG_REQ(D_MGS, req, "llog close");
+                rc = llog_origin_handle_close(req);
+                break;
+        case LLOG_CATINFO:
+                DEBUG_REQ(D_MGS, req, "llog catinfo");
+                rc = llog_catinfo(req);
+                break;
+        default:
+                req->rq_status = -ENOTSUPP;
+                rc = ptlrpc_error(req);
+                RETURN(rc);
+        }
+
+        LASSERT(current->journal_info == NULL);
+        
+        CDEBUG(D_CONFIG | (rc?D_ERROR:0), "MGS handle cmd=%d rc=%d\n",
+               req->rq_reqmsg->opc, rc);
+
+ out:
+        target_send_reply(req, rc, fail);
+        RETURN(0);
+}
+
+static inline int mgs_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+
+        target_destroy_export(exp);
+
+        RETURN(0);
+}
+
+/* from mdt_iocontrol */
+int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                  void *karg, void *uarg)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct obd_ioctl_data *data = karg;
+        struct lvfs_run_ctxt saved;
+        int rc = 0;
+
+        ENTRY;
+        CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
+
+        switch (cmd) {
+
+        case OBD_IOC_PARAM: {
+                struct lustre_handle lockh;
+                struct lustre_cfg *lcfg;
+                struct llog_rec_hdr rec;
+                char fsname[32], *devname;
+                int lockrc;
+
+                CERROR("MGS param\n");
+
+                rec.lrh_len = llog_data_len(data->ioc_plen1);
+
+                if (data->ioc_type == LUSTRE_CFG_TYPE) {
+                        rec.lrh_type = OBD_CFG_REC;
+                } else {
+                        CERROR("unknown cfg record type:%d \n", data->ioc_type);
+                        RETURN(-EINVAL);
+                }
+
+                OBD_ALLOC(lcfg, data->ioc_plen1);
+                if (lcfg == NULL)
+                        RETURN(-ENOMEM);
+                rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
+                if (rc) 
+                        GOTO(out_free, rc);
+
+                if (lcfg->lcfg_bufcount < 1)
+                        GOTO(out_free, rc = -EINVAL);
+
+                /* Extract fsname */
+                memset(fsname, 0, sizeof(fsname));
+                devname = lustre_cfg_string(lcfg, 0);
+                if (devname) {
+                        char *ptr = strchr(devname, '-');
+                        if (!ptr) {
+                                /* assume devname is the fsname */
+                                strncpy(fsname, devname, sizeof(fsname));
+                        } else {  
+                                strncpy(fsname, devname, ptr - devname);
+                        }
+                        CDEBUG(D_MGS, "set param on fs %s device %s\n", 
+                               fsname, devname);
+                } else {
+                        CDEBUG(D_MGS, "set global param\n");
+                }
+
+                rc = mgs_setparam(obd, fsname, lcfg);
+                if (rc) {
+                        CERROR("setparam err %d\n", rc);
+                        GOTO(out_free, rc);
+                }
+
+                /* Revoke lock so everyone updates.  Should be alright if
+                   someone was already reading while we were updating the logs,
+                   so we don't really need to hold the lock while we're
+                   writing (above). */
+                if (fsname) {
+                        lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
+                        if (lockrc != ELDLM_OK) 
+                                CERROR("lock error %d for fs %s\n", lockrc, 
+                                       fsname);
+                        else
+                                mgs_put_cfg_lock(&lockh);
+                }
+out_free:
+                OBD_FREE(lcfg, data->ioc_plen1);
+                RETURN(rc);
+        }
+
+        case OBD_IOC_DUMP_LOG: {
+                struct llog_ctxt *ctxt =
+                        llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL);
+                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                if (rc)
+                        RETURN(rc);
+
+                RETURN(rc);
+        }
+
+        case OBD_IOC_LLOG_CHECK:
+        case OBD_IOC_LLOG_INFO:
+        case OBD_IOC_LLOG_PRINT: {
+                struct llog_ctxt *ctxt =
+                        llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+
+                push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
+                rc = llog_ioctl(ctxt, cmd, data);
+                pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
+
+                RETURN(rc);
+        }
+
+        default:
+                CDEBUG(D_INFO, "unknown command %x\n", cmd);
+                RETURN(-EINVAL);
+        }
+        RETURN(0);
+}
+
+/* use obd ops to offer management infrastructure */
+static struct obd_ops mgs_obd_ops = {
+        .o_owner           = THIS_MODULE,
+        .o_connect         = mgs_connect,
+        .o_disconnect      = mgs_disconnect,
+        .o_setup           = mgs_setup,
+        .o_precleanup      = mgs_precleanup,
+        .o_cleanup         = mgs_cleanup,
+        .o_destroy_export  = mgs_destroy_export,
+        .o_iocontrol       = mgs_iocontrol,
+};
+
+static int __init mgs_init(void)
+{
+        struct lprocfs_static_vars lvars;
+
+        lprocfs_init_vars(mgs, &lvars);
+        class_register_type(&mgs_obd_ops, lvars.module_vars, LUSTRE_MGS_NAME);
+
+        return 0;
+}
+
+static void /*__exit*/ mgs_exit(void)
+{
+        class_unregister_type(LUSTRE_MGS_NAME);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre  Management Server (MGS)");
+MODULE_LICENSE("GPL");
+
+module_init(mgs_init);
+module_exit(mgs_exit);
diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h
new file mode 100644 (file)
index 0000000..688055c
--- /dev/null
@@ -0,0 +1,49 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#ifndef _MGS_INTERNAL_H
+#define _MGS_INTERNAL_H
+
+#ifdef __KERNEL__
+# include <linux/fs.h>
+#endif
+#include <libcfs/kp30.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+
+/* MDS has o_t * 1000 */
+#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 10)
+
+/* mgs_llog.c */
+#define FSDB_EMPTY 0x0001
+
+struct fs_db {
+        char              fsdb_name[8];
+        struct list_head  fsdb_list;
+        struct semaphore  fsdb_sem;
+        void*             fsdb_ost_index_map;
+        void*             fsdb_mdt_index_map;
+        __u32             fsdb_flags;
+        __u32             fsdb_gen;
+};
+
+int mgs_init_fsdb_list(struct obd_device *obd);
+int mgs_cleanup_fsdb_list(struct obd_device *obd);
+int mgs_check_index(struct obd_device *obd, struct mgs_target_info *mti);
+int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti);
+int mgs_write_log_target(struct obd_device *obd, struct mgs_target_info *mti);
+int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti);
+int mgs_erase_logs(struct obd_device *obd, char *fsname);
+int mgs_setparam(struct obd_device *obd, char *fsname, struct lustre_cfg *lcfg);
+
+/* mgs_fs.c */
+int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt);
+int mgs_fs_cleanup(struct obd_device *obddev);
+
+
+#endif
diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c
new file mode 100644 (file)
index 0000000..3face3f
--- /dev/null
@@ -0,0 +1,1654 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/mgs/mgs_llog.c
+ *  Lustre Management Server (mgs) config llog creation
+ *
+ *  Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MGS
+#define D_MGS D_CONFIG/*|D_WARNING*/
+
+#ifdef __KERNEL__
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <libcfs/list.h>
+#include <linux/lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include "mgs_internal.h"
+
+/********************** Class fns ********************/
+
+static int class_dentry_readdir(struct obd_device *obd, struct dentry *dir,
+                                struct vfsmount *inmnt, 
+                                struct list_head *dentry_list){
+        /* see mds_cleanup_pending */
+        struct lvfs_run_ctxt saved;
+        struct file *file;
+        struct dentry *dentry;
+        struct vfsmount *mnt;
+        int rc = 0;
+        ENTRY;
+                                                                                
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        dentry = dget(dir);
+        if (IS_ERR(dentry))
+                GOTO(out_pop, rc = PTR_ERR(dentry));
+        mnt = mntget(inmnt);
+        if (IS_ERR(mnt)) {
+                l_dput(dentry);
+                GOTO(out_pop, rc = PTR_ERR(mnt));
+        }
+
+        file = dentry_open(dentry, mnt, O_RDONLY);
+        if (IS_ERR(file))
+                /* dentry_open_it() drops the dentry, mnt refs */
+                GOTO(out_pop, rc = PTR_ERR(file));
+                                                                                
+        INIT_LIST_HEAD(dentry_list);
+        rc = l_readdir(file, dentry_list);
+        filp_close(file, 0);
+        /*  filp_close->fput() drops the dentry, mnt refs */
+                                                                                
+out_pop:
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        RETURN(rc);
+}
+
+/******************** DB functions *********************/
+
+/* from the (client) config log, figure out:
+        1. which ost's/mdt's are configured (by index)
+        2. what the last config step is
+*/
+/* FIXME is it better to have a separate db file, instead of parsing the info
+   out of the client log? */
+static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, 
+                            void *data)
+{
+        struct fs_db *fsdb = (struct fs_db *)data;
+        int cfg_len = rec->lrh_len;
+        char *cfg_buf = (char*) (rec + 1);
+        struct lustre_cfg *lcfg;
+        __u32 index;
+        int rc = 0;
+        ENTRY;
+
+        if (rec->lrh_type != OBD_CFG_REC) {
+                CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+                RETURN(-EINVAL);
+        }
+
+        rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+        if (rc) {
+                CERROR("Insane cfg\n");
+                RETURN(rc);
+        }
+
+        lcfg = (struct lustre_cfg *)cfg_buf;
+
+        CDEBUG(D_INFO, "cmd %x %s %s\n", lcfg->lcfg_command, 
+               lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+        /* Figure out ost indicies */ 
+        /* lov_modify_tgts add 0:lov1  1:ost1_UUID  2(index):0  3(gen):1 */
+        if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD ||
+            lcfg->lcfg_command == LCFG_LOV_DEL_OBD) {
+                index = simple_strtoul(lustre_cfg_string(lcfg, 2),
+                                       NULL, 10);
+                CDEBUG(D_MGS, "OST index for %s is %u (%s)\n",
+                       lustre_cfg_string(lcfg, 1), index, 
+                       lustre_cfg_string(lcfg, 2));
+                set_bit(index, fsdb->fsdb_ost_index_map);
+        }
+        
+        /* Figure out mdt indicies */
+        /* attach   0:MDC_uml1_mdsA_MNT_client  1:mdc  2:1d834_MNT_client_03f */
+        if ((lcfg->lcfg_command == LCFG_ATTACH) &&
+            (strcmp(lustre_cfg_string(lcfg, 1), LUSTRE_MDC_NAME) == 0)) {
+                rc = server_name2index(lustre_cfg_string(lcfg, 0),
+                                       &index, NULL);
+                if (rc != LDD_F_SV_TYPE_MDT) {
+                        CWARN("Unparsable MDC name %s, assuming index 0\n",
+                              lustre_cfg_string(lcfg, 0));
+                        index = 0;
+                }
+                rc = 0;
+                CDEBUG(D_MGS, "MDT index is %u\n", index);
+                set_bit(index, fsdb->fsdb_mdt_index_map);
+        }
+
+        /* Keep track of the latest marker step */
+        if (lcfg->lcfg_command == LCFG_MARKER) {
+                struct cfg_marker *marker;
+                marker = lustre_cfg_buf(lcfg, 1);
+                fsdb->fsdb_gen = max(fsdb->fsdb_gen, marker->cm_step);
+        }
+
+        RETURN(rc);
+}
+
+static int mgs_get_fsdb_from_llog(struct obd_device *obd, char *logname,
+                                struct fs_db *fsdb)
+{
+        struct llog_handle *loghandle;
+        struct lvfs_run_ctxt saved;
+        int rc, rc2;
+        ENTRY;
+
+        down(&fsdb->fsdb_sem);
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        
+        rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT),
+                         &loghandle, NULL, logname);
+        if (rc)
+                GOTO(out_pop, rc);
+
+        rc = llog_init_handle(loghandle, LLOG_F_IS_PLAIN, NULL);
+        if (rc)
+                GOTO(out_close, rc);
+
+        if (llog_get_size(loghandle) <= 1)
+                fsdb->fsdb_flags |= FSDB_EMPTY;
+
+        rc = llog_process(loghandle, mgs_fsdb_handler, (void *)fsdb, NULL);
+        CDEBUG(D_MGS, "get_db = %d\n", rc);
+out_close:
+        rc2 = llog_close(loghandle);
+        if (!rc)
+                rc = rc2;
+
+out_pop:
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        up(&fsdb->fsdb_sem);
+
+        RETURN(rc);
+}
+
+static int next_index(void *index_map, int map_len)
+{
+        int i;
+        for (i = 0; i < map_len * 8; i++)
+                 if (!test_bit(i, index_map)) {
+                         return i;
+                 }
+        CERROR("max index %d exceeded.\n", i);
+        return -1;
+}
+
+#if 0
+static int count_osts(void *index_map, int map_len)
+{
+       int i, num;
+       for (i = 0, num = 0; i < map_len * 8; i++)
+               if (test_bit(i, index_map))
+                        num++;
+       return num;
+}
+#endif
+
+static struct fs_db *mgs_find_fsdb(struct obd_device *obd, char *fsname)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct fs_db *fsdb;
+        struct list_head *tmp;
+
+        list_for_each(tmp, &mgs->mgs_fs_db_list) {
+                fsdb = list_entry(tmp, struct fs_db, fsdb_list);
+                if (strcmp(fsdb->fsdb_name, fsname) == 0)
+                        return fsdb;
+        }
+        return NULL;
+}
+
+#define INDEX_MAP_SIZE 4096
+
+/* caller must hold the mgs->mgs_fs_db_lock */
+static struct fs_db *mgs_new_fsdb(struct obd_device *obd, char *fsname)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct fs_db *fsdb;
+        ENTRY;
+        
+        OBD_ALLOC_PTR(fsdb);
+        if (!fsdb) 
+                RETURN(NULL);
+
+        OBD_ALLOC(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE);
+        OBD_ALLOC(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
+        if (!fsdb->fsdb_ost_index_map || !fsdb->fsdb_mdt_index_map) {
+                CERROR("No memory for index maps\n");
+                GOTO(err, 0);
+        }
+        
+        strncpy(fsdb->fsdb_name, fsname, sizeof(fsdb->fsdb_name));
+        sema_init(&fsdb->fsdb_sem, 1);
+        list_add(&fsdb->fsdb_list, &mgs->mgs_fs_db_list);
+
+        RETURN(fsdb);
+err:
+        if (fsdb->fsdb_ost_index_map) 
+                OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE);
+        if (fsdb->fsdb_mdt_index_map) 
+                OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
+        OBD_FREE_PTR(fsdb);
+        RETURN(NULL);
+}
+
+static void mgs_free_fsdb(struct fs_db *fsdb)
+{
+        /* wait for anyone with the sem */
+        down(&fsdb->fsdb_sem);
+        list_del(&fsdb->fsdb_list);
+        OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE);
+        OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
+        OBD_FREE_PTR(fsdb);
+}
+
+int mgs_init_fsdb_list(struct obd_device *obd)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        INIT_LIST_HEAD(&mgs->mgs_fs_db_list);
+        return 0;
+}
+
+int mgs_cleanup_fsdb_list(struct obd_device *obd)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct fs_db *fsdb;
+        struct list_head *tmp, *tmp2;
+        down(&mgs->mgs_sem);
+        list_for_each_safe(tmp, tmp2, &mgs->mgs_fs_db_list) {
+                fsdb = list_entry(tmp, struct fs_db, fsdb_list);
+                mgs_free_fsdb(fsdb);
+        }
+        up(&mgs->mgs_sem);
+        return 0;
+}
+
+static inline int name_create(char *prefix, char *suffix, char **newname)
+{
+        LASSERT(newname);
+        OBD_ALLOC(*newname, strlen(prefix) + strlen(suffix) + 1);
+        if (!*newname) 
+                return -ENOMEM;
+        sprintf(*newname, "%s%s", prefix, suffix);
+        return 0;
+}
+
+static inline void name_destroy(char *name)
+{        
+        if (name)
+                OBD_FREE(name, strlen(name) + 1);
+}
+
+
+static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, 
+                               struct fs_db **dbh)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct fs_db *fsdb;
+        char *cliname;
+        int rc = 0;
+
+        down(&mgs->mgs_sem);
+        fsdb = mgs_find_fsdb(obd, name);
+        if (fsdb) {
+                up(&mgs->mgs_sem);
+                *dbh = fsdb;
+                return 0;
+        }
+
+        CDEBUG(D_MGS, "Creating new db\n");
+        fsdb = mgs_new_fsdb(obd, name);
+        up(&mgs->mgs_sem);
+        if (!fsdb) 
+                return -ENOMEM;
+
+        /* populate the db from the client llog */
+        name_create(name, "-client", &cliname);
+        rc = mgs_get_fsdb_from_llog(obd, cliname, fsdb);
+        name_destroy(cliname);
+        if (rc) {
+                CERROR("Can't get db from llog %d\n", rc);
+                mgs_free_fsdb(fsdb);
+                return rc;
+        }
+
+        *dbh = fsdb;
+        
+        return 0;
+}
+
+/* 1 = index in use
+   0 = index unused 
+   -1= empty client log */
+int mgs_check_index(struct obd_device *obd, struct mgs_target_info *mti)
+{
+        struct fs_db *fsdb;
+        void *imap;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(!(mti->mti_flags & LDD_F_NEED_INDEX));
+
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); 
+        if (rc) {
+                CERROR("Can't get db for %s\n", mti->mti_fsname);
+                RETURN(rc);
+        }
+
+        if (fsdb->fsdb_flags & FSDB_EMPTY) 
+                RETURN(-1);
+
+        if (mti->mti_flags & LDD_F_SV_TYPE_OST) 
+                imap = fsdb->fsdb_ost_index_map;
+        else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) 
+                imap = fsdb->fsdb_mdt_index_map;
+        else
+                RETURN(-EINVAL);
+
+        if (test_bit(mti->mti_stripe_index, imap)) 
+                RETURN(1);
+        RETURN(0);
+}
+
+
+int mgs_set_index(struct obd_device *obd, struct mgs_target_info *mti)
+{
+        struct fs_db *fsdb;
+        void *imap;
+        int rc = 0;
+        ENTRY;
+
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); 
+        if (rc) {
+                CERROR("Can't get db for %s\n", mti->mti_fsname);
+                RETURN(rc);
+        }
+
+        if (mti->mti_flags & LDD_F_SV_TYPE_OST) 
+                imap = fsdb->fsdb_ost_index_map;
+        else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) 
+                imap = fsdb->fsdb_mdt_index_map;
+        else
+                RETURN(-EINVAL);
+
+        if (mti->mti_flags & LDD_F_NEED_INDEX) {
+                rc = next_index(imap, INDEX_MAP_SIZE);
+                if (rc == -1)
+                        RETURN(-ERANGE);
+                mti->mti_stripe_index = rc;
+        }
+
+        /* Remove after CMD */
+        if ((mti->mti_flags & LDD_F_SV_TYPE_MDT) && 
+            (mti->mti_stripe_index > 0)) {
+                LCONSOLE_ERROR("MDT index must = 0 (until Clustered MetaData "
+                               "feature is ready.)\n");
+                mti->mti_stripe_index = 0;
+        }
+
+        if (mti->mti_stripe_index >= INDEX_MAP_SIZE * 8) {
+                LCONSOLE_ERROR("Server %s requested index %d, but the"
+                               "max index is %d.\n", 
+                               mti->mti_svname, mti->mti_stripe_index,
+                               INDEX_MAP_SIZE * 8);
+                RETURN(-ERANGE);
+        }
+         
+        if (test_bit(mti->mti_stripe_index, imap)) {
+                if (mti->mti_flags & LDD_F_VIRGIN) {
+                        LCONSOLE_ERROR("Server %s requested index %d, but that "
+                                       "index is already in use\n",
+                                       mti->mti_svname, mti->mti_stripe_index);
+                        RETURN(-EADDRINUSE);
+                } else {
+                        CERROR("Server %s updating index %d\n",
+                               mti->mti_svname, mti->mti_stripe_index);
+                        RETURN(EALREADY);
+                }
+        }
+
+        set_bit(mti->mti_stripe_index, imap);
+        fsdb->fsdb_flags &= ~FSDB_EMPTY;
+        server_make_name(mti->mti_flags, mti->mti_stripe_index,
+                         mti->mti_fsname, mti->mti_svname);
+
+        CDEBUG(D_MGS, "Set index for %s to %d\n", mti->mti_svname, 
+               mti->mti_stripe_index);
+
+        RETURN(0);
+}
+                           
+/******************** config log recording functions *********************/
+
+static int record_lcfg(struct obd_device *obd, struct llog_handle *llh,
+                         struct lustre_cfg *lcfg)
+{
+        struct lvfs_run_ctxt   saved;
+        struct llog_rec_hdr    rec;
+        int buflen, rc;
+
+        LASSERT(llh);
+        LASSERT(llh->lgh_ctxt);        
+
+        buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                lcfg->lcfg_buflens);
+        rec.lrh_len = llog_data_len(buflen);
+        rec.lrh_type = OBD_CFG_REC;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        /* idx = -1 means append */
+        rc = llog_write_rec(llh, &rec, NULL, 0, (void *)lcfg, -1);
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        if (rc) {
+                CERROR("failed %d\n", rc);
+        }
+        LASSERT(!rc);
+        return rc;
+}
+
+static int record_base(struct obd_device *obd, struct llog_handle *llh,
+                     char *cfgname, lnet_nid_t nid, int cmd,
+                     char *s1, char *s2, char *s3, char *s4)
+{
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg     *lcfg;
+        int rc;
+               
+        CDEBUG(D_MGS, "lcfg %s %#x %s %s %s %s\n", cfgname,
+               cmd, s1, s2, s3, s4); 
+
+        lustre_cfg_bufs_reset(&bufs, cfgname);
+        if (s1) 
+                lustre_cfg_bufs_set_string(&bufs, 1, s1);
+        if (s2) 
+                lustre_cfg_bufs_set_string(&bufs, 2, s2);
+        if (s3) 
+                lustre_cfg_bufs_set_string(&bufs, 3, s3);
+        if (s4) 
+                lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        lcfg->lcfg_nid = nid;
+
+        rc = record_lcfg(obd, llh, lcfg);
+        
+        lustre_cfg_free(lcfg);
+        
+        if (rc) {
+                CERROR("error %d: lcfg %s %#x %s %s %s %s\n", rc, cfgname,
+                       cmd, s1, s2, s3, s4); 
+        }
+        return(rc);
+}
+
+
+static inline int record_add_uuid(struct obd_device *obd, 
+                                  struct llog_handle *llh, 
+                                  uint64_t nid, char *uuid)
+{
+        return record_base(obd,llh,NULL,nid,LCFG_ADD_UUID,uuid,0,0,0);
+
+}
+
+static inline int record_add_conn(struct obd_device *obd, 
+                                  struct llog_handle *llh,
+                                  char *devname,
+                                  char *uuid)
+{
+        return record_base(obd,llh,devname,0,LCFG_ADD_CONN,uuid,0,0,0);
+}
+
+static inline int record_attach(struct obd_device *obd, struct llog_handle *llh,
+                                char *devname, char *type, char *uuid)
+{
+        return record_base(obd,llh,devname,0,LCFG_ATTACH,type,uuid,0,0);
+}
+
+static inline int record_setup(struct obd_device *obd, struct llog_handle *llh,
+                               char *devname, 
+                               char *s1, char *s2, char *s3, char *s4)
+{
+        return record_base(obd,llh,devname,0,LCFG_SETUP,s1,s2,s3,s4);
+}
+
+static int record_lov_setup(struct obd_device *obd, struct llog_handle *llh,
+                            char *devname, struct lov_desc *desc)
+{
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+        int rc;
+
+        lustre_cfg_bufs_reset(&bufs, devname);
+        lustre_cfg_bufs_set(&bufs, 1, desc, sizeof(*desc));
+        lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
+
+        rc = record_lcfg(obd, llh, lcfg);
+
+        lustre_cfg_free(lcfg);
+        return rc;
+}
+
+static inline int record_lov_add(struct obd_device *obd,
+                                 struct llog_handle *llh,
+                                 char *lov_name, char *ost_uuid,
+                                 char *index, char *gen)
+{
+        return record_base(obd,llh,lov_name,0,LCFG_LOV_ADD_OBD,
+                           ost_uuid,index,gen,0);
+}                                  
+
+static inline int record_mount_opt(struct obd_device *obd, 
+                                   struct llog_handle *llh,
+                                   char *profile, char *lov_name,
+                                   char *mdc_name)
+{
+        return record_base(obd,llh,NULL,0,LCFG_MOUNTOPT,
+                           profile,lov_name,mdc_name,0);
+}                
+
+static int record_marker(struct obd_device *obd, struct llog_handle *llh,
+                         struct fs_db *fsdb, __u32 flags,
+                         char *svname, char *comment)
+{
+        struct cfg_marker marker;
+        struct timeval tv;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+        int rc;
+
+        if (flags & CM_START) 
+                fsdb->fsdb_gen++;
+        marker.cm_step = fsdb->fsdb_gen;
+        marker.cm_flags = flags;
+        strncpy(marker.cm_svname, svname, sizeof(marker.cm_svname)); 
+        strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment)); 
+        do_gettimeofday(&tv);
+        marker.cm_createtime = tv.tv_sec;
+        marker.cm_canceltime = 0;
+        lustre_cfg_bufs_reset(&bufs, NULL);
+        lustre_cfg_bufs_set(&bufs, 1, &marker, sizeof(marker));
+        lcfg = lustre_cfg_new(LCFG_MARKER, &bufs);
+
+        rc = record_lcfg(obd, llh, lcfg);
+
+        lustre_cfg_free(lcfg);
+        return rc;
+}
+
+static int record_start_log(struct obd_device *obd, 
+                            struct llog_handle **llh, char *name)
+{
+        static struct obd_uuid cfg_uuid = { .uuid = "config_uuid" };
+        struct lvfs_run_ctxt saved;
+        int rc = 0;
+        
+        if (*llh) {
+                GOTO(out, rc = -EBUSY);
+        }
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT),
+                         llh, NULL, name);
+        if (rc == 0)
+                llog_init_handle(*llh, LLOG_F_IS_PLAIN, &cfg_uuid);
+        else
+                *llh = NULL;
+
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+out:
+        if (rc) {
+                CERROR("Can't start log %s: %d\n", name, rc);
+        }
+        RETURN(rc);
+}
+
+static int record_end_log(struct obd_device *obd, struct llog_handle **llh)
+{
+        struct lvfs_run_ctxt saved;
+        int rc = 0;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        
+        rc = llog_close(*llh);
+        *llh = NULL;
+        
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        RETURN(rc);
+}
+
+static int mgs_log_is_empty(struct obd_device *obd, char *name)
+{
+        struct lvfs_run_ctxt saved;
+        struct llog_handle *llh;
+        int rc = 0;
+
+        /* FIXME cache the empty state in the db */
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT),
+                         &llh, NULL, name);
+        if (rc == 0) {
+                llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL);
+                rc = llog_get_size(llh);
+                llog_close(llh);
+        }
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        /* header is record 1 */
+        return(rc <= 1);
+}
+
+/******************** config "macros" *********************/
+
+/* write an lcfg directly into a log (with markers) */
+static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb,
+                                char *logname, char *obdname,  
+                                struct lustre_cfg *lcfg)
+{
+        struct llog_handle *llh = NULL;
+        int rc;
+        ENTRY;
+
+        rc = record_start_log(obd, &llh, logname);
+        rc = record_marker(obd, llh, fsdb, CM_START, obdname, "param"); 
+        
+        rc = record_lcfg(obd, llh, lcfg);
+
+        rc = record_marker(obd, llh, fsdb, CM_END, obdname, "param"); 
+        rc = record_end_log(obd, &llh);
+        
+        RETURN(rc);
+}
+
+/* write the lcfg in all logs for the given fs */
+int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb,
+                          char *fsname, struct lustre_cfg *lcfg)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        struct list_head dentry_list;
+        struct l_linux_dirent *dirent, *n;
+        char *logname;
+        int rc, len = strlen(fsname);
+        ENTRY;
+        
+        /* We need to set params for any future logs 
+           as well. FIXME Append this file to every new log. */
+        name_create(fsname, "-params", &logname);
+        if (mgs_log_is_empty(obd, logname)) {
+                struct llog_handle *llh = NULL;
+                rc = record_start_log(obd, &llh, logname);
+                rc = record_end_log(obd, &llh);
+        }
+        name_destroy(logname);
+
+        /* Find all the logs in the CONFIGS directory */
+        rc = class_dentry_readdir(obd, mgs->mgs_configs_dir,
+                                  mgs->mgs_vfsmnt, &dentry_list);
+        if (rc) {
+                CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR);
+                RETURN(rc);
+        }
+                                                                                
+        /* Could use fsdb index maps instead of directory listing */
+        list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) {
+                list_del(&dirent->lld_list);
+                if (strncmp(fsname, dirent->lld_name, len) == 0) {
+                        CDEBUG(D_MGS, "Changing log %s\n", dirent->lld_name);
+                        rc = mgs_write_log_direct(obd, fsdb, dirent->lld_name,
+                                                  dirent->lld_name, lcfg);
+                }
+                OBD_FREE(dirent, sizeof(*dirent));
+        }
+        
+        RETURN(rc);
+}
+
+/* lov is the first thing in the mdt and client logs */
+static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *fsdb, 
+                             struct mgs_target_info *mti,
+                             char *logname, char *lovname)
+{
+        struct llog_handle *llh = NULL;
+        struct lov_desc *lovdesc;
+        char *uuid;
+        int rc = 0;
+        ENTRY;
+
+        CDEBUG(D_MGS, "Writing log %s\n", logname);
+
+        /*
+        #01 L attach   0:lov_mdsA  1:lov  2:71ccb_lov_mdsA_19f961a9e1
+        #02 L lov_setup 0:lov_mdsA 1:(struct lov_desc)
+              uuid=lov1_UUID, stripe count=1, size=1048576, offset=0, pattern=0
+        */
+
+        /* FIXME just make lov_setup accept empty desc (put uuid in buf 2) */
+        OBD_ALLOC(lovdesc, sizeof(*lovdesc));
+        if (lovdesc == NULL)
+                RETURN(-ENOMEM);
+        lovdesc->ld_magic = LOV_DESC_MAGIC;
+        lovdesc->ld_tgt_count = 0;
+        /* Defaults.  Can be changed later by lcfg config_param */ 
+        lovdesc->ld_default_stripe_count = 1;
+        lovdesc->ld_pattern = LOV_PATTERN_RAID0;
+        lovdesc->ld_default_stripe_size = 1024 * 1024;
+        lovdesc->ld_default_stripe_offset = 0;
+        sprintf((char*)lovdesc->ld_uuid.uuid, "%s_UUID", lovname);
+        /* can these be the same? */
+        uuid = (char *)lovdesc->ld_uuid.uuid;
+
+        /* This should always be the first entry in a log.
+        rc = mgs_clear_log(obd, logname); */
+        rc = record_start_log(obd, &llh, logname);
+        rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup"); 
+        rc = record_attach(obd, llh, lovname, "lov", uuid);
+        rc = record_lov_setup(obd, llh, lovname, lovdesc);
+        rc = record_marker(obd, llh, fsdb, CM_END, lovname, "lov setup"); 
+        rc = record_end_log(obd, &llh);
+        
+        OBD_FREE(lovdesc, sizeof(*lovdesc));
+        RETURN(rc);
+}
+
+/* add failnids to open log */
+static int mgs_write_log_failnids(struct obd_device *obd,
+                                  struct mgs_target_info *mti,
+                                  struct llog_handle *llh,
+                                  char *cliname)
+{
+        char *failnodeuuid = NULL;
+        char *ptr = mti->mti_params;
+        lnet_nid_t nid;
+        int rc = 0;
+
+        /*
+        #03 L add_uuid  nid=uml1@tcp(0x20000c0a80201) nal=90 0:  1:uml1_UUID
+        #04 L add_uuid  nid=1@elan(0x1000000000001)   nal=90 0:  1:uml1_UUID
+        #05 L setup    0:OSC_uml1_ost1_mdsA  1:ost1_UUID  2:uml1_UUID
+        #06 L add_uuid  nid=uml2@tcp(0x20000c0a80202) nal=90 0:  1:uml2_UUID
+        #0x L add_uuid  nid=2@elan(0x1000000000002)   nal=90 0:  1:uml2_UUID
+        #07 L add_conn 0:OSC_uml1_ost1_mdsA  1:uml2_UUID
+        */
+
+        /* Pull failnid info out of params string */
+        while (class_find_param(ptr, PARAM_FAILNODE, &ptr) == 0) {
+                while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                        if (failnodeuuid == NULL) {
+                                /* We don't know the failover node name, 
+                                   so just use the first nid as the uuid */
+                                rc = name_create(libcfs_nid2str(nid), "",
+                                                 &failnodeuuid);
+                                if (rc) 
+                                        return rc;
+                        }
+                        CDEBUG(D_MGS, "add nid %s for failover uuid %s, "
+                               "client %s\n", libcfs_nid2str(nid),
+                               failnodeuuid, cliname);
+                        rc = record_add_uuid(obd, llh, nid, failnodeuuid);
+                }
+                if (failnodeuuid) {
+                        rc = record_add_conn(obd, llh, cliname, failnodeuuid);
+                        name_destroy(failnodeuuid);
+                        failnodeuuid = NULL;
+                }
+        }
+
+        return rc;
+}
+
+static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
+                             struct mgs_target_info *mti)
+{
+        struct llog_handle *llh = NULL;
+        char *cliname, *mdcname, *lovname, *nodeuuid, *mdcuuid;
+        int rc, i, first_log = 0;
+        ENTRY;
+
+        CDEBUG(D_MGS, "writing new mdt %s\n", mti->mti_svname);
+
+        /* COMPAT_146 */
+        if (mti->mti_flags & LDD_F_UPGRADE14) {
+                char *ptr, oldname[sizeof(mti->mti_uuid)];
+                /* We're starting with an old uuid.  Assume old name for lov
+                   as well since the lov entry already exists in the log. */
+                CERROR("old mds uuid %s\n", mti->mti_uuid);
+                strcpy(oldname, mti->mti_uuid);
+                ptr = strstr(oldname, "_UUID");
+                if (!ptr) {
+                        CERROR("Can't get old MDT name from %s\n", 
+                               mti->mti_uuid);
+                        RETURN(-EINVAL);
+                }
+                *ptr = '\0';
+                name_create("lov_", oldname, &lovname);
+                CERROR("lov name: %s\n", lovname);
+        } else {
+                /* Make up our own uuid and lov name */
+                snprintf(mti->mti_uuid, sizeof(mti->mti_uuid),
+                         "%s_UUID", mti->mti_svname);
+                name_create(mti->mti_fsname, "-mdtlov", &lovname);
+        }
+
+        /* Append mdt info to mdt log */
+        if (mgs_log_is_empty(obd, mti->mti_svname)) {
+                /* This is the first time for all logs for this fs, 
+                   since any ost should have already started the mdt log. */
+                first_log++;
+                rc = mgs_write_log_lov(obd, fsdb, mti, mti->mti_svname,
+                                       lovname);
+        } 
+        /* else there's already some ost entries in the mdt log. */
+
+        /* We added the lov, maybe some osc's, now for the mdt.
+           We might add more ost's after this. Note that during the parsing
+           of this log, this is when the mdt will start. (This was not 
+           formerly part of the old mds log, it was directly executed by
+           lconf.) */ 
+        /*
+        #09 L mount_option 0:  1:mdsA  2:lov_mdsA
+        attach mds mdsA mdsA_UUID
+        setup /dev/loop2 ldiskfs mdsA errors=remount-ro,user_xattr
+        */
+        rc = record_start_log(obd, &llh, mti->mti_svname);
+        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add mdt"); 
+        rc = record_mount_opt(obd, llh, mti->mti_svname, lovname, 0);
+        rc = record_attach(obd, llh, mti->mti_svname, LUSTRE_MDS_NAME, 
+                           mti->mti_uuid);
+        rc = record_setup(obd, llh, mti->mti_svname,
+                          "dev"/*ignored*/, "type"/*ignored*/,
+                          mti->mti_svname, 0/*options*/);
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdt"); 
+        rc = record_end_log(obd, &llh);
+
+        /* Append the mdt info to the client log */
+        name_create(mti->mti_fsname, "-client", &cliname);
+        name_destroy(lovname);
+        name_create(mti->mti_fsname, "-clilov", &lovname);
+        if (first_log || 
+            /* If we're upgrading, the MDT log will exist but not the client. */
+            ((mti->mti_flags & LDD_F_UPGRADE14) && 
+             mgs_log_is_empty(obd, cliname))) {
+                /* Start client log */
+                rc = mgs_write_log_lov(obd, fsdb, mti, cliname, lovname);
+        }
+
+        name_create(libcfs_nid2str(mti->mti_nids[0]), /*"_UUID"*/"", &nodeuuid);
+        name_create(mti->mti_svname, "-mdc", &mdcname);
+        name_create(mdcname, "_UUID", &mdcuuid);
+        /* 
+        #09 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0:  1:uml1_UUID
+        #10 L attach   0:MDC_uml1_mdsA_MNT_client  1:mdc  2:1d834_MNT_client_03f
+        #11 L setup    0:MDC_uml1_mdsA_MNT_client  1:mdsA_UUID  2:uml1_UUID
+        #12 L add_uuid nid=uml2@tcp(0x20000c0a80202) 0:  1:uml2_UUID
+        #13 L add_conn 0:MDC_uml1_mdsA_MNT_client  1:uml2_UUID
+        #14 L mount_option 0:  1:client  2:lov1  3:MDC_uml1_mdsA_MNT_client
+        */
+        rc = record_start_log(obd, &llh, cliname);
+        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add mdc");
+        for (i = 0; i < mti->mti_nid_count; i++) {
+                CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i]));
+                rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid);
+        }
+        rc = record_attach(obd, llh, mdcname, LUSTRE_MDC_NAME, mdcuuid);
+        rc = record_setup(obd, llh, mdcname, mti->mti_uuid,nodeuuid, 0, 0);
+        rc = mgs_write_log_failnids(obd, mti, llh, mdcname);
+        rc = record_mount_opt(obd, llh, cliname, lovname, mdcname);
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); 
+        rc = record_end_log(obd, &llh);
+
+        name_destroy(mdcuuid);
+        name_destroy(mdcname);
+        name_destroy(nodeuuid);
+        name_destroy(cliname);
+        name_destroy(lovname);
+        RETURN(rc);
+}
+
+/* Add the ost info to the client/mdt lov */
+static int mgs_write_log_osc(struct obd_device *obd, struct fs_db *fsdb,
+                             struct mgs_target_info *mti,
+                             char *logname, char *lovname, int flags)
+{
+        struct llog_handle *llh = NULL;
+        char *nodeuuid, *oscname, *oscuuid, *lovuuid;
+        char index[5];
+        int i, rc;
+
+        if (mgs_log_is_empty(obd, logname)) {
+                /* The first item in the log must be the lov, so we have
+                   somewhere to add our osc. */
+                rc = mgs_write_log_lov(obd, fsdb, mti, logname, lovname);
+        }
+  
+        CDEBUG(D_MGS, "adding osc for %s to log %s\n",
+               mti->mti_svname, logname);
+
+        name_create(libcfs_nid2str(mti->mti_nids[0]), "", &nodeuuid);
+        name_create(mti->mti_svname, "-osc", &oscname);
+        name_create(oscname, "_UUID", &oscuuid);
+        name_create(lovname, "_UUID", &lovuuid);
+
+        /*
+        #03 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0:  1:uml1_UUID
+        multihomed (#4)
+        #04 L add_uuid  nid=1@elan(0x1000000000001)  nal=90 0:  1:uml1_UUID
+        #04 L attach   0:OSC_uml1_ost1_MNT_client  1:osc  2:89070_lov1_a41dff51a
+        #05 L setup    0:OSC_uml1_ost1_MNT_client  1:ost1_UUID  2:uml1_UUID
+        failover (#6,7)
+        #06 L add_uuid nid=uml2@tcp(0x20000c0a80202) 0:  1:uml2_UUID
+        #07 L add_conn 0:OSC_uml1_ost1_MNT_client  1:uml2_UUID
+        #08 L lov_modify_tgts add 0:lov1  1:ost1_UUID  2(index):0  3(gen):1
+        */
+        rc = record_start_log(obd, &llh, logname);
+        rc = record_marker(obd, llh, fsdb, CM_START | flags, mti->mti_svname,
+                           "add osc"); 
+        for (i = 0; i < mti->mti_nid_count; i++) {
+                CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i]));
+                rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid);
+        }
+        rc = record_attach(obd, llh, oscname, LUSTRE_OSC_NAME, lovuuid);
+        rc = record_setup(obd, llh, oscname, mti->mti_uuid, nodeuuid, 0, 0);
+        rc = mgs_write_log_failnids(obd, mti, llh, oscname);
+        snprintf(index, sizeof(index), "%d", mti->mti_stripe_index);
+        rc = record_lov_add(obd, llh, lovname, mti->mti_uuid, index, "1");
+        rc = record_marker(obd, llh, fsdb, CM_END | flags, mti->mti_svname,
+                           "add osc"); 
+        rc = record_end_log(obd, &llh);
+        
+        name_destroy(lovuuid);
+        name_destroy(oscuuid);
+        name_destroy(oscname);
+        name_destroy(nodeuuid);
+        return rc;
+}
+
+static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
+                             struct mgs_target_info *mti)
+{
+        struct llog_handle *llh = NULL;
+        char *logname, *lovname;
+        int rc, flags = 0;
+        ENTRY;
+        
+        CDEBUG(D_MGS, "writing new ost %s\n", mti->mti_svname);
+
+        /* The ost startup log */
+
+        /* If the ost log already exists, that means that someone reformatted
+           the ost and it called target_add again.
+           FIXME check and warn here, maybe inc config ver #?  Or abort, 
+           and claim there's already a server with that name?  Maybe need 
+           another flag to say it's okay to rewrite. 
+           Heck, what do we do about the client and mds logs? We better
+           abort. */
+        if (!mgs_log_is_empty(obd, mti->mti_svname)) {
+                LCONSOLE_ERROR("The config log for %s already exists, yet the "
+                               "server claims it never registered.  It may have"
+                               " been reformatted, or the index changed. Use "
+                               " tunefs.lustre --writeconf to regenerate "
+                               " all logs.\n", mti->mti_svname);
+                return -EALREADY;
+        }
+        /*
+        attach obdfilter ost1 ost1_UUID
+        setup /dev/loop2 ldiskfs f|n errors=remount-ro,user_xattr
+        */
+        rc = record_start_log(obd, &llh, mti->mti_svname);
+        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost"); 
+        if (*mti->mti_uuid == '\0') 
+                snprintf(mti->mti_uuid, sizeof(mti->mti_uuid),
+                         "%s_UUID", mti->mti_svname);
+        rc = record_attach(obd, llh, mti->mti_svname,
+                           "obdfilter"/*LUSTRE_OST_NAME*/, mti->mti_uuid);
+        rc = record_setup(obd,llh,mti->mti_svname,
+                          "dev"/*ignored*/,"type"/*ignored*/,
+                          "f", 0/*options*/);
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost"); 
+        rc = record_end_log(obd, &llh);
+
+        /* We also have to update the other logs where this osc is part of 
+           the lov */
+
+        /* Append ost info to mdt log */
+        if (mti->mti_flags & LDD_F_UPGRADE14) 
+                /* If we're upgrading, the old mdt log already has our
+                   entry. Let's do a fake one for fun. */
+                flags = CM_SKIP | CM_UPGRADE146;
+        /* FIXME add to all mdt logs for CMD */
+        // FIXME need real mdt name -- but MDT may not have registered yet!
+        name_create(mti->mti_fsname, "-MDT0000", &logname);
+        name_create(mti->mti_fsname, "-mdtlov", &lovname);
+        mgs_write_log_osc(obd, fsdb, mti, logname, lovname, flags);
+        name_destroy(lovname);
+        name_destroy(logname);
+        
+        /* Append ost info to the client log */
+        name_create(mti->mti_fsname, "-client", &logname);
+        name_create(mti->mti_fsname, "-clilov", &lovname);
+        mgs_write_log_osc(obd, fsdb, mti, logname, lovname, 0);
+        name_destroy(lovname);
+        name_destroy(logname);
+        
+        RETURN(rc);
+}
+
+/* Add additional failnids to an existing log.  
+   The mdc/osc must have been added to logs first */
+/* tcp nids must be in dotted-quad ascii -
+   we can't resolve hostnames from the kernel. */
+static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb,
+                                     struct mgs_target_info *mti)
+{
+        char *logname, *cliname;
+        struct llog_handle *llh = NULL;
+        int rc;
+        ENTRY;
+
+        /* Verify that we know about this target */
+        if (mgs_log_is_empty(obd, mti->mti_svname)) {
+                LCONSOLE_ERROR("The target %s has not registered yet. "
+                               "It must be started before failnids can "
+                               "be added.\n", mti->mti_svname);
+                RETURN(-ENOENT);
+        }
+
+        /* Create mdc/osc client name (e.g. lustre-OST0001-osc) */
+        if (mti->mti_flags & LDD_F_SV_TYPE_MDT) {
+                name_create(mti->mti_svname, "-mdc", &cliname);
+        } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) {
+                name_create(mti->mti_svname, "-osc", &cliname);
+        } else {
+                RETURN(-EINVAL);
+        }
+        
+        /* Add failover nids to client log */
+        name_create(mti->mti_fsname, "-client", &logname);
+        rc = record_start_log(obd, &llh, logname);
+        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,
+                           "add failnid");
+        rc = mgs_write_log_failnids(obd, mti, llh, cliname);
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname,
+                           "add failnid"); 
+        rc = record_end_log(obd, &llh);
+        name_destroy(logname);
+
+        if (mti->mti_flags & LDD_F_SV_TYPE_OST) {
+                /* Add OST failover nids to the MDT log as well */
+                name_create(mti->mti_fsname, "-MDT0000", &logname);
+                rc = record_start_log(obd, &llh, logname);
+                rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,
+                                   "add failnid");
+                rc = mgs_write_log_failnids(obd, mti, llh, cliname);
+                rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname,
+                                   "add failnid"); 
+                rc = record_end_log(obd, &llh);
+                name_destroy(logname);
+        }
+
+        name_destroy(cliname);
+        RETURN(rc);
+}
+
+static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb,
+                                struct mgs_target_info *mti)
+{
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+        char *ptr = mti->mti_params;
+        char *endptr;
+        char *end = mti->mti_params + sizeof(mti->mti_params);
+        int rc = 0, len;
+        ENTRY;
+
+        if (!mti->mti_params) 
+                RETURN(0);
+
+        while (ptr < end) {
+                while (*ptr == ' ') 
+                        ptr++;
+                if (*ptr == '\0' || (ptr >= end))
+                        break;
+                endptr = strchr(ptr, ' ');
+                if (endptr)
+                        len = endptr - ptr;
+                else
+                        len = strlen(ptr);
+                CDEBUG(D_MGS, "next param '%.*s'\n", len, ptr);
+
+                if (class_match_param(ptr, PARAM_MGSNODE, &endptr) == 0) 
+                        GOTO(end_while, rc);
+
+                if (class_match_param(ptr, PARAM_FAILNODE, &endptr) == 0) {
+                        /* Add a failover nidlist */
+                        rc = 0;
+                        /* We already processed failovers params for new
+                           targets in mgs_write_log_target */
+                        if (mti->mti_flags & MTI_F_IOCTL) {
+                                CDEBUG(D_MGS, "Adding failnode\n");
+                                rc = mgs_write_log_add_failnid(obd, fsdb, mti);
+                        }
+                        GOTO(end_while, rc);
+                }
+
+                if (class_match_param(ptr, PARAM_OBD_TIMEOUT, &endptr) == 0) {
+                        /* Change obd timeout */
+                        int timeout;
+                        timeout = simple_strtoul(endptr, &endptr, 0);
+
+                        CDEBUG(D_MGS, "obd timeout %d\n", timeout);
+                        lustre_cfg_bufs_reset(&bufs, NULL);
+                        lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs);
+                        lcfg->lcfg_num = timeout;
+                        /* modify all servers and clients */
+                        rc = mgs_write_log_direct_all(obd, fsdb, mti->mti_fsname,
+                                                   lcfg); 
+                        lustre_cfg_free(lcfg);
+                        GOTO(end_while, rc);
+                }
+
+                if (class_match_param(ptr, PARAM_DEFAULT_STRIPE, &endptr) == 0){
+                        /* Change lov default stripe params */
+                        char *lovname, *logname;
+                        CDEBUG(D_MGS, "lov param %s\n", ptr);
+                        if (!(mti->mti_flags & LDD_F_SV_TYPE_MDT)) {
+                                LCONSOLE_ERROR("Default stripe params must be "
+                                               "set on the MDT, not %s. "
+                                               "Ignoring.\n",
+                                               mti->mti_svname);
+                                GOTO(end_while, rc = 0);
+                        }
+
+                        /* Modify mdtlov */
+                        name_create(mti->mti_fsname, "-mdtlov", &lovname);
+                        if (mgs_log_is_empty(obd, mti->mti_svname)) {
+                                name_destroy(lovname);
+                                GOTO(end_while, rc = -ENODEV);
+                        }
+                        lustre_cfg_bufs_reset(&bufs, lovname);
+                        lustre_cfg_bufs_set(&bufs, 1, ptr, len);
+                        lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+                        rc = mgs_write_log_direct(obd, fsdb, mti->mti_svname, 
+                                                  lovname, lcfg);
+                        lustre_cfg_free(lcfg);
+                        name_destroy(lovname);
+                        if (rc) 
+                                GOTO(end_while, rc);
+
+                        /* Modify clilov */
+                        name_create(mti->mti_fsname, "-client", &logname);
+                        name_create(mti->mti_fsname, "-clilov", &lovname);
+                        lustre_cfg_bufs_reset(&bufs, lovname);
+                        lustre_cfg_bufs_set(&bufs, 1, ptr, len);
+                        lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+                        rc = mgs_write_log_direct(obd, fsdb, logname,
+                                                  lovname, lcfg);
+                        lustre_cfg_free(lcfg);
+                        name_destroy(lovname);
+                        name_destroy(logname);
+                        GOTO(end_while, rc);
+                }
+
+                LCONSOLE_WARN("Ignoring unrecognized param '%.*s'\n", len, ptr);
+
+end_while:
+                if (rc) {
+                        CERROR("err %d on param '%.*s\n", rc, len, ptr);
+                        break;
+                }
+                ptr += len;
+        }
+
+        RETURN(rc);
+}
+
+int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti)
+{
+        /* Not implementing automatic failover nid addition at this time. */
+        return 0;
+#if 0
+        struct fs_db *fsdb;
+        int rc;
+        ENTRY;
+
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); 
+        if (rc) 
+                RETURN(rc);
+
+        if (mgs_log_is_empty(obd, mti->mti_svname)) 
+                /* should never happen */
+                RETURN(-ENOENT);
+
+        CDEBUG(D_MGS, "Checking for new failnids for %s\n", mti->mti_svname);
+        // FIXME check logs
+        /* FIXME we need a real database lookup.  Create on-disk db of known 
+           size, lookup by index */
+        /* Check each nid, or check only nid0 and add all if nid0 is missing?
+           What if someone adds a net to a node? Better check everything. */
+        /* if nid 0 is missing, mgs_write_log_add_failnid.
+           if just one nid is missing, add uuid for nodeuuid[nid0]).
+        */
+
+        /* Hey, we can just check mti->params to see if we're already in
+           the failover list */
+        
+        down(&fsdb->fsdb_sem);
+        rc = mgs_write_log_add_failnid(obd, fsdb, mti);
+        up(&fsdb->fsdb_sem);
+
+        RETURN(rc);
+#endif
+}
+
+int mgs_write_log_target(struct obd_device *obd,
+                         struct mgs_target_info *mti)
+{
+        struct fs_db *fsdb;
+        int rc = -EINVAL;
+        ENTRY;
+
+        /* set/check the new target index */
+        rc = mgs_set_index(obd, mti);
+        if (rc < 0) {
+                CERROR("Can't get index (%d)\n", rc);
+                RETURN(rc);
+        }
+        if (rc == EALREADY) {
+                // FIXME mark old log sections as invalid, add new.
+                CERROR("updates not yet implemented\n");
+                RETURN(-EALREADY);
+        }
+
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); 
+        if (rc) {
+                CERROR("Can't get db for %s\n", mti->mti_fsname);
+                RETURN(rc);
+        }
+
+        down(&fsdb->fsdb_sem);
+
+        if (mti->mti_flags & LDD_F_SV_TYPE_MDT) {
+                rc = mgs_write_log_mdt(obd, fsdb, mti);
+        } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) {
+                rc = mgs_write_log_ost(obd, fsdb, mti);
+        } else {
+                CERROR("Unknown target type %#x, can't create log for %s\n",
+                       mti->mti_flags, mti->mti_svname);
+        }
+        if (rc) {
+                CERROR("Can't write logs for %s (%d)\n", mti->mti_svname, rc);
+                GOTO(out_up, rc);
+        }
+
+        rc = mgs_write_log_params(obd, fsdb, mti);
+
+out_up:
+        up(&fsdb->fsdb_sem);
+        RETURN(rc);
+}
+
+
+/* COMPAT_146 */
+/***************** upgrade pre-mountconf logs to mountconf *****************/
+
+#if 0
+int mgs_upgrade_logs_14(struct obd_device *obd, struct fs_db *fsdb, 
+                        struct mgs_target_info *mti)
+{
+        int rc = 0;
+        ENTRY;
+
+        CDEBUG(D_MGS, "Upgrading old logs for %s\n", mti->mti_fsname);
+
+        /* If we get here, we know: 
+                the client log fsname-client exists
+                the logs have not been updated
+           so
+        1. parse the old client log (client log name?) to find out UUIDs for
+           all servers
+        2. regen all ost logs: servers will get new
+           name based on index, but will keep their old uuids.
+        3. append mdt startup to the end of the mdt log
+        4. append marker to old client log signifying we did the upgrade
+        ?  translate mds/client logs to new names?
+                  2 UP mdt MDS MDS_uuid 3
+                  3 UP lov lov_mdsA 47d06_lov_mdsA_61f31f85bc 4
+                  4 UP osc OSC_uml1_ost1_mdsA 47d06_lov_mdsA_61f31f85bc 4
+                  5 UP osc OSC_uml1_ost2_mdsA 47d06_lov_mdsA_61f31f85bc 4
+                  6 UP mds lustre-MDT0000 mdsA_UUID 3
+                to
+        ?  update server uuids?
+        */
+
+        
+        /* old mdt log: 
+        old osc's were part of old lov,
+        mount opt connects mdt to lov
+        so need to use old lov name.
+        old client logs starts old mdc and lov,
+        so need to use old lov,mdc names in mount opt - but new client name 
+        *-client instead of just "client" */
+
+
+        if ((mti->mti_flags & LDD_F_SV_TYPE_MDT)) {
+                CDEBUG(D_MGS, "Upgrade MDT\n");
+                if (mgs_log_is_empty(obd, mti->mti_svname)) {
+                        CERROR("The MDT log %s is missing.\n", mti->mti_svname);
+                        RETURN(-ENOENT);
+                }
+                /* Append the MDT startup sequence to the old log 
+                   (lconf used to start the MDT directly) */
+                rc = mgs_write_log_mdt(obd, fsdb, mti);
+                if (rc) 
+                        RETURN(rc);
+
+                /* this would be for trying to update an old client log */
+                struct llog_handle *llh = NULL;
+                char *cliname;
+                CDEBUG(D_MGS, "Upgrade client\n");
+
+                name_create(mti->mti_fsname, "-client", &cliname);
+
+                rc = record_start_log(obd, &llh, cliname);
+                rc = record_marker(obd, llh, fsdb, CM_START, "client",
+                                   "upgrade from 1.4"); 
+                /* FIXME find the old lovname and mdcname from old log */
+                /* old: mount_option 0:  1:client  2:lov1  3:MDC_uml1_mdsA_MNT_client */
+                /* new: mount_option 0:  1:lustre-client  2:lustre-clilov  3:lustre-MDT0000-mdc */
+                rc = record_mount_opt(obd, llh, cliname, "lov1", 
+                                      "MDC_uml1_mdsA_MNT_client");
+                rc = record_marker(obd, llh, fsdb, CM_END, "client", 
+                                   "upgrade to 1.6"); 
+                rc = record_end_log(obd, &llh);
+                name_destroy(cliname);
+        }
+        
+        if ((mti->mti_flags & LDD_F_SV_TYPE_OST)) {
+                CDEBUG(D_MGS, "Upgrade OST\n");
+                /* A regular new ost log, but don't update client or MDT logs */
+                rc = mgs_write_log_ost(obd, fsdb, mti);
+        }
+
+        RETURN(rc);
+}
+#endif        
+
+/* first connect of upgraded servers */ 
+int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti)
+{
+        struct fs_db *fsdb;
+        int rc = 0;
+        ENTRY;
+
+        /* Create client and ost log normally, as servers register.
+         That way logs are modern (except have old uuids (from last_rcvd))
+         - Old clients can continue to use upgraded OSTs
+         - New clients will only start with upgraded OSTs
+         - MDT won't know about old OSTs, only upgraded, so we need the old
+           MDT log in order for old clients to work. (Old clients connect to
+           the MDT, not the MGS, for their logs, and will therefore receive
+           the old client log from the MDT /LOGS dir.) */ 
+
+        CDEBUG(D_MGS, "upgrading server %s from pre-1.6\n", 
+               mti->mti_svname); 
+        server_mti_print("upgrade", mti);
+        
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb);
+        if (rc) 
+                RETURN(rc);
+
+        if (fsdb->fsdb_flags & FSDB_EMPTY)
+                /* First server to upgrade sees this */
+                CWARN("info: missing client log\n");
+
+        if (!(fsdb->fsdb_flags & FSDB_EMPTY) && (fsdb->fsdb_gen == 0)) {
+                /* There were no markers in the client log, meaning we have 
+                   not updated the logs for this fs */
+                CWARN("info: found old, unupdated client log\n");
+        }
+
+        if ((mti->mti_flags & LDD_F_SV_TYPE_MDT) && 
+            mgs_log_is_empty(obd, mti->mti_svname)) {
+                LCONSOLE_ERROR("The old MDT log %s is missing.  Was "
+                               "tunefs.lustre successful?\n",
+                               mti->mti_svname);
+                RETURN(-ENOENT);
+        }
+
+        /* FIXME Old MDT log already has an old mount opt 
+           which we should drop */
+        rc = mgs_write_log_target(obd, mti);
+        RETURN(rc);
+}
+/* end COMPAT_146 */
+
+static int mgs_clear_log(struct obd_device *obd, char *name)
+{
+        struct lvfs_run_ctxt saved;
+        struct llog_handle *llh;
+        int rc = 0;
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT),
+                         &llh, NULL, name);
+        if (rc == 0) {
+                llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL);
+                rc = llog_destroy(llh);
+                llog_free_handle(llh);
+        }
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        if (rc)
+                CERROR("failed to clear log %s: %d\n", name, rc);
+
+        return(rc);
+}
+
+/* erase all logs for the given fs */
+int mgs_erase_logs(struct obd_device *obd, char *fsname)
+{
+        struct mgs_obd *mgs = &obd->u.mgs;
+        static struct fs_db *fsdb;
+        struct list_head dentry_list;
+        struct l_linux_dirent *dirent, *n;
+        int rc, len = strlen(fsname);
+        ENTRY;
+        
+        /* Find all the logs in the CONFIGS directory */
+        rc = class_dentry_readdir(obd, mgs->mgs_configs_dir,
+                                  mgs->mgs_vfsmnt, &dentry_list);
+        if (rc) {
+                CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR);
+                RETURN(rc);
+        }
+                                                                                
+        /* Delete the fs db */
+        down(&mgs->mgs_sem);
+        fsdb = mgs_find_fsdb(obd, fsname);
+        if (fsdb) 
+                mgs_free_fsdb(fsdb);
+        up(&mgs->mgs_sem);
+
+        list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) {
+                list_del(&dirent->lld_list);
+                if (strncmp(fsname, dirent->lld_name, len) == 0) {
+                        CDEBUG(D_MGS, "Removing log %s\n", dirent->lld_name);
+                        mgs_clear_log(obd, dirent->lld_name);
+                }
+                OBD_FREE(dirent, sizeof(*dirent));
+        }
+        
+        RETURN(rc);
+}
+
+/* from llog_swab */
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+        int i;
+        ENTRY;
+
+        CDEBUG(D_MGS, "lustre_cfg: %p\n", lcfg);
+        CDEBUG(D_MGS, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+        CDEBUG(D_MGS, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+        CDEBUG(D_MGS, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+        CDEBUG(D_MGS, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+        CDEBUG(D_MGS, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+        CDEBUG(D_MGS, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+        if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+                for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+                        CDEBUG(D_MGS, "\tlcfg->lcfg_buflens[%d]: %d %s\n",
+                               i, lcfg->lcfg_buflens[i], 
+                               lustre_cfg_string(lcfg, i));
+                }
+        EXIT;
+}
+
+/* Set a permanent (config log) param for a target or fs */
+int mgs_setparam(struct obd_device *obd, char *fsname, struct lustre_cfg *lcfg)
+{
+        struct fs_db *fsdb;
+        struct mgs_target_info *mti;
+        char *devname;
+        int rc = 0;
+        ENTRY;
+
+        print_lustre_cfg(lcfg);
+        
+        /* lustre, lustre-mdtlov, lustre-client, lustre-MDT0000 */
+        devname = lustre_cfg_string(lcfg, 0);
+
+        if (devname == NULL) {
+                /* Global setting across all fs's? */
+                LCONSOLE_ERROR("Global settings not implemented yet!\n");
+                RETURN(-ENOSYS);
+        }
+        
+        CDEBUG(D_MGS, "target: %s\n", devname);
+
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); 
+        if (rc) 
+                RETURN(rc);
+        if (fsdb->fsdb_flags & FSDB_EMPTY) {
+                CERROR("No filesystem targets for %s\n", fsname);
+                RETURN(-EINVAL);
+        }
+
+        /* Create a fake mti to hold everything */
+        OBD_ALLOC_PTR(mti);
+        if (!mti) 
+                GOTO(out, rc = -ENOMEM);
+        strcpy(mti->mti_fsname, fsname);
+        strcpy(mti->mti_svname, devname);
+        rc = server_name2index(devname, &mti->mti_stripe_index, NULL);
+        if (rc < 0) 
+                GOTO(out, rc);
+        mti->mti_flags = rc | MTI_F_IOCTL;
+        strncpy(mti->mti_params, lustre_cfg_string(lcfg, 1), 
+                sizeof(mti->mti_params));
+
+        down(&fsdb->fsdb_sem);
+        rc = mgs_write_log_params(obd, fsdb, mti); 
+        up(&fsdb->fsdb_sem);
+
+out:
+        OBD_FREE_PTR(mti);
+        RETURN(rc);
+}
+
+
+#if 0
+/******************** unused *********************/
+static int mgs_backup_llog(struct obd_device *obd, char* fsname)
+{
+        struct file *filp, *bak_filp;
+        struct lvfs_run_ctxt saved;
+        char *logname, *buf;
+        loff_t soff = 0 , doff = 0;
+        int count = 4096, len;
+        int rc = 0;
+
+        OBD_ALLOC(logname, PATH_MAX);
+        if (logname == NULL)
+                return -ENOMEM;
+
+        OBD_ALLOC(buf, count);
+        if (!buf)
+                GOTO(out , rc = -ENOMEM);
+
+        len = snprintf(logname, PATH_MAX, "%s/%s.bak",
+                       MOUNT_CONFIGS_DIR, fsname);
+
+        if (len >= PATH_MAX - 1) {
+                GOTO(out, -ENAMETOOLONG);
+        } 
+
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                
+        bak_filp = l_filp_open(logname, O_RDWR|O_CREAT|O_TRUNC, 0660);
+        if (IS_ERR(bak_filp)) {
+                rc = PTR_ERR(bak_filp);
+                CERROR("backup logfile open %s: %d\n", logname, rc);
+                GOTO(pop, rc);
+        }
+        sprintf(logname, "%s/%s", MOUNT_CONFIGS_DIR, fsname);
+        filp = l_filp_open(logname, O_RDONLY, 0);
+        if (IS_ERR(filp)) {
+                rc = PTR_ERR(filp);
+                CERROR("logfile open %s: %d\n", logname, rc);
+                GOTO(close1f, rc);
+        }
+
+        while ((rc = lustre_fread(filp, buf, count, &soff)) > 0) {
+                rc = lustre_fwrite(bak_filp, buf, count, &doff);
+                break;
+        }
+
+        filp_close(filp, 0);
+close1f:
+        filp_close(bak_filp, 0);
+pop:
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+out:
+        if (buf)
+                OBD_FREE(buf, count);
+        OBD_FREE(logname, PATH_MAX);
+        return rc;
+}
+
+
+
+#endif
index ff70e59..a33afd2 100644 (file)
@@ -23,7 +23,7 @@ obdclass-all-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o
 obdclass-all-objs += class_obd.o
 obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o
 obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
-obdclass-all-objs += statfs_pack.o obdo.o obd_config.o prng.o
+obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o prng.o
 
 obdclass-objs := $(obdclass-linux-objs) $(obdclass-all-objs)
 
index 180ce86..b5fea25 100644 (file)
@@ -386,12 +386,14 @@ EXPORT_SYMBOL(proc_lustre_root);
 
 EXPORT_SYMBOL(class_register_type);
 EXPORT_SYMBOL(class_unregister_type);
+EXPORT_SYMBOL(class_search_type);
 EXPORT_SYMBOL(class_get_type);
 EXPORT_SYMBOL(class_put_type);
 EXPORT_SYMBOL(class_name2dev);
 EXPORT_SYMBOL(class_name2obd);
 EXPORT_SYMBOL(class_uuid2dev);
 EXPORT_SYMBOL(class_uuid2obd);
+EXPORT_SYMBOL(class_obd_list);
 EXPORT_SYMBOL(class_find_client_obd);
 EXPORT_SYMBOL(class_find_client_notype);
 EXPORT_SYMBOL(class_devices_in_group);
@@ -403,6 +405,7 @@ EXPORT_SYMBOL(class_conn2cliimp);
 EXPORT_SYMBOL(class_disconnect);
 
 /* uuid.c */
+EXPORT_SYMBOL(class_generate_random_uuid);
 EXPORT_SYMBOL(class_uuid_unparse);
 EXPORT_SYMBOL(lustre_uuid_to_peer);
 
@@ -410,7 +413,7 @@ EXPORT_SYMBOL(class_handle_hash);
 EXPORT_SYMBOL(class_handle_unhash);
 EXPORT_SYMBOL(class_handle2object);
 
-/* config.c */
+/* obd_config.c */
 EXPORT_SYMBOL(class_incref);
 EXPORT_SYMBOL(class_decref);
 EXPORT_SYMBOL(class_get_profile);
@@ -508,17 +511,18 @@ static int __init init_obdclass(void)
 int init_obdclass(void)
 #endif
 {
+        int i, err;
         struct obd_device *obd;
-        int err;
-        int i;
-
 #ifdef __KERNEL__
+        int lustre_register_fs(void);
+
         printk(KERN_INFO "Lustre: OBD class driver Build Version: "
                BUILD_VERSION", info@clusterfs.com\n");
 #else
         CDEBUG(D_INFO, "Lustre: OBD class driver Build Version: "
                BUILD_VERSION", info@clusterfs.com\n");
 #endif
+
         spin_lock_init(&obd_types_lock);
         spin_lock_init(&handle_lock);
         cfs_waitq_init(&obd_race_waitq);
@@ -550,6 +554,7 @@ int init_obdclass(void)
                 return err;
 #ifdef __KERNEL__
         err = class_procfs_init();
+        lustre_register_fs();
 #endif
 
         return err;
@@ -561,8 +566,11 @@ int init_obdclass(void)
 static void cleanup_obdclass(void)
 {
         int i;
+        int lustre_unregister_fs(void);
         ENTRY;
 
+        lustre_unregister_fs();
+
         cfs_psdev_deregister(&obd_psdev);
         for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
index 361c2d4..7ed2f36 100644 (file)
@@ -46,7 +46,7 @@ int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
  * support functions: we could use inter-module communication, but this
  * is more portable to other OS's
  */
-static struct obd_type *class_search_type(char *name)
+struct obd_type *class_search_type(char *name)
 {
         struct list_head *tmp;
         struct obd_type *type;
@@ -69,11 +69,15 @@ struct obd_type *class_get_type(char *name)
 
 #ifdef CONFIG_KMOD
         if (!type) {
-                if (!request_module(name)) {
-                        CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+                char *modname = name;
+                if (strcmp(modname, LUSTRE_MDT_NAME) == 0) 
+                        modname = LUSTRE_MDS_NAME;
+                if (!request_module(modname)) {
+                        CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
                         type = class_search_type(name);
-                } else
-                        CDEBUG(D_INFO, "Can't load module '%s'\n", name);
+                } else {
+                        LCONSOLE_ERROR("Can't load module '%s'\n", modname);
+                }
         }
 #endif
         if (type)
@@ -274,6 +278,33 @@ struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
         return &obd_dev[dev];
 }
 
+void class_obd_list(void)
+{
+        char *status;
+        int i;
+
+        spin_lock(&obd_dev_lock);
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
+                struct obd_device *obd = &obd_dev[i];
+                if (obd->obd_type == NULL)
+                        continue;
+                if (obd->obd_stopping)
+                        status = "ST";
+                else if (obd->obd_set_up)
+                        status = "UP";
+                else if (obd->obd_attached)
+                        status = "AT";
+                else
+                        status = "--";
+                LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+                         i, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+                         atomic_read(&obd->obd_refcount));
+        }
+        spin_unlock(&obd_dev_lock);
+        return;
+}
+
 /* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
    specified, then only the client with that uuid is returned,
    otherwise any client connected to the tgt is returned. */
index a6edbb7..2af0105 100644 (file)
@@ -142,8 +142,11 @@ int llog_init_handle(struct llog_handle *handle, int flags,
         rc = llog_read_header(handle);
         if (rc == 0) {
                 flags = llh->llh_flags;
-                if (uuid)
-                        LASSERT(obd_uuid_equals(uuid, &llh->llh_tgtuuid));
+                if (uuid && !obd_uuid_equals(uuid, &llh->llh_tgtuuid)) {
+                        CERROR("uuid mismatch: %s/%s\n", (char *)uuid->uuid,
+                               (char *)llh->llh_tgtuuid.uuid);
+                        rc = -EEXIST;
+                }
                 GOTO(out, rc);
         } else if (rc != LLOG_EEMPTY || !flags) {
                 /* set a pesudo flag for initialization */
@@ -209,15 +212,19 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb,
         char *buf;
         __u64 cur_offset = LLOG_CHUNK_SIZE;
         int rc = 0, index = 1, last_index;
-        int saved_index = 0;
+        int saved_index = 0, last_called_index = 0;
         ENTRY;
 
+        LASSERT(llh);
+
         OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
         if (!buf)
                 RETURN(-ENOMEM);
 
-        if (cd != NULL)
+        if (cd != NULL) {
+                last_called_index = cd->first_idx;
                 index = cd->first_idx + 1;
+        }
         if (cd != NULL && cd->last_idx)
                 last_index = cd->last_idx;
         else
@@ -285,6 +292,7 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb,
                         /* if set, process the callback on this record */
                         if (ext2_test_bit(index, llh->llh_bitmap)) {
                                 rc = cb(loghandle, rec, data);
+                                last_called_index = index;
                                 if (rc == LLOG_PROC_BREAK) {
                                         CWARN("recovery from log: "LPX64":%x"
                                               " stopped\n",
@@ -309,12 +317,22 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb,
         }
 
  out:
+        if (cd != NULL)
+                cd->last_idx = last_called_index;
         if (buf)
                 OBD_FREE(buf, LLOG_CHUNK_SIZE);
         RETURN(rc);
 }
 EXPORT_SYMBOL(llog_process);
 
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+        if (loghandle && loghandle->lgh_hdr)
+                return loghandle->lgh_hdr->llh_count;
+        return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
 int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
                          void *data, void *catdata)
 {
index f9c1ec0..9bdea74 100644 (file)
@@ -129,7 +129,6 @@ static int llog_check_cb(struct llog_handle *handle, struct llog_rec_hdr *rec,
                 case MDS_UNLINK_REC:
                 case MDS_SETATTR_REC:
                 case OBD_CFG_REC:
-                case PTL_CFG_REC:               /* obsolete */
                 case LLOG_HDR_MAGIC: {
                          l = snprintf(out, remains, "[index]: %05d  [type]: "
                                       "%02x  [len]: %04d ok\n",
index 6d56707..2eedc32 100644 (file)
@@ -46,6 +46,7 @@
 #include <libcfs/list.h>
 #include <lvfs.h>
 #include <lustre_fsfilt.h>
+#include <lustre_disk.h>
 #include "llog_internal.h"
 
 #if defined(__KERNEL__) && defined(LLOG_LVFS)
@@ -494,7 +495,7 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle,
         RETURN(-EIO);
 }
 
-static struct file *llog_filp_open(char *name, int flags, int mode)
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
 {
         char *logname;
         struct file *filp;
@@ -504,7 +505,7 @@ static struct file *llog_filp_open(char *name, int flags, int mode)
         if (logname == NULL)
                 return ERR_PTR(-ENOMEM);
 
-        len = snprintf(logname, PATH_MAX, "LOGS/%s", name);
+        len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
         if (len >= PATH_MAX - 1) {
                 filp = ERR_PTR(-ENAMETOOLONG);
         } else {
@@ -513,7 +514,6 @@ static struct file *llog_filp_open(char *name, int flags, int mode)
                         CERROR("logfile creation %s: %ld\n", logname,
                                PTR_ERR(filp));
         }
-
         OBD_FREE(logname, PATH_MAX);
         return filp;
 }
@@ -572,7 +572,16 @@ static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
                 handle->lgh_id = *logid;
 
         } else if (name) {
-                handle->lgh_file = llog_filp_open(name, open_flags, 0644);
+                /* COMPAT_146 */
+                if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) {
+                        handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, 
+                                                          open_flags, 0644);
+                } else {
+                        /* end COMPAT_146 */
+                        handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR,
+                                                          name, open_flags, 
+                                                          0644);
+                }
                 if (IS_ERR(handle->lgh_file))
                         GOTO(cleanup, rc = PTR_ERR(handle->lgh_file));
 
@@ -639,12 +648,20 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
 {
         struct dentry *fdentry;
         struct obdo *oa;
+        struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+        char *dir;
         int rc;
         ENTRY;
 
+        /* COMPAT_146 */
+        if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0)
+                dir = MDT_LOGS_DIR;
+        else
+                /* end COMPAT_146 */
+                dir = MOUNT_CONFIGS_DIR;
+
         fdentry = handle->lgh_file->f_dentry;
-        if (!strcmp(fdentry->d_parent->d_name.name, "LOGS")) {
-                struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+        if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
                 struct inode *inode = fdentry->d_parent->d_inode;
                 struct lvfs_run_ctxt saved;
 
@@ -692,7 +709,8 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
         int size = sizeof(*idarray) * count;
         loff_t off = 0;
 
-        LASSERT(count);
+        if (!count) 
+                return (0);
 
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
@@ -702,17 +720,19 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
                        name, rc);
                 GOTO(out, rc);
         }
-
+        
         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
                 CERROR("%s is not a regular file!: mode = %o\n", name,
                        file->f_dentry->d_inode->i_mode);
                 GOTO(out, rc = -ENOENT);
         }
 
+        CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", 
+               (int)file->f_dentry->d_inode->i_size, size);
+
         rc = fsfilt_read_record(disk_obd, file, idarray, size, &off);
         if (rc) {
-                CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n",
-                       name, rc);
+                CERROR("OBD filter: error reading %s: rc %d\n", name, rc);
                 GOTO(out, rc);
         }
 
@@ -734,7 +754,8 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
         int size = sizeof(*idarray) * count;
         loff_t off = 0;
 
-        LASSERT(count);
+        if (!count) 
+                return (0);
 
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700);
index 4833c29..5f6e680 100644 (file)
 
 /* helper functions for calling the llog obd methods */
 
+int llog_cleanup(struct llog_ctxt *ctxt)
+{
+        int rc = 0;
+        ENTRY;
+
+        if (!ctxt) {
+                CERROR("No ctxt\n");
+                RETURN(-ENODEV);
+        }
+        
+        if (CTXTP(ctxt, cleanup))
+                rc = CTXTP(ctxt, cleanup)(ctxt);
+
+        ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL;
+        if (ctxt->loc_exp)
+                class_export_put(ctxt->loc_exp);
+        OBD_FREE(ctxt, sizeof(*ctxt));
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
 int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
                int count, struct llog_logid *logid, struct llog_operations *op)
 {
@@ -49,6 +71,17 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
         if (index < 0 || index >= LLOG_MAX_CTXTS)
                 RETURN(-EFAULT);
 
+        if (obd->obd_llog_ctxt[index]) {
+        /* During an mds_lov_add_ost, we try to tear down and resetup llogs.
+           But the mdt teardown does not flow down to the lov/osc's as the 
+           setup does, because the lov/osc must clean up only when they are
+           done, not when the mdt is done. So instead, we just assume that
+           if the lov llogs are already set up then we must cleanup first. */
+                CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", 
+                       obd->obd_name, index);
+                llog_cleanup(obd->obd_llog_ctxt[index]);
+        }
+
         OBD_ALLOC(ctxt, sizeof(*ctxt));
         if (!ctxt)
                 RETURN(-ENOMEM);
@@ -67,28 +100,6 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
 }
 EXPORT_SYMBOL(llog_setup);
 
-int llog_cleanup(struct llog_ctxt *ctxt)
-{
-        int rc = 0;
-        ENTRY;
-
-        if (!ctxt) {
-                CERROR("No ctxt\n");
-                RETURN(-ENODEV);
-        }
-        
-        if (CTXTP(ctxt, cleanup))
-                rc = CTXTP(ctxt, cleanup)(ctxt);
-
-        ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL;
-        if (ctxt->loc_exp)
-                class_export_put(ctxt->loc_exp);
-        OBD_FREE(ctxt, sizeof(*ctxt));
-
-        RETURN(rc);
-}
-EXPORT_SYMBOL(llog_cleanup);
-
 int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
 {
         int rc = 0;
index e12003f..4f45df0 100644 (file)
@@ -239,11 +239,9 @@ void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
         }
 
         __swab32s(&lcfg->lcfg_command);
-
         __swab32s(&lcfg->lcfg_num);
         __swab32s(&lcfg->lcfg_flags);
         __swab64s(&lcfg->lcfg_nid);
-
         __swab32s(&lcfg->lcfg_bufcount);
         for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
                 __swab32s(&lcfg->lcfg_buflens[i]);
index 2a42138..920ba03 100644 (file)
@@ -81,6 +81,8 @@ int lustre_uuid_to_peer(char *uuid, lnet_nid_t *peer_nid, int index)
         return -ENOENT;
 }
 
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid; 
+   LNET will choose the best one. */
 int class_add_uuid(char *uuid, __u64 nid)
 {
         struct uuid_nid_data *data;
@@ -114,7 +116,7 @@ int class_add_uuid(char *uuid, __u64 nid)
         return 0;
 }
 
-/* delete only one entry if uuid is specified, otherwise delete all */
+/* Delete the nids for one uuid if specified, otherwise delete all */
 int class_del_uuid (char *uuid)
 {
         struct list_head  deathrow;
index 6befd70..ff70f56 100644 (file)
@@ -1,7 +1,7 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2006 Cluster File Systems, Inc.
  *
  *   This file is part of the Lustre file system, http://www.lustre.org
  *   Lustre is a trademark of Cluster File Systems, Inc.
@@ -38,6 +38,8 @@
 #include <libcfs/list.h>
 
 
+/********************** class fns **********************/
+
 /* Create a new device and set the type, name and uuid.  If
  * successful, the new device can be accessed by either name or uuid.
  */
@@ -222,6 +224,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         RETURN(0);
 
 err_exp:
+        CERROR("setup %s failed (%d)\n", obd->obd_name, err);
         class_unlink_export(obd->obd_self_export);
         obd->obd_self_export = NULL;
         obd->obd_starting = 0;
@@ -334,8 +337,8 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
                                 obd->obd_force = 1;
                                 break;
                         case 'A':
-                                LCONSOLE_WARN("Failing %s by user command\n",
-                                       obd->obd_name);
+                                LCONSOLE_WARN("Failing over %s\n", 
+                                              obd->obd_name);
                                 obd->obd_fail = 1;
                                 obd->obd_no_transno = 1;
                                 obd->obd_no_recov = 1;
@@ -458,7 +461,8 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
                 RETURN(-EINVAL);
         }
         if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
-            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+            strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && 
+            strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
                 CERROR("can't add connection on non-client dev\n");
                 RETURN(-EINVAL);
         }
@@ -529,7 +533,7 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc,
         ENTRY;
         OBD_ALLOC(lprof, sizeof(*lprof));
         if (lprof == NULL)
-                GOTO(out, err = -ENOMEM);
+                RETURN(-ENOMEM);
         CFS_INIT_LIST_HEAD(&lprof->lp_list);
 
         LASSERT(proflen == (strlen(prof) + 1));
@@ -540,7 +544,7 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc,
 
         LASSERT(osclen == (strlen(osc) + 1));
         OBD_ALLOC(lprof->lp_osc, osclen);
-        if (lprof->lp_profile == NULL)
+        if (lprof->lp_osc == NULL)
                 GOTO(out, err = -ENOMEM);
         memcpy(lprof->lp_osc, osc, osclen);
 
@@ -553,8 +557,16 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc,
         }
 
         list_add(&lprof->lp_list, &lustre_profile_list);
+        RETURN(err);
 
 out:
+        if (lprof->lp_mdc)
+                OBD_FREE(lprof->lp_mdc, mdclen);
+        if (lprof->lp_osc)
+                OBD_FREE(lprof->lp_osc, osclen);
+        if (lprof->lp_profile)
+                OBD_FREE(lprof->lp_profile, proflen);
+        OBD_FREE(lprof, sizeof(*lprof));        
         RETURN(err);
 }
 
@@ -621,8 +633,6 @@ int class_process_config(struct lustre_cfg *lcfg)
         case LCFG_DEL_MOUNTOPT: {
                 CDEBUG(D_IOCTL, "mountopt: profile %s\n",
                        lustre_cfg_string(lcfg, 1));
-                /* set these mount options somewhere, so ll_fill_super
-                 * can find them. */
                 class_del_profile(lustre_cfg_string(lcfg, 1));
                 GOTO(out, err = 0);
         }
@@ -643,9 +653,11 @@ int class_process_config(struct lustre_cfg *lcfg)
                         sizeof (obd_lustre_upcall));
                 GOTO(out, err = 0);
         }
-        case LCFG_PARAM: 
         case LCFG_MARKER: {
-                LCONSOLE_WARN("LCFG_MARKER not yet implemented.\n");
+                struct cfg_marker *marker;
+                marker = lustre_cfg_buf(lcfg, 1);
+                CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+                       marker->cm_flags, marker->cm_svname, marker->cm_comment);
                 GOTO(out, err = 0);
         }
         }
@@ -690,17 +702,34 @@ int class_process_config(struct lustre_cfg *lcfg)
         }
         }
 out:
+        if ((err == -ENOSYS || err == -EINVAL) && 
+            !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+                CWARN("Skipping optional command %#x\n", lcfg->lcfg_command);
+                err = 0;
+        }
         return err;
 }
 
+int class_config_dump_handler(struct llog_handle * handle,
+                              struct llog_rec_hdr *rec, void *data);
+
+#ifdef __KERNEL__
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+#else
+#define lustre_check_exclusion(a,b)  0
+#endif
+
 static int class_config_llog_handler(struct llog_handle * handle,
                                      struct llog_rec_hdr *rec, void *data)
 {
-        struct config_llog_instance *cfg = data;
+        struct config_llog_instance *clli = data;
         int cfg_len = rec->lrh_len;
         char *cfg_buf = (char*) (rec + 1);
         int rc = 0;
         ENTRY;
+        
+        //class_config_dump_handler(handle, rec, data);
+
         switch (rec->lrh_type) {
         case OBD_CFG_REC: {
                 struct lustre_cfg *lcfg, *lcfg_new;
@@ -717,23 +746,74 @@ static int class_config_llog_handler(struct llog_handle * handle,
                 if (rc)
                         GOTO(out, rc);
 
+                /* Figure out config state info */
+                if (lcfg->lcfg_command == LCFG_MARKER) {
+                        struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+                        CDEBUG(D_CONFIG, "Marker, cfg_flg=%#x\n",
+                               clli->cfg_flags);
+                        if (marker->cm_flags & CM_START) {
+                                /* all previous flags off */
+                                clli->cfg_flags = CFG_F_MARKER;
+                                if (marker->cm_flags & CM_SKIP) { 
+                                        clli->cfg_flags |= CFG_F_SKIP;
+                                        CDEBUG(D_CONFIG, "SKIP #%d\n",
+                                               marker->cm_step);
+                                } else if (lustre_check_exclusion(clli->cfg_sb, 
+                                                          marker->cm_svname)) {
+                                        clli->cfg_flags |= CFG_F_EXCLUDE;
+                                        CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+                                               marker->cm_step);
+                                }
+                        } else if (marker->cm_flags & CM_END) {
+                                clli->cfg_flags = 0;
+                        }
+                }
+                /* A config command without a start marker before it is 
+                   illegal (1.4.6. compat must set it artificially) */
+                if (!(clli->cfg_flags & CFG_F_MARKER) && 
+                    (lcfg->lcfg_command != LCFG_MARKER)) {
+                        CWARN("Config not inside markers, ignoring! (%#x)\n", 
+                              clli->cfg_flags);
+                        clli->cfg_flags |= CFG_F_SKIP;
+                }
+                
+                if (clli->cfg_flags & CFG_F_SKIP) {
+                        // FIXME warning
+                        CDEBUG(D_CONFIG|D_WARNING, "skipping %#x\n",
+                               clli->cfg_flags);
+                        rc = 0;
+                        /* No processing! */
+                        break;
+                }
+
+                if ((clli->cfg_flags & CFG_F_EXCLUDE) && 
+                    (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+                        /* Add inactive instead */
+                        lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
                 lustre_cfg_bufs_init(&bufs, lcfg);
 
-                if (cfg && cfg->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) {
+                if (clli && clli->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
                         inst = 1;
                         inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
-                                strlen(cfg->cfg_instance) + 1;
+                                strlen(clli->cfg_instance) + 1;
                         OBD_ALLOC(inst_name, inst_len);
                         if (inst_name == NULL)
                                 GOTO(out, rc = -ENOMEM);
                         sprintf(inst_name, "%s-%s",
                                 lustre_cfg_string(lcfg, 0),
-                                cfg->cfg_instance);
+                                clli->cfg_instance);
                         lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+                        CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", 
+                               lcfg->lcfg_command, inst_name);
                 }
 
-                if (cfg && lcfg->lcfg_command == LCFG_ATTACH) {
-                        lustre_cfg_bufs_set_string(&bufs, 2, cfg->cfg_uuid.uuid);
+                /* we override the llog's uuid for clients, to insure they
+                are unique */
+                if (clli && clli->cfg_instance && 
+                    lcfg->lcfg_command == LCFG_ATTACH) {
+                        lustre_cfg_bufs_set_string(&bufs, 2,
+                                                   clli->cfg_uuid.uuid);
                 }
 
                 lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
@@ -765,22 +845,23 @@ static int class_config_llog_handler(struct llog_handle * handle,
                         OBD_FREE(inst_name, inst_len);
                 break;
         }
-        case PTL_CFG_REC: {
-                CWARN("Ignoring obsolete portals config\n");
-                break;
-        }
         default:
                 CERROR("Unknown llog record type %#x encountered\n",
                        rec->lrh_type);
                 break;
         }
 out:
+        if (rc) {
+                CERROR("Err %d on cfg command:\n", rc);
+                class_config_dump_handler(handle, rec, data);
+        }
         RETURN(rc);
 }
 
 int class_config_parse_llog(struct llog_ctxt *ctxt, char *name,
                             struct config_llog_instance *cfg)
 {
+        struct llog_process_cat_data cd = {0, 0};
         struct llog_handle *llh;
         int rc, rc2;
         ENTRY;
@@ -794,14 +875,25 @@ int class_config_parse_llog(struct llog_ctxt *ctxt, char *name,
         if (rc)
                 GOTO(parse_out, rc);
 
-        rc = llog_process(llh, class_config_llog_handler, cfg, NULL);
+        /* continue processing from where we last stopped to end-of-log */
+        if (cfg)
+                cd.first_idx = cfg->cfg_last_idx;
+        cd.last_idx = 0;
+
+        rc = llog_process(llh, class_config_llog_handler, cfg, &cd);
+
+        /* FIXME remove warning */
+        CDEBUG(D_CONFIG|D_WARNING, "Processed log %s gen %d-%d (rc=%d)\n", name, 
+               cd.first_idx + 1, cd.last_idx, rc);
+        if (cfg)
+                cfg->cfg_last_idx = cd.last_idx;
+
 parse_out:
         rc2 = llog_close(llh);
         if (rc == 0)
                 rc = rc2;
 
         RETURN(rc);
-
 }
 
 int class_config_dump_handler(struct llog_handle * handle,
@@ -809,8 +901,16 @@ int class_config_dump_handler(struct llog_handle * handle,
 {
         int cfg_len = rec->lrh_len;
         char *cfg_buf = (char*) (rec + 1);
+        char *outstr, *ptr, *end;
         int rc = 0;
         ENTRY;
+
+        OBD_ALLOC(outstr, 256);
+        end = outstr + 256;
+        ptr = outstr;
+        if (!outstr) {
+                RETURN(-ENOMEM);
+        }
         if (rec->lrh_type == OBD_CFG_REC) {
                 struct lustre_cfg *lcfg;
                 int i;
@@ -820,30 +920,39 @@ int class_config_dump_handler(struct llog_handle * handle,
                         GOTO(out, rc);
                 lcfg = (struct lustre_cfg *)cfg_buf;
 
-                CDEBUG(D_INFO, "lcfg command: %x\n", lcfg->lcfg_command);
-                if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0)
-                        CDEBUG(D_INFO, "     devname: %s\n",
-                               lustre_cfg_string(lcfg, 0));
-                if (lcfg->lcfg_flags)
-                        CDEBUG(D_INFO, "       flags: %x\n", lcfg->lcfg_flags);
-                if (lcfg->lcfg_nid)
-                        CDEBUG(D_INFO, "         nid: %s\n",
-                               libcfs_nid2str(lcfg->lcfg_nid));
-                if (lcfg->lcfg_nal)
-                        CDEBUG(D_INFO, "         nal: %x (obsolete)\n", lcfg->lcfg_nal);
-                if (lcfg->lcfg_num)
-                        CDEBUG(D_INFO, "         num: %x\n", lcfg->lcfg_num);
-                for (i = 1; i < lcfg->lcfg_bufcount; i++)
-                        if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0)
-                                CDEBUG(D_INFO, "     inlbuf%d: %s\n", i,
-                                       lustre_cfg_string(lcfg, i));
-        } else if (rec->lrh_type == PTL_CFG_REC) {
-                CDEBUG(D_INFO, "Obsolete pcfg command\n");
+                ptr += snprintf(ptr, end-ptr, "cmd=%05x ",
+                                lcfg->lcfg_command);
+                if (lcfg->lcfg_flags) {
+                        ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+                                        lcfg->lcfg_flags);
+                }
+                if (lcfg->lcfg_num) {
+                        ptr += snprintf(ptr, end-ptr, "num=%#08x ",
+                                        lcfg->lcfg_num);
+                }
+                if (lcfg->lcfg_nid) {
+                        ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+                                        libcfs_nid2str(lcfg->lcfg_nid),
+                                        lcfg->lcfg_nid);
+                }
+                if (lcfg->lcfg_command == LCFG_MARKER) {
+                        struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+                        ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+                                        marker->cm_step, marker->cm_flags, 
+                                        marker->cm_svname, marker->cm_comment);
+                } else {
+                        for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+                                ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+                                                lustre_cfg_string(lcfg, i));
+                        }
+                }
+                LCONSOLE(D_WARNING, "   %s\n", outstr);
         } else {
-                CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+                LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
                 rc = -EINVAL;
         }
 out:
+        OBD_FREE(outstr, 256);
         RETURN(rc);
 }
 
@@ -854,6 +963,8 @@ int class_config_dump_llog(struct llog_ctxt *ctxt, char *name,
         int rc, rc2;
         ENTRY;
 
+        LCONSOLE_INFO("Dumping config log %s\n", name);
+
         rc = llog_create(ctxt, &llh, NULL, name);
         if (rc)
                 RETURN(rc);
@@ -868,23 +979,23 @@ parse_out:
         if (rc == 0)
                 rc = rc2;
 
+        LCONSOLE_INFO("End config log %s\n", name);
         RETURN(rc);
 
 }
 
 /* Cleanup and detach */
-void class_manual_cleanup(struct obd_device *obd)
+int class_manual_cleanup(struct obd_device *obd)
 {
         struct lustre_cfg *lcfg;
         struct lustre_cfg_bufs bufs;
-        int err;
+        int rc;
         char flags[3]="";
         ENTRY;
 
         if (!obd) {
                 CERROR("empty cleanup\n");
-                EXIT;
-                return;
+                RETURN(-EALREADY);
         }
 
         if (obd->obd_force)
@@ -899,16 +1010,19 @@ void class_manual_cleanup(struct obd_device *obd)
         lustre_cfg_bufs_set_string(&bufs, 1, flags);
         lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
 
-        err = class_process_config(lcfg);
-        if (err)
-                CERROR("cleanup failed %d: %s\n", err, obd->obd_name);
+        rc = class_process_config(lcfg);
+        if (rc) {
+                CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+                GOTO(out, rc);
+        }
 
         /* the lcfg is almost the same for both ops */
         lcfg->lcfg_command = LCFG_DETACH;
-        err = class_process_config(lcfg);
+        rc = class_process_config(lcfg);
+        if (rc)
+                CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
         lustre_cfg_free(lcfg);
-        if (err)
-                CERROR("detach failed %d: %s\n", err, obd->obd_name);
-        EXIT;
+        RETURN(rc);
 }
 
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c
new file mode 100644 (file)
index 0000000..01a9b1b
--- /dev/null
@@ -0,0 +1,1919 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/obdclass/obd_mount.c
+ *  Client/server mount routines
+ *
+ *  Copyright (c) 2006 Cluster File Systems, Inc.
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_MGMT
+#define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */
+#define PRINT_CMD LCONSOLE
+#define PRINT_MASK D_SUPER
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h> 
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lustre_ver.h>
+                      
+static int (*client_fill_super)(struct super_block *sb) = NULL;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+        char *ptr;
+
+        if (!buf) 
+                return 1;
+
+        if ((ptr = strstr(buf, key)) == NULL) 
+                return 1;
+
+        if (valp) 
+                *valp = ptr + strlen(key);
+        
+        return 0;
+}
+
+/* returns 0 if this is the first key in the buffer, else 1 */
+int class_match_param(char *buf, char *key, char **valp)
+{
+        if (!buf) 
+                return 1;
+
+        if (memcmp(buf, key, strlen(key)) != 0) 
+                return 1;
+
+        if (valp) 
+                *valp = buf + strlen(key);
+        
+        return 0;
+}
+
+/* 0 is good nid, 
+   1 not found
+   < 0 error
+   endh is set to next separator */
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+        char tmp, *endp;
+
+        if (!buf) 
+                return 1;
+        while (*buf == ',' || *buf == ':') 
+                buf++;
+        if (*buf == ' ' || *buf == '/' || *buf == '\0') 
+                return 1;
+
+        /* nid separators or end of nids */
+        endp = strpbrk(buf, ",: /");
+        if (endp == NULL) 
+                endp = buf + strlen(buf);
+
+        tmp = *endp;
+        *endp = '\0';
+        *nid = libcfs_str2nid(buf);
+        if (*nid == LNET_NID_ANY) {
+                LCONSOLE_ERROR("Can't parse NID '%s'\n", buf);
+                *endp = tmp;
+                return -EINVAL;
+        }
+        *endp = tmp;
+
+        if (endh) 
+                *endh = endp;
+        CDEBUG(D_MOUNT, "Nid %s\n", libcfs_nid2str(*nid));
+        return 0;
+}
+
+/*********** mount lookup *********/
+
+DECLARE_MUTEX(lustre_mount_info_lock);
+struct list_head server_mount_info_list = LIST_HEAD_INIT(server_mount_info_list);
+
+static struct lustre_mount_info *server_find_mount(char *name)
+{
+        struct list_head *tmp;
+        struct lustre_mount_info *lmi;
+        ENTRY;
+
+        list_for_each(tmp, &server_mount_info_list) {
+                lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain);
+                if (strcmp(name, lmi->lmi_name) == 0) 
+                        RETURN(lmi);
+        }
+        RETURN(NULL);
+}
+
+/* we must register an obd for a mount before we call the setup routine.  
+   *_setup will call lustre_get_mount to get the mnt struct
+   by obd_name, since we can't pass the pointer to setup. */
+static int server_register_mount(char *name, struct super_block *sb,
+                          struct vfsmount *mnt)
+{
+        struct lustre_mount_info *lmi;
+        char *name_cp;
+        ENTRY;
+
+        LASSERT(mnt);
+        LASSERT(sb);
+
+        OBD_ALLOC(lmi, sizeof(*lmi));
+        if (!lmi) 
+                RETURN(-ENOMEM);
+        OBD_ALLOC(name_cp, strlen(name) + 1);
+        if (!name_cp) { 
+                OBD_FREE(lmi, sizeof(*lmi));
+                RETURN(-ENOMEM);
+        }
+        strcpy(name_cp, name);
+
+        down(&lustre_mount_info_lock);
+        
+        if (server_find_mount(name)) {
+                up(&lustre_mount_info_lock);
+                OBD_FREE(lmi, sizeof(*lmi));
+                OBD_FREE(name_cp, strlen(name) + 1);
+                CERROR("Already registered %s\n", name);
+                RETURN(-EEXIST);
+        }
+        lmi->lmi_name = name_cp;
+        lmi->lmi_sb = sb;
+        lmi->lmi_mnt = mnt;
+        list_add(&lmi->lmi_list_chain, &server_mount_info_list);
+         
+        up(&lustre_mount_info_lock);
+
+        CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n", 
+               lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count));
+
+        RETURN(0);
+}
+
+/* when an obd no longer needs a mount */
+static int server_deregister_mount(char *name)
+{
+        struct lustre_mount_info *lmi;
+        ENTRY;
+
+        down(&lustre_mount_info_lock);
+        lmi = server_find_mount(name);
+        if (!lmi) {
+                up(&lustre_mount_info_lock);
+                CERROR("%s not registered\n", name);
+                RETURN(-ENOENT);
+        }
+        
+        CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n", 
+               lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count));
+        
+        OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+        list_del(&lmi->lmi_list_chain);
+        OBD_FREE(lmi, sizeof(*lmi));
+        up(&lustre_mount_info_lock);
+
+        RETURN(0);
+}
+
+/* Deregister anyone referencing the mnt. Everyone should have
+   put_mount in *_cleanup, but this is a catch-all in case of err... */
+/* FIXME this should be removed from lustre_free_lsi, which may be called
+   from server_put_mount _before_ it gets to server_deregister_mount. 
+   Leave it here for now for the error message it shows... */
+static void server_deregister_mount_all(struct vfsmount *mnt)
+{
+        struct list_head *tmp, *n;
+        struct lustre_mount_info *lmi;
+        ENTRY;
+
+        if (!mnt) {
+                EXIT;
+                return;
+        }
+
+        //down(&lustre_mount_info_lock);
+        list_for_each_safe(tmp, n, &server_mount_info_list) {
+                lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain);
+                if (lmi->lmi_mnt == mnt) {
+                        CERROR("Mount %p still referenced by %s\n", mnt,
+                               lmi->lmi_name);
+                        //OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1);
+                        //list_del(&lmi->lmi_list_chain);
+                        //OBD_FREE(lmi, sizeof(*lmi));
+                }
+        }
+        //up(&lustre_mount_info_lock);
+        EXIT;
+}
+
+/* obd's look up a registered mount using their name. This is just
+   for initial obd setup to find the mount struct.  It should not be
+   called every time you want to mntget. */
+struct lustre_mount_info *server_get_mount(char *name)
+{
+        struct lustre_mount_info *lmi;
+        struct lustre_sb_info *lsi;
+        ENTRY;
+
+        down(&lustre_mount_info_lock);
+        lmi = server_find_mount(name);
+        up(&lustre_mount_info_lock);
+        if (!lmi) {
+                CERROR("Can't find mount for %s\n", name);
+                RETURN(NULL);
+        }
+        lsi = s2lsi(lmi->lmi_sb);
+        mntget(lmi->lmi_mnt);
+        atomic_inc(&lsi->lsi_mounts);
+        
+        CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n", 
+               lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts),
+               atomic_read(&lmi->lmi_mnt->mnt_count));
+
+        RETURN(lmi);
+}
+
+static void unlock_mntput(struct vfsmount *mnt)
+{
+        if (kernel_locked()) {
+                unlock_kernel();
+                mntput(mnt);
+                lock_kernel();
+        } else {
+                mntput(mnt);
+        }
+}
+
+static int lustre_put_lsi(struct super_block *sb);
+
+/* to be called from obd_cleanup methods */
+int server_put_mount(char *name, struct vfsmount *mnt)
+{
+        struct lustre_mount_info *lmi;
+        struct lustre_sb_info *lsi;
+        ENTRY;
+
+        down(&lustre_mount_info_lock);
+        lmi = server_find_mount(name);
+        up(&lustre_mount_info_lock);
+        if (!lmi) {
+                CERROR("Can't find mount for %s\n", name);
+                RETURN(-ENOENT);
+        }
+        lsi = s2lsi(lmi->lmi_sb);
+        LASSERT(lmi->lmi_mnt == mnt);
+        unlock_mntput(lmi->lmi_mnt);
+
+        CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n", 
+               lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts),
+               atomic_read(&lmi->lmi_mnt->mnt_count));
+
+        if (lustre_put_lsi(lmi->lmi_sb)) {
+                CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n", 
+                       lmi->lmi_mnt, name, 
+                       atomic_read(&lmi->lmi_mnt->mnt_count));
+                /* last mount is the One True Mount */
+                if (atomic_read(&lmi->lmi_mnt->mnt_count) > 1)
+                        CERROR("%s: mount busy, vfscount=%d!\n", name,
+                               atomic_read(&lmi->lmi_mnt->mnt_count));
+        }
+
+        /* this obd should never need the mount again */
+        server_deregister_mount(name);
+        
+        RETURN(0);
+}
+
+
+/******* mount helper utilities *********/
+
+static void ldd_print(struct lustre_disk_data *ldd)
+{
+        PRINT_CMD(PRINT_MASK, "  disk data:\n"); 
+        PRINT_CMD(PRINT_MASK, "config:  %d\n", ldd->ldd_config_ver);
+        PRINT_CMD(PRINT_MASK, "fs:      %s\n", ldd->ldd_fsname);
+        PRINT_CMD(PRINT_MASK, "server:  %s\n", ldd->ldd_svname);
+        PRINT_CMD(PRINT_MASK, "index:   %04x\n", ldd->ldd_svindex);
+        PRINT_CMD(PRINT_MASK, "flags:   %#x\n", ldd->ldd_flags);
+        PRINT_CMD(PRINT_MASK, "diskfs:  %s\n", MT_STR(ldd));
+        PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts);
+        PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params);
+}
+
+static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt, 
+                           struct lustre_disk_data *ldd)
+{       
+        struct lvfs_run_ctxt saved;
+        struct file *file;
+        loff_t off = 0;
+        unsigned long len;
+        int rc;
+        ENTRY;
+               
+        push_ctxt(&saved, mount_ctxt, NULL);
+        
+        file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644);
+        if (IS_ERR(file)) {
+                rc = PTR_ERR(file);
+                CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
+                GOTO(out, rc);
+        }
+        len = file->f_dentry->d_inode->i_size;
+        CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len);
+        if (len != sizeof(*ldd)) {
+                CERROR("disk data size does not match: see %lu expect %u\n", 
+                       len, sizeof(*ldd));
+                GOTO(out_close, rc = -EINVAL);
+        }
+
+        rc = lustre_fread(file, ldd, len, &off);
+        if (rc != len) {
+                CERROR("error reading %s: read %d of %lu\n", 
+                       MOUNT_DATA_FILE, rc, len);
+                GOTO(out_close, rc = -EINVAL);
+        }
+        rc = 0;
+
+        if (ldd->ldd_magic != LDD_MAGIC) {
+                /* FIXME add swabbing support */
+                CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE, 
+                       ldd->ldd_magic, LDD_MAGIC);
+                GOTO(out_close, rc = -EINVAL);
+        }
+        
+        if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) {
+                CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+                       ldd->ldd_svname, 
+                       ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP);
+                GOTO(out_close, rc = -EINVAL);
+        }
+        if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) {
+                CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+                       ldd->ldd_svname,  
+                       ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP);
+                /* Do something like remount filesystem read-only */
+                GOTO(out_close, rc = -EINVAL);
+        }
+
+        ldd_print(ldd);
+
+out_close:
+        filp_close(file, 0);
+out:
+        pop_ctxt(&saved, mount_ctxt, NULL);
+        RETURN(rc);
+}
+
+static int ldd_write(struct lvfs_run_ctxt *mount_ctxt, 
+                     struct lustre_disk_data *ldd)
+{       
+        struct lvfs_run_ctxt saved;
+        struct file *file;
+        loff_t off = 0;
+        unsigned long len = sizeof(struct lustre_disk_data);
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(ldd->ldd_magic == LDD_MAGIC);
+        
+        ldd->ldd_config_ver++;  
+
+        push_ctxt(&saved, mount_ctxt, NULL);
+        
+        file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644);
+        if (IS_ERR(file)) {
+                rc = PTR_ERR(file);
+                CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc);
+                GOTO(out, rc);
+        }
+        rc = lustre_fwrite(file, ldd, len, &off);
+        if (rc != len) {
+                CERROR("error writing %s: read %d of %lu\n", 
+                       MOUNT_DATA_FILE, rc, len);
+                GOTO(out_close, rc = -EINVAL);
+        }
+
+        rc = 0;
+        ldd_print(ldd);
+
+out_close:
+        filp_close(file, 0);
+out:
+        pop_ctxt(&saved, mount_ctxt, NULL);
+        RETURN(rc);
+}
+
+
+/**************** config llog ********************/
+
+/* Get a config log from the MGS and process it.
+   This func is called for both clients and servers.
+   Continue to process new statements appended to the logs
+   (whenever the config lock is revoked) until lustre_end_log
+   is called. */
+int lustre_process_log(struct super_block *sb, char *logname, 
+                     struct config_llog_instance *cfg)
+{
+        struct lustre_cfg *lcfg;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+        int rc;
+        ENTRY;
+
+        LASSERT(mgc);
+        LASSERT(cfg);
+
+        /* mgc_process_config */
+        lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+        lustre_cfg_bufs_set_string(&bufs, 1, logname);
+        lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+        lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb));
+        lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs);
+        rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+        lustre_cfg_free(lcfg);
+
+        if (rc) 
+                LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
+                               "from the MGS (%d).  This may be the result of "
+                               "communication errors between this node and "
+                               "the MGS, or the MGS may not be running.\n",
+                               mgc->obd_name, logname, rc);
+
+        class_obd_list();
+        RETURN(rc);
+}
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname, 
+                       struct config_llog_instance *cfg)
+{
+        struct lustre_cfg *lcfg;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+        int rc;
+        ENTRY;
+
+        if (!mgc)
+                RETURN(-ENOENT);
+
+        /* mgc_process_config */
+        lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+        lustre_cfg_bufs_set_string(&bufs, 1, logname);
+        if (cfg)
+                lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+        lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+        rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+        lustre_cfg_free(lcfg);
+        RETURN(rc);
+}
+
+/**************** obd start *******************/
+
+static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+                   char *s1, char *s2, char *s3, char *s4)
+{
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg    * lcfg = NULL;
+        int rc;
+
+        CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+               cmd, s1, s2, s3, s4); 
+
+        lustre_cfg_bufs_reset(&bufs, cfgname);
+        if (s1) 
+                lustre_cfg_bufs_set_string(&bufs, 1, s1);
+        if (s2) 
+                lustre_cfg_bufs_set_string(&bufs, 2, s2);
+        if (s3) 
+                lustre_cfg_bufs_set_string(&bufs, 3, s3);
+        if (s4) 
+                lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        lcfg->lcfg_nid = nid;
+        rc = class_process_config(lcfg);
+        lustre_cfg_free(lcfg);
+        return(rc);
+}
+
+static int lustre_start_simple(char *obdname, char *type, char *uuid, 
+                               char *s1, char *s2)
+{
+        int rc;
+        CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+        rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+        if (rc) {
+                CERROR("%s attach error %d\n", obdname, rc);
+                return(rc);
+        }
+        rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0);
+        if (rc) {
+                CERROR("%s setup error %d\n", obdname, rc);
+                do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+        }
+        return rc;
+}
+
+/* Set up a MGS to serve startup logs */
+static int server_start_mgs(struct super_block *sb)
+{
+        struct lustre_sb_info    *lsi = s2lsi(sb);
+        struct vfsmount          *mnt = lsi->lsi_srv_mnt;
+        struct lustre_mount_info *lmi;
+        int    rc = 0;
+        ENTRY;
+        LASSERT(mnt);
+
+        /* It is impossible to have more than 1 MGS per node, since
+           MGC wouldn't know which to connect to */
+        lmi = server_find_mount(LUSTRE_MGS_OBDNAME);
+        if (lmi) {
+                lsi = s2lsi(lmi->lmi_sb);
+                LCONSOLE_ERROR("The MGS service was already started from "
+                               "server %s\n", lsi->lsi_ldd->ldd_svname);
+                RETURN(-EALREADY);
+        }
+
+        CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+        rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt);
+
+        if (!rc &&
+            ((rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, 
+                                       LUSTRE_MGS_OBDNAME, 0, 0)))) 
+                server_deregister_mount(LUSTRE_MGS_OBDNAME);
+        
+        if (rc)                                
+                LCONSOLE_ERROR("Failed to start MGS '%s' (%d).  Is the 'mgs' "
+                               "module loaded?\n", LUSTRE_MGS_OBDNAME, rc);
+
+        RETURN(rc);
+}
+
+static int server_stop_mgs(struct super_block *sb)
+{
+        struct obd_device *obd;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME);
+
+        /* There better be only one MGS */
+        obd = class_name2obd(LUSTRE_MGS_OBDNAME);
+        if (!obd) {
+                CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME);
+                RETURN(-EALREADY);
+        }
+
+        /* The MGS should always stop when we say so */
+        obd->obd_force = 1;
+        rc = class_manual_cleanup(obd);
+        RETURN(rc);
+}
+
+/* Set up a mgcobd to process startup logs */
+static int lustre_start_mgc(struct super_block *sb)
+{
+        struct lustre_handle mgc_conn = {0, };
+        struct obd_connect_data ocd = { 0 };
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct obd_uuid *uuid;
+        class_uuid_t uuidc;
+        lnet_nid_t nid;
+        char niduuid[10];
+        char *ptr;
+        int recov_bk = 0;
+        int rc = 0, i = 0, j;
+        ENTRY;
+
+        LASSERT(lsi->lsi_lmd);
+        
+        obd = class_name2obd(LUSTRE_MGC_OBDNAME);
+        if (obd) {
+                atomic_inc(&obd->u.cli.cl_mgc_refcount);
+                /* FIXME There's only one MGC, but users could give different
+                   MGS nids on the mount line.  So now do we add new MGS uuids
+                   or not?  If there's truly one MGS per site, the MGS uuids
+                   _should_ all be the same. Maybe check here?
+                */
+                
+                /* If we are restarting the MGS, don't try to keep the MGC's
+                   old connection, or registration will fail. */
+                if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) {
+                        CDEBUG(D_MOUNT|D_ERROR, "New MGS with live MGC\n");
+                        recov_bk = 1;
+                }
+
+                /* Try all connections, but only once (again). 
+                   We don't want to block another target from starting
+                   (using its local copy of the log), but we do want to connect
+                   if at all possible. */
+                recov_bk++;
+                CDEBUG(D_MOUNT, "Set MGS reconnect %d\n", recov_bk);
+                rc = obd_set_info_async(obd->obd_self_export,
+                                        strlen(KEY_INIT_RECOV_BACKUP),
+                                        KEY_INIT_RECOV_BACKUP,
+                                        sizeof(recov_bk), &recov_bk, NULL);
+                GOTO(out, rc = 0);
+        }
+
+        CDEBUG(D_MOUNT, "Start MGC '%s'\n", LUSTRE_MGC_OBDNAME);
+
+        /* Add the primary nids for the MGS */
+        if (lsi->lsi_flags & LSI_SERVER) {
+                ptr = lsi->lsi_ldd->ldd_params;
+                if (IS_MGS(lsi->lsi_ldd)) {
+                        /* Use local nids (including LO) */
+                        lnet_process_id_t id;
+                        while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                                rc = do_lcfg(LUSTRE_MGC_OBDNAME, id.nid,
+                                             LCFG_ADD_UUID, "mgsnid0", 0,0,0);
+                        }
+                } else {
+                        /* Use mgsnode= nids */
+                        if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) {
+                                CERROR("No MGS nids given.\n");
+                                RETURN(-EINVAL);
+                        }
+                        while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                                rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid,
+                                             LCFG_ADD_UUID, "mgsnid0", 0,0,0);
+                                i++;
+                        }
+                }
+        } else { /* client */
+                /* use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+                ptr = lsi->lsi_lmd->lmd_dev;
+                while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                        rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid,
+                                     LCFG_ADD_UUID, "mgsnid0", 0,0,0);
+                        i++;
+                        if (*ptr == ':') 
+                                break;
+                }
+        }
+        if (i == 0) {
+                CERROR("No valid MGS nids found.\n");
+                RETURN(-EINVAL);
+        }
+        lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+        /* Random uuid for MGC allows easier reconnects */
+        OBD_ALLOC_PTR(uuid);
+        class_generate_random_uuid(uuidc);
+        class_uuid_unparse(uuidc, uuid);
+
+        /* Start the MGC */
+        rc = lustre_start_simple(LUSTRE_MGC_OBDNAME, LUSTRE_MGC_NAME, 
+                                 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+                                 "mgsnid0");
+        OBD_FREE_PTR(uuid);
+        if (rc) 
+                RETURN(rc);
+        
+        /* Add any failover MGS nids */
+        i = 1;
+        while ((*ptr == ':' || 
+                class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) {
+                /* New failover node */
+                sprintf(niduuid, "mgsnid%d", i);
+                j = 0;
+                while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                        j++;
+                        rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid,
+                                     LCFG_ADD_UUID, niduuid, 0,0,0);
+                        if (*ptr == ':') 
+                                break;
+                }
+                if (j > 0) {
+                        rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_ADD_CONN,
+                                     niduuid, 0, 0, 0);
+                        i++;
+                } else {
+                        /* at ":/fsname" */
+                        break;
+                }
+        }
+        lsi->lsi_lmd->lmd_mgs_failnodes = i;
+        
+        obd = class_name2obd(LUSTRE_MGC_OBDNAME);
+        if (!obd) {
+                CERROR("Can't find mgcobd %s\n", LUSTRE_MGC_OBDNAME);
+                RETURN(-ENOTCONN);
+        }
+
+        /* Try all connections, but only once. */
+        recov_bk = 1;
+        rc = obd_set_info_async(obd->obd_self_export,
+                                strlen(KEY_INIT_RECOV_BACKUP), 
+                                KEY_INIT_RECOV_BACKUP,
+                                sizeof(recov_bk), &recov_bk, NULL);
+        if (rc) 
+                /* nonfatal */
+                CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+       
+        /* FIXME add ACL support? */
+        //ocd.ocd_connect_flags = OBD_CONNECT_ACL;
+
+        /* We connect to the MGS at setup, and don't disconnect until cleanup */
+        rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd);
+        if (rc) {
+                CERROR("connect failed %d\n", rc);
+                GOTO(out, rc);
+        }
+        
+        exp = class_conn2export(&mgc_conn);
+        obd->u.cli.cl_mgc_mgsexp = exp;
+
+        /* And keep a refcount of servers/clients who started with "mount",
+           so we know when we can get rid of the mgc. */
+        atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+out:
+        /* Keep the mgc info in the sb. Note that many lsi's can point
+           to the same mgc.*/
+        lsi->lsi_mgc = obd;
+        RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *obd;
+        char niduuid[10];
+        int i, rc;
+        ENTRY;
+
+        if (!lsi)
+                RETURN(-ENOENT);
+        obd = lsi->lsi_mgc;
+        if (!obd)
+                RETURN(-ENOENT);
+
+        lsi->lsi_mgc = NULL;
+        if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+                /* This is not fatal, every client that stops 
+                   will call in here. */
+                CDEBUG(D_MOUNT, "mgc still has %d references.\n", 
+                       atomic_read(&obd->u.cli.cl_mgc_refcount));
+                RETURN(-EBUSY); 
+        }
+
+        /* MGC must always stop */
+        obd->obd_force = 1;
+        /* client_disconnect_export uses the no_recov flag to decide whether it
+           should disconnect or just invalidate.  (The MGC has no
+           recoverable data in any case.) */
+        obd->obd_no_recov = 1;
+
+        if (obd->u.cli.cl_mgc_mgsexp)
+                obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+
+        rc = class_manual_cleanup(obd);
+        if (rc)
+                RETURN(rc);
+        
+        for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+                sprintf(niduuid, "mgsnid%d", i);
+                rc = do_lcfg(obd->obd_name, 0, LCFG_DEL_UUID, 
+                             niduuid, 0, 0, 0);
+                if (rc)
+                        CERROR("del MDC UUID %s failed: rc = %d\n", 
+                               niduuid, rc);
+        }
+        /* class_import_put will get rid of the additional connections */
+
+        RETURN(0);
+}
+          
+/* Since there's only one mgc per node, we have to change it's fs to get
+   access to the right disk. */
+static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev);
+
+        /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */
+        rc = obd_set_info_async(mgc->obd_self_export,
+                                strlen("set_fs"), "set_fs",
+                                sizeof(*sb), sb, NULL);
+        if (rc) {
+                CERROR("can't set_fs %d\n", rc);
+        }
+
+        RETURN(rc);
+}
+
+static int server_mgc_clear_fs(struct obd_device *mgc)
+{
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "Unassign mgc disk\n");
+        
+        rc = obd_set_info_async(mgc->obd_self_export,
+                                strlen("clear_fs"), "clear_fs",
+                                0, NULL, NULL);
+        RETURN(rc);
+}
+
+/* Stop MDS/OSS if nobody is using them */
+static int server_stop_servers(int lddflags, int lsiflags)
+{
+        struct obd_device *obd = NULL;
+        struct obd_type *type = NULL;
+        int rc = 0;
+        ENTRY;
+
+        /* Either an MDT or an OST or neither  */
+
+        /* if this was an MDT, and there are no more MDT's, clean up the MDS */
+        if ((lddflags & LDD_F_SV_TYPE_MDT) && (obd = class_name2obd("MDS"))) {
+                //FIXME pre-rename, should eventually be LUSTRE_MDT_NAME
+                type = class_search_type(LUSTRE_MDS_NAME);
+        } 
+        /* if this was an OST, and there are no more OST's, clean up the OSS */
+        if ((lddflags & LDD_F_SV_TYPE_OST) && (obd = class_name2obd("OSS"))) {
+                type = class_search_type(LUSTRE_OST_NAME);
+        }
+
+        if (obd && (!type || !type->typ_refcnt)) {
+                int err;
+                obd->obd_force = 1;
+                /* obd_fail doesn't mean much on a server obd */
+                err = class_manual_cleanup(obd);
+                if (!rc) 
+                        rc = err;
+        }
+
+        RETURN(rc);
+}
+
+int server_mti_print(char *title, struct mgs_target_info *mti)
+{
+        PRINT_CMD(PRINT_MASK, "mti %s\n", title); 
+        PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); 
+        PRINT_CMD(PRINT_MASK, "fs:     %s\n", mti->mti_fsname); 
+        PRINT_CMD(PRINT_MASK, "uuid:   %s\n", mti->mti_uuid); 
+        PRINT_CMD(PRINT_MASK, "ver: %d  flags: %#x\n",
+                  mti->mti_config_ver, mti->mti_flags);
+        return(0);
+}
+
+static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti)
+{       
+        struct lustre_sb_info   *lsi = s2lsi(sb);
+        struct lustre_disk_data *ldd = lsi->lsi_ldd;
+        lnet_process_id_t        id;
+        int i = 0;
+        ENTRY;
+
+        if (!(lsi->lsi_flags & LSI_SERVER))
+                RETURN(-EINVAL);
+
+        strncpy(mti->mti_fsname, ldd->ldd_fsname,
+                sizeof(mti->mti_fsname));
+        strncpy(mti->mti_svname, ldd->ldd_svname,
+                sizeof(mti->mti_svname));
+        
+        mti->mti_nid_count = 0;
+        while (LNetGetId(i++, &id) != -ENOENT) {
+                if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) 
+                        continue;
+                mti->mti_nids[mti->mti_nid_count] = id.nid;
+                mti->mti_nid_count++;
+                if (mti->mti_nid_count >= MTI_NIDS_MAX) {
+                        CWARN("Only using first %d nids for %s\n",
+                              mti->mti_nid_count, mti->mti_svname);
+                        break;
+                }
+        }    
+
+        mti->mti_config_ver = 0;
+        mti->mti_flags = ldd->ldd_flags;
+        mti->mti_stripe_index = ldd->ldd_svindex;
+        memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid));
+        if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) {
+                CERROR("params too big for mti\n");
+                RETURN(-ENOMEM);
+                /* FIXME we can't send a msg much bigger than 4k - use bulk? */
+        }
+        memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params));
+        RETURN(0);
+}
+
+/* Register an old or new target with the MGS. If needed MGS will construct
+   startup logs and assign index */
+int server_register_target(struct super_block *sb)
+{       
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device *mgc = lsi->lsi_mgc;
+        struct lustre_disk_data *ldd = lsi->lsi_ldd;
+        struct mgs_target_info *mti = NULL;
+        int rc;
+        ENTRY;
+
+        LASSERT(mgc);
+
+        if (!(lsi->lsi_flags & LSI_SERVER))
+                RETURN(-EINVAL);
+
+        OBD_ALLOC_PTR(mti);
+        if (!mti) 
+                RETURN(-ENOMEM);
+        rc = server_sb2mti(sb, mti);
+        if (rc) 
+                GOTO(out, rc);
+
+        CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n",
+               mti->mti_svname, mti->mti_fsname,
+               libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index,
+               mti->mti_flags);
+
+        /* Register the target */
+        /* FIXME use mdc_process_config instead */
+        rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp,
+                                strlen("register_target"), "register_target",
+                                sizeof(*mti), mti, NULL);
+        if (rc) {
+                CERROR("registration with the MGS failed (%d)\n", rc);
+                GOTO(out, rc);
+        }
+
+        /* Always update our flags */
+        ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD;
+
+        /* If this flag is set, it means the MGS wants us to change our
+           on-disk data. (So far this means just the index.) */
+        if (mti->mti_flags & LDD_F_REWRITE_LDD) {
+                char *label;
+                int err;
+                CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x "
+                       "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index, 
+                       mti->mti_svname);
+                ldd->ldd_svindex = mti->mti_stripe_index;
+                strncpy(ldd->ldd_svname, mti->mti_svname, 
+                        sizeof(ldd->ldd_svname));
+                /* or ldd_make_sv_name(ldd); */
+                ldd_write(&mgc->obd_lvfs_ctxt, ldd);
+
+                err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb,
+                                       mti->mti_svname);
+                if (err)
+                        CERROR("Label set error %d\n", err);
+                label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb);
+                if (label) 
+                        CDEBUG(D_MOUNT, "Disk label changed to %s\n", label);
+        }
+
+out:
+        if (mti)        
+                OBD_FREE_PTR(mti);
+        RETURN(rc);
+}
+
+/* Start targets */
+static int server_start_targets(struct super_block *sb, struct vfsmount *mnt)
+{
+        struct obd_device *obd;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct config_llog_instance cfg;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname);
+        
+        /* If we're an MDT, make sure the global MDS is running */
+        if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) {
+                /* make sure (what will be called) the MDS is started */
+                obd = class_name2obd("MDS");
+                if (!obd) {
+                        //FIXME pre-rename, should eventually be LUSTRE_MDS_NAME
+                        rc = lustre_start_simple("MDS", LUSTRE_MDT_NAME, 
+                                                 "MDS_uuid", 0, 0);
+                        if (rc) {
+                                CERROR("failed to start MDS: %d\n", rc);
+                                GOTO(out_servers, rc);
+                        }
+                }
+        }
+
+        /* If we're an OST, make sure the global OSS is running */
+        if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) {
+                /* make sure OSS is started */
+                obd = class_name2obd("OSS");
+                if (!obd) {
+                        rc = lustre_start_simple("OSS", LUSTRE_OSS_NAME,
+                                                 "OSS_uuid", 0, 0);
+                        if (rc) {
+                                CERROR("failed to start OSS: %d\n", rc);
+                                GOTO(out_servers, rc);
+                        }
+                }
+        }
+
+        /* Set the mgc fs to our server disk.  This allows the MGC
+           to read and write configs locally. */
+        server_mgc_set_fs(lsi->lsi_mgc, sb);
+
+        /* Register with MGS */
+        rc = server_register_target(sb);
+        if (rc && (lsi->lsi_ldd->ldd_flags & 
+                   (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){
+                CERROR("Required registration failed for %s: %d\n", 
+                       lsi->lsi_ldd->ldd_svname, rc);
+                if (rc == -EIO) {
+                        LCONSOLE_ERROR("Communication error with the MGS.  Is "
+                                       "the MGS running?\n");
+                }
+                GOTO(out, rc);
+        }
+
+        /* Let the target look up the mount using the target's name 
+           (we can't pass the sb or mnt through class_process_config.) */
+        rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt);
+        if (rc) 
+                GOTO(out, rc);
+
+        /* Start targets using the llog named for the target */
+        memset(&cfg, 0, sizeof(cfg));
+        rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg);
+        if (rc) {
+                CERROR("failed to start server %s: %d\n",
+                       lsi->lsi_ldd->ldd_svname, rc);
+                GOTO(out, rc);
+        }
+
+        if (!class_name2obd(lsi->lsi_ldd->ldd_svname)) {
+                CERROR("no server named %s was started\n",
+                       lsi->lsi_ldd->ldd_svname);
+                rc = -ENXIO;
+        }
+        
+out:
+        /* Release the mgc fs for others to use */
+        server_mgc_clear_fs(lsi->lsi_mgc);
+
+out_servers:
+        RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = NULL;
+        ENTRY;
+
+        OBD_ALLOC(lsi, sizeof(*lsi));
+        if (!lsi)
+                RETURN(NULL);
+        OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+        if (!lsi->lsi_lmd) {
+                OBD_FREE(lsi, sizeof(*lsi));
+                RETURN(NULL);
+        }
+
+        lsi->lsi_lmd->lmd_exclude_count = 0;
+        s2lsi_nocast(sb) = lsi;
+        /* we take 1 extra ref for our setup */
+        atomic_set(&lsi->lsi_mounts, 1);
+
+        /* Default umount style */
+        lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+        RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        ENTRY;
+
+        if (!lsi)
+                RETURN(0);
+                
+        CDEBUG(D_MOUNT, "Freeing lsi\n");
+        
+        /* someone didn't call server_put_mount. */
+        LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+        if (lsi->lsi_ldd != NULL) 
+                OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd));
+        
+        if (lsi->lsi_lmd != NULL) {
+                if (lsi->lsi_lmd->lmd_dev != NULL) 
+                        OBD_FREE(lsi->lsi_lmd->lmd_dev, 
+                                 strlen(lsi->lsi_lmd->lmd_dev) + 1);
+                if (lsi->lsi_lmd->lmd_profile != NULL) 
+                        OBD_FREE(lsi->lsi_lmd->lmd_profile, 
+                                 strlen(lsi->lsi_lmd->lmd_profile) + 1);
+                if (lsi->lsi_lmd->lmd_opts != NULL) 
+                        OBD_FREE(lsi->lsi_lmd->lmd_opts, 
+                                 strlen(lsi->lsi_lmd->lmd_opts) + 1);
+                if (lsi->lsi_lmd->lmd_exclude_count)
+                        OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+                                 sizeof(lsi->lsi_lmd->lmd_exclude[0]) * 
+                                 lsi->lsi_lmd->lmd_exclude_count);
+                OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+        }
+        
+        LASSERT(lsi->lsi_llsbi == NULL);
+        
+        server_deregister_mount_all(lsi->lsi_srv_mnt);
+        
+        OBD_FREE(lsi, sizeof(*lsi));
+        s2lsi_nocast(sb) = NULL;
+        
+        RETURN(0);
+}
+           
+static int lustre_put_lsi(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        ENTRY;
+
+        LASSERT(lsi);
+        
+        CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+
+        if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+                lustre_free_lsi(sb);
+                RETURN(1);
+        }
+        RETURN(0);
+}
+
+/*************** server mount ******************/
+
+/* Kernel mount using mount options in MOUNT_DATA_FILE */
+static struct vfsmount *server_kernel_mount(struct super_block *sb)
+{
+        struct lvfs_run_ctxt mount_ctxt;
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct lustre_disk_data *ldd;
+        struct lustre_mount_data *lmd = lsi->lsi_lmd;
+        struct vfsmount *mnt;
+        char *options = NULL;
+        unsigned long page, s_flags;
+        int rc;
+        ENTRY;
+
+        OBD_ALLOC(ldd, sizeof(*ldd));
+        if (!ldd)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        /* In the past, we have always used flags = 0.
+           Note ext3/ldiskfs can't be mounted ro. */
+        s_flags = sb->s_flags;
+
+        /* Pre-mount ext3 to read the MOUNT_DATA_FILE */
+        CDEBUG(D_MOUNT, "Pre-mount ext3 %s\n", lmd->lmd_dev);
+        mnt = do_kern_mount("ext3", s_flags, lmd->lmd_dev, 0);
+        if (IS_ERR(mnt)) {
+                rc = PTR_ERR(mnt);
+                CERROR("premount ext3 failed (%d), trying ldiskfs\n", rc);
+                /* If ext3 fails (bec. of mballoc, extents), try ldiskfs */
+                mnt = do_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, 0);
+                if (IS_ERR(mnt)) {
+                        rc = PTR_ERR(mnt);
+                        CERROR("premount ldiskfs failed: rc = %d\n", rc);
+                        GOTO(out_free, rc);
+                }
+        }
+
+        OBD_SET_CTXT_MAGIC(&mount_ctxt);
+        mount_ctxt.pwdmnt = mnt;
+        mount_ctxt.pwd = mnt->mnt_root;
+        mount_ctxt.fs = get_ds();
+
+        rc = ldd_parse(&mount_ctxt, ldd); 
+        unlock_mntput(mnt);
+
+        if (rc) {
+                CERROR("premount parse options failed: rc = %d\n", rc);
+                GOTO(out_free, rc);
+        }
+
+        /* Done with our pre-mount, now do the real mount. */
+
+        /* Glom up mount options */
+        page = __get_free_page(GFP_KERNEL);
+        if (!page) 
+                GOTO(out_free, rc = -ENOMEM);
+
+        options = (char *)page;
+        memset(options, 0, PAGE_SIZE);
+        strncpy(options, ldd->ldd_mount_opts, PAGE_SIZE - 2);
+        
+        /* Add in any mount-line options */
+        if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) {
+                int len = PAGE_SIZE - strlen(options) - 2;
+                if (*options != 0) 
+                        strcat(options, ",");
+                strncat(options, lmd->lmd_opts, len);
+        }
+
+        /* Special permanent mount flags */
+        if (IS_OST(ldd)) 
+            s_flags |= MS_NOATIME | MS_NODIRATIME;
+
+        CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n",
+               MT_STR(ldd), lmd->lmd_dev, options);
+        mnt = do_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev, 
+                            (void *)options);
+        free_page(page);
+        if (IS_ERR(mnt)) {
+                rc = PTR_ERR(mnt);
+                CERROR("do_kern_mount failed: rc = %d\n", rc);
+                GOTO(out_free, rc);
+        }
+
+        lsi->lsi_ldd = ldd;   /* freed at lsi cleanup */
+        CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
+        RETURN(mnt);
+
+out_free:
+        OBD_FREE(ldd, sizeof(*ldd));
+        lsi->lsi_ldd = NULL;    
+        RETURN(ERR_PTR(rc));
+}
+                      
+static void server_wait_finished(struct vfsmount *mnt)
+{
+        wait_queue_head_t   waitq;
+        struct l_wait_info  lwi;
+        int                 retries = 10;
+        
+        init_waitqueue_head(&waitq);
+
+        while ((atomic_read(&mnt->mnt_count) > 0) && retries--) {
+                CWARN("Mount still busy with %d refs\n",
+                       atomic_read(&mnt->mnt_count));
+
+                /* Wait for a bit */
+                lwi = LWI_TIMEOUT(2 * HZ, NULL, NULL);
+                l_wait_event(waitq, 0, &lwi);
+        }
+        if (atomic_read(&mnt->mnt_count)) {
+                CERROR("Mount is still busy, giving up.\n");
+        }
+}
+
+static void server_put_super(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct obd_device     *obd;
+        struct vfsmount       *mnt = lsi->lsi_srv_mnt;
+        char *tmpname;
+        int tmpname_sz;
+        int lddflags = lsi->lsi_ldd->ldd_flags;
+        int lsiflags = lsi->lsi_flags;
+        int rc;
+        ENTRY;
+
+        LASSERT(lsiflags & LSI_SERVER);
+        
+        tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1;
+        OBD_ALLOC(tmpname, tmpname_sz);
+        memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
+        CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+                                                                                       
+        /* Stop the target */
+        if (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd)) {
+
+                /* tell the mgc to drop the config log */
+                lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL);
+                
+                obd = class_name2obd(lsi->lsi_ldd->ldd_svname);
+                if (obd) {
+                        CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name);
+                        if (lsi->lsi_flags & LSI_UMOUNT_FORCE)
+                                obd->obd_force = 1;
+                        if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER)
+                                obd->obd_fail = 1;
+                        /* We can't seem to give an error return code
+                           to .put_super, so we better make sure we clean up!
+                           FIXME is there a way to get around this? */
+                        obd->obd_force = 1;
+                        class_manual_cleanup(obd);
+                } else {
+                        CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname);
+                        server_deregister_mount(lsi->lsi_ldd->ldd_svname);
+                }
+        }
+
+        /* If they wanted the mgs to stop separately from the mdt, they
+           should have put it on a different device. */ 
+        if (IS_MGS(lsi->lsi_ldd)) {
+                /* stop the mgc before the mgs so the connection gets cleaned
+                   up */
+                lustre_stop_mgc(sb);
+                server_stop_mgs(sb);
+        }
+
+        /* clean the mgc and sb */
+        rc = lustre_common_put_super(sb);
+        // FIXME how do I return a failure? 
+
+        /* drop the One True Mount */
+        unlock_mntput(mnt);
+
+        /* Wait for the targets to really clean up - can't exit (and let the
+           sb get destroyed) while the mount is still in use */
+        server_wait_finished(mnt);
+        
+        /* Stop the servers (MDS, OSS) if no longer needed.  We must wait
+           until the target is really gone so that our type refcount check
+           is right. */
+        server_stop_servers(lddflags, lsiflags);
+
+        LCONSOLE_WARN("server umount %s complete\n", tmpname);
+        OBD_FREE(tmpname, tmpname_sz);
+        EXIT;
+}
+
+static void server_umount_begin(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "umount -f\n");
+        /* umount = failover
+           umount -f = force
+           no third way to do non-force, non-failover */
+        lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER;
+        lsi->lsi_flags |= LSI_UMOUNT_FORCE;
+        EXIT;
+}
+
+static int server_statfs (struct super_block *sb, struct kstatfs *buf)
+{
+        struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt;
+        ENTRY;
+
+        if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) {
+                int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf);
+                if (!rc) {
+                        buf->f_type = sb->s_magic;
+                        RETURN(0);
+                }
+        }
+        
+        /* just return 0 */
+        buf->f_type = sb->s_magic;
+        buf->f_bsize = sb->s_blocksize;
+        buf->f_blocks = 1;
+        buf->f_bfree = 0;
+        buf->f_bavail = 0;
+        buf->f_files = 1;
+        buf->f_ffree = 0;
+        buf->f_namelen = NAME_MAX;
+        RETURN(0);
+}
+
+static struct super_operations server_ops =
+{
+        .put_super      = server_put_super,
+        .umount_begin   = server_umount_begin, /* umount -f */
+        .statfs         = server_statfs,
+};
+
+#define log2(n) ffz(~(n))
+#define LUSTRE_SUPER_MAGIC 0x0BD00BD1
+
+static int server_fill_super_common(struct super_block *sb)
+{
+        struct inode *root = 0;
+        ENTRY;
+                                                                                 
+        CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev);
+                                                                                 
+        sb->s_blocksize = 4096;
+        sb->s_blocksize_bits = log2(sb->s_blocksize);
+        sb->s_magic = LUSTRE_SUPER_MAGIC;
+        sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES;
+        sb->s_flags |= MS_RDONLY;
+        sb->s_op = &server_ops;
+        root = new_inode(sb);
+        if (!root) {
+                CERROR("Can't make root inode\n");
+                RETURN(-EIO);
+        }
+                                                                                 
+        /* returns -EIO for every operation */
+        /* make_bad_inode(root); -- badness - can't umount */
+        /* apparently we need to be a directory for the mount to finish */
+        root->i_mode = S_IFDIR;
+                                                                                 
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                CERROR("Can't make root dentry\n");
+                iput(root);
+                RETURN(-EIO);
+        }
+                                                                                 
+        RETURN(0);
+}
+     
+static int server_fill_super(struct super_block *sb)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct vfsmount *mnt;
+        int rc;
+        ENTRY;
+
+        /* the One True Mount */
+        mnt = server_kernel_mount(sb);
+        if (IS_ERR(mnt)) {
+                rc = PTR_ERR(mnt);
+                CERROR("Unable to mount device %s: %d\n", 
+                      lsi->lsi_lmd->lmd_dev, rc);
+                GOTO(out, rc);
+        }
+        lsi->lsi_srv_mnt = mnt;
+
+        LASSERT(lsi->lsi_ldd);
+        CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n",
+               lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname, 
+               lsi->lsi_lmd->lmd_dev);
+
+        if (class_name2obd(lsi->lsi_ldd->ldd_svname)) {
+                LCONSOLE_ERROR("The target named %s is already running. "
+                               "Double-mount may have compromised the disk "
+                               "journal.\n", lsi->lsi_ldd->ldd_svname);
+                unlock_mntput(mnt);
+                lustre_put_lsi(sb);
+                GOTO(out, rc = -EALREADY);
+        }
+
+        /* start MGS before MGC */
+        if (IS_MGS(lsi->lsi_ldd)) {
+                rc = server_start_mgs(sb);
+                if (rc) {
+                        CERROR("ignoring Failed MGS start!!\n");
+                        //GOTO(out_mnt, rc);
+                }
+        }
+
+        rc = lustre_start_mgc(sb);
+        if (rc) 
+                GOTO(out_mnt, rc);
+
+        /* Set up all obd devices for service */
+        if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && 
+                (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) {
+                rc = server_start_targets(sb, mnt);
+                if (rc < 0) {
+                        CERROR("Unable to start targets: %d\n", rc);
+                        GOTO(out_mnt, rc);
+                }
+        /* FIXME overmount client here,
+           or can we just start a client log and client_fill_super on this sb? 
+           We need to make sure server_put_super gets called too - ll_put_super
+           calls lustre_common_put_super; check there for LSI_SERVER flag, 
+           call s_p_s if so. 
+           Probably should start client from new thread so we can return.
+           Client will not finish until all servers are connected.
+           Note - MGMT-only server does NOT get a client, since there is no
+           lustre fs associated - the MGMT is for all lustre fs's */
+        }
+
+        rc = server_fill_super_common(sb);
+        if (rc) 
+                GOTO(out_mnt, rc);
+
+        RETURN(0);
+
+out_mnt:
+        server_put_super(sb);
+out:
+        RETURN(rc);
+}
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error 
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(char *svname, __u32 *idx, char **endptr)
+{
+        unsigned long index;
+        int rc;
+        char *dash = strchr(svname, '-');
+        if (!dash) {
+                CERROR("Can't understand server name %s\n", svname);
+                return(-EINVAL);
+        }
+
+        if (strncmp(dash + 1, "MDT", 3) == 0) 
+                rc = LDD_F_SV_TYPE_MDT;
+        else if (strncmp(dash + 1, "OST", 3) == 0) 
+                rc = LDD_F_SV_TYPE_OST;
+        else 
+                return(-EINVAL);
+
+        index = simple_strtoul(dash + 4, endptr, 16);
+        *idx = index;
+        return rc;
+}
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+        
+        rc = lustre_stop_mgc(sb);
+        if (rc && (rc != -ENOENT)) {
+                if (rc != -EBUSY) {
+                        CERROR("Can't stop MGC: %d\n", rc);
+                        RETURN(rc);
+                }
+                /* BUSY just means that there's some other obd that
+                   needs the mgc.  Let him clean it up. */
+                CDEBUG(D_MOUNT, "MGC still in use\n");
+        }
+        lustre_put_lsi(sb);
+        RETURN(rc);
+}      
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+        int i;
+
+        PRINT_CMD(PRINT_MASK, "  mount data:\n"); 
+        if (lmd_is_client(lmd)) 
+                PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
+        PRINT_CMD(PRINT_MASK, "device:  %s\n", lmd->lmd_dev);
+        PRINT_CMD(PRINT_MASK, "flags:   %x\n", lmd->lmd_flags);
+        if (lmd->lmd_opts)
+                PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
+        for (i = 0; i < lmd->lmd_exclude_count; i++) {
+                PRINT_CMD(PRINT_MASK, "exclude %d:  OST%04x\n", i, 
+                          lmd->lmd_exclude[i]);
+        }
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+        struct lustre_sb_info *lsi = s2lsi(sb);
+        struct lustre_mount_data *lmd = lsi->lsi_lmd;
+        __u32 index;
+        int i, rc;
+        ENTRY;
+
+        rc = server_name2index(svname, &index, NULL);
+        if (rc != LDD_F_SV_TYPE_OST) 
+                RETURN(0);
+
+        CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, 
+               index, lmd->lmd_exclude_count, lmd->lmd_dev);
+        
+        for(i = 0; i < lmd->lmd_exclude_count; i++) {
+                if (index == lmd->lmd_exclude[i]) {
+                        CWARN("Excluding %s (on exclusion list)\n", svname);
+                        RETURN(1);
+                }
+        }
+        RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr)
+{
+        char *s1 = ptr, *s2;
+        __u32 index, *exclude_list;
+        int rc = 0;
+        ENTRY;
+
+        /* temp storage until we figure out how many we have */
+        OBD_ALLOC(exclude_list, sizeof(index) * MAX_OBD_DEVICES);
+        if (!exclude_list)
+                RETURN(-ENOMEM);
+
+        /* we enter this fn pointing at the '=' */
+        while (*s1 && *s1 != ' ' && *s1 != ',') {
+                s1++;
+                rc = server_name2index(s1, &index, &s2);
+                if (rc < 0) {
+                        CERROR("Can't parse %s\n", s1);
+                        break;
+                }
+                if (rc == LDD_F_SV_TYPE_OST) 
+                        exclude_list[lmd->lmd_exclude_count++] = index;
+                else
+                        CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+                s1 = s2;
+                /* now we are pointing at ':' (next exclude) 
+                   or ',' (end of excludes) */
+                
+                if (lmd->lmd_exclude_count >= MAX_OBD_DEVICES)
+                        break;
+        }
+        if (rc >= 0) /* non-err */
+                rc = 0;
+
+        if (lmd->lmd_exclude_count) {
+                /* permanent, freed in lustre_free_lsi */
+                OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * 
+                          lmd->lmd_exclude_count);
+                if (lmd->lmd_exclude) {
+                        memcpy(lmd->lmd_exclude, exclude_list, 
+                               sizeof(index) * lmd->lmd_exclude_count);
+                } else { 
+                        rc = -ENOMEM;
+                        lmd->lmd_exclude_count = 0;
+                }
+        }
+        OBD_FREE(exclude_list, sizeof(index) * MAX_OBD_DEVICES); 
+        RETURN(rc);
+}
+
+/* mount -v -t lustre uml1:uml2:/lustre-client /mnt/lustre */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+        char *s1, *s2, *devname = NULL;
+        struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(lmd);
+        if (!options) {
+                LCONSOLE_ERROR("Missing mount data: check that " 
+                               "/sbin/mount.lustre is installed.\n");
+                RETURN(-EINVAL);          
+        }
+        
+        /* Options should be a string - try to detect old lmd data */
+        if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { 
+                LCONSOLE_ERROR("You're using an old version of "        
+                               "/sbin/mount.lustre.  Please install version "
+                               "%s\n", LUSTRE_VERSION_STRING);     
+                RETURN(-EINVAL);
+        }
+        lmd->lmd_magic = LMD_MAGIC;
+
+        /* Default flags */
+        lmd->lmd_flags |= LMD_FLG_RECOVER;
+
+        s1 = options;
+        while (*s1) {
+                /* Skip whitespace and extra commas */
+                while (*s1 == ' ' || *s1 == ',')
+                        s1++;
+
+                /* Client options are parsed in ll_options: eg. flock, 
+                   user_xattr, acl */
+                
+                if (strncmp(s1, "recov", 5) == 0) 
+                        /* FIXME do something with the RECOVER flag - see lconf */
+                        lmd->lmd_flags |= LMD_FLG_RECOVER;
+                else if (strncmp(s1, "norecov", 7) == 0)
+                        lmd->lmd_flags &= ~LMD_FLG_RECOVER;
+                else if (strncmp(s1, "nosvc", 5) == 0)
+                        lmd->lmd_flags |= LMD_FLG_NOSVC;
+
+                /* ost exclusion list */
+                else if (strncmp(s1, "exclude=", 8) == 0) {
+                        rc = lmd_make_exclusion(lmd, s1 + 7);
+                        if (rc) 
+                                goto invalid;
+                }
+
+                /* Linux 2.4 doesn't pass the device, so we stuck it at the 
+                   end of the options. */
+                else if (strncmp(s1, "device=", 7) == 0) {
+                        devname = s1 + 7;
+                        /* terminate options right before device.  device
+                           must be the last one. */
+                        *s1 = 0;
+                        break;
+                }
+
+                /* Find next opt */
+                s2 = strchr(s1, ',');
+                if (s2 == NULL) 
+                        break;
+                s1 = s2 + 1;
+        }
+
+        if (!devname) {
+                LCONSOLE_ERROR("Can't find the device name "
+                               "(need mount option 'device=...')\n");
+                goto invalid;
+        }
+
+        s1 = strrchr(devname, ':');
+        if (s1) {
+                lmd->lmd_flags = LMD_FLG_CLIENT;
+                /* Remove leading /s from fsname */
+                while (*++s1 == '/') ;
+                /* Freed in lustre_free_lsi */
+                OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+                if (!lmd->lmd_profile) 
+                        RETURN(-ENOMEM);
+                sprintf(lmd->lmd_profile, "%s-client", s1);
+        }
+
+        /* Freed in lustre_free_lsi */
+        OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+        if (!lmd->lmd_dev) 
+                RETURN(-ENOMEM);
+        strcpy(lmd->lmd_dev, devname);
+        
+        /* Save mount options */
+        s1 = options + strlen(options) - 1;
+        while (s1 >= options && (*s1 == ',' || *s1 == ' ')) 
+                *s1-- = 0;
+        if (*options != 0) {
+                /* Freed in lustre_free_lsi */
+                OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+                if (!lmd->lmd_opts) 
+                        RETURN(-ENOMEM);
+                strcpy(lmd->lmd_opts, options);
+        }
+
+        lmd->lmd_magic = LMD_MAGIC;
+
+        lmd_print(lmd);
+        RETURN(rc);
+
+invalid:
+        CERROR("Bad mount options %s\n", options);
+        RETURN(-EINVAL);          
+}
+
+
+/* Common mount */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct lustre_mount_data *lmd;
+        struct lustre_sb_info *lsi;
+        int rc;
+        ENTRY;
+        CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+        
+        lsi = lustre_init_lsi(sb);
+        if (!lsi) 
+                RETURN(-ENOMEM);
+        lmd = lsi->lsi_lmd;
+
+        /* Figure out the lmd from the mount options */
+        if (lmd_parse((char *)data, lmd)) {
+                lustre_put_lsi(sb);
+                RETURN(-EINVAL);
+        }
+
+        if (lmd_is_client(lmd)) {
+                CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+                if (!client_fill_super) {
+                        LCONSOLE_ERROR("Nothing registered for client mount!"
+                               " Is llite module loaded?\n");
+                        rc = -ENODEV;
+                } else {
+                        rc = lustre_start_mgc(sb);
+                        if (rc) 
+                                goto out;
+                        /* Connect and start */
+                        /* (should always be ll_fill_super) */
+                        rc = (*client_fill_super)(sb);
+                        /* c_f_s will call lustre_common_put_super on failure */
+
+                }
+        } else {
+                CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev);
+                lsi->lsi_flags |= LSI_SERVER;
+                rc = server_fill_super(sb);
+                /* s_f_s calls lustre_start_mgc after the mount because we need
+                   the MGS nids which are stored on disk.  Plus, we may
+                   need to start the MGS first. */
+                /* s_f_s will call server_put_super on failure */
+        }
+                                                                                
+out:
+        if (rc){
+                CERROR("Unable to mount %s\n", 
+                       s2lsi(sb) ? lmd->lmd_dev : "");
+        } else {
+                LCONSOLE_WARN("mount %s complete\n", lmd->lmd_dev);
+        }
+        RETURN(rc);
+} 
+                                                                               
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb))
+{
+        client_fill_super = cfs;
+}
+
+/***************** FS registration ******************/
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+/* 2.5 and later */
+struct super_block * lustre_get_sb(struct file_system_type *fs_type,
+                               int flags, const char *devname, void * data)
+{
+        /* calls back in fill super */
+        /* we could append devname= onto options (*data) here, 
+           but 2.4 doesn't get devname.  So we do it in mount_lustre.c */
+        return get_sb_nodev(fs_type, flags, data, lustre_fill_super);
+}
+
+struct file_system_type lustre_fs_type = {
+        .owner        = THIS_MODULE,
+        .name         = "lustre",
+        .get_sb       = lustre_get_sb,
+        .kill_sb      = kill_anon_super,
+        .fs_flags     = FS_BINARY_MOUNTDATA,
+};
+
+#else
+/* 2.4 */
+static struct super_block *lustre_read_super(struct super_block *sb,
+                                             void *data, int silent)
+{
+        int rc;
+        ENTRY;
+
+        rc = lustre_fill_super(sb, data, silent);
+        if (rc)
+                RETURN(NULL);
+        RETURN(sb);
+}
+
+static struct file_system_type lustre_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "lustre",
+        .fs_flags       = FS_NFSEXP_FSID,
+        .read_super     = lustre_read_super,
+};
+#endif
+
+int lustre_register_fs(void)
+{
+        return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+        return unregister_filesystem(&lustre_fs_type);
+}
+
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+EXPORT_SYMBOL(lustre_common_put_super);
+EXPORT_SYMBOL(lustre_process_log);
+EXPORT_SYMBOL(lustre_end_log);
+EXPORT_SYMBOL(server_get_mount);
+EXPORT_SYMBOL(server_put_mount);
+EXPORT_SYMBOL(server_register_target);
+EXPORT_SYMBOL(server_name2index);
+EXPORT_SYMBOL(server_mti_print);
+EXPORT_SYMBOL(class_find_param);
+EXPORT_SYMBOL(class_match_param);
+EXPORT_SYMBOL(class_parse_nid);
+
+
index eb85aea..09302bd 100644 (file)
 #include <obd_class.h>
 
 struct uuid {
-       __u32   time_low;
-       __u16   time_mid;
-       __u16   time_hi_and_version;
-       __u16   clock_seq;
-       __u8    node[6];
+        __u32   time_low;
+        __u16   time_mid;
+        __u16   time_hi_and_version;
+        __u16   clock_seq;
+        __u8    node[6];
 };
 
 static void uuid_unpack(class_uuid_t in, struct uuid *uu)
 {
-       __u8    *ptr = in;
-       __u32   tmp;
+        __u8    *ptr = in;
+        __u32   tmp;
 
-       tmp = *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       uu->time_low = tmp;
+        tmp = *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        uu->time_low = tmp;
 
-       tmp = *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       uu->time_mid = tmp;
+        tmp = *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        uu->time_mid = tmp;
 
-       tmp = *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       uu->time_hi_and_version = tmp;
+        tmp = *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        uu->time_hi_and_version = tmp;
 
-       tmp = *ptr++;
-       tmp = (tmp << 8) | *ptr++;
-       uu->clock_seq = tmp;
+        tmp = *ptr++;
+        tmp = (tmp << 8) | *ptr++;
+        uu->clock_seq = tmp;
 
-       memcpy(uu->node, ptr, 6);
+        memcpy(uu->node, ptr, 6);
 }
 
 #if 0
 static void uuid_pack(struct uuid *uu, class_uuid_t ptr)
 {
-       __u32   tmp;
-       unsigned char   *out = ptr;
-
-       tmp = uu->time_low;
-       out[3] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[2] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[1] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[0] = (unsigned char) tmp;
-
-       tmp = uu->time_mid;
-       out[5] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[4] = (unsigned char) tmp;
-
-       tmp = uu->time_hi_and_version;
-       out[7] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[6] = (unsigned char) tmp;
-
-       tmp = uu->clock_seq;
-       out[9] = (unsigned char) tmp;
-       tmp >>= 8;
-       out[8] = (unsigned char) tmp;
-
-       memcpy(out+10, uu->node, 6);
+        __u32   tmp;
+        unsigned char   *out = ptr;
+
+        tmp = uu->time_low;
+        out[3] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[2] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[1] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[0] = (unsigned char) tmp;
+
+        tmp = uu->time_mid;
+        out[5] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[4] = (unsigned char) tmp;
+
+        tmp = uu->time_hi_and_version;
+        out[7] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[6] = (unsigned char) tmp;
+
+        tmp = uu->clock_seq;
+        out[9] = (unsigned char) tmp;
+        tmp >>= 8;
+        out[8] = (unsigned char) tmp;
+
+        memcpy(out+10, uu->node, 6);
 }
 
 int class_uuid_parse(struct obd_uuid in, class_uuid_t uu)
 {
-       struct uuid uuid;
-       int i;
-       char *cp, buf[3];
-
-       if (strlen(in) != 36)
-               return -1;
-       for (i=0, cp = in; i <= 36; i++,cp++) {
-               if ((i == 8) || (i == 13) || (i == 18) ||
-                   (i == 23))
-                       if (*cp == '-')
-                               continue;
-               if (i== 36)
-                       if (*cp == 0)
-                               continue;
-               if (!isxdigit(*cp))
-                       return -1;
-       }
-       uuid.time_low = simple_strtoul(in, NULL, 16);
-       uuid.time_mid = simple_strtoul(in+9, NULL, 16);
-       uuid.time_hi_and_version = simple_strtoul(in+14, NULL, 16);
-       uuid.clock_seq = simple_strtoul(in+19, NULL, 16);
-       cp = in+24;
-       buf[2] = 0;
-       for (i=0; i < 6; i++) {
-               buf[0] = *cp++;
-               buf[1] = *cp++;
-               uuid.node[i] = simple_strtoul(buf, NULL, 16);
-       }
-
-       uuid_pack(&uuid, uu);
-       return 0;
+        struct uuid uuid;
+        int i;
+        char *cp, buf[3];
+
+        if (strlen(in) != 36)
+                return -1;
+        for (i=0, cp = in; i <= 36; i++,cp++) {
+                if ((i == 8) || (i == 13) || (i == 18) ||
+                    (i == 23))
+                        if (*cp == '-')
+                                continue;
+                if (i== 36)
+                        if (*cp == 0)
+                                continue;
+                if (!isxdigit(*cp))
+                        return -1;
+        }
+        uuid.time_low = simple_strtoul(in, NULL, 16);
+        uuid.time_mid = simple_strtoul(in+9, NULL, 16);
+        uuid.time_hi_and_version = simple_strtoul(in+14, NULL, 16);
+        uuid.clock_seq = simple_strtoul(in+19, NULL, 16);
+        cp = in+24;
+        buf[2] = 0;
+        for (i=0; i < 6; i++) {
+                buf[0] = *cp++;
+                buf[1] = *cp++;
+                uuid.node[i] = simple_strtoul(buf, NULL, 16);
+        }
+
+        uuid_pack(&uuid, uu);
+        return 0;
 }
 #endif
 
+
+void generate_random_uuid(unsigned char uuid_out[16]);
+
+/* We need to have some extra twiddling here because some systems have
+ * no random state when they start up. */
+void class_generate_random_uuid(class_uuid_t uuid)
+{
+        struct timeval t;
+        int *i, j, k;
+
+        LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0);
+
+        j = jiffies;
+        do_gettimeofday(&t);
+        k = t.tv_usec;
+
+        generate_random_uuid(uuid);
+
+        for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) {
+                *i ^= j ^ k;
+                j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff);
+                k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000);
+        }
+}
+
 void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
 {
-       struct uuid uuid;
-
-       uuid_unpack(uu, &uuid);
-       sprintf(out->uuid,
-               "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-               uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
-               uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
-               uuid.node[0], uuid.node[1], uuid.node[2],
-               uuid.node[3], uuid.node[4], uuid.node[5]);
+        struct uuid uuid;
+
+        uuid_unpack(uu, &uuid);
+        sprintf(out->uuid,
+                "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
+                uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
+                uuid.node[0], uuid.node[1], uuid.node[2],
+                uuid.node[3], uuid.node[4], uuid.node[5]);
 }
index 458c355..a7c115b 100644 (file)
@@ -55,6 +55,7 @@
 #include <lustre_log.h>
 #include <lustre_commit_confd.h>
 #include <libcfs/list.h>
+#include <lustre_disk.h>
 #include <lustre_quota.h>
 #include <lustre_ver.h>
 
@@ -89,15 +90,15 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti,
         /* we don't allocate new transnos for replayed requests */
         if (oti->oti_transno == 0) {
                 spin_lock(&filter->fo_translock);
-                last_rcvd = le64_to_cpu(filter->fo_fsd->fsd_last_transno) + 1;
-                filter->fo_fsd->fsd_last_transno = cpu_to_le64(last_rcvd);
+                last_rcvd = le64_to_cpu(filter->fo_fsd->lsd_last_transno) + 1;
+                filter->fo_fsd->lsd_last_transno = cpu_to_le64(last_rcvd);
                 spin_unlock(&filter->fo_translock);
                 oti->oti_transno = last_rcvd;
         } else {
                 spin_lock(&filter->fo_translock);
                 last_rcvd = oti->oti_transno;
-                if (last_rcvd > le64_to_cpu(filter->fo_fsd->fsd_last_transno))
-                        filter->fo_fsd->fsd_last_transno =
+                if (last_rcvd > le64_to_cpu(filter->fo_fsd->lsd_last_transno))
+                        filter->fo_fsd->lsd_last_transno =
                                 cpu_to_le64(last_rcvd);
                 spin_unlock(&filter->fo_translock);
         }
@@ -181,8 +182,8 @@ static int filter_client_add(struct obd_device *obd, struct filter_obd *filter,
         }
 
         fed->fed_lr_idx = cl_idx;
-        fed->fed_lr_off = le32_to_cpu(filter->fo_fsd->fsd_client_start) +
-                cl_idx * le16_to_cpu(filter->fo_fsd->fsd_client_size);
+        fed->fed_lr_off = le32_to_cpu(filter->fo_fsd->lsd_client_start) +
+                cl_idx * le16_to_cpu(filter->fo_fsd->lsd_client_size);
         LASSERTF(fed->fed_lr_off > 0, "fed_lr_off = %llu\n", fed->fed_lr_off);
 
         CDEBUG(D_INFO, "client at index %d (%llu) with UUID '%s' added\n",
@@ -317,21 +318,21 @@ static int filter_free_server_data(struct filter_obd *filter)
 
 /* assumes caller is already in kernel ctxt */
 int filter_update_server_data(struct obd_device *obd, struct file *filp,
-                              struct filter_server_data *fsd, int force_sync)
+                              struct lr_server_data *fsd, int force_sync)
 {
         loff_t off = 0;
         int rc;
         ENTRY;
 
-        CDEBUG(D_INODE, "server uuid      : %s\n", fsd->fsd_uuid);
+        CDEBUG(D_INODE, "server uuid      : %s\n", fsd->lsd_uuid);
         CDEBUG(D_INODE, "server last_rcvd : "LPU64"\n",
-               le64_to_cpu(fsd->fsd_last_transno));
+               le64_to_cpu(fsd->lsd_last_transno));
         CDEBUG(D_INODE, "server last_mount: "LPU64"\n",
-               le64_to_cpu(fsd->fsd_mount_count));
+               le64_to_cpu(fsd->lsd_mount_count));
 
         rc = fsfilt_write_record(obd, filp, fsd, sizeof(*fsd), &off,force_sync);
         if (rc)
-                CERROR("error writing filter_server_data: rc = %d\n", rc);
+                CERROR("error writing lr_server_data: rc = %d\n", rc);
 
         RETURN(rc);
 }
@@ -367,7 +368,7 @@ int filter_update_last_objid(struct obd_device *obd, obd_gr group,
 static int filter_init_server_data(struct obd_device *obd, struct file * filp)
 {
         struct filter_obd *filter = &obd->u.filter;
-        struct filter_server_data *fsd;
+        struct lr_server_data *fsd;
         struct filter_client_data *fcd = NULL;
         struct inode *inode = filp->f_dentry->d_inode;
         unsigned long last_rcvd_size = inode->i_size;
@@ -377,9 +378,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         int rc;
 
         /* ensure padding in the struct is the correct size */
-        CLASSERT(offsetof(struct filter_server_data, fsd_padding) +
-                 sizeof(fsd->fsd_padding) == LR_SERVER_SIZE);
-        CLASSERT(offsetof(struct filter_client_data, fcd_padding) +
+        CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
+                 sizeof(fsd->lsd_padding) == LR_SERVER_SIZE);
+        CLASSERT (offsetof(struct filter_client_data, fcd_padding) +
                  sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE);
 
         OBD_ALLOC(fsd, sizeof(*fsd));
@@ -394,16 +395,17 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         }
 
         if (last_rcvd_size == 0) {
-                CWARN("%s: initializing new %s\n", obd->obd_name, LAST_RCVD);
-
-                memcpy(fsd->fsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->fsd_uuid));
-                fsd->fsd_last_transno = 0;
-                mount_count = fsd->fsd_mount_count = 0;
-                fsd->fsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
-                fsd->fsd_client_start = cpu_to_le32(LR_CLIENT_START);
-                fsd->fsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
-                fsd->fsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT);
+                LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name);
+
+                memcpy(fsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->lsd_uuid));
+                fsd->lsd_last_transno = 0;
+                mount_count = fsd->lsd_mount_count = 0;
+                fsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
+                fsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START);
+                fsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
+                fsd->lsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT);
                 filter->fo_subdir_count = FILTER_SUBDIR_COUNT;
+                fsd->lsd_feature_incompat = cpu_to_le32(OBD_INCOMPAT_OST);
         } else {
                 rc = fsfilt_read_record(obd, filp, fsd, sizeof(*fsd), &off);
                 if (rc) {
@@ -411,54 +413,54 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                                LAST_RCVD, rc);
                         GOTO(err_fsd, rc);
                 }
-                if (strcmp(fsd->fsd_uuid, obd->obd_uuid.uuid) != 0) {
+                if (strcmp(fsd->lsd_uuid, obd->obd_uuid.uuid) != 0) {
                         LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
                                        " disk %s. Were the /dev/ assignments "
                                        "rearranged?\n",
-                                       obd->obd_uuid.uuid, fsd->fsd_uuid);
+                                       obd->obd_uuid.uuid, fsd->lsd_uuid);
                         GOTO(err_fsd, rc = -EINVAL);
                 }
-                mount_count = le64_to_cpu(fsd->fsd_mount_count);
-                filter->fo_subdir_count = le16_to_cpu(fsd->fsd_subdir_count);
+                mount_count = le64_to_cpu(fsd->lsd_mount_count);
+                filter->fo_subdir_count = le16_to_cpu(fsd->lsd_subdir_count);
         }
 
-        if (fsd->fsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) {
+        if (fsd->lsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) {
                 CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
-                       obd->obd_name, le32_to_cpu(fsd->fsd_feature_incompat) &
+                       obd->obd_name, le32_to_cpu(fsd->lsd_feature_incompat) &
                        ~FILTER_INCOMPAT_SUPP);
                 GOTO(err_fsd, rc = -EINVAL);
         }
-        if (fsd->fsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) {
+        if (fsd->lsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) {
                 CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
-                       obd->obd_name, le32_to_cpu(fsd->fsd_feature_rocompat) &
+                       obd->obd_name, le32_to_cpu(fsd->lsd_feature_rocompat) &
                        ~FILTER_ROCOMPAT_SUPP);
                 /* Do something like remount filesystem read-only */
                 GOTO(err_fsd, rc = -EINVAL);
         }
 
         CDEBUG(D_INODE, "%s: server last_rcvd : "LPU64"\n",
-               obd->obd_name, le64_to_cpu(fsd->fsd_last_transno));
+               obd->obd_name, le64_to_cpu(fsd->lsd_last_transno));
         CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n",
                obd->obd_name, mount_count + 1);
         CDEBUG(D_INODE, "%s: server data size: %u\n",
-               obd->obd_name, le32_to_cpu(fsd->fsd_server_size));
+               obd->obd_name, le32_to_cpu(fsd->lsd_server_size));
         CDEBUG(D_INODE, "%s: per-client data start: %u\n",
-               obd->obd_name, le32_to_cpu(fsd->fsd_client_start));
+               obd->obd_name, le32_to_cpu(fsd->lsd_client_start));
         CDEBUG(D_INODE, "%s: per-client data size: %u\n",
-               obd->obd_name, le32_to_cpu(fsd->fsd_client_size));
+               obd->obd_name, le32_to_cpu(fsd->lsd_client_size));
         CDEBUG(D_INODE, "%s: server subdir_count: %u\n",
-               obd->obd_name, le16_to_cpu(fsd->fsd_subdir_count));
+               obd->obd_name, le16_to_cpu(fsd->lsd_subdir_count));
         CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name,
-               last_rcvd_size <= le32_to_cpu(fsd->fsd_client_start) ? 0 :
-               (last_rcvd_size - le32_to_cpu(fsd->fsd_client_start)) /
-                le16_to_cpu(fsd->fsd_client_size));
+               last_rcvd_size <= le32_to_cpu(fsd->lsd_client_start) ? 0 :
+               (last_rcvd_size - le32_to_cpu(fsd->lsd_client_start)) /
+                le16_to_cpu(fsd->lsd_client_size));
 
         if (!obd->obd_replayable) {
                 CWARN("%s: recovery support OFF\n", obd->obd_name);
                 GOTO(out, rc = 0);
         }
 
-        for (cl_idx = 0, off = le32_to_cpu(fsd->fsd_client_start);
+        for (cl_idx = 0, off = le32_to_cpu(fsd->lsd_client_start);
              off < last_rcvd_size; cl_idx++) {
                 __u64 last_rcvd;
                 struct obd_export *exp;
@@ -472,9 +474,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
 
                 /* Don't assume off is incremented properly by
                  * fsfilt_read_record(), in case sizeof(*fcd)
-                 * isn't the same as fsd->fsd_client_size.  */
-                off = le32_to_cpu(fsd->fsd_client_start) +
-                        cl_idx * le16_to_cpu(fsd->fsd_client_size);
+                 * isn't the same as fsd->lsd_client_size.  */
+                off = le32_to_cpu(fsd->lsd_client_start) +
+                        cl_idx * le16_to_cpu(fsd->lsd_client_size);
                 rc = fsfilt_read_record(obd, filp, fcd, sizeof(*fcd), &off);
                 if (rc) {
                         CERROR("error reading FILT %s idx %d off %llu: rc %d\n",
@@ -496,7 +498,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 exp = class_new_export(obd, (struct obd_uuid *)fcd->fcd_uuid);
                 CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64
                        " srv lr: "LPU64"\n", fcd->fcd_uuid, cl_idx,
-                       last_rcvd, le64_to_cpu(fsd->fsd_last_transno));
+                       last_rcvd, le64_to_cpu(fsd->lsd_last_transno));
                 if (IS_ERR(exp))
                         GOTO(err_client, rc = PTR_ERR(exp));
 
@@ -516,21 +518,21 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
                        cl_idx, last_rcvd);
 
-                if (last_rcvd > le64_to_cpu(fsd->fsd_last_transno))
-                        fsd->fsd_last_transno = cpu_to_le64(last_rcvd);
+                if (last_rcvd > le64_to_cpu(fsd->lsd_last_transno))
+                        fsd->lsd_last_transno = cpu_to_le64(last_rcvd);
 
         }
 
         if (fcd)
                 OBD_FREE(fcd, sizeof(*fcd));
 
-        obd->obd_last_committed = le64_to_cpu(fsd->fsd_last_transno);
+        obd->obd_last_committed = le64_to_cpu(fsd->lsd_last_transno);
 
         if (obd->obd_recoverable_clients) {
                 CWARN("RECOVERY: service %s, %d recoverable clients, "
                       "last_rcvd "LPU64"\n", obd->obd_name,
                       obd->obd_recoverable_clients,
-                      le64_to_cpu(fsd->fsd_last_transno));
+                      le64_to_cpu(fsd->lsd_last_transno));
                 obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = CURRENT_SECONDS;
@@ -541,7 +543,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
 
 out:
         filter->fo_mount_count = mount_count + 1;
-        fsd->fsd_mount_count = cpu_to_le64(filter->fo_mount_count);
+        fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count);
 
         /* save it, so mount count and last_transno is current */
         rc = filter_update_server_data(obd, filp, filter->fo_fsd, 1);
@@ -659,7 +661,7 @@ static int filter_prep_groups(struct obd_device *obd)
                         CERROR("error renaming O/R to O/0: rc %d\n", rc);
                         GOTO(cleanup_O0, rc);
                 }
-                filter->fo_fsd->fsd_feature_incompat |=
+                filter->fo_fsd->lsd_feature_incompat |=
                         cpu_to_le32(OBD_INCOMPAT_GROUPS);
                 rc = filter_update_server_data(obd, filter->fo_rcvd_filp,
                                                filter->fo_fsd, 1);
@@ -1393,6 +1395,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         struct lustre_cfg* lcfg = buf;
         struct filter_obd *filter = &obd->u.filter;
         struct vfsmount *mnt;
+        struct lustre_mount_info *lmi;
         struct obd_uuid uuid;
         __u8 *uuid_ptr;
         char *str, *label;
@@ -1405,25 +1408,38 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
             LUSTRE_CFG_BUFLEN(lcfg, 2) < 1)
                 RETURN(-EINVAL);
 
-        obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
+        lmi = server_get_mount(obd->obd_name);
+        if (lmi) {
+                /* We already mounted in lustre_fill_super.
+                   lcfg bufs 1, 2, 4 (device, fstype, mount opts) are ignored.*/
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+                mnt = lmi->lmi_mnt;
+                obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
+        } else {
+                /* old path - used by lctl */
+                CERROR("Using old MDS mount method\n");
+                mnt = do_kern_mount(lustre_cfg_string(lcfg, 2),
+                                    MS_NOATIME|MS_NODIRATIME,
+                                    lustre_cfg_string(lcfg, 1), option);    
+                if (IS_ERR(mnt)) {
+                        rc = PTR_ERR(mnt);
+                        LCONSOLE_ERROR("Can't mount disk %s (%d)\n",
+                                       lustre_cfg_string(lcfg, 1), rc);
+                        RETURN(rc);
+                }
+
+                obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2));
+        }
         if (IS_ERR(obd->obd_fsops))
-                RETURN(PTR_ERR(obd->obd_fsops));
+                GOTO(err_mntput, rc = PTR_ERR(obd->obd_fsops));
 
         rc = filter_iobuf_pool_init(filter);
         if (rc != 0)
                 GOTO(err_ops, rc);
 
-        mnt = do_kern_mount(lustre_cfg_string(lcfg, 2),MS_NOATIME|MS_NODIRATIME,
-                            lustre_cfg_string(lcfg, 1), option);
-        if (IS_ERR(mnt)) {
-                rc = PTR_ERR(mnt);
-                LCONSOLE_ERROR("Can't mount disk %s (%d)\n",
-                               lustre_cfg_string(lcfg, 1), rc);
-                GOTO(err_ops, rc);
-        }
-
         LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
 
+        /* failover is the default */
         obd->obd_replayable = 1;
 
         if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
@@ -1447,7 +1463,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
 
         rc = filter_prep(obd);
         if (rc)
-                GOTO(err_mntput, rc);
+                GOTO(err_ops, rc);
 
         filter->fo_destroy_in_progress = 0;
         sema_init(&filter->fo_create_lock, 1);
@@ -1498,10 +1514,11 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         } else {
                 str = "no UUID";
         }
-        label = fsfilt_label(obd, obd->u.obt.obt_sb);
+        
+        label = fsfilt_get_label(obd, obd->u.obt.obt_sb);
 
         if (obd->obd_recovering) {
-                LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in"
+                LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
                               "recovery until %d %s reconnect, or if no clients"
                               " reconnect for %d:%.02d; during that time new "
                               "clients will not be allowed to connect. "
@@ -1526,14 +1543,19 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
 
 err_post:
         filter_post(obd);
-err_mntput:
-        unlock_kernel();
-        mntput(mnt);
-        obd->u.obt.obt_sb = 0;
-        lock_kernel();
 err_ops:
         fsfilt_put_ops(obd->obd_fsops);
         filter_iobuf_pool_done(filter);
+err_mntput:
+        if (lmi) {
+                server_put_mount(obd->obd_name, mnt);
+        } else {
+                /* old method */
+                unlock_kernel();
+                mntput(mnt);
+                lock_kernel();
+        }
+        obd->u.obt.obt_sb = 0;
         return rc;
 }
 
@@ -1654,12 +1676,12 @@ static int filter_cleanup(struct obd_device *obd)
 {
         struct filter_obd *filter = &obd->u.filter;
         lvfs_sbdev_type save_dev;
-        int must_relock = 0;
+        int must_relock = 0, must_put = 0;
         ENTRY;
 
         if (obd->obd_fail)
-                CERROR("%s: shutting down for failover; client state will"
-                       " be preserved.\n", obd->obd_name);
+                LCONSOLE_WARN("%s: shutting down for failover; client state "
+                              "will be preserved.\n", obd->obd_name);
 
         if (!list_empty(&obd->obd_exports)) {
                 CERROR("%s: still has clients!\n", obd->obd_name);
@@ -1687,10 +1709,8 @@ static int filter_cleanup(struct obd_device *obd)
 
         LL_DQUOT_OFF(obd->u.obt.obt_sb);
 
-        if (atomic_read(&filter->fo_vfsmnt->mnt_count) > 1)
-                CERROR("%s: mount point %p busy, mnt_count: %d\n",
-                       obd->obd_name, filter->fo_vfsmnt,
-                       atomic_read(&filter->fo_vfsmnt->mnt_count));
+        must_put = server_put_mount(obd->obd_name, filter->fo_vfsmnt);
+        /* must_put is for old method (l_p_m returns non-0 on err) */
 
         /* We can only unlock kernel if we are in the context of sys_ioctl,
            otherwise we never called lock_kernel */
@@ -1698,9 +1718,10 @@ static int filter_cleanup(struct obd_device *obd)
                 unlock_kernel();
                 must_relock++;
         }
-
-        mntput(filter->fo_vfsmnt);
-        //destroy_buffers(obd->u.obt.obt_sb->s_dev);
+        
+        if (must_put) 
+                /* In case we didn't mount with lustre_get_mount -- old method*/
+                mntput(filter->fo_vfsmnt);
         obd->u.obt.obt_sb = NULL;
 
         lvfs_clear_rdonly(save_dev);
@@ -1750,16 +1771,16 @@ static int filter_connect_internal(struct obd_export *exp,
 
         if (data->ocd_connect_flags & OBD_CONNECT_INDEX) {
                 struct filter_obd *filter = &exp->exp_obd->u.filter;
-                struct filter_server_data *fsd = filter->fo_fsd;
-                int index = le32_to_cpu(fsd->fsd_ost_index);
-
-                if (!(fsd->fsd_feature_compat &
+                struct lr_server_data *lsd = filter->fo_fsd;
+                int index = le32_to_cpu(lsd->lsd_ost_index);
+                
+                if (!(lsd->lsd_feature_compat &
                       cpu_to_le32(OBD_COMPAT_OST))) {
                         /* this will only happen on the first connect */
-                        fsd->fsd_ost_index = cpu_to_le32(data->ocd_index);
-                        fsd->fsd_feature_compat |= cpu_to_le32(OBD_COMPAT_OST);
+                        lsd->lsd_ost_index = cpu_to_le32(data->ocd_index);
+                        lsd->lsd_feature_compat |= cpu_to_le32(OBD_COMPAT_OST);
                         filter_update_server_data(exp->exp_obd,
-                                                  filter->fo_rcvd_filp, fsd, 1);
+                                                  filter->fo_rcvd_filp, lsd, 1);
                 } else if (index != data->ocd_index) {
                         LCONSOLE_ERROR("Connection from %s to index "
                                        "%u doesn't match actual OST "
@@ -2943,12 +2964,12 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
                 RETURN(-EINVAL);
         }
 
-        if (keylen < strlen("mds_conn") ||
-            memcmp(key, "mds_conn", keylen) != 0)
+        if (keylen < strlen(KEY_MDS_CONN) ||
+            memcmp(key, KEY_MDS_CONN, keylen) != 0)
                 RETURN(-EINVAL);
 
-        CWARN("%s: received MDS connection from %s\n", obd->obd_name,
-              obd_export_nid2str(exp));
+        LCONSOLE_WARN("%s: received MDS connection from %s\n", obd->obd_name,
+                      obd_export_nid2str(exp));
         obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie;
 
         /* setup llog imports */
@@ -3137,14 +3158,14 @@ static int __init obdfilter_init(void)
         init_obd_quota_ops(quota_interface, &filter_sanobd_ops);
 
         rc = class_register_type(&filter_obd_ops, lvars.module_vars,
-                                 OBD_FILTER_DEVICENAME);
+                                 LUSTRE_OST_NAME);
         if (rc)
                 GOTO(out, rc);
 
         rc = class_register_type(&filter_sanobd_ops, lvars.module_vars,
-                                 OBD_FILTER_SAN_DEVICENAME);
+                                 LUSTRE_OSTSAN_NAME);
         if (rc) {
-                class_unregister_type(OBD_FILTER_DEVICENAME);
+                class_unregister_type(LUSTRE_OST_NAME);
 out:
                 if (quota_interface)
                         PORTAL_SYMBOL_PUT(filter_quota_interface);
@@ -3162,8 +3183,9 @@ static void __exit obdfilter_exit(void)
         if (quota_interface)
                 PORTAL_SYMBOL_PUT(filter_quota_interface);
 
-        class_unregister_type(OBD_FILTER_SAN_DEVICENAME);
-        class_unregister_type(OBD_FILTER_DEVICENAME);
+        class_unregister_type(LUSTRE_OSTSAN_NAME);
+        class_unregister_type(LUSTRE_OST_NAME);
+        
         OBD_FREE(obdfilter_created_scratchpad,
                  OBDFILTER_CREATED_SCRATCHPAD_ENTRIES *
                  sizeof(*obdfilter_created_scratchpad));
index 6b08d94..81d9406 100644 (file)
@@ -8,22 +8,13 @@
 #ifdef __KERNEL__
 # include <linux/spinlock.h>
 #endif
-#include <linux/lustre_disk.h>
+#include <lustre_disk.h>
 #include <lustre_handles.h>
 #include <lustre_debug.h>
 #include <obd.h>
 
 #define FILTER_LAYOUT_VERSION "2"
 
-#ifndef OBD_FILTER_DEVICENAME
-# define OBD_FILTER_DEVICENAME "obdfilter"
-#endif
-
-#ifndef OBD_FILTER_SAN_DEVICENAME
-# define OBD_FILTER_SAN_DEVICENAME "sanobdfilter"
-#endif
-
-#define HEALTH_CHECK "health_check"
 #define FILTER_INIT_OBJID 0
 
 #define FILTER_SUBDIR_COUNT      32            /* set to zero for no subdirs */
 
 #define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 
-#define FILTER_INCOMPAT_SUPP   (OBD_INCOMPAT_GROUPS)
+#define FILTER_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS | OBD_INCOMPAT_OST | \
+                              OBD_INCOMPAT_COMMON_LR)
 
 #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
 #define GRANT_FOR_LLOG(obd) 16
 
-/* Data stored per server at the head of the last_rcvd file.  In le32 order.
- * Try to keep this the same as mds_server_data so we might one day merge. */
-struct filter_server_data {
-/* 00*/ __u8  fsd_uuid[40];        /* server UUID */
-/* 28*/ __u64 fsd_last_transno_new;/* future last completed transaction ID */
-/* 30*/ __u64 fsd_last_transno;    /* last completed transaction ID */
-        __u64 fsd_mount_count;     /* FILTER incarnation number */
-/* 40*/ __u32 fsd_feature_compat;  /* compatible feature flags */
-        __u32 fsd_feature_rocompat;/* read-only compatible feature flags */
-        __u32 fsd_feature_incompat;/* incompatible feature flags */
-        __u32 fsd_server_size;     /* size of server data area */
-/* 50*/ __u32 fsd_client_start;    /* start of per-client data area */
-        __u16 fsd_client_size;     /* size of per-client data area */
-        __u16 fsd_subdir_count;    /* number of subdirectories for objects */
-        __u64 fsd_catalog_oid;     /* recovery catalog object id */
-/* 60*/ __u32 fsd_catalog_ogen;    /* recovery catalog inode generation */
-        __u8  fsd_peeruuid[40];    /* UUID of MDS associated with this OST */
-/* 8c*/ __u32 fsd_ost_index;       /* index number of OST in LOV */
-        __u32 fsd_mds_index;       /* index number of MDS in LMV */
-/* 94*/ __u8  fsd_padding[LR_SERVER_SIZE - 148];
-};
-
 /* Data stored per client in the last_rcvd file.  In le32 order. */
 struct filter_client_data {
         __u8  fcd_uuid[40];        /* client UUID */
@@ -107,7 +77,7 @@ __u64 filter_last_id(struct filter_obd *, struct obdo *);
 int filter_update_fidea(struct obd_export *exp, struct inode *inode,
                         void *handle, struct obdo *oa);
 int filter_update_server_data(struct obd_device *, struct file *,
-                              struct filter_server_data *, int force_sync);
+                              struct lr_server_data *, int force_sync);
 int filter_update_last_objid(struct obd_device *, obd_gr, int force_sync);
 int filter_common_setup(struct obd_device *, obd_count len, void *buf,
                         void *option);
@@ -133,8 +103,8 @@ int filter_commitrw(int cmd, struct obd_export *, struct obdo *, int objcount,
                     struct obd_ioobj *, int niocount, struct niobuf_local *,
                     struct obd_trans_info *, int rc);
 int filter_brw(int cmd, struct obd_export *, struct obdo *,
-              struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *,
-              struct obd_trans_info *);
+               struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *,
+               struct obd_trans_info *);
 void flip_into_page_cache(struct inode *inode, struct page *new_page);
 
 /* filter_io_*.c */
index 0f2321a..6392f30 100644 (file)
@@ -3104,7 +3104,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
 
-        if (KEY_IS("next_id")) {
+        if (KEY_IS(KEY_NEXT_ID)) {
                 if (vallen != sizeof(obd_id))
                         RETURN(-EINVAL);
                 obd->u.cli.cl_oscc.oscc_next_id = *((obd_id*)val) + 1;
@@ -3123,11 +3123,11 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
                 RETURN(0);
         }
 
-        if (KEY_IS("initial_recov")) {
+        if (KEY_IS(KEY_INIT_RECOV)) {
                 if (vallen != sizeof(int))
                         RETURN(-EINVAL);
                 imp->imp_initial_recov = *(int *)val;
-                CDEBUG(D_HA, "%s: set imp_no_init_recov = %d\n",
+                CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n",
                        exp->exp_obd->obd_name,
                        imp->imp_initial_recov);
                 RETURN(0);
@@ -3277,7 +3277,7 @@ static int osc_import_event(struct obd_device *obd,
                 break;
         }
         case IMP_EVENT_INACTIVE: {
-                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
                 break;
         }
         case IMP_EVENT_INVALIDATE: {
@@ -3305,7 +3305,7 @@ static int osc_import_event(struct obd_device *obd,
                         oscc->oscc_flags &= ~OSCC_FLAG_NOSPC;
                         spin_unlock(&oscc->oscc_lock);
                 }
-                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
                 break;
         }
         case IMP_EVENT_OCD: {
@@ -3318,7 +3318,7 @@ static int osc_import_event(struct obd_device *obd,
                 if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
                         imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
 
-                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
                 break;
         }
         default:
index 2cc87af..11da088 100644 (file)
@@ -486,6 +486,8 @@ static void ost_nio_pages_put(struct ptlrpc_request *req,
         EXIT;
 }
 
+#if 0
+/* see ldlm_blocking_ast */
 /* cut-n-paste of mds_blocking_ast() */
 static int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                             void *data, int flag)
@@ -529,7 +531,8 @@ static int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
         }
         RETURN(0);
 }
-
+#endif
+                           
 static int ost_brw_lock_get(int mode, struct obd_export *exp,
                             struct obd_ioobj *obj, struct niobuf_remote *nb,
                             struct lustre_handle *lh)
@@ -1629,7 +1632,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
                 ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
                                 OST_MAXREPSIZE, OST_REQUEST_PORTAL,
                                 OSC_REPLY_PORTAL,
-                                obd_timeout * 1000, ost_handle, LUSTRE_OST_NAME,
+                                obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME,
                                 obd->obd_proc_entry, ost_print_req,
                                 ost_num_threads);
         if (ost->ost_service == NULL) {
@@ -1764,15 +1767,15 @@ static int __init ost_init(void)
         int rc;
         ENTRY;
 
-        lprocfs_init_vars(ost,&lvars);
+        lprocfs_init_vars(ost, &lvars);
         rc = class_register_type(&ost_obd_ops, lvars.module_vars,
-                                 LUSTRE_OST_NAME);
+                                 LUSTRE_OSS_NAME);
         RETURN(rc);
 }
 
 static void /*__exit*/ ost_exit(void)
 {
-        class_unregister_type(LUSTRE_OST_NAME);
+        class_unregister_type(LUSTRE_OSS_NAME);
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
index 715f65b..8b89cd9 100644 (file)
@@ -88,6 +88,7 @@ int ptlrpc_init_import(struct obd_import *imp)
 
         return 0;
 }
+EXPORT_SYMBOL(ptlrpc_init_import);
 
 #define UUID_STR "_UUID"
 static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
@@ -209,6 +210,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
         obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
 }
 
+/* unset imp_invalid */
 void ptlrpc_activate_import(struct obd_import *imp)
 {
         struct obd_device *obd = imp->imp_obd;
@@ -366,7 +368,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
                 /* Don't retry if connect fails */
                 rc = 0;
                 obd_set_info_async(obd->obd_self_export,
-                                   strlen("initial_recov"), "initial_recov",
+                                   strlen(KEY_INIT_RECOV), KEY_INIT_RECOV,
                                    sizeof(rc), &rc, NULL);
         }
 
@@ -414,6 +416,7 @@ out:
 
         RETURN(rc);
 }
+EXPORT_SYMBOL(ptlrpc_connect_import);
 
 static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
 {
@@ -845,6 +848,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
         switch (imp->imp_connect_op) {
         case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
         case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
+        case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
         default:
                 CERROR("don't know how to disconnect from %s (connect_op %d)\n",
                        obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
index d714a84..bcab551 100644 (file)
@@ -309,10 +309,10 @@ out:
 
 static int llog_client_close(struct llog_handle *handle)
 {
-        int rc = 0;
-
-        ENTRY;
-        RETURN(rc);
+        /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+           the servers all close the file at the end of every
+           other LLOG_ RPC. */
+        return(0);
 }
 
 
index ba80326..7926a3f 100644 (file)
@@ -408,7 +408,7 @@ void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
 
         bufcount = m->bufcount;
         if (n >= bufcount) {
-                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                CERROR("msg %p buffer[%d] not present (count %d)\n",
                        m, n, bufcount);
                 return NULL;
         }
@@ -628,6 +628,24 @@ void lustre_swab_mds_body (struct mds_body *b)
         __swab32s (&b->padding_4);
 }
 
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+        int i;
+        LASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+        for (i = 0; i < MTI_NIDS_MAX; i++) {
+                __swab64s(&mti->mti_nids[i]);
+                __swab64s(&mti->mti_failnids[i]);
+        }
+        for (i = 0; i < 8; i++) {
+                __swab16s(&mti->mti_failnodes[i]);
+        }
+        __swab32s(&mti->mti_stripe_index);
+        __swab32s(&mti->mti_nid_count);
+        __swab32s(&mti->mti_failnid_count);
+        __swab32s(&mti->mti_config_ver);
+        __swab32s(&mti->mti_flags);
+}
+
 static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
 {
         __swab64s (&i->dqi_bgrace);
@@ -1070,6 +1088,16 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_STATUS_CONN);
         LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
                  (long long)MDS_STATUS_LOV);
+        LASSERTF(MGS_CONNECT == 250, " found %lld\n",
+                 (long long)MGS_CONNECT);
+        LASSERTF(MGS_DISCONNECT == 251, " found %lld\n",
+                 (long long)MGS_DISCONNECT);
+        LASSERTF(MGS_EXCEPTION == 252, " found %lld\n",
+                 (long long)MGS_EXCEPTION);
+        LASSERTF(MGS_TARGET_REG == 253, " found %lld\n",
+                 (long long)MGS_TARGET_REG);
+        LASSERTF(MGS_TARGET_DEL == 254, " found %lld\n",
+                 (long long)MGS_TARGET_DEL);
         LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
                  (long long)LDLM_ENQUEUE);
         LASSERTF(LDLM_CONVERT == 102, " found %lld\n",
index db5eb7c..d9007e1 100644 (file)
@@ -342,7 +342,7 @@ static int ping_evictor_main(void *arg)
         time_t expire_time;
         ENTRY;
 
-        ptlrpc_daemonize("ping_evictor");
+        ptlrpc_daemonize("ll_evictor");
 
         CDEBUG(D_HA, "Starting Ping Evictor\n");
         pet_exp = NULL;
@@ -418,7 +418,7 @@ void ping_evictor_start(void)
 
         init_waitqueue_head(&pet_waitq);
 
-        rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
+        rc = cfs_kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FILES);
         if (rc < 0) {
                 pet_refcount--;
                 CERROR("Cannot start ping evictor thread: %d\n", rc);
index a3df637..60adc71 100644 (file)
@@ -216,6 +216,7 @@ EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
 EXPORT_SYMBOL(lustre_swab_ldlm_request);
 EXPORT_SYMBOL(lustre_swab_ldlm_reply);
 EXPORT_SYMBOL(lustre_swab_qdata);
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
 
 /* recover.c */
 EXPORT_SYMBOL(ptlrpc_run_recovery_over_upcall);
index 4d41dc0..cbbed63 100644 (file)
@@ -302,8 +302,10 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 }
 
 /*
+ * Administratively active/deactive a client. 
  * This should only be called by the ioctl interface, currently
- * with the lctl deactivate and activate commands.
+ * with the lctl deactivate and activate commands, and
+ * client umount -f (ll_umount_begin)
  */
 int ptlrpc_set_import_active(struct obd_import *imp, int active)
 {
@@ -333,6 +335,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
         RETURN(rc);
 }
 
+/* Attempt to reconnect an import */
 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
 {
         int rc;
@@ -370,6 +373,7 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
         ENTRY;
 
         spin_lock_irqsave(&imp->imp_lock, flags);
+        /* Check if reconnect is already in progress */
         if (imp->imp_state != LUSTRE_IMP_DISCON) {
                 in_recovery = 1;
         }
index 631f096..55fd5b3 100644 (file)
@@ -841,7 +841,9 @@ ptlrpc_check_rqbd_pool(struct ptlrpc_service *svc)
         if (avail <= low_water)
                 ptlrpc_grow_req_bufs(svc);
 
-        lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail);
+        if (svc->srv_stats)
+                lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+                                    avail);
 }
 
 static int
index ea4f574..6086088 100644 (file)
@@ -201,10 +201,11 @@ int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
                 rc = -EINTR;
 
         qchk->obd_uuid = cli->cl_target_uuid;
+        /* FIXME change strncmp to strcmp and save the strlen op */
         if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME,
             strlen(LUSTRE_OSC_NAME)))
-                memcpy(qchk->obd_type, LUSTRE_FILTER_NAME,
-                       strlen(LUSTRE_FILTER_NAME));
+                memcpy(qchk->obd_type, LUSTRE_OST_NAME,
+                       strlen(LUSTRE_OST_NAME));
         else if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME,
                  strlen(LUSTRE_MDC_NAME)))
                 memcpy(qchk->obd_type, LUSTRE_MDS_NAME,
index c04cf4f..a7d0a0a 100644 (file)
@@ -8,7 +8,7 @@ pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
 pkgexample_scripts += local.sh echo.sh uml.sh lov.sh
 noinst_DATA =
 noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh
-noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net
+noinst_SCRIPTS += runfailure-mds runvmstat runfailure-net
 noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests
 noinst_SCRIPTS += sanity.sh rundbench
 
index c769134..a4bacc2 100755 (executable)
@@ -5,7 +5,7 @@ set -vxe
 
 PATH=`dirname $0`/../utils:$PATH
 
-[ "$CONFIGS" ] || CONFIGS="local lov"
+[ "$CONFIGS" ] || CONFIGS="local"  #"local lov"
 [ "$MAX_THREADS" ] || MAX_THREADS=10
 if [ -z "$THREADS" ]; then
        KB=`awk '/MemTotal:/ { print $2 }' /proc/meminfo`
@@ -19,19 +19,29 @@ fi
 [ "$MOUNT2" ] || MOUNT2=${MOUNT}2
 [ "$TMP" ] || TMP=/tmp
 [ "$COUNT" ] || COUNT=1000
-#[ "$DEBUG_LVL" ] || DEBUG_LVL=0x370200
 [ "$DEBUG_LVL" ] || DEBUG_LVL=0
 [ "$DEBUG_OFF" ] || DEBUG_OFF="sysctl -w lnet.debug=$DEBUG_LVL"
-[ "$DEBUG_ON" ] || DEBUG_ON="sysctl -w lnet.debug=0x33f0480"
+[ "$DEBUG_ON" ] || DEBUG_ON="sysctl -w lnet.debug=0x33f0484"
 
 LIBLUSTRE=${LIBLUSTRE:-../liblustre}
 LIBLUSTRETESTS=${LIBLUSTRETESTS:-$LIBLUSTRE/tests}
 
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. mountconf.sh
+
+SETUP=${SETUP:-mcsetup}
+FORMAT=${FORMAT:-mcformat}
+CLEANUP=${CLEANUP:-mcstopall}
+
 for NAME in $CONFIGS; do
        export NAME MOUNT START CLEAN
-       [ -e $NAME.sh ] && sh $NAME.sh
-       [ ! -e $NAME.xml ] && [ -z "$LDAPURL" ] && \
-               echo "no config '$NAME.xml'" 1>&2 && exit 1
+       . $LUSTRE/tests/cfg/$NAME.sh
+       
+       assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
+       assert_env ost_HOST ost2_HOST OST_MKFS_OPTS OSTDEV
+       assert_env FSNAME
 
        if [ "$RUNTESTS" != "no" ]; then
                sh runtests
@@ -42,7 +52,7 @@ for NAME in $CONFIGS; do
        fi
 
        if [ "$DBENCH" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount_client $MOUNT
                SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'`
                DB_THREADS=`expr $SPACE / 50000`
                [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS
@@ -50,43 +60,44 @@ for NAME in $CONFIGS; do
                $DEBUG_OFF
                sh rundbench 1
                $DEBUG_ON
-               sh llmountcleanup.sh
-               sh llrmount.sh
+               $CLEANUP
+               $SETUP
                if [ $DB_THREADS -gt 1 ]; then
                        $DEBUG_OFF
                        sh rundbench $DB_THREADS
                        $DEBUG_ON
-                       sh llmountcleanup.sh
-                       sh llrmount.sh
+                       $CLEANUP
+                       $SETUP
                fi
                rm -f /mnt/lustre/`hostname`/client.txt
        fi
+
        chown $UID $MOUNT && chmod 700 $MOUNT
        if [ "$BONNIE" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount_client $MOUNT
                $DEBUG_OFF
                bonnie++ -f -r 0 -s $(($SIZE / 1024)) -n 10 -u $UID -d $MOUNT
                $DEBUG_ON
-               sh llmountcleanup.sh
-               sh llrmount.sh
+               $CLEANUP
+               $SETUP
        fi
 
        IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE -s $SIZE"
        IOZFILE="-f $MOUNT/iozone"
        if [ "$IOZONE" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount_client $MOUNT
                $DEBUG_OFF
                iozone $IOZONE_OPTS $IOZFILE
                $DEBUG_ON
-               sh llmountcleanup.sh
-               sh llrmount.sh
+               $CLEANUP
+               $SETUP
 
                if [ "$O_DIRECT" != "no" -a "$IOZONE_DIR" != "no" ]; then
                        $DEBUG_OFF
                        iozone -I $IOZONE_OPTS $IOZFILE.odir
                        $DEBUG_ON
-                       sh llmountcleanup.sh
-                       sh llrmount.sh
+                       $CLEANUP
+                       $SETUP
                fi
 
                SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'`
@@ -103,21 +114,22 @@ for NAME in $CONFIGS; do
                        done
                        iozone $IOZONE_OPTS -t $IOZ_THREADS $IOZFILE
                        $DEBUG_ON
-                       sh llmountcleanup.sh
-                       sh llrmount.sh
+                       $CLEANUP
+                       $SETUP
                elif [ $IOZVER -lt 3145 ]; then
                        VER=`iozone -v | awk '/Revision:/ { print $3 }'`
                        echo "iozone $VER too old for multi-thread test"
                fi
        fi
+
        if [ "$FSX" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount | grep $MOUNT || $SETUP
                $DEBUG_OFF
                ./fsx -c 50 -p 1000 -P $TMP -l $SIZE \
                        -N $(($COUNT * 100)) $MOUNT/fsxfile
                $DEBUG_ON
-               sh llmountcleanup.sh
-               sh llrmount.sh
+               $CLEANUP
+               $SETUP
        fi      
 
        mkdir -p $MOUNT2
@@ -132,11 +144,11 @@ for NAME in $CONFIGS; do
        esac
 
        if [ "$SANITYN" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount_client $MOUNT
                $DEBUG_OFF
 
                if [ "$MDSNODE" -a "$MDSNAME" -a "$CLIENT" ]; then
-                       llmount $MDSNODE:/$MDSNAME/$CLIENT $MOUNT2
+                       mount_client $MOUNT2
                        SANITYLOG=$TMP/sanity.log START=: CLEAN=: sh sanityN.sh
                        umount $MOUNT2
                else
@@ -145,12 +157,12 @@ for NAME in $CONFIGS; do
                fi
 
                $DEBUG_ON
-               sh llmountcleanup.sh
-               sh llrmount.sh
+               $CLEANUP
+               $SETUP
        fi
 
        if [ "$LIBLUSTRE" != "no" ]; then
-               mount | grep $MOUNT || sh llmount.sh
+               mount_client $MOUNT
                export LIBLUSTRE_MOUNT_POINT=$MOUNT2
                export LIBLUSTRE_MOUNT_TARGET=$MDSNODE:/$MDSNAME/$CLIENT
                export LIBLUSTRE_TIMEOUT=`cat /proc/sys/lustre/timeout`
@@ -158,11 +170,11 @@ for NAME in $CONFIGS; do
                if [ -x $LIBLUSTRETESTS/sanity ]; then
                        $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET
                fi
-               sh llmountcleanup.sh
-               #sh llrmount.sh
+               $CLEANUP
+               #$SETUP
        fi
 
-       mount | grep $MOUNT && sh llmountcleanup.sh
+       $CLEANUP
 done
 
 if [ "$REPLAY_SINGLE" != "no" ]; then
index 2b185f9..e8d323e 100644 (file)
@@ -1,33 +1,59 @@
+FSNAME=lustre
 mds_HOST=${mds_HOST:-`hostname`}
+mgs_HOST=${mgs_HOST:-$mds_HOST}
 mdsfailover_HOST=${mdsfailover_HOST:-""}
 ost1_HOST=${ost1_HOST:-"`hostname`"}
 ost2_HOST=${ost2_HOST:-"`hostname`"}
 EXTRA_OSTS=${EXTRA_OSTS:-"`hostname`"}
-client_HOST=${client_HOST:-"'*'"}
 LIVE_CLIENT=${LIVE_CLIENT:-"`hostname`"}
 # This should always be a list, not a regexp
 FAIL_CLIENTS=${FAIL_CLIENTS:-""}
 
+MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt}
+MDSSIZE=${MDSSIZE:-10000} #50000000
+OSTDEV=${OSTDEV:-"$TMP/${FSNAME}-ost%d"}
+OSTSIZE=${OSTSIZE:=10000} #50000000
+
 NETTYPE=${NETTYPE:-tcp}
+MGSNID=`h2$NETTYPE $mgs_HOST`
+FSTYPE=${FSTYPE:-ext3}
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
+STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 TIMEOUT=${TIMEOUT:-30}
-PTLDEBUG=${PTLDEBUG:-0x3f0400}
+PTLDEBUG=${PTLDEBUG:-0x33f0404}
 SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
-MOUNT=${MOUNT:-"/mnt/lustre"}
-#CLIENT_UPCALL=${CLIENT_UPCALL:-`pwd`/client-upcall-mdev.sh}
-#UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh}
 
-MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
-MDSSIZE=${MDSSIZE:-10000} #50000000
-MDSJOURNALSIZE=${MDSJOURNALSIZE:-0}
+MKFSOPT=""
+MOUNTOPT=""
+[ "x$MDSJOURNALSIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE"
+[ "x$MDSISIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -i $MDSISIZE"
+[ "x$MKFSOPT" != "x" ] &&
+    MKFSOPT="--mkfsoptions=\"$MKFSOPT\""
+[ "x$mdsfailover_HOST" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`"
+[ "x$STRIPE_BYTES" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --param default_stripe_size=$STRIPE_BYTES"
+[ "x$STRIPES_PER_OBJ" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --param default_stripe_count=$STRIPES_PER_OBJ"
+MDS_MKFS_OPTS="--mgs --mdt --device-size=$MDSSIZE $MKFSOPT $MOUNTOPT $MDSOPT"
 
-OSTDEV=${OSTDEV:-"$TMP/ost%d-`hostname`"}
-OSTSIZE=${OSTSIZE:=10000} #50000000
-OSTJOURNALSIZE=${OSTJOURNALSIZE:-0}
+MKFSOPT=""
+MOUNTOPT=""
+[ "x$OSTJOURNALSIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE"
+[ "x$MKFSOPT" != "x" ] &&
+    MKFSOPT="--mkfsoptions=\"$MKFSOPT\""
+[ "x$ostfailover_HOST" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`"
+OST_MKFS_OPTS="--ost --device-size=$OSTSIZE --mgsnode=$MGSNID $MKFSOPT $MOUNTOPT $OSTOPT"
 
-FSTYPE=${FSTYPE:-ext3}
-STRIPE_BYTES=${STRIPE_BYTES:-65536} #1048576
-STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
+MDS_MOUNT_OPTS="-o loop"
+OST_MOUNT_OPTS="-o loop"
+MOUNT=${MOUNT:-"/mnt/lustre"}
 
+PDSH=${PDSH:-no_dsh}
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
 POWER_DOWN=${POWER_DOWN:-"powerman --off"}
 POWER_UP=${POWER_UP:-"powerman --on"}
index 47a7b0c..38ad798 100644 (file)
@@ -62,7 +62,6 @@ fi
 OSTJOURNALSIZE=${OSTJOURNALSIZE:-0}
 
 FSTYPE=${FSTYPE:-ext3}
-#STRIPE_BYTES=${STRIPE_BYTES:-65536} 
 STRIPE_BYTES=${STRIPE_BYTES:-1048576} 
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 
index c68419b..ca7258e 100644 (file)
@@ -3,38 +3,72 @@ MDSNODE=${MDSNODE:-`hostname`}
 OSTNODE=${OSTNODE:-`hostname`}
 CLIENT=${CLIENT:-client}
 
+FSNAME=lustre
 mds_HOST=${mds_HOST:-$MDSNODE}
 mdsfailover_HOST=${mdsfailover_HOST}
+mgs_HOST=${mgs_HOST:-$mds_HOST}
 ost_HOST=${ost_HOST:-$OSTNODE}
+ostfailover_HOST=${ostfailover_HOST}
 ost2_HOST=${ost2_HOST:-$ost_HOST}
-client_HOST=${client_HOST:-$CLIENT}
-NETTYPE=${NETTYPE:-tcp}
-
-MOUNT=${MOUNT:-"/mnt/lustre"}
-MOUNT1=${MOUNT1:-$MOUNT}
-MOUNT2=${MOUNT2:-${MOUNT}2}
-DIR=${DIR:-$MOUNT}
-DIR2=${DIR2:-$MOUNT1}
-PTLDEBUG=${PTLDEBUG:-0x3f0400}
-SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
-PDSH=${PDSH:-no_dsh}
 
 TMP=${TMP:-/tmp}
-MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
+MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt}
 MDSSIZE=${MDSSIZE:-100000}
-OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
+MDSOPT=${MDSOPT:-"--mountfsoptions=acl"}
+OSTDEV=${OSTDEV:-$TMP/${FSNAME}-ost0}
 OSTSIZE=${OSTSIZE:-200000}
-FSTYPE=${FSTYPE:-ext3}
+OSTDEV2=${OSTDEV2:-$TMP/${FSNAME}-ost1}
+
+NETTYPE=${NETTYPE:-tcp}
+MGSNID=`h2$NETTYPE $mgs_HOST`
+FSTYPE=${FSTYPE:-ldiskfs}
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
+STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 TIMEOUT=${TIMEOUT:-20}
 UPCALL=${UPCALL:-DEFAULT}
+PTLDEBUG=${PTLDEBUG:-0x33f0404}
+SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
 
-MDSOPT=${MDSOPT:-"user_xattr,acl"}
-CLIENTOPT=${CLIENTOPT:-"user_xattr,acl"}
-MOUNTOPT=${MOUNTOPT:-"user_xattr,acl"}
+MKFSOPT=""
+MOUNTOPT=""
+[ "x$MDSJOURNALSIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE"
+[ "x$MDSISIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -i $MDSISIZE"
+[ "x$MKFSOPT" != "x" ] &&
+    MKFSOPT="--mkfsoptions=\"$MKFSOPT\""
+[ "x$mdsfailover_HOST" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`"
+[ "x$STRIPE_BYTES" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --param default_stripe_size=$STRIPE_BYTES"
+[ "x$STRIPES_PER_OBJ" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --param default_stripe_count=$STRIPES_PER_OBJ"
+MDS_MKFS_OPTS="--mgs --mdt --device-size=$MDSSIZE --param obd_timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $MDSOPT"
 
-STRIPE_BYTES=${STRIPE_BYTES:-1048576}
-STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
+MKFSOPT=""
+MOUNTOPT=""
+[ "x$OSTJOURNALSIZE" != "x" ] &&
+    MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE"
+[ "x$MKFSOPT" != "x" ] &&
+    MKFSOPT="--mkfsoptions=\"$MKFSOPT\""
+[ "x$ostfailover_HOST" != "x" ] &&
+    MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`"
+OST_MKFS_OPTS="--ost --device-size=$OSTSIZE --mgsnode=$MGSNID --param obd_timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT"
+OST2_MKFS_OPTS=${OST2_MKFS_OPTS:-${OST_MKFS_OPTS}}
+
+MDS_MOUNT_OPTS="-o loop"
+OST_MOUNT_OPTS="-o loop"
+OST2_MOUNT_OPTS="-o loop"
 
+MOUNT=${MOUNT:-/mnt/${FSNAME}}
+MOUNT1=${MOUNT1:-$MOUNT}
+MOUNT2=${MOUNT2:-${MOUNT}2}
+DIR=${DIR:-$MOUNT}
+DIR1=${DIR:-$MOUNT1}
+DIR2=${DIR2:-$MOUNT2}
+MOUNTOPT=${MOUNTOPT:-"user_xattr,acl"}
+
+PDSH=${PDSH:-no_dsh}
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
 POWER_DOWN=${POWER_DOWN:-"powerman --off"}
 POWER_UP=${POWER_UP:-"powerman --on"}
index c7f7674..b0d6101 100644 (file)
@@ -23,7 +23,7 @@ FSTYPE=${FSTYPE:-ext3}
 TIMEOUT=${TIMEOUT:-10}
 #UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
 
-STRIPE_BYTES=${STRIPE_BYTES:-65536}
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
index 37f33a4..23ea6ac 100644 (file)
@@ -10,8 +10,8 @@
 set -e
 
 ONLY=${ONLY:-"$*"}
-# bug number for skipped test:
-ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT"
+# bug number for skipped test:      mc mc mc mc mc mc  mc mc mc
+ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT 9  10 11 12 13 13b 14 15 18"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 SRCDIR=`dirname $0`
@@ -20,6 +20,7 @@ PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
 LUSTRE=${LUSTRE:-`dirname $0`/..}
 RLUSTRE=${RLUSTRE:-$LUSTRE}
 MOUNTLUSTRE=${MOUNTLUSTRE:-/sbin/mount.lustre}
+MKFSLUSTRE=${MKFSLUSTRE:-/usr/sbin/mkfs.lustre}
 HOSTNAME=`hostname`
 
 . $LUSTRE/tests/test-framework.sh
@@ -28,61 +29,76 @@ init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
 
-gen_config() {
-       rm -f $XMLCONFIG
-
-       add_mds mds --dev $MDSDEV --size $MDSSIZE
-       add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
-           --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-       add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
-       add_client client mds --lov lov1 --path $MOUNT
+reformat() {
+        grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT
+       stop ost -f
+       stop ost2 -f
+       stop mds -f
+       echo Formatting mds, ost, ost2
+       add mds $MDS_MKFS_OPTS --reformat $MDSDEV  > /dev/null
+       add ost $OST_MKFS_OPTS --reformat $OSTDEV  > /dev/null
+       add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2  > /dev/null
 }
 
-gen_second_config() {
-       rm -f $XMLCONFIG
-
-       add_mds mds2 --dev $MDSDEV --size $MDSSIZE
-       add_lov lov2 mds2 --stripe_sz $STRIPE_BYTES\
-           --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-       add_ost ost2 --lov lov2 --dev $OSTDEV --size $OSTSIZE
-       add_client client mds2 --lov lov2 --path $MOUNT2
+gen_config() {
+        reformat
+        # The MGS must be started before the OSTs for a new fs, so start
+        # and stop to generate the startup logs. 
+       start_mds
+       start_ost
+       sleep 5
+       stop_ost
+       stop_mds
 }
 
 start_mds() {
        echo "start mds service on `facet_active_host mds`"
-       start mds --reformat $MDSLCONFARGS  || return 94
+       start mds $MDSDEV $MDS_MOUNT_OPTS || return 94
 }
 
 stop_mds() {
        echo "stop mds service on `facet_active_host mds`"
-       stop mds $@  || return 97
+       # These tests all use non-failover stop
+       stop mds -f  || return 97
 }
 
 start_ost() {
        echo "start ost service on `facet_active_host ost`"
-       start ost --reformat $OSTLCONFARGS  || return 95
+       start ost $OSTDEV $OST_MOUNT_OPTS || return 95
 }
 
 stop_ost() {
        echo "stop ost service on `facet_active_host ost`"
-       stop ost $@  || return 98
+       # These tests all use non-failover stop
+       stop ost -f  || return 98
+}
+
+start_ost2() {
+       echo "start ost2 service on `facet_active_host ost2`"
+       start ost2 $OSTDEV2 $OST2_MOUNT_OPTS || return 92
+}
+
+stop_ost2() {
+       echo "stop ost2 service on `facet_active_host ost2`"
+       # These tests all use non-failover stop
+       stop ost2 -f  || return 93
 }
 
 mount_client() {
        local MOUNTPATH=$1
        echo "mount lustre on ${MOUNTPATH}....."
-       zconf_mount `hostname`  $MOUNTPATH  || return 96
+       zconf_mount `hostname` $MOUNTPATH  || return 96
 }
 
 umount_client() {
        local MOUNTPATH=$1
        echo "umount lustre on ${MOUNTPATH}....."
-       zconf_umount `hostname`  $MOUNTPATH || return 97
+       zconf_umount `hostname` $MOUNTPATH || return 97
 }
 
 manual_umount_client(){
-       echo "manual umount lustre on ${MOUNTPATH}...."
-       do_facet client "umount $MOUNT"
+       echo "manual umount lustre on ${MOUNT}...."
+       do_facet client "umount -d $MOUNT"
 }
 
 setup() {
@@ -91,22 +107,23 @@ setup() {
        mount_client $MOUNT
 }
 
+cleanup_nocli() {
+       stop_mds || return 201
+       stop_ost || return 202
+       unload_modules || return 203
+}
+
 cleanup() {
-       umount_client $MOUNT $FORCE || return 200
-       stop_mds $FORCE || return 201
-       stop_ost $FORCE || return 202
-       # catch case where these return just fine, but modules are still not unloaded
-       /sbin/lsmod | egrep -q "lnet|libcfs"
-       if [ 1 -ne $? ]; then
-               echo "modules still loaded..."
-               /sbin/lsmod
-               return 203
-       fi
+       umount_client $MOUNT || return 200
+       cleanup_nocli || return $?
 }
 
 check_mount() {
-       do_facet client "touch $DIR/a" || return 71     
-       do_facet client "rm $DIR/a" || return 72        
+       do_facet client "cp /etc/passwd $DIR/a" || return 71
+       do_facet client "rm $DIR/a" || return 72
+       # make sure lustre is actually mounted (touch will block, 
+        # but grep won't, so do it after) 
+        do_facet client "grep $MOUNT' ' /proc/mounts > /dev/null" || return 73
        echo "setup single mount lustre success"
 }
 
@@ -136,9 +153,7 @@ gen_config
 
 
 test_0() {
-       start_ost
-       start_mds       
-       mount_client $MOUNT
+        setup
        check_mount || return 41
        cleanup || return $?
 }
@@ -147,9 +162,7 @@ run_test 0 "single mount setup"
 test_1() {
        start_ost
        echo "start ost second time..."
-       start ost --reformat $OSTLCONFARGS
-       start_mds       
-       mount_client $MOUNT
+       setup
        check_mount || return 42
        cleanup || return $?
 }
@@ -159,8 +172,7 @@ test_2() {
        start_ost
        start_mds       
        echo "start mds second time.."
-       start mds --reformat $MDSLCONFARGS
-       
+       start_mds
        mount_client $MOUNT
        check_mount || return 43
        cleanup || return $?
@@ -169,19 +181,17 @@ run_test 2 "start up mds twice"
 
 test_3() {
        setup
-       mount_client $MOUNT
-
+       #mount.lustre returns an error if already in mtab
+       mount_client $MOUNT && return $?
        check_mount || return 44
-       
-       umount_client $MOUNT    
-       cleanup  || return $?
+       cleanup || return $?
 }
 run_test 3 "mount client twice"
 
 test_4() {
        setup
        touch $DIR/$tfile || return 85
-       stop_ost --force
+       stop_ost -f
        cleanup
        eno=$?
        # ok for ost to fail shutdown
@@ -195,12 +205,12 @@ run_test 4 "force cleanup ost, then cleanup"
 test_5() {
        setup
        touch $DIR/$tfile || return 1
-       stop_mds --force || return 2
+       stop_mds -f || return 2
 
        # cleanup may return an error from the failed
        # disconnects; for now I'll consider this successful
        # if all the modules have unloaded.
-       umount $MOUNT &
+       umount -d $MOUNT &
        UMOUNT_PID=$!
        sleep 6
        echo "killing umount"
@@ -219,102 +229,66 @@ test_5() {
                grep " $MOUNT " /etc/mtab && echo "test 5: mtab after second umount" && return 11
        fi
 
-       # cleanup client modules
-       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
-       
+       manual_umount_client
        # stop_mds is a no-op here, and should not fail
-       stop_mds  || return 4
-       stop_ost || return 5
-
-       lsmod | grep -q lnet && return 6
-       return 0
+       cleanup_nocli || return $?
+       # df may have lingering entry
+       manual_umount_client
+       # mtab may have lingering entry
+       grep -v $MOUNT" " /etc/mtab > $TMP/mtabtemp
+       mv $TMP/mtabtemp /etc/mtab
 }
 run_test 5 "force cleanup mds, then cleanup"
 
 test_5b() {
        start_ost
-
        [ -d $MOUNT ] || mkdir -p $MOUNT
-       grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before lconf" && return 9
-       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
        grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before mount" && return 10
-       llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/mds_svc/client_facet $MOUNT  && return 1
+       mount_client $MOUNT && return 1
        grep " $MOUNT " /etc/mtab && echo "test 5b: mtab after failed mount" && return 11
-
-       # cleanup client modules
-       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
-       
+       umount_client $MOUNT    
        # stop_mds is a no-op here, and should not fail
-       stop_mds || return 2
-       stop_ost || return 3
-
-       lsmod | grep -q lnet && return 4
+       cleanup_nocli || return $?
        return 0
-
 }
 run_test 5b "mds down, cleanup after failed mount (bug 2712)"
 
 test_5c() {
        start_ost
        start_mds
-
        [ -d $MOUNT ] || mkdir -p $MOUNT
-       grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before lconf" && return 9
-       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
        grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before mount" && return 10
-       llmount -vv -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/wrong_mds_svc/client_facet $MOUNT && return 1
+       mount -t lustre `facet_nid mgs`:/wrong.$FSNAME $MOUNT || :
        grep " $MOUNT " /etc/mtab && echo "test 5c: mtab after failed mount" && return 11
-
-       # cleanup client modules
-       $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
-       
-       stop_mds || return 2
-       stop_ost || return 3
-
-       lsmod | grep -q lnet && return 4
-       return 0
-
+       umount_client $MOUNT
+       cleanup_nocli  || return $?
 }
 run_test 5c "cleanup after failed mount (bug 2712)"
 
 test_5d() {
        start_ost
        start_mds
-       stop_ost --force
-
-       [ -d $MOUNT ] || mkdir -p $MOUNT
-       grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before lconf" && return 9
-       $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+       stop_ost -f
        grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before mount" && return 10
-       llmount -vv -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`:/mds_svc/client_facet $MOUNT || return 1
-
-       umount_client $MOUNT || return 2
+       mount_client $MOUNT || return 1
+       cleanup  || return $?
        grep " $MOUNT " /etc/mtab && echo "test 5d: mtab after unmount" && return 11
-       
-       stop_mds || return 3
-
-       lsmod | grep -q lnet && return 4
        return 0
-
 }
-run_test 5d "ost down, don't crash during mount attempt"
+run_test 5d "mount with ost down"
 
 test_5e() {
        start_ost
        start_mds
-       sleep 5 # give MDS a chance to connect to OSTs before delaying requests
+        # give MDS a chance to connect to OSTs (bz 10476)
+       sleep 5 
 
 #define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
        do_facet client "sysctl -w lustre.fail_loc=0x80000506"
        grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10
        mount_client $MOUNT || echo "mount failed (not fatal)"
-       umount_client $MOUNT || return 2
+       cleanup  || return $?
        grep " $MOUNT " /etc/mtab && echo "test 5e: mtab after unmount" && return 11
-       
-       stop_mds || return 3
-       stop_ost || return 3
-
-       lsmod | grep -q lnet && return 4
        return 0
 }
 run_test 5e "delayed connect, don't crash (bug 10268)"
@@ -331,23 +305,16 @@ run_test 6 "manual umount, then mount again"
 test_7() {
        setup
        manual_umount_client
-       cleanup || return $?
+       cleanup_nocli || return $?
 }
 run_test 7 "manual umount, then cleanup"
 
 test_8() {
-       start_ost
-       start_mds
-
-       mount_client $MOUNT
+       setup
        mount_client $MOUNT2
-
        check_mount2 || return 45
-       umount $MOUNT
        umount_client $MOUNT2
-       
-       stop_mds
-       stop_ost
+       cleanup  || return $?
 }
 run_test 8 "double mount setup"
 
@@ -663,7 +630,7 @@ test_15() {
        echo "mount lustre on $MOUNT with $MOUNTLUSTRE: success"
        [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname`
        check_mount || return 41
-       do_node `hostname` umount $MOUNT
+       do_node `hostname` umount -d $MOUNT
 
        [ -f "$MOUNTLUSTRE" ] && rm -f $MOUNTLUSTRE
        echo "mount lustre on ${MOUNT} without $MOUNTLUSTRE....."
@@ -680,9 +647,7 @@ test_16() {
 
         if [ ! -f "$MDSDEV" ]; then
             echo "no $MDSDEV existing, so mount Lustre to create one"
-            start_ost
-            start_mds
-            mount_client $MOUNT
+           setup
             check_mount || return 41
             cleanup || return $?
         fi
@@ -691,12 +656,10 @@ test_16() {
         do_facet mds "[ -d $TMPMTPT ] || mkdir -p $TMPMTPT;
                       mount -o loop -t ext3 $MDSDEV $TMPMTPT || return \$?;
                       chmod 555 $TMPMTPT/{OBJECTS,LOGS,PENDING} || return \$?;
-                      umount $TMPMTPT || return \$?" || return $?
+                      umount -d $TMPMTPT || return \$?" || return $?
 
         echo "mount Lustre to change the mode of OBJECTS/LOGS/PENDING, then umount Lustre"
-        start_ost
-        start_mds
-        mount_client $MOUNT
+       setup
         check_mount || return 41
         cleanup || return $?
 
@@ -729,23 +692,19 @@ test_16() {
 run_test 16 "verify that lustre will correct the mode of OBJECTS/LOGS/PENDING"
 
 test_17() {
-        TMPMTPT="${MOUNT%/*}/conf17"
-
         if [ ! -f "$MDSDEV" ]; then
             echo "no $MDSDEV existing, so mount Lustre to create one"
-            start_ost
-            start_mds
-            mount_client $MOUNT
+           setup
             check_mount || return 41
             cleanup || return $?
         fi
 
         echo "Remove mds config log"
-        do_facet mds "debugfs -w -R 'unlink LOGS/mds_svc' $MDSDEV || return \$?" || return $?
+        do_facet mds "debugfs -w -R 'unlink CONFIGS/$FSNAME-MDT0000' $MDSDEV || return \$?" || return $?
 
         start_ost
-       start mds $MDSLCONFARGS && return 42
-        cleanup || return $?
+       start_mds && return 42
+       gen_config
 }
 run_test 17 "Verify failed mds_postsetup won't fail assertion (2936)"
 
@@ -754,12 +713,11 @@ test_18() {
         echo "mount mds with large journal..."
         OLDMDSSIZE=$MDSSIZE
         MDSSIZE=2000000
+       #FIXME have to change MDS_MKFS_OPTS
         gen_config
 
         echo "mount lustre system..."
-        start_ost
-        start_mds
-        mount_client $MOUNT
+       setup
         check_mount || return 41
 
         echo "check journal size..."
@@ -779,15 +737,73 @@ test_18() {
 }
 run_test 18 "check lconf creates large journals"
 
-test_19() {
-        # first format the ost/mdt
+test_19a() {
+       start_mds || return 1
+       stop_mds -f || return 2
+}
+run_test 19a "start/stop MDS without OSTs"
+
+test_19b() {
+       start_ost || return 1
+       stop_ost -f || return 2
+}
+run_test 19b "start/stop OSTs without MDS"
+
+test_20a() {
+        start_mds
+       start_ost
+       stop_ost
+       stop_mds
+}
+run_test 20a "start mds before ost, stop ost first"
+
+test_20b() {
         start_ost
        start_mds
        stop_mds
        stop_ost
-       start mds $MDSLCONFARGS || return 1
-       stop mds --force || return 2
 }
-run_test 19 "start/stop MDS without OSTs"
+run_test 20b "start ost before mds, stop mds first"
+
+test_20c() {
+        start_ost
+       start_mds
+       start_ost2
+       stop_ost
+       stop_ost2
+       stop_mds
+}
+run_test 20c "start mds between two osts, stop mds last"
+
+test_21() {
+        reformat
+       start_mds
+       echo Client mount before any osts are in the logs
+       mount_client $MOUNT
+       check_mount && return 41
+       pass
+
+       echo Client mount with ost in logs, but none running
+       start_ost
+       stop_ost
+       mount_client $MOUNT
+       # check_mount will block trying to contact ost
+       umount_client $MOUNT
+       pass
+
+       echo Client mount with a running ost
+       start_ost
+       mount_client $MOUNT
+       sleep 5 #bz10476
+       check_mount || return 41
+       pass
+
+       cleanup
+}
+run_test 21 "start a client before osts"
+
+
+umount_client $MOUNT   
+cleanup_nocli
 
 equals_msg "Done"
index 03a8f7d..d399036 100755 (executable)
@@ -18,7 +18,9 @@ UPCALL=${UPCALL:-DEFAULT}
 
 build_test_filter
 
-assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT 
+assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
+assert_env ost1_HOST ost2_HOST OST_MKFS_OPTS OSTDEV
+assert_env LIVE_CLIENT FSNAME
 
 ####
 # Initialize all the ostN_HOST 
@@ -111,48 +113,36 @@ reintegrate_clients() {
     DOWN_NUM=0
 }
 
-gen_config() {
-    rm -f $XMLCONFIG
-    add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
-
-    if [ ! -z "$mdsfailover_HOST" ]; then
-        add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
-    fi
-
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
-       --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-    for i in `seq $NUMOST`; do
-       dev=`printf $OSTDEV $i`
-       add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
-           --journal-size $OSTJOURNALSIZE
-    done
-     
-
-    add_client client mds --lov lov1 --path $MOUNT
+start_ost() {
+    local dev=`printf $OSTDEV $1`
+    start ost$1 $dev $OST_MOUNT_OPTS
 }
 
 setup() {
-    gen_config
-
+    cleanup
     rm -rf logs/*
+    wait_for mds
+    add mds $MDS_MKFS_OPTS --reformat $MDSDEV >> /dev/null
+    start mds $MDSDEV $MDS_MOUNT_OPTS
     for i in `seq $NUMOST`; do
+       local dev=`printf $OSTDEV $i`
+       local index=$((i - 1))
        wait_for ost$i
-       start ost$i ${REFORMAT} $OSTLCONFARGS 
+       echo Adding ost$i at index $index dev $dev
+       add ost$i $OST_MKFS_OPTS --reformat --index=$index $dev >> /dev/null
+       start ost$i $dev $OST_MOUNT_OPTS
     done
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-    wait_for mds
-    start mds $MDSLCONFARGS ${REFORMAT}
+
     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
-
 }
 
 cleanup() {
     zconf_umount $CLIENTS $MOUNT
-
-    stop mds ${FORCE} $MDSLCONFARGS || :
+    stop mds -f
     for i in `seq $NUMOST`; do
-        stop ost$i ${FORCE} $OSTLCONFARGS  || :
+        stop ost$i -f
     done
 }
 
@@ -205,11 +195,9 @@ node_to_ost() {
     done
     echo "No ost found for node; $node"
     return 1
-    
 }
 
 
-
 if [ "$ONLY" == "cleanup" ]; then
     $CLEANUP
     exit
@@ -230,17 +218,14 @@ fi
 echo "Starting Test 17 at `date`"
 
 test_0() {
-    echo "Failover MDS"
     facet_failover mds
     echo "Waiting for df pid: $DFPID"
     wait $DFPID || { echo "df returned $?" && return 1; }
 
-    echo "Failing OST1"
     facet_failover ost1
     echo "Waiting for df pid: $DFPID"
     wait $DFPID || { echo "df returned $?" && return 2; }
 
-    echo "Failing OST2"
     facet_failover ost2
     echo "Waiting for df pid: $DFPID"
     wait $DFPID || { echo "df returned $?" && return 3; }
@@ -261,7 +246,6 @@ test_2() {
     echo "Verify Lustre filesystem is up and running"
     client_df
 
-    echo "Failing MDS"
     shutdown_facet mds
     reboot_facet mds
 
@@ -273,17 +257,15 @@ test_2() {
     DFPID=$!
     sleep 5
 
-    echo "Failing OST"
     shutdown_facet ost1
 
     echo "Reintegrating OST"
     reboot_facet ost1
     wait_for ost1
-    start ost1
+    start_ost 1
 
-    echo "Failover MDS"
     wait_for mds
-    start mds
+    start mds $MDSDEV $MDS_MOUNT_OPTS
 
     #Check FS
     wait $DFPID
@@ -331,7 +313,6 @@ test_4() {
     echo "Fourth Failure Mode: OST/MDS `date`"
 
     #OST Portion
-    echo "Failing OST ost1"
     shutdown_facet ost1
  
     #Check FS
@@ -341,7 +322,6 @@ test_4() {
     sleep 5
 
     #MDS Portion
-    echo "Failing MDS"
     shutdown_facet mds
     reboot_facet mds
 
@@ -357,11 +337,10 @@ test_4() {
     echo "Reintegrating OST"
     reboot_facet ost1
     wait_for ost1
-    start ost1
+    start_ost 1
     
-    echo "Failover MDS"
     wait_for mds
-    start mds
+    start mds $MDSDEV $MDS_MOUNT_OPTS
     #Check FS
     
     wait $DFPIDA
@@ -382,7 +361,6 @@ test_5() {
     client_df
     
     #OST Portion
-    echo "Failing OST"
     shutdown_facet ost1
     reboot_facet ost1
     
@@ -393,7 +371,6 @@ test_5() {
     sleep 5
     
     #OST Portion
-    echo "Failing OST"
     shutdown_facet ost2
     reboot_facet ost2
 
@@ -406,9 +383,9 @@ test_5() {
     #Reintegration
     echo "Reintegrating OSTs"
     wait_for ost1
-    start ost1
+    start_ost 1
     wait_for ost2
-    start ost2
+    start_ost 2
     
     clients_recover_osts ost1
     clients_recover_osts ost2
@@ -431,7 +408,6 @@ test_6() {
     client_touch testfile || return 2
        
     #OST Portion
-    echo "Failing OST"
     shutdown_facet ost1
     reboot_facet ost1
 
@@ -454,7 +430,7 @@ test_6() {
     #Reintegration
     echo "Reintegrating OST/CLIENTs"
     wait_for ost1
-    start ost1
+    start_ost 1
     reintegrate_clients
     sleep 5 
 
@@ -496,7 +472,6 @@ test_7() {
     client_rm testfile
 
     #MDS Portion
-    echo "Failing MDS"
     facet_failover mds
 
     #Check FS
@@ -548,7 +523,6 @@ test_8() {
 
 
     #OST Portion
-    echo "Failing OST"
     shutdown_facet ost1
     reboot_facet ost1
 
@@ -565,7 +539,7 @@ test_8() {
     echo "Reintegrating CLIENTs/OST"
     reintegrate_clients
     wait_for ost1
-    start ost1
+    start_ost 1
     wait $DFPID
     client_df || return 1
     client_touch testfile2 || return 2
@@ -637,5 +611,4 @@ test_10() {
 run_test 10 "Running Availability for 6 hours..."
 
 equals_msg "Done, cleaning up"
-# we need to force cleanup for the stale MDS conns until bug 5921 is fixed
-FORCE=--force $CLEANUP
+$CLEANUP
index ca26b2a..80fa2b3 100755 (executable)
@@ -29,13 +29,16 @@ fi
 [ "$DEBUG" ] && debug_opt="--ptldebug=$DEBUG"
 [ "$PTLDEBUG" ] && debug_opt="--ptldebug=$PTLDEBUG"
 
-${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ \
-       $conf_opt  || {
+echo llmount: FIXME replace llmount.sh with ". mountconf.sh" and "$SETUP"
+
+exit 1
+
+#${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ $conf_opt  || {
     # maybe acceptor error, dump tcp port usage
-    netstat -tpn
-    exit 2
-}
+#    netstat -tpn
+#    exit 2
+#}
 
-if [ "$MOUNT2" ]; then
-       $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3
-fi
+#if [ "$MOUNT2" ]; then
+#      $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3
+#fi
index 3293050..7d8eda9 100755 (executable)
@@ -30,8 +30,12 @@ fi
 
 [ "$MOUNT2" ] && umount $MOUNT2
 
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \
+#${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \
     --dump $TMP/debug $conf_opt
+
+echo FIXME llmountcleanup should be replaced with $CLEANUP
+exit 1
+
 rc=$?
 echo "lconf DONE"
 BUSY=`dmesg | grep -i destruct`
diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh
deleted file mode 100755 (executable)
index 434ef44..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/sh
-# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
-
-export PATH=`dirname $0`/../utils:$PATH
-
-LCONF=${LCONF:-lconf}
-NAME=${NAME:-local}
-LLMOUNT=${LLMOUNT:-llmount}
-
-config=$NAME.xml
-mkconfig=$NAME.sh
-
-if [ "$PORTALS" ]; then
-  portals_opt="--portals=$PORTALS"
-fi
-
-if [ "$LUSTRE" ]; then
-  lustre_opt="--lustre=$LUSTRE"
-fi
-
-if [ "$LDAPURL" ]; then
-    conf_opt="--ldapurl $LDAPURL --config $NAME"
-else
-    if [ ! -f $config -o $mkconfig -nt $config ]; then
-       sh $mkconfig $config || exit 1
-    fi
-    conf_opt="$config"
-fi    
-
-[ "$NODE" ] && node_opt="--node $NODE"
-[ "$DEBUG" ] && portals_opt="$portals_opt --ptldebug=$DEBUG"
-[ "$PTLDEBUG" ] && portals_opt="$portals_opt --ptldebug=$PTLDEBUG"
-
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || {
-    # maybe acceptor error, dump tcp port usage
-    netstat -tpn
-    exit 2
-}
-
-
-if [ "$MOUNT2" ]; then
-       $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3
-fi
index 0a8cc71..fb19ac6 100755 (executable)
@@ -4,29 +4,30 @@ export PATH=`dirname $0`/../utils:$PATH
 
 config=${1:-`basename $0 .sh`.xml}
 
-LMC="${LMC:-lmc} -m $config"
+LMC=echo 
 TMP=${TMP:-/tmp}
 
+FSNAME=lustre
 HOSTNAME=`hostname`
-MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
+MDSDEV=${MDSDEV:-$TMP/mdt-${FSNAME}}
 MDSSIZE=${MDSSIZE:-400000}
-FSTYPE=${FSTYPE:-ext3}
-MOUNT=${MOUNT:-/mnt/lustre}
+MOUNT=${MOUNT:-/mnt/${FSNAME}}
 MOUNT2=${MOUNT2:-${MOUNT}2}
 NETTYPE=${NETTYPE:-tcp}
 [ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
 
-OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
+OSTDEV=${OSTDEV:-$TMP/ost0-${FSNAME}}
 OSTSIZE=${OSTSIZE:-400000}
+OSTDEV2=${OSTDEV2:-$TMP/ost1-${FSNAME}}
 
 MDS_MOUNT_OPTS="user_xattr,acl,${MDS_MOUNT_OPTS:-""}"
 CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}"
 
 # specific journal size for the ost, in MB
 JSIZE=${JSIZE:-0}
-[ "$JSIZE" -gt 0 ] && JARG="--journal_size $JSIZE"
+[ "$JSIZE" -gt 0 ] && OST_MKFS_OPTS=$OST_MKFS_OPTS" -J size=$JSIZE"
 MDSISIZE=${MDSISIZE:-0}
-[ "$MDSISIZE" -gt 0 ] && IARG="--inode_size $MDSISIZE"
+[ "$MDSISIZE" -gt 0 ] && MDS_MKFS_OPTS=$MDS_MKFS_OPTS" -i $MDSISIZE"
 
 STRIPE_BYTES=${STRIPE_BYTES:-1048576}
 STRIPES_PER_OBJ=1      # 0 means stripe over all OSTs
@@ -58,38 +59,28 @@ h2iib () {
        esac
 }
 
-# create nodes
-${LMC} --add node --node $HOSTNAME || exit 10
-${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` \
-    --nettype $NETTYPE $PORT_OPT || exit 11
-${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT|| exit 12
+MGSNID=`h2$NETTYPE $HOSTNAME`
 
 # configure mds server
 [ "x$MDS_MOUNT_OPTS" != "x" ] &&
-    MDS_MOUNT_OPTS="--mountfsoptions $MDS_MOUNT_OPTS"
-
+    MDS_MOUNT_OPTS="--mountfsoptions=$MDS_MOUNT_OPTS"
+[ "x$MDS_MKFS_OPTS" != "x" ] &&
+    MDS_MOUNT_OPTS="--mkfsoptions=\"$MDS_MOUNT_OPTS\""
 [ "x$QUOTA_OPTS" != "x" ] &&
     QUOTA_OPTS="--quota $QUOTA_OPTS"
-    
-# configure mds server
-${LMC} --add mds --node $HOSTNAME --mds mds1 --fstype $FSTYPE \
-       --dev $MDSDEV $MDS_MOUNT_OPTS $QUOTA_OPTS\
-       --size $MDSSIZE $JARG $IARG $MDSOPT || exit 20
+[ ! -z "$mdsfailover_HOST" ] && MDS_FAIL_OPT="--failnode=$mdsfailover_HOST"    
+
+MDS_OPTS="--mgs $MDS_FAIL_OPT --device-size=$MDSSIZE $MDS_MOUNT_OPTS $MDS_MKFS_OPTS"
+echo mkfs.lustre --mdt $MDS_OPTS --reformat $MDSDEV
 
 [ "x$OST_MOUNT_OPTS" != "x" ] &&
-    OST_MOUNT_OPTS="--mountfsoptions $OST_MOUNT_OPTS"
-
-# configure ost
-${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \
-       --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 $LOVOPT || exit 20
-
-${LMC} --add ost --node $HOSTNAME --lov lov1 --fstype $FSTYPE \
-       --dev $OSTDEV $QUOTA_OPTS\
-       $OST_MOUNT_OPTS --size $OSTSIZE $JARG $OSTOPT || exit 30
-
-# create client config
-[ "x$CLIENTOPT" != "x" ] && CLIENTOPT="--clientoptions $CLIENTOPT"
-${LMC} --add mtpt --node $HOSTNAME --path $MOUNT \
-       --mds mds1 --lov lov1 $CLIENTOPT || exit 40
-${LMC} --add mtpt --node client --path $MOUNT2 \
-       --mds mds1 --lov lov1 $CLIENTOPT || exit 41
+    OST_MOUNT_OPTS="--mountfsoptions=$OST_MOUNT_OPTS"
+[ "x$OST_MKFS_OPTS" != "x" ] &&
+    OST_MOUNT_OPTS="--mkfsoptions=\"$OST_MOUNT_OPTS\""
+
+OST_OPTS="--mgsnode=`h2$NETTYPE $HOSTNAME` $OST_FAIL_OPT --device-size=$OSTSIZE $OST_MOUNT_OPTS $OST_MKFS_OPTS"
+echo mkfs.lustre --ost $OST_OPTS --reformat $OSTDEV
+
+OST2_OPTS="--mgsnode=`h2$NETTYPE $HOSTNAME` $OST_FAIL_OPT --device-size=$OSTSIZE $OST_MOUNT_OPTS $OST_MKFS_OPTS"
+echo mkfs.lustre --ost $OST2_OPTS --reformat $OSTDEV2
+
index 352c2b9..62c3b14 100755 (executable)
@@ -19,7 +19,7 @@ NETTYPE=${NETTYPE:-tcp}
 [ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
 
 OSTCOUNT=${OSTCOUNT:-5}
-# OSTDEVN will still override the device for OST N
+# OSTDEVn will still override the device for OST n
 
 OSTSIZE=${OSTSIZE:-150000}
 # 1 to config an echo client instead of llite
index 5a61806..91b6a2f 100644 (file)
@@ -198,7 +198,7 @@ out_close:
         return rc;
 }
 
-/* cocurrent mmap operations on two nodes */
+/* concurrent mmap operations on two nodes */
 static int mmap_tst3(char *mnt)
 {
         char *ptr, mmap_file[256];
@@ -403,7 +403,7 @@ static int cancel_lru_locks(char *prefix)
         }
 
         if (prefix)
-                sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/%s_*/lru_size", prefix);
+                sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*-%s-*/lru_size", prefix);
         else
                 sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*/lru_size");
 
@@ -472,7 +472,7 @@ static int mmap_tst5(char *mnt)
         memset(ptr, 'a', region);
 
         /* cancel unused locks */
-        cancel_lru_locks("OSC");
+        cancel_lru_locks("osc");
         if (rc)
                 goto out_unmap;
 
@@ -538,7 +538,7 @@ static int mmap_tst6(char *mnt)
                 goto out;
         }
 
-        cancel_lru_locks("OSC");
+        cancel_lru_locks("osc");
         if (rc)
                 goto out;
 
@@ -594,11 +594,11 @@ struct test_case {
 struct test_case tests[] = {
         { 1, "mmap test1: basic mmap operation", mmap_tst1, 1 },
         { 2, "mmap test2: MAP_PRIVATE not write back", mmap_tst2, 1 },
-        { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 },
-        { 4, "mmap test4: c1 write to f1 from mmaped f2, " 
-             "c2 write to f1 from mmaped f1", mmap_tst4, 2 },
+        { 3, "mmap test3: concurrent mmap ops on two nodes", mmap_tst3, 2 },
+        { 4, "mmap test4: c1 write to f1 from mmapped f2, " 
+             "c2 write to f1 from mmapped f1", mmap_tst4, 2 },
         { 5, "mmap test5: read/write file to/from the buffer "
-             "which mmaped to just this file", mmap_tst5, 1 },
+             "which mmapped to just this file", mmap_tst5, 1 },
         { 6, "mmap test6: check mmap write/read content on two nodes", 
                 mmap_tst6, 2 },
         { 0, NULL, 0, 0 }
diff --git a/lustre/tests/mountconf.sh b/lustre/tests/mountconf.sh
new file mode 100755 (executable)
index 0000000..0d71f75
--- /dev/null
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+#set -vx
+
+# mountconf setup of MDS and two OSTs
+
+#export PATH=`dirname $0`/../utils:$PATH
+#LUSTRE=${LUSTRE:-`dirname $0`/..}
+#. $LUSTRE/tests/test-framework.sh
+#init_test_env $@
+
+mcstopall() {
+    # make sure we are using the primary server, so test-framework will
+    # be able to clean up properly.
+    activemds=`facet_active mds`
+    if [ $activemds != "mds" ]; then
+        fail mds
+    fi
+
+    grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT $*
+    stop ost -f
+    stop ost2 -f
+    stop mds -f
+    return 0
+}
+
+mccleanup() {
+    echo "mountconf cleanup $*"
+    mcstopall $*
+    unload_modules
+}
+
+mcformat() {
+    mcstopall
+    echo Formatting mds, ost, ost2
+    add mds $MDS_MKFS_OPTS --reformat $MDSDEV    > /dev/null || exit 10
+    add ost $OST_MKFS_OPTS --reformat $OSTDEV    > /dev/null || exit 10
+    add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2 > /dev/null || exit 10
+}
+export MCFORMAT=${MCFORMAT:-"mcformat"}
+
+mount_client() {
+    grep " $1 " /proc/mounts || zconf_mount `hostname` $*
+}
+
+mcsetup() {
+    echo Setup mds, ost, ost2
+    start mds $MDSDEV $MDS_MOUNT_OPTS
+    start ost $OSTDEV $OST_MOUNT_OPTS
+    start ost2 $OSTDEV2 $OST2_MOUNT_OPTS
+    [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
+
+    mount_client $MOUNT
+    sleep 5
+}
+
+export MCSETUP=${MCSETUP:-"mcsetup"}
+export MCCLEANUP=${MCCLEANUP:-"mccleanup"}
+
index 0d12568..3da2ceb 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 set -e
-set -vx
+#set -vx
 
 export PATH=`dirname $0`/../utils:$PATH
 LFS=${LFS:-lfs}
@@ -52,7 +52,7 @@ fi
 # flush cache to OST(s) so avail numbers are correct
 sync; sleep 1 ; sync
 
-for OSC in /proc/fs/lustre/osc/OSC*MNT*; do
+for OSC in /proc/fs/lustre/osc/*-osc-*; do
        AVAIL=`cat $OSC/kbytesavail`
        GRANT=`cat $OSC/cur_grant_bytes`
        [ $(($AVAIL - $GRANT / 1024)) -lt 400 ] && OSCFULL=full
@@ -60,7 +60,7 @@ done
 
 if [ -z "$OSCFULL" ]; then
        echo "no OSTs are close to full"
-       grep "[0-9]" /proc/fs/lustre/osc/OSC*MNT*/{kbytesavail,cur*}
+       grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*}
        SUCCESS=0
 fi
 
index f7682bb..1eb5dbd 100644 (file)
@@ -54,14 +54,14 @@ fi
 # flush cache to OST(s) so avail numbers are correct
 sync; sleep 1 ; sync
 
-for OSC in /proc/fs/lustre/osc/OSC*MNT*; do
+for OSC in /proc/fs/lustre/osc/*-osc-*; do
        AVAIL=`cat $OSC/kbytesavail`
        GRANT=`cat $OSC/cur_grant_bytes`
        [ $(($AVAIL - $GRANT / 1024)) -lt 400 ] && OSCFULL=full
 done
 if [ -z "$OSCFULL" ]; then
        echo "no OSTs are close to full"
-       grep "[0-9]" /proc/fs/lustre/osc/OSC*MNT*/{kbytesavail,cur*}|tee -a $LOG
+       grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*}|tee -a $LOG
        SUCCESS=0
 fi
 
index 25d613e..36e90f3 100755 (executable)
@@ -5,57 +5,33 @@ set -e
 #         bug  2986 5494 7288
 ALWAYS_EXCEPT="20b  24   27 $RECOVERY_SMALL_EXCEPT"
 
-LUSTRE=${LUSTRE:-`dirname $0`/..}
+# Tests that always fail with mountconf -- FIXME
+# 16 fails with 1, not evicted
+EXCEPT="$EXCEPT 16"
 
-. $LUSTRE/tests/test-framework.sh
 
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
 init_test_env $@
-
 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
 
 build_test_filter
 
-
 # Allow us to override the setup if we already have a mounted system by
 # setting SETUP=" " and CLEANUP=" "
 SETUP=${SETUP:-"setup"}
 CLEANUP=${CLEANUP:-"cleanup"}
-FORCE=${FORCE:-"--force"}
 
-make_config() {
-    rm -f $XMLCONFIG
-    add_mds mds --dev $MDSDEV --size $MDSSIZE
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
-       --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-    add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
-    add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
-    add_client client mds --lov lov1 --path $MOUNT
-}
+# for MCSETUP and MCCLEANUP
+. mountconf.sh
 
 setup() {
-    make_config
-    start ost --reformat $OSTLCONFARGS 
-    start ost2 --reformat $OSTLCONFARGS 
-    [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-    start mds $MDSLCONFARGS --reformat
-    grep " $MOUNT " /proc/mounts || zconf_mount `hostname`  $MOUNT
+    $MCFORMAT
+    $MCSETUP
 }
 
 cleanup() {
-    zconf_umount `hostname` $MOUNT
-    stop mds ${FORCE} $MDSLCONFARGS
-    stop ost2 ${FORCE}
-    stop ost ${FORCE} --dump $TMP/recovery-small-`hostname`.log
-}
-
-replay() {
-    do_mds "sync"
-    do_mds 'echo -e "device \$mds1\\nprobe\\nnotransno\\nreadonly" | lctl'
-    do_client "$1" &
-    shutdown_mds -f
-    start_mds
-    wait
-    do_client "df -h $MOUNT" # trigger failover, if we haven't already
+       $MCCLEANUP > /dev/null || { echo "FAILed to clean up"; exit 20; }
 }
 
 if [ ! -z "$EVAL" ]; then
@@ -65,12 +41,11 @@ fi
 
 if [ "$ONLY" == "cleanup" ]; then
     sysctl -w lnet.debug=0 || true
-    FORCE=--force cleanup
+    cleanup
     exit
 fi
 
-REFORMAT=--reformat $SETUP
-unset REFORMAT
+$SETUP
 
 [ "$ONLY" == "setup" ] && exit
 
@@ -93,14 +68,14 @@ test_3() {
 run_test 3 "stat: drop req, drop rep"
 
 test_4() {
-    do_facet client "cp /etc/resolv.conf $MOUNT/resolv.conf" || return 1
-    drop_request "cat $MOUNT/resolv.conf > /dev/null"   || return 2
-    drop_reply "cat $MOUNT/resolv.conf > /dev/null"     || return 3
+    do_facet client "cp /etc/passwd $MOUNT/passwd" || return 1
+    drop_request "cat $MOUNT/passwd > /dev/null"   || return 2
+    drop_reply "cat $MOUNT/passwd > /dev/null"     || return 3
 }
 run_test 4 "open: drop req, drop rep"
 
 test_5() {
-    drop_request "mv $MOUNT/resolv.conf $MOUNT/renamed" || return 1
+    drop_request "mv $MOUNT/passwd $MOUNT/renamed" || return 1
     drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2
     do_facet client "checkstat -v $MOUNT/renamed-again"  || return 3
 }
@@ -152,7 +127,7 @@ test_11(){
     do_facet client multiop $MOUNT/$tfile Ow  || return 1
     do_facet client multiop $MOUNT/$tfile or  || return 2
 
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
 
     do_facet client multiop $MOUNT/$tfile or  || return 3
     drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected"
@@ -207,15 +182,15 @@ test_15() {
 }
 run_test 15 "failed open (-ENOMEM)"
 
-READ_AHEAD=`cat /proc/fs/lustre/llite/*/max_read_ahead_mb | head -n 1`
+READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1`
 stop_read_ahead() {
-   for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do 
+   for f in $LPROC/llite/*/max_read_ahead_mb; do 
       echo 0 > $f
    done
 }
 
 start_read_ahead() {
-   for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do 
+   for f in $LPROC/llite/*/max_read_ahead_mb; do 
       echo $READ_AHEAD > $f
    done
 }
@@ -227,7 +202,7 @@ test_16() {
 
 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE
     do_facet ost sysctl -w lustre.fail_loc=0x80000504
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
     # will get evicted here
     do_facet client "cmp /etc/termcap $MOUNT/termcap"  && return 1
     sysctl -w lustre.fail_loc=0
@@ -260,7 +235,7 @@ test_18a() {
     do_facet client mkdir -p $MOUNT/$tdir
     f=$MOUNT/$tdir/$tfile
 
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
     pgcache_empty || return 1
 
     # 1 stripe on ost2
@@ -268,14 +243,13 @@ test_18a() {
 
     do_facet client cp /etc/termcap $f
     sync
-    local osc2_dev=`$LCTL device_list | \
-       awk '(/ost2.*client_facet/){print $4}' `
-    $LCTL --device %$osc2_dev deactivate
+    local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
+    $LCTL --device $osc2dev deactivate || return 3
     # my understanding is that there should be nothing in the page
     # cache after the client reconnects?     
     rc=0
     pgcache_empty || rc=2
-    $LCTL --device %$osc2_dev activate
+    $LCTL --device $osc2dev activate
     rm -f $f
     return $rc
 }
@@ -286,7 +260,7 @@ test_18b() {
     f=$MOUNT/$tdir/$tfile
     f2=$MOUNT/$tdir/${tfile}-2
 
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
     pgcache_empty || return 1
 
     # shouldn't have to set stripe size of count==1
@@ -329,7 +303,7 @@ test_19b() {
     do_facet client multiop $f Ow  || return 1
     do_facet client multiop $f or  || return 2
 
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
 
     do_facet client multiop $f or  || return 3
     drop_ldlm_cancel multiop $f Ow  || echo "client evicted, as expected"
@@ -343,7 +317,7 @@ test_20a() {        # bug 2983 - ldlm_handle_enqueue cleanup
        multiop $DIR/$tdir/${tfile} O_wc &
        MULTI_PID=$!
        sleep 1
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
        do_facet ost sysctl -w lustre.fail_loc=0x80000308
        kill -USR1 $MULTI_PID
@@ -356,7 +330,7 @@ run_test 20a "ldlm_handle_enqueue error (should return error)"
 test_20b() {   # bug 2986 - ldlm_handle_enqueue error during open
        mkdir -p $DIR/$tdir
        touch $DIR/$tdir/${tfile}
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
 #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
        do_facet ost sysctl -w lustre.fail_loc=0x80000308
        dd if=/etc/hosts of=$DIR/$tdir/$tfile && \
@@ -377,7 +351,7 @@ run_test 20b "ldlm_handle_enqueue error (should return error)"
 
 test_24() {    # bug 2248 - eviction fails writeback but app doesn't see it
        mkdir -p $DIR/$tdir
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        multiop $DIR/$tdir/$tfile Owy_wyc &
        MULTI_PID=$!
        usleep 500
@@ -399,7 +373,7 @@ test_26() {      # bug 5921 - evict dead exports by pinger
            echo "skipping test 26 (local OST)" && return
        [ "`lsmod | grep mds`" ] && \
            echo "skipping test 26 (local MDS)" && return
-       OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+       OST_FILE=$LPROC/obdfilter/ost_svc/num_exports
         OST_EXP="`do_facet ost cat $OST_FILE`"
        OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
        echo starting with $OST_NEXP1 OST exports
@@ -421,9 +395,9 @@ run_test 26 "evict dead exports"
 
 test_26b() {      # bug 10140 - evict dead exports by pinger
        zconf_mount `hostname` $MOUNT2
-       MDS_FILE=/proc/fs/lustre/mds/mds_svc/num_exports
+       MDS_FILE=$LPROC/mds/${mds_svc}/num_exports
         MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
-       OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+       OST_FILE=$LPROC/obdfilter/${ost_svc}/num_exports
         OST_NEXP1="`do_facet ost cat $OST_FILE | cut -d' ' -f2`"
        echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
        zconf_umount `hostname` $MOUNT2 -f
@@ -566,5 +540,4 @@ test_52() {
 }
 run_test 52 "failover OST under load"
 
-
-FORCE=--force $CLEANUP
+$CLEANUP
index 5fe9d3a..05dfdde 100755 (executable)
@@ -14,57 +14,42 @@ init_test_env $@
 
 SETUP=${SETUP:-"setup"}
 CLEANUP=${CLEANUP:-"cleanup"}
-FORCE=${FORCE:-"--force"}
-
-gen_config() {
-    rm -f $XMLCONFIG
-    add_mds mds --dev $MDSDEV --size $MDSSIZE
-    if [ ! -z "$mdsfailover_HOST" ]; then
-    add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
-    fi
-    
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
-   --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-    add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover
-    add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE --failover
-    add_client client mds --lov lov1 --path $MOUNT
-}
-
-
 
 build_test_filter
 
 cleanup() {
-    # make sure we are using the primary MDS, so the config log will
+    # make sure we are using the primary server, so test-framework will
     # be able to clean up properly.
     activemds=`facet_active mds`
     if [ $activemds != "mds" ]; then
         fail mds
     fi
 
-    umount $MOUNT2 || true
-    umount $MOUNT  || true
-    rmmod llite || true
-    stop mds ${FORCE}
-    stop ost2 ${FORCE}
-    stop ost ${FORCE}  --dump $TMP/replay-dual-`hostname`.log
+    grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT
+    grep " $MOUNT2 " /proc/mounts && zconf_umount `hostname` $MOUNT2
+    stop mds -f
+    stop ost2 -f
+    stop ost -f
 }
 
 if [ "$ONLY" == "cleanup" ]; then
     sysctl -w lnet.debug=0
-    FORCE=--force cleanup
+    cleanup
     exit
 fi
 
 setup() {
-    gen_config
-    start ost --reformat $OSTLCONFARGS 
-    start ost2 --reformat $OSTLCONFARGS 
-    start mds $MDSLCONFARGS --reformat
-    grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
-    grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
-
-#    echo $TIMEOUT > /proc/sys/lustre/timeout
+    cleanup
+    add mds $MDS_MKFS_OPTS --reformat $MDSDEV
+    add ost $OST_MKFS_OPTS --reformat $OSTDEV
+    add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2
+    start mds $MDSDEV $MDS_MOUNT_OPTS
+    start ost $OSTDEV $OST_MOUNT_OPTS
+    start ost2 $OSTDEV2 $OST2_MOUNT_OPTS
+    # client actions will get EIO until MDT contacts OSTs, so give it a sec
+    sleep 5
+    zconf_mount `hostname` $MOUNT
+    zconf_mount `hostname` $MOUNT2
 }
 
 $SETUP
@@ -459,7 +444,7 @@ test_18() { # bug 3822 - evicting client with enqueued lock
    sleep 1
 #define OBD_FAIL_LDLM_BL_CALLBACK        0x305
    do_facet client sysctl -w lustre.fail_loc=0x80000305  # drop cb, evict
-   cancel_lru_locks MDC
+   cancel_lru_locks mdc
    usleep 500 # wait to ensure first client is one that will be evicted
    openfile -f O_RDONLY $MOUNT2/$tdir/f0
    wait $OPENPID
@@ -472,5 +457,5 @@ if [ "$ONLY" != "setup" ]; then
    equals_msg test complete, cleaning up
    SLEEP=$((`date +%s` - $NOW))
    [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
-   FORCE=--force $CLEANUP
+   $CLEANUP
 fi
index c9ae901..f74e4f4 100755 (executable)
@@ -10,38 +10,39 @@ init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
 
 ostfailover_HOST=${ostfailover_HOST:-$ost_HOST}
+#failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST
 
 # Skip these tests
 # BUG NUMBER: 2766?
 ALWAYS_EXCEPT="5 $REPLAY_OST_SINGLE_EXCEPT"
 
 gen_config() {
-    rm -f $XMLCONFIG
-    add_mds mds --dev $MDSDEV --size $MDSSIZE
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
-       --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-    add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover
-    if [ ! -z "$ostfailover_HOST" ]; then
-        add_ostfailover ost --dev $OSTDEV --size $OSTSIZE
-    fi
-    add_client client mds --lov lov1 --path $MOUNT
+    grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT
+    stop ost -f
+    stop ost2 -f
+    stop mds -f
+    echo Formatting mds, ost
+    add mds $MDS_MKFS_OPTS --reformat $MDSDEV
+    add ost $OST_MKFS_OPTS --reformat $OSTDEV
 }
 
 cleanup() {
-    # make sure we are using the primary MDS, so the config log will
+    # make sure we are using the primary server, so test-framework will
     # be able to clean up properly.
     activeost=`facet_active ost`
     if [ $activeost != "ost" ]; then
         fail ost
     fi
+
     zconf_umount `hostname` $MOUNT
-    stop mds ${FORCE} $MDSLCONFARGS
-    stop ost ${FORCE} --dump $TMP/replay-ost-single-`hostname`.log
+    stop mds
+    stop ost
+    unload_modules
 }
 
 if [ "$ONLY" == "cleanup" ]; then
     sysctl -w lnet.debug=0
-    FORCE=--force cleanup
+    cleanup
     exit
 fi
 
@@ -52,18 +53,15 @@ CLEANUP=${CLEANUP:-"cleanup"}
 
 setup() {
     gen_config
-
-    start ost --reformat $OSTLCONFARGS
+    start mds $MDSDEV $MDS_MOUNT_OPTS
+    start ost $OSTDEV $OST_MOUNT_OPTS
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-    start mds --reformat $MDSLCONFARGS
 
     if [ -z "`grep " $MOUNT " /proc/mounts`" ]; then
        # test "-1" needed during initial client->OST connection
        log "== test 00: target handle mismatch (bug 5317) === `date +%H:%M:%S`"
-
        #define OBD_FAIL_OST_ALL_REPLY_NET       0x211
        do_facet ost "sysctl -w lustre.fail_loc=0x80000211"
-
        zconf_mount `hostname` $MOUNT && df $MOUNT && pass || error "mount fail"
     fi
 }
@@ -117,7 +115,7 @@ test_4() {
     verify=$ROOT/tmp/verify-$$
     dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile
     # invalidate cache, so that we're reading over the wire
-    for i in /proc/fs/lustre/ldlm/namespaces/OSC_*MNT*; do
+    for i in /proc/fs/lustre/ldlm/namespaces/*-osc-*; do
         echo -n clear > $i/lru_size
     done
     cmp $verify $DIR/$tfile &
@@ -145,7 +143,7 @@ test_5() {
 run_test 5 "Fail OST during iozone"
 
 kbytesfree() {
-   awk '{total+=$1} END {print total}' /proc/fs/lustre/osc/OSC_*MNT*/kbytesfree
+   awk '{total+=$1} END {print total}' /proc/fs/lustre/osc/*-osc-*/kbytesfree
 }
 
 test_6() {
@@ -199,4 +197,4 @@ test_7() {
 run_test 7 "Fail OST before obd_destroy"
 
 equals_msg test complete, cleaning up
-FORCE=--force $CLEANUP
+$CLEANUP
index 8352be3..513766c 100755 (executable)
@@ -1,6 +1,7 @@
 #!/bin/sh
 
 set -e
+#set -v
 
 #
 # This test needs to be run on the client
@@ -12,57 +13,26 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+. mountconf.sh 
 
 # Skip these tests
-# bug number: 2766 9930
+# bug number: 2766
 ALWAYS_EXCEPT="0b   $REPLAY_SINGLE_EXCEPT"
 
-gen_config() {
-    rm -f $XMLCONFIG
-    add_mds mds --dev $MDSDEV --size $MDSSIZE
-    if [ ! -z "$mdsfailover_HOST" ]; then
-        add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
-    fi
-    
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
-       --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
-    add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
-    add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
-    add_client client mds --lov lov1 --path $MOUNT
-}
-
 build_test_filter
 
-cleanup() {
-    # make sure we are using the primary MDS, so the config log will
-    # be able to clean up properly.
-    activemds=`facet_active mds`
-    if [ $activemds != "mds" ]; then
-        fail mds
-    fi
-    zconf_umount `hostname` $MOUNT
-    stop mds ${FORCE} $MDSLCONFARGS
-    stop ost2 ${FORCE}
-    stop ost ${FORCE} --dump $TMP/replay-single-`hostname`.log
-}
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"mcstopall"}
 
 if [ "$ONLY" == "cleanup" ]; then
     sysctl -w lnet.debug=0 || true
-    FORCE=--force cleanup
-    exit
+    $CLEANUP
+    exit 0
 fi
 
-SETUP=${SETUP:-"setup"}
-CLEANUP=${CLEANUP:-"cleanup"}
-
 setup() {
-    gen_config
-
-    start ost --reformat $OSTLCONFARGS 
-    start ost2 --reformat $OSTLCONFARGS 
-    [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
-    start mds $MDSLCONFARGS --reformat
-    grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
+    mcformat
+    mcsetup
 }
 
 $SETUP
@@ -101,20 +71,20 @@ test_1a() {
     do_facet ost "sysctl -w lustre.fail_loc=0"
 
     rm -fr $DIR/$tfile
-    local old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    local old_last_id=`cat $LPROC/obdfilter/*/last_id`
     touch -o $DIR/$tfile 1
     sync
-    local new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    local new_last_id=`cat $LPROC/obdfilter/*/last_id`
     
     test "$old_last_id" = "$new_last_id" || {
        echo "OST object create is caused by MDS"
        return 1
     }
     
-    old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    old_last_id=`cat $LPROC/obdfilter/*/last_id`
     echo "data" > $DIR/$tfile
     sync
-    new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    new_last_id=`cat $LPROC/obdfilter/*/last_id`
     test "$old_last_id" = "$new_last_id "&& {
        echo "CROW does not work on write"
        return 1
@@ -126,10 +96,10 @@ test_1a() {
     do_facet ost "sysctl -w lustre.fail_loc=0x80000801"
 
     rm -fr $DIR/1a1
-    old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    old_last_id=`cat $LPROC/obdfilter/*/last_id`
     echo "data" > $DIR/1a1
     sync
-    new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+    new_last_id=`cat $LPROC/obdfilter/*/last_id`
     test "$old_last_id" = "$new_last_id" || {
        echo "CROW does work with fail_loc=0x80000801"
        return 1
@@ -760,7 +730,7 @@ test_36() {
     touch $DIR/$tfile
     checkstat $DIR/$tfile
     facet_failover mds
-    cancel_lru_locks MDC
+    cancel_lru_locks mdc
     if dmesg | grep "unknown lock cookie"; then 
        echo "cancel after replay failed"
        return 1
@@ -812,8 +782,7 @@ test_39() { # bug 4176
 run_test 39 "test recovery from unlink llog (test llog_gen_rec) "
 
 count_ost_writes() {
-        cat /proc/fs/lustre/osc/*/stats |
-            awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }'
+    awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' $LPROC/osc/*/stats
 }
 
 #b=2477,2532
@@ -864,13 +833,13 @@ test_41() {
     # make sure the start of the file is ost1
     lfs setstripe $f $((128 * 1024)) 0 0 
     do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
-    cancel_lru_locks OSC
+    cancel_lru_locks osc
     # fail ost2 and read from ost1
-    local osc2_dev=`$LCTL device_list | \
-               awk '(/ost2.*client_facet/){print $4}' `
-    $LCTL --device %$osc2_dev deactivate
+    local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
+    [ "$osc2dev" ] || return 4
+    $LCTL --device $osc2dev deactivate || return 1
     do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3
-    $LCTL --device %$osc2_dev activate
+    $LCTL --device $osc2dev activate || return 2
     return 0
 }
 run_test 41 "read from a valid osc while other oscs are invalid"
@@ -911,8 +880,10 @@ test_43() { # bug 2530
 run_test 43 "mds osc import failure during recovery; don't LBUG"
 
 test_44() {
-    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
+    [ "$mdcdev" ] || exit 2
     for i in `seq 1 10`; do
+       echo iteration $i
         #define OBD_FAIL_TGT_CONN_RACE     0x701
         do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
         $LCTL --device $mdcdev recover
@@ -924,8 +895,10 @@ test_44() {
 run_test 44 "race in target handle connect"
 
 test_44b() {
-    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
+    [ "$mdcdev" ] || exit 2
     for i in `seq 1 10`; do
+       echo iteration $i
         #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
         do_facet mds "sysctl -w lustre.fail_loc=0x80000704"
         $LCTL --device $mdcdev recover
@@ -938,7 +911,8 @@ run_test 44b "race in target handle connect"
 
 # Handle failed close
 test_45() {
-    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
+    [ "$mdcdev" ] || exit 2
     $LCTL --device $mdcdev recover
 
     multiop $DIR/$tfile O_c &
@@ -947,13 +921,13 @@ test_45() {
 
     # This will cause the CLOSE to fail before even 
     # allocating a reply buffer
-    $LCTL --device $mdcdev deactivate
+    $LCTL --device $mdcdev deactivate || return 4
 
     # try the close
     kill -USR1 $pid
     wait $pid || return 1
 
-    $LCTL --device $mdcdev activate
+    $LCTL --device $mdcdev activate || return 5
     sleep 1
 
     $CHECKSTAT -t file $DIR/$tfile || return 2
@@ -1012,9 +986,9 @@ test_48() {
 run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
 
 test_50() {
-    local osc_dev=`$LCTL device_list | \
-               awk '(/ost_svc_mds_svc/){print $4}' `
-    $LCTL --device %$osc_dev recover &&  $LCTL --device %$osc_dev recover
+    local oscdev=`grep ${ost_svc}-osc- $LPROC/devices | awk '{print $1}'`
+    [ "$oscdev" ] || return 1
+    $LCTL --device $oscdev recover &&  $LCTL --device $oscdev recover
     # give the mds_lov_sync threads a chance to run
     sleep 5
 }
@@ -1023,7 +997,7 @@ run_test 50 "Double OSC recovery, don't LASSERT (3812)"
 # b3764 timed out lock replay
 test_52() {
     touch $DIR/$tfile
-    cancel_lru_locks MDC
+    cancel_lru_locks mdc
 
     multiop $DIR/$tfile s
     replay_barrier mds
@@ -1092,4 +1066,4 @@ test_58() {
 run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
 
 equals_msg test complete, cleaning up
-FORCE=--force $CLEANUP
+$CLEANUP
index 0969f23..7071490 100755 (executable)
@@ -4,7 +4,19 @@
 # Probably a good idea to run this before doing any checkins.
 # In the future this can become more fancy, but it's OK for now.
 
+LUSTRE=${LUSTRE:-`dirname $0`/..}
 SRCDIR="`dirname $0`"
+export PATH=/sbin:/usr/sbin:$SRCDIR:$SRCDIR/../utils:$PATH
+
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+. mountconf.sh
+
+SETUP=${SETUP:-mcsetup}
+FORMAT=${FORMAT:-mcformat}
+CLEANUP=${CLEANUP:-mcstopall}
+
 fail() { 
        echo "ERROR: $1" 1>&2
        [ $2 ] && RC=$2 || RC=1
@@ -16,14 +28,11 @@ log() {
        lctl mark "$*"
 }
 
-export PATH=/sbin:/usr/sbin:$SRCDIR:$SRCDIR/../utils:$PATH
 
 ERROR=
 SRC=/etc
 [ "$COUNT" ] || COUNT=1000
 
-[ "$LCONF" ] || LCONF=lconf
-
 [ "$MCREATE" ] || MCREATE=mcreate
 
 [ "$MKDIRMANY" ] || MKDIRMANY="createmany -d"
@@ -36,10 +45,11 @@ while [ "$1" ]; do
        shift
 done
 
-EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+EXISTING_MOUNT=`awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts`
 if [ -z "$EXISTING_MOUNT" ]; then
-       sh llmount.sh $OPTS
-       EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+        $FORMAT
+        $SETUP
+       EXISTING_MOUNT=`awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts`
        [ -z "$EXISTING_MOUNT" ] && fail "no lustre filesystem mounted" 1
        I_MOUNTED="yes"
 fi
@@ -93,8 +103,8 @@ done
 [ "$ERROR" ] && fail "old and new files are different" $ERROR
 log "finished at `date` ($(($(date +%s) - START)))"
 
-sh llmountcleanup.sh || exit 19
-sh llrmount.sh $OPTS || exit 20
+$CLEANUP || exit 19
+$SETUP || exit 20
 
 log "comparing previously copied files"
 for f in $FILES; do
@@ -104,8 +114,8 @@ done
 
 [ "$ERROR" ] && fail "old and new files are different on second diff" $ERROR
 
-sh llmountcleanup.sh || exit 19
-sh llrmount.sh $OPTS || exit 20
+$CLEANUP || exit 19
+$SETUP || exit 20
 
 log "removing $DST"
 rm -r $V $DST || fail "can't remove $DST" 37
@@ -134,5 +144,5 @@ fi
 
 if [ "$I_MOUNTED" = "yes" ]; then
        sync && sleep 2 && sync     # wait for delete thread
-       sh llmountcleanup.sh || exit 29
+       $CLEANUP
 fi
index 8c1e164..b30fc42 100644 (file)
@@ -128,7 +128,7 @@ pass() {
 }
 
 mounted_lustre_filesystems() {
-       awk '($3 ~ "lustre") { print $2 }' /proc/mounts
+       awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
 }
 MOUNT="`mounted_lustre_filesystems`"
 if [ -z "$MOUNT" ]; then
@@ -589,8 +589,7 @@ test_7()
        echo 0 > /proc/sys/lustre/fail_loc
 
        echo "  Trigger recovery..."
-       OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_OST.* / { print $1 }'`"
-       [ -z "$OSC0_UUID" ] && OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_ost1.* / { print $1 }'`"
+       OSC0_UUID="`$LCTL dl | awk '/.* *-osc-* / { print $1 }'`"
        for i in $OSC0_UUID; do
                $LCTL --device $i activate > /dev/null 2>&1 || error "activate osc failed!"
        done
index 72ecbc5..b68cb58 100644 (file)
@@ -12,6 +12,12 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42b  42c  42d  45   68"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 63 64b 71 77 101"
+# Tests that fail on uml
+[ "$UML" = "true" ] && EXCEPT="$EXCEPT 31d"
+
+# Tests that always fail with mountconf -- FIXME
+# 48a moving the working dir succeeds
+EXCEPT="$EXCEPT 48a"
 
 case `uname -r` in
 2.4*) FSTYPE=${FSTYPE:-ext3};    ALWAYS_EXCEPT="$ALWAYS_EXCEPT 76" ;;
@@ -62,22 +68,31 @@ else
     fi
 fi
 
+SANITYLOG=${SANITYLOG:-/tmp/sanity.log}
+
 export NAME=${NAME:-local}
 
 SAVE_PWD=$PWD
 
-clean() {
+# for MCSETUP and MCCLEANUP
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+. mountconf.sh
+
+cleanup() {
        echo -n "cln.."
-       sh llmountcleanup.sh ${FORCE} > /dev/null || { echo "FAILed to clean up"; exit 20; }
+       $MCCLEANUP ${FORCE} $* || { echo "FAILed to clean up"; exit 20; }
 }
-CLEAN=${CLEAN:-:}
+CLEANUP=${CLEANUP:-:}
 
-start() {
+setup() {
        echo -n "mnt.."
-       sh llrmount.sh > /dev/null || exit 10
+       $MCSETUP || exit 10
        echo "done"
 }
-START=${START:-:}
+SETUP=${SETUP:-:}
 
 log() {
        echo "$*"
@@ -93,7 +108,6 @@ trace() {
 }
 TRACE=${TRACE:-""}
 
-LPROC=/proc/fs/lustre
 check_kernel_version() {
        VERSION_FILE=$LPROC/kernel_version
        WANT_VER=$1
@@ -113,8 +127,8 @@ basetest() {
 }
 
 run_one() {
-       if ! mount | grep -q $DIR; then
-               $START
+       if ! grep -q $DIR /proc/mounts; then
+               $SETUP
        fi
        testnum=$1
        message=$2
@@ -127,7 +141,7 @@ run_one() {
        unset TESTNAME
        pass "($((`date +%s` - $BEFORE))s)"
        cd $SAVE_PWD
-       $CLEAN
+       $CLEANUP
 }
 
 build_test_filter() {
@@ -198,13 +212,15 @@ pass() {
 }
 
 mounted_lustre_filesystems() {
-       awk '($3 ~ "lustre") { print $2 }' /proc/mounts
+       awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
 }
-MOUNT="`mounted_lustre_filesystems`"
-if [ -z "$MOUNT" ]; then
-       sh llmount.sh
-       MOUNT="`mounted_lustre_filesystems`"
-       [ -z "$MOUNT" ] && error "NAME=$NAME not mounted"
+
+MOUNTED="`mounted_lustre_filesystems`"
+if [ -z "$MOUNTED" ]; then
+        $MCFORMAT
+       $MCSETUP
+       MOUNTED="`mounted_lustre_filesystems`"
+       [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted"
        I_MOUNTED=yes
 fi
 
@@ -731,7 +747,7 @@ test_24n() {
     $CHECKSTAT ${f}.rename
     $CHECKSTAT -a ${f}
 }
-run_test 24n "Statting the old file after renameing (Posix rename 2)"
+run_test 24n "Statting the old file after renaming (Posix rename 2)"
 
 test_24o() {
        check_kernel_version 37 || return 0
@@ -985,11 +1001,11 @@ reset_enospc() {
 
 exhaust_precreations() {
        OSTIDX=$1
-       OST=$(head -n $((OSTIDX + 1)) $LPROC/lov/${LOVNAME}/target_obd |\
-               tail -n 1 | awk '{print $2}' | sed -e 's/_UUID$//')
-
-       last_id=$(cat $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc_last_id)
-       next_id=$(cat $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc_next_id)
+       OST=$(grep ${OSTIDX}": " $LPROC/lov/${LOVNAME}/target_obd | \
+           awk '{print $2}' | sed -e 's/_UUID$//')
+       # on the mdt's osc
+       last_id=$(cat $LPROC/osc/${OST}-osc/prealloc_last_id)
+       next_id=$(cat $LPROC/osc/${OST}-osc/prealloc_next_id)
 
        mkdir -p $DIR/d27/${OST}
        $LSTRIPE $DIR/d27/${OST} 0 $OSTIDX 1
@@ -997,7 +1013,7 @@ exhaust_precreations() {
        sysctl -w lustre.fail_loc=0x215
        echo "Creating to objid $last_id on ost $OST..."
        createmany -o $DIR/d27/${OST}/f $next_id $((last_id - next_id + 2))
-       grep '[0-9]' $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc*
+       grep '[0-9]' $LPROC/osc/${OST}-osc/prealloc*
        reset_enospc $2
 }
 
@@ -1093,19 +1109,19 @@ test_28() {
 run_test 28 "create/mknod/mkdir with bad file types ============"
 
 cancel_lru_locks() {
-       for d in $LPROC/ldlm/namespaces/$1*; do
+       for d in $LPROC/ldlm/namespaces/*-$1-*; do
                echo clear > $d/lru_size
        done
-       grep "[0-9]" $LPROC/ldlm/namespaces/$1*/lock_unused_count /dev/null
+       grep "[0-9]" $LPROC/ldlm/namespaces/*-$1-*/lock_unused_count /dev/null
 }
 
 test_29() {
-       cancel_lru_locks MDC
+       cancel_lru_locks mdc
        mkdir $DIR/d29
        touch $DIR/d29/foo
        log 'first d29'
        ls -l $DIR/d29
-       MDCDIR=${MDCDIR:-$LPROC/ldlm/namespaces/MDC_*}
+       MDCDIR=${MDCDIR:-$LPROC/ldlm/namespaces/*-mdc-*}
        LOCKCOUNTORIG=`cat $MDCDIR/lock_count`
        LOCKUNUSEDCOUNTORIG=`cat $MDCDIR/lock_unused_count`
        log 'second d29'
@@ -1637,11 +1653,11 @@ setup_test42() {
 # file truncation, and file removal.
 test_42a() {
        setup_test42
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        stop_writeback
        sync; sleep 1; sync # just to be safe
        BEFOREWRITES=`count_ost_writes`
-       grep "[0-9]" $LPROC/osc/OSC*MNT*/cur_grant_bytes
+       grep "[0-9]" $LPROC/osc/*-osc-*/cur_grant_bytes
        dd if=/dev/zero of=$DIR/f42a bs=1024 count=100
        AFTERWRITES=`count_ost_writes`
        [ $BEFOREWRITES -eq $AFTERWRITES ] || \
@@ -1652,7 +1668,7 @@ run_test 42a "ensure that we don't flush on close =============="
 
 test_42b() {
        setup_test42
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        stop_writeback
         sync
         dd if=/dev/zero of=$DIR/f42b bs=1024 count=100
@@ -1691,7 +1707,7 @@ trunc_test() {
         test=$1
         file=$DIR/$test
         offset=$2
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        stop_writeback
        # prime the file with 0,EOF PW to match
        touch $file
@@ -1701,7 +1717,7 @@ trunc_test() {
         dd if=/dev/zero of=$file bs=1024 count=100
         BEFOREWRITES=`count_ost_writes`
         $TRUNCATE $file $offset
-        cancel_lru_locks OSC
+        cancel_lru_locks osc
         AFTERWRITES=`count_ost_writes`
        start_writeback
 }
@@ -1835,7 +1851,7 @@ test_45() {
        [ $before -gt $after ] || error "writeback didn't lower dirty count"
        do_dirty_record "echo blah > $f"
        [ $before -eq $after ] && error "write wasn't cached"
-       do_dirty_record "cancel_lru_locks OSC"
+       do_dirty_record "cancel_lru_locks osc"
        [ $before -gt $after ] || error "lock cancellation didn't lower dirty count"
        start_writeback
 }
@@ -2064,8 +2080,8 @@ test_52b() {
 run_test 52b "immutable flag test (should return errors) ======="
 
 test_53() {
-        for i in `ls -d $LPROC/osc/OSC*mds1 2> /dev/null` ; do
-                ostname=`echo $i | cut -d _ -f 3-4 | sed -e s/_mds1//`
+        for i in `ls -d $LPROC/osc/*-osc 2> /dev/null` ; do
+                ostname=`basename $i | cut -d - -f 1-2`
                 ost_last=`cat $LPROC/obdfilter/$ostname/last_id`
                 mds_last=`cat $i/prealloc_last_id`
                 echo "$ostname.last_id=$ost_last ; MDS.last_id=$mds_last"
@@ -2304,7 +2320,7 @@ run_test 60b "limit repeated messages from CERROR/CWARN ========"
 test_61() {
        f="$DIR/f61"
        dd if=/dev/zero of=$f bs=`page_size` count=1
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        multiop $f OSMWUc || error
        sync
 }
@@ -2314,7 +2330,7 @@ run_test 61 "mmap() writes don't make sync hang ================"
 test_62() {
         f="$DIR/f62"
         echo foo > $f
-        cancel_lru_locks OSC
+        cancel_lru_locks osc
         sysctl -w lustre.fail_loc=0x405
         cat $f && error "cat succeeded, expect -EIO"
         sysctl -w lustre.fail_loc=0
@@ -2367,7 +2383,7 @@ run_test 63b "async write errors should be returned to fsync ==="
 
 test_64a () {
        df $DIR
-       grep "[0-9]" $LPROC/osc/OSC*MNT*/cur*
+       grep "[0-9]" $LPROC/osc/*-osc-*/cur*
 }
 run_test 64a "verify filter grant calculations (in kernel) ====="
 
@@ -2454,9 +2470,9 @@ run_test 65i "set default striping on root directory (bug 6367)="
 
 test_65j() { # bug6367
        # if we aren't already remounting for each test, do so for this test
-       if [ "$CLEAN" = ":" ]; then
-               clean || error "failed to unmount"
-               start || error "failed to remount"
+       if [ "$CLEANUP" = ":" ]; then
+               cleanup -f || error "failed to unmount"
+               setup || error "failed to remount"
        fi
        $LSTRIPE -d $MOUNT || true
 }
@@ -2553,7 +2569,7 @@ test_69() {
        sysctl -w lustre.fail_loc=0
        $DIRECTIO write $f 0 2 || error "write error"
 
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        $DIRECTIO read $f 0 1 || error "read error"
 
        sysctl -w lustre.fail_loc=0x217
@@ -2603,7 +2619,7 @@ test_72() { # bug 5695 - Test that on 2.6 remove_suid works properly
        # See if we are still setuid/sgid
        test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on write"
        # Now test that MDS is updated too
-       cancel_lru_locks MDC
+       cancel_lru_locks mdc
        test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on MDS"
        true
 }
@@ -2819,7 +2835,7 @@ test_101() {
        local nreads=10000
        local cache_limit=32
 
-       for s in $LPROC/osc/OSC_*/rpc_stats; do
+       for s in $LPROC/osc/*-osc*/rpc_stats; do
                echo 0 > $s
        done
        trap cleanup_101 EXIT
@@ -2841,7 +2857,7 @@ test_101() {
        cleanup_101
 
        if [ $(($discard * 10)) -gt $nreads ] ;then
-               cat $LPROC/osc/OSC_*/rpc_stats
+               cat $LPROC/osc/*-osc*/rpc_stats
                cat $LPROC/llite/*/read_ahead_stats
                error "too many ($discard) discarded pages" 
        fi
@@ -2856,7 +2872,7 @@ test_102() {
         touch $testfile
 
        [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
-       [ -z "`grep xattr $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
+       [ -z "`grep xattr $LPROC/mdc/*-mdc-*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
        echo "set/get xattr..."
         setfattr -n trusted.name1 -v value1 $testfile || error
         [ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \
@@ -2907,8 +2923,8 @@ test_103 () {
 
     [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
     [ -z "`mount | grep " $DIR .*\<acl\>"`" ] && echo "skipping $TESTNAME (must have acl)" && return
-    [ -z "`grep acl $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return
-    which setfacl 2>/dev/null || (echo "skipping $TESTNAME (could not find setfacl)" && return)
+    [ -z "`grep acl $LPROC/mdc/*-mdc-*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return
+    $(which setfacl 2>/dev/null) || echo "skipping $TESTNAME (could not find setfacl)" && return
 
     echo "performing cp ..."
     run_acl_subtest cp || error
@@ -2943,7 +2959,7 @@ test_104() {
        lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed"
        lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed"
        
-       OSC=`lctl dl | awk '/OSC.*MNT/ {print $4}' | head -n 1`
+       OSC=`awk '/-osc-/ {print $4}' $LPROC/devices | head -n 1`
        lctl --device %$OSC deactivate
        lfs df || error "lfs df with deactivated OSC failed"
        lctl --device %$OSC recover
@@ -2957,11 +2973,12 @@ HOME=$OLDHOME
 
 log "cleanup: ======================================================"
 if [ "`mount | grep ^$NAME`" ]; then
-       rm -rf $DIR/[Rdfs][1-9]*
-       if [ "$I_MOUNTED" = "yes" ]; then
-               sh llmountcleanup.sh || error "llmountcleanup failed"
-       fi
+    rm -rf $DIR/[Rdfs][1-9]*
 fi
+if [ "$I_MOUNTED" = "yes" ]; then
+    $MCCLEANUP -f || error "cleanup failed"
+fi
+
 
 echo '=========================== finished ==============================='
 [ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true
index 57cfaa8..a32f2b6 100644 (file)
@@ -7,6 +7,9 @@ ONLY=${ONLY:-"$*"}
 ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b  14c"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
+# Tests that fail on uml
+[ "$UML" = "true" ] && EXCEPT="$EXCEPT 7"
+
 SRCDIR=`dirname $0`
 PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
 
@@ -33,18 +36,26 @@ fi
 
 SAVE_PWD=$PWD
 
-clean() {
+# for MCSETUP and MCCLEANUP
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+. mountconf.sh
+
+cleanup() {
        echo -n "cln.."
-       sh llmountcleanup.sh ${FORCE} > /dev/null || exit 20
+       grep " $MOUNT2 " /proc/mounts && zconf_umount `hostname` $MOUNT2 ${FORCE}
+       $MCCLEANUP ${FORCE} > /dev/null || { echo "FAILed to clean up"; exit 20; }
 }
-CLEAN=${CLEAN:-}
+CLEANUP=${CLEANUP:-:}
 
-start() {
+setup() {
        echo -n "mnt.."
-       sh llrmount.sh > /dev/null || exit 10
+       $MCSETUP || exit 10
        echo "done"
 }
-START=${START:-}
+SETUP=${SETUP:-:}
 
 log() {
        echo "$*"
@@ -61,8 +72,8 @@ trace() {
 TRACE=${TRACE:-""}
 
 run_one() {
-       if ! mount | grep -q $DIR1; then
-               $START
+       if ! grep -q $DIR /proc/mounts; then
+               $SETUP
        fi
        testnum=$1
        message=$2
@@ -75,7 +86,27 @@ run_one() {
        unset TESTNAME
        pass "($((`date +%s` - $BEFORE))s)"
        cd $SAVE_PWD
-       $CLEAN
+       $CLEANUP
+}
+
+build_test_filter() {
+       [ "$ALWAYS_EXCEPT$EXCEPT$SANITYN_EXCEPT" ] && \
+           echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITYN_EXCEPT`"
+
+        for O in $ONLY; do
+            eval ONLY_${O}=true
+        done
+        for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do
+            eval EXCEPT_${E}=true
+        done
+}
+
+_basetest() {
+    echo $*
+}
+
+basetest() {
+    IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
 }
 
 build_test_filter() {
@@ -143,11 +174,23 @@ pass() {
        echo PASS $@
 }
 
-export MOUNT1=`mount| awk '/ lustre/ { print $3 }'| head -n 1`
-export MOUNT2=`mount| awk '/ lustre/ { print $3 }'| tail -n 1`
+mounted_lustre_filesystems() {
+       awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
+}
+MOUNTED="`mounted_lustre_filesystems`"
+if [ -z "$MOUNTED" ]; then
+    $MCFORMAT
+    $MCSETUP
+    mount_client $MOUNT2
+    MOUNTED="`mounted_lustre_filesystems`"
+    [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted"
+    I_MOUNTED=yes
+fi
+export MOUNT1=`mounted_lustre_filesystems | head -n 1`
 [ -z "$MOUNT1" ] && error "NAME=$NAME not mounted once"
+export MOUNT2=`mounted_lustre_filesystems | tail -n 1`
 [ "$MOUNT1" = "$MOUNT2" ] && error "NAME=$NAME not mounted twice"
-[ `mount| awk '/ lustre/ { print $3 }'| wc -l` -ne 2 ] && \
+[ `mounted_lustre_filesystems | wc -l` -ne 2 ] && \
        error "NAME=$NAME mounted more than twice"
 
 export DIR1=${DIR1:-$MOUNT1}
@@ -387,17 +430,17 @@ test_16() {
 run_test 16 "2500 iterations of dual-mount fsx ================="
 
 cancel_lru_locks() {
-       for d in /proc/fs/lustre/ldlm/namespaces/$1*; do
+       for d in /proc/fs/lustre/ldlm/namespaces/*-$1-*; do
                echo clear > $d/lru_size
        done
-       grep "[0-9]" /proc/fs/lustre/ldlm/namespaces/$1*/lock_unused_count /dev/null
+       grep "[0-9]" /proc/fs/lustre/ldlm/namespaces/*-$1-*/lock_unused_count /dev/null
 }
 
 test_17() { # bug 3513, 3667
        [ ! -d /proc/fs/lustre/ost ] && echo "skipping OST-only test" && return
 
        cp /etc/termcap $DIR1/f17
-       cancel_lru_locks OSC > /dev/null
+       cancel_lru_locks osc > /dev/null
        #define OBD_FAIL_ONCE|OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
        echo 0x8000030a > /proc/sys/lustre/fail_loc
        ls -ls $DIR1/f17 | awk '{ print $1,$6 }' > $DIR1/f17-1 & \
@@ -417,7 +460,7 @@ test_19() { # bug3811
        [ -d /proc/fs/lustre/obdfilter ] || return 0
 
        MAX=`cat /proc/fs/lustre/obdfilter/*/readcache_max_filesize | head -n 1`
-       for O in /proc/fs/lustre/obdfilter/OST*; do
+       for O in /proc/fs/lustre/obdfilter/*OST*; do
                echo 4096 > $O/readcache_max_filesize
        done
        dd if=/dev/urandom of=$TMP/f19b bs=512k count=32
@@ -425,7 +468,7 @@ test_19() { # bug3811
        cp $TMP/f19b $DIR1/f19b
        for i in `seq 1 20`; do
                [ $((i % 5)) -eq 0 ] && log "test_18 loop $i"
-               cancel_lru_locks OSC > /dev/null
+               cancel_lru_locks osc > /dev/null
                cksum $DIR1/f19b | cut -d" " -f 1,2 > $TMP/sum1 & \
                cksum $DIR2/f19b | cut -d" " -f 1,2 > $TMP/sum2
                wait
@@ -434,7 +477,7 @@ test_19() { # bug3811
                [ "`cat $TMP/sum2`" = "$SUM" ] || \
                        error "$DIR2/f19b `cat $TMP/sum2` != $SUM"
        done
-       for O in /proc/fs/lustre/obdfilter/OST*; do
+       for O in /proc/fs/lustre/obdfilter/*OST*; do
                echo $MAX > $O/readcache_max_filesize
        done
        rm $DIR1/f19b
@@ -443,12 +486,12 @@ test_19() { # bug3811
 
 test_20() {
        mkdir $DIR1/d20
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        CNT=$((`cat /proc/fs/lustre/llite/fs0/dump_page_cache | wc -l`))
        multiop $DIR1/f20 Ow8190c
        multiop $DIR2/f20 Oz8194w8190c
        multiop $DIR1/f20 Oz0r8190c
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        CNTD=$((`cat /proc/fs/lustre/llite/fs0/dump_page_cache | wc -l` - $CNT))
        [ $CNTD -gt 0 ] && \
            error $CNTD" page left in cache after lock cancel" || true
@@ -497,7 +540,7 @@ test_23() { # Bug 5972
        echo "others should see updated atime while another read" > $DIR1/f23
        
        # clear the lock(mode: LCK_PW) gotten from creating operation
-       cancel_lru_locks OSC
+       cancel_lru_locks osc
        
        time1=`date +%s`        
        sleep 2
@@ -560,6 +603,10 @@ run_test 25 "change ACL on one mountpoint be seen on another ==="
 
 log "cleanup: ======================================================"
 rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
+if [ "$I_MOUNTED" = "yes" ]; then
+    cleanup
+fi
 
 echo '=========================== finished ==============================='
 [ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true
+
index 3b10909..8ca56d9 100644 (file)
@@ -2,6 +2,7 @@
 # vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
 
 set -e
+#set -vx
 
 export REFORMAT=""
 export VERBOSE=false
@@ -36,12 +37,11 @@ init_test_env() {
     export TMP=${TMP:-$ROOT/tmp}
 
     export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests
-    export LLMOUNT=${LLMOUNT:-"llmount"}
-    export LCONF=${LCONF:-"lconf"}
-    export LMC=${LMC:-"lmc"}
     export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
+    export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"}
     export CHECKSTAT="${CHECKSTAT:-checkstat} "
     export FSYTPE=${FSTYPE:-"ext3"}
+    export LPROC=/proc/fs/lustre
 
     if [ "$ACCEPTOR_PORT" ]; then
         export PORT_OPT="--port $ACCEPTOR_PORT"
@@ -70,55 +70,87 @@ init_test_env() {
 #    echo "CONFIG=`canonical_path $CONFIG`"  > $LUSTRE/tests/CONFIG
 }
 
+unload_modules() {
+    lsmod | grep lnet > /dev/null && $LCTL dk $TMP/debug
+    local MODULES=`$LCTL modules | awk '{ print $2 }'`
+    rmmod $MODULES >/dev/null 2>&1 
+     # do it again, in case we tried to unload ksocklnd too early
+    lsmod | grep lnet > /dev/null && rmmod $MODULES >/dev/null 2>&1 
+    lsmod | grep lnet && echo "modules still loaded" && return 1
+
+    LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked"`
+    LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked"`
+    if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
+       echo "$LEAK_LUSTRE" 1>&2
+       echo "$LEAK_PORTALS" 1>&2
+       mv $TMP/debug $TMP/debug-leak.`date +%s`
+       echo "Memory leaks detected"
+       return 254
+    fi
+}
+
 # Facet functions
+# start facet device options 
 start() {
     facet=$1
     shift
-    active=`facet_active $facet`
-    do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
-        --node ${active}_facet  --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
-        $@ $XMLCONFIG
+    device=$1
+    shift
+    echo "Starting ${facet}: $@ ${device} /mnt/${facet}"
+    do_facet ${facet} mkdir -p /mnt/${facet}
+    do_facet ${facet} mount -t lustre $@ ${device} /mnt/${facet} 
+    #do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
+    #    --node ${active}_facet  --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
+    #    $@ $XMLCONFIG
     RC=${PIPESTATUS[0]}
     if [ $RC -ne 0 ]; then
-        # maybe acceptor error, dump tcp port usage
-        netstat -tpn
+       echo mount -t lustre $@ ${device} /mnt/${facet} 
+        echo Start of ${device} on ${facet} failed ${RC}
+    else 
+       do_facet ${facet} sync
+       # need the awk in case running with -v 
+       label=`do_facet ${facet} "e2label ${device}" | awk '{print $(NF)}'`
+       eval export ${facet}_svc=${label}
+       eval export ${facet}_dev=${device}
+       eval export ${facet}_opt=\"$@\"
+       echo Started ${label}
     fi
     return $RC
 }
 
 stop() {
     facet=$1
-    active=`facet_active $facet`
     shift
-    do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
-        --node ${active}_facet  --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
-        $@ --cleanup $XMLCONFIG
+    # the following line fails with VERBOSE set 
+    local running=`do_facet ${facet} "grep -c /mnt/${facet}' ' /proc/mounts" | awk '{print $(NF)}'`
+    if [ $running -ne 0 ]; then
+       echo "Stopping /mnt/${facet} (opts:$@)"
+       do_facet ${facet} umount -d $@ /mnt/${facet}
+    fi
+    #do_facet ${facet} umount -d $@ /mnt/${facet} >> /dev/null 2>&1 || :
+    [ -e /proc/fs/lustre ] && grep "ST " /proc/fs/lustre/devices && echo "service didn't stop" && exit 1
+    return 0
 }
 
 zconf_mount() {
     local OPTIONS
-    client=$1
-    mnt=$2
-
-    do_node $client mkdir $mnt 2> /dev/null || :
-
+    local client=$1
+    local mnt=$2
     # Only supply -o to mount if we have options
     if [ -n "$MOUNTOPT" ]; then
         OPTIONS="-o $MOUNTOPT"
     fi
-
-    if [ -x /sbin/mount.lustre ] ; then
-       do_node $client mount -t lustre $OPTIONS \
-               `facet_nid mds`:/mds_svc/client_facet $mnt || return 1
-        do_node $client "sysctl -w lnet.debug=$PTLDEBUG; sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }"
-    else
-       # this is so cheating
-       do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG > \
-               /dev/null || return 2
-       do_node $client $LLMOUNT $OPTIONS \
-               `facet_nid mds`:/mds_svc/client_facet $mnt || return 4
+    local device=`facet_nid mgs`:/$FSNAME
+    if [ -z "$mnt" -o -z "$FSNAME" ]; then
+       echo Bad zconf mount command: opt=$OPTIONS dev=$device mnt=$mnt
+       exit 1
     fi
 
+    echo "Starting client: $OPTIONS $device $mnt" 
+    do_node $client mkdir -p $mnt
+    do_node $client mount -t lustre $OPTIONS $device $mnt || return 1
+
+    do_node $client "sysctl -w lnet.debug=$PTLDEBUG; sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }"
     [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname`
     return 0
 }
@@ -127,8 +159,11 @@ zconf_umount() {
     client=$1
     mnt=$2
     [ "$3" ] && force=-f
-    do_node $client umount $force  $mnt || :
-    do_node $client $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null || :
+    local running=`do_node $client "grep -c $mnt' ' /proc/mounts" | awk '{print $(NF)}'`
+    if [ $running -ne 0 ]; then
+       echo "Stopping client $mnt (opts:$force)"
+       do_node $client umount $force $mnt
+    fi
 }
 
 shutdown_facet() {
@@ -137,7 +172,7 @@ shutdown_facet() {
        $POWER_DOWN `facet_active_host $facet`
        sleep 2 
     elif [ "$FAILURE_MODE" = SOFT ]; then
-       stop $facet --force --failover --nomod
+       stop $facet
     fi
 }
 
@@ -182,7 +217,7 @@ client_reconnect() {
 
 facet_failover() {
     facet=$1
-    echo "Failing $facet node `facet_active_host $facet`"
+    echo "Failing $facet on node `facet_active_host $facet`"
     shutdown_facet $facet
     reboot_facet $facet
     client_df &
@@ -192,52 +227,64 @@ facet_failover() {
     TO=`facet_active_host $facet`
     echo "Failover $facet to $TO"
     wait_for $facet
-    start $facet
+    local dev=${facet}_dev
+    local opt=${facet}_opt
+    start $facet ${!dev} ${!opt}
+}
+
+obd_name() {
+    local facet=$1
 }
 
 replay_barrier() {
     local facet=$1
     do_facet $facet sync
     df $MOUNT
-    do_facet $facet $LCTL --device %${facet}_svc readonly
-    do_facet $facet $LCTL --device %${facet}_svc notransno
-    do_facet $facet $LCTL mark "$facet REPLAY BARRIER"
-    $LCTL mark "local REPLAY BARRIER"
+    local svc=${facet}_svc
+    do_facet $facet $LCTL --device %${!svc} readonly
+    do_facet $facet $LCTL --device %${!svc} notransno
+    do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}"
+    $LCTL mark "local REPLAY BARRIER on ${!svc}"
 }
 
 replay_barrier_nodf() {
     local facet=$1
     do_facet $facet sync
-    do_facet $facet $LCTL --device %${facet}_svc readonly
-    do_facet $facet $LCTL --device %${facet}_svc notransno
-    do_facet $facet $LCTL mark "$facet REPLAY BARRIER"
-    $LCTL mark "local REPLAY BARRIER"
+    local svc=${facet}_svc
+    echo Replay barrier on ${!svc}
+    do_facet $facet $LCTL --device %${!svc} readonly
+    do_facet $facet $LCTL --device %${!svc} notransno
+    do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}"
+    $LCTL mark "local REPLAY BARRIER on ${!svc}"
 }
 
 mds_evict_client() {
     UUID=`cat /proc/fs/lustre/mdc/*_MNT_*/uuid`
-    do_facet mds "echo $UUID > /proc/fs/lustre/mds/mds_svc/evict_client"
+    do_facet mds "echo $UUID > /proc/fs/lustre/mds/${mds_svc}/evict_client"
 }
 
 fail() {
-    local facet=$1
-    facet_failover $facet
+    facet_failover $*
     df $MOUNT || error "post-failover df: $?"
 }
 
 fail_abort() {
     local facet=$1
-    stop $facet --force --failover --nomod
+    stop $facet
     change_active $facet
-    start $facet
-    do_facet $facet lctl --device %${facet}_svc abort_recovery
+    local svc=${facet}_svc
+    local dev=${facet}_dev
+    local opt=${facet}_opt
+    start $facet ${!dev} ${!opt}
+    do_facet $facet lctl --device %${!svc} abort_recovery
     df $MOUNT || echo "first df failed: $?"
     sleep 1
     df $MOUNT || error "post-failover df: $?"
 }
 
 do_lmc() {
-    $LMC -m ${XMLCONFIG} $@
+    echo There is no lmc.  This is mountconf, baby.
+    exit 1
 }
 
 h2gm () {
@@ -353,69 +400,13 @@ do_facet() {
     do_node $HOST $@
 }
 
-add_facet() {
+add() {
     local facet=$1
     shift
-    echo "add facet $facet: `facet_host $facet`"
-    do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT \
-        --lustre_upcall $UPCALL --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM
-    do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` \
-        --nettype lnet $PORT_OPT
-}
-
-add_mds() {
-    local MOUNT_OPTS
-    local facet=$1
-    shift
-    rm -f ${facet}active
-    add_facet $facet
-    [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT"
-    do_lmc --add mds --node ${facet}_facet --mds ${facet}_svc \
-       --fstype $FSTYPE $* $MOUNT_OPTS
-}
-
-add_mdsfailover() {
-    local MOUNT_OPTS
-    local facet=$1
-    shift
-    add_facet ${facet}failover  --lustre_upcall $UPCALL
-    [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT"
-    do_lmc --add mds  --node ${facet}failover_facet --mds ${facet}_svc \
-       --fstype $FSTYPE $* $MOUNT_OPTS
-}
-
-add_ost() {
-    facet=$1
-    shift
+    # failsafe
+    stop ${facet} -f
     rm -f ${facet}active
-    add_facet $facet
-    do_lmc --add ost --node ${facet}_facet --ost ${facet}_svc \
-       --fstype $FSTYPE $* $OSTOPT
-}
-
-add_ostfailover() {
-    facet=$1
-    shift
-    add_facet ${facet}failover
-    do_lmc --add ost --failover --node ${facet}failover_facet \
-       --ost ${facet}_svc --fstype $FSTYPE $* $OSTOPT
-}
-
-add_lov() {
-    lov=$1
-    mds_facet=$2
-    shift; shift
-    do_lmc --add lov --mds ${mds_facet}_svc --lov $lov $* $LOVOPT
-}
-
-add_client() {
-    local MOUNT_OPTS
-    local facet=$1
-    mds=$2
-    shift; shift
-    [ "x$CLIENTOPT" != "x" ] && MOUNT_OPTS="--clientoptions $CLIENTOPT"
-    add_facet $facet --lustre_upcall $UPCALL
-    do_lmc --add mtpt --node ${facet}_facet --mds ${mds}_svc $* $MOUNT_OPTS
+    $MKFS $*
 }
 
 
@@ -622,6 +613,7 @@ equals_msg() {
 
 log() {
        echo "$*"
+       lsmod | grep lnet > /dev/null || modprobe lnet
        $LCTL mark "$*" 2> /dev/null || true
 }
 
index dc22291..d730e27 100644 (file)
@@ -15,9 +15,15 @@ obdbarrier
 lload
 wirecheck
 lfs
+mkfs.lustre
+mkfs_lustre
+mount.lustre
+mount_lustre
+tunefs.lustre
+tunefs_lustre
+llog_reader
 llmount
 l_getgroups
-mount.lustre
 wiretest
 llog_reader
 .*.cmd
index efc7547..eb43617 100644 (file)
@@ -8,26 +8,30 @@ AM_LDFLAGS := -L$(top_builddir)/lnet/utils
 
 LIBPTLCTL := $(top_builddir)/lnet/utils/libptlctl.a
 
-sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive      \
-       load_ldap.sh lrun 
+sbin_scripts = llanalyze llstat.pl llobdstat.pl lactive lrun 
 bin_scripts = lfind lstripe
 
 if UTILS
-rootsbin_SCRIPTS = mount.lustre
-sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount  \
-       l_getgroups
+# mount only finds helpers in /sbin
+rootsbin_PROGRAMS = mount.lustre
+sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest \
+       mount_lustre mkfs_lustre mkfs.lustre \
+       tunefs_lustre tunefs.lustre l_getgroups
 bin_PROGRAMS = lfs llog_reader
-lib_LIBRARIES = liblustreapi.a 
+lib_LIBRARIES = liblustreapi.a
 sbin_SCRIPTS = $(sbin_scripts)
 bin_SCRIPTS = $(bin_scripts)
 endif # UTILS
 
+lctl_SOURCES = parser.c obd.c lustre_cfg.c lctl.c parser.h obdctl.h platform.h
 lctl_LDADD := $(LIBREADLINE) $(LIBPTLCTL) 
 lctl_DEPENDENCIES := $(LIBPTLCTL) 
 
+lfs_SOURCES = lfs.c parser.c obd.c
 lfs_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
 lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a 
 
+lload_SOURCES = lload.c 
 lload_LDADD := $(LIBREADLINE) $(LIBPTLCTL)
 lload_DEPENDENCIES := $(LIBPTLCTL)
 
@@ -35,22 +39,29 @@ liblustreapi_a_SOURCES = liblustreapi.c
 
 wirecheck_SOURCES = wirecheck.c
 wirecheck_CPPFLAGS = -DCC="\"$(CC)\""
+
 wiretest_SOURCES = wiretest.c
 
-lctl_SOURCES = parser.c obd.c lustre_cfg.c lctl.c parser.h obdctl.h platform.h
-lload_SOURCES = lload.c 
 obdio_SOURCES = obdio.c obdiolib.c obdiolib.h
 obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h
-lfs_SOURCES = lfs.c parser.c obd.c
 
-llog_reader_LDADD := $(LIBREADLINE) $(LIBPTLCTL)
-llog_reader_DEPENDENCIES := $(LIBPTLCTL)
 llog_reader_SOURCES = llog_reader.c
+llog_reader_LDADD := $(LIBPTLCTL)
+llog_reader_DEPENDENCIES := $(LIBPTLCTL)
+
+mount_lustre_SOURCES = mount_lustre.c 
+mount_lustre_LDADD := $(LIBPTLCTL)
+mount_lustre_DEPENDENCIES := $(LIBPTLCTL)
 
-llmount_SOURCES = llmount.c 
-llmount_CFLAGS = $(LLMOUNT_GM_CFLAGS)
-llmount_LDADD = $(LIBREADLINE) $(LIBPTLCTL) $(LLMOUNT_GM_LDADD) 
-llmount_DEPENDENCIES := $(LIBPTLCTL) 
+mkfs_lustre_SOURCES = mkfs_lustre.c
+mkfs_lustre_CPPFLAGS = -UTUNEFS $(AM_CPPFLAGS)
+mkfs_lustre_LDADD := $(LIBPTLCTL)
+mkfs_lustre_DEPENDENCIES := $(LIBPTLCTL)
+
+tunefs_lustre_SOURCES = $(mkfs_lustre_SOURCES)
+tunefs_lustre_CPPFLAGS = -DTUNEFS $(AM_CPPFLAGS)
+tunefs_lustre_LDADD := $(mkfs_lustre_LDADD)
+tunefs_lustre_DEPENDENCIES := $(mkfs_lustre_DEPENDENCIES)
 
 EXTRA_DIST = $(bin_scripts) $(sbin_scripts)
 
@@ -59,5 +70,12 @@ newwiretest: wirehdr.c wirecheck
        cp wirehdr.c wiretest.c
        ./wirecheck >> wiretest.c
 
-mount.lustre$(EXEEXT): llmount
+# Apparently I can't use .'s in automake names
+mount.lustre$(EXEEXT): mount_lustre
+       cp $< $@
+
+mkfs.lustre$(EXEEXT): mkfs_lustre
+       cp $< $@
+
+tunefs.lustre$(EXEEXT): tunefs_lustre
        cp $< $@
diff --git a/lustre/utils/cluster_scripts/1uml.csv b/lustre/utils/cluster_scripts/1uml.csv
new file mode 100644 (file)
index 0000000..d6f23a4
--- /dev/null
@@ -0,0 +1,5 @@
+# combo mdt/mgs
+uml1,options lnet networks=tcp,/r/tmp/mdt,mdt|mgs,,,,--device-size=10240
+# ost0
+uml1,options lnet networks=tcp,/r/tmp/ost0,ost,,"uml1@tcp0",,--device-size=10240
+
diff --git a/lustre/utils/cluster_scripts/cluster_config.sh b/lustre/utils/cluster_scripts/cluster_config.sh
new file mode 100755 (executable)
index 0000000..818d884
--- /dev/null
@@ -0,0 +1,705 @@
+#!/bin/bash
+#
+# cluster_config.sh - configure multiple lustre servers from a csv file
+#
+# This script is used to parse each line of a spreadsheet (csv file) and 
+# execute remote pdsh commands to format (mkfs.lustre) every Lustre target 
+# that will be part of the Lustre cluster.
+# 
+# In addition, it can also verify the network connectivity and hostnames in 
+# the cluster and produce High-Availability software configurations for
+# Heartbeat or CluManager
+#
+################################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage: `basename $0` [-t HAtype] [-n] [-f] [-h] [-v] <csv file>
+
+       -t HAtype       produce High-Availability software configurations
+
+                       The argument following -t is used to indicate the High-
+                       Availability software type. The HA software types which 
+                       are currently supported are: hbv1 (Heartbeat v1), hbv2 
+                       (Heartbeat v2) and clumanager (CluManager).
+       -n              don't verify network connectivity and hostnames in the 
+                       cluster
+       -f              force-format the Lustre targets using --reformat option
+       -h              help and examples
+       -v              verbose mode
+       csv file        a spreadsheet that contains configuration parameters
+                        (separated by commas) for each target in a Lustre cl-
+                        uster
+EOF
+       exit 1
+}
+
+# Samples 
+sample() {
+       cat >&2 <<EOF
+
+Each line in the csv file represents one Lustre target.
+The format is:
+hostname,module_opts,device name,device type,fsname,mgs nids,index,
+format options,mkfs options,mount options,failover nids,heartbeat channels,
+service address,heartbeat options
+
+Items left blank will be set to defaults.
+
+Sample 1 for csv file (Simple one without HA software configuration options):
+-------------------------------------------------------------------------------
+# combo mdt/mgs
+lustre-mgs,options lnet networks=tcp,/r/tmp/mgs,mdt|mgs,,,,--device-size=10240
+
+# ost0
+lustre-ost,options lnet networks=tcp,/r/tmp/ost0,ost,,lustre-mgs@tcp0,,
+--device-size=10240
+-------------------------------------------------------------------------------
+
+Sample 2 for csv file (Complex one without HA software configuration options):
+-------------------------------------------------------------------------------
+# mgs
+lustre-mgs1,options lnet 'networks="tcp,elan"',/r/tmp/mgs,mgs,,,,
+--device-size=10240,-J size=4,,"lustre-mgs2,2@elan"
+
+# mdt
+lustre-mdt1,options lnet 'networks="tcp,elan"',/r/tmp/mdt,mdt,,
+"lustre-mgs1,1@elan:lustre-mgs2,2@elan",,--device-size=10240,
+-J size=4,,lustre-mdt2
+
+# ost
+lustre-ost1,options lnet 'networks="tcp,elan"',/r/tmp/ost,ost,,
+"lustre-mgs1,1@elan:lustre-mgs2,2@elan",,--device-size=10240,
+-J size=4,"extents,mballoc",lustre-ost2
+-------------------------------------------------------------------------------
+
+Sample 3 for csv file (with Heartbeat version 1 configuration options):
+-------------------------------------------------------------------------------
+# mgs
+lustre-mgs1,options lnet networks=tcp,/r/tmp/mgs,mgs,,,,--device-size=10240,,,
+lustre-mgs2,serial /dev/ttyS0:bcast eth1,192.168.1.170,auto_failback off:
+ping 192.168.1.169:respawn hacluster /usr/lib/heartbeat/ipfail
+
+# mdt
+lustre-mdt1,options lnet networks=tcp,/r/tmp/mdt,mdt,,"lustre-mgs1:lustre-mgs2",
+,--device-size=10240,,,lustre-mdt2,bcast eth1,192.168.1.173,auto_failback off
+
+# ost
+lustre-ost1,options lnet networks=tcp,/r/tmp/ost,ost,,"lustre-mgs1:lustre-mgs2",
+,--device-size=10240,,,lustre-ost2,bcast eth1,192.168.1.171,auto_failback on
+-------------------------------------------------------------------------------
+
+Sample 4 for csv file (with Heartbeat version 2 configuration options):
+-------------------------------------------------------------------------------
+# combo mdt/mgs
+lustre-mgs1,options lnet networks=tcp,/r/tmp/mgs,mgs|mdt,,,,--device-size=10240,
+,,"lustre-mgs2:lustre-mgs3",bcast eth1,192.168.1.170,auto_failback off
+
+# ost
+lustre-ost1,options lnet networks=tcp,/r/tmp/ost,ost,,"lustre-mgs1:lustre-mgs2:
+lustre-mgs3",,--device-size=10240,,,lustre-ost2,bcast eth1,192.168.1.171,
+auto_failback on:crm yes
+-------------------------------------------------------------------------------
+
+Sample 5 for csv file (with Red Hat's Cluster Manager configuration options):
+-------------------------------------------------------------------------------
+# mgs
+lustre-mgs1,options lnet networks=tcp,/r/tmp/mgs,mgs,,,,--device-size=10240,,,
+lustre-mgs2,broadcast,192.168.1.170,--clumembd--interval=1000000 --tko_count=20
+
+# mdt
+lustre-mdt1,options lnet networks=tcp,/r/tmp/mdt,mdt,,"lustre-mgs1:lustre-mgs2",
+,--device-size=10240,,,lustre-mdt2,multicast225.0.0.12,192.168.1.173,
+
+# ost
+lustre-ost1,options lnet networks=tcp,/r/tmp/ost,ost,,"lustre-mgs1:lustre-mgs2",
+,--device-size=10240,,,lustre-ost2,,192.168.1.171:192.168.1.172,
+-------------------------------------------------------------------------------
+
+EOF
+       exit 1
+}
+
+# Global variables
+PDSH=${PDSH:-"pdsh -R ssh"}
+export PDSH
+
+CMD_PATH=${CMD_PATH:-"/sbin/"}
+
+# Some scripts to be called
+SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"}
+MODULE_CONFIG=${SCRIPTS_PATH}$"module_config.sh"
+VERIFY_CLUSTER_NET=${SCRIPTS_PATH}$"verify_cluster_net.sh"
+GEN_HB_CONFIG=${SCRIPTS_PATH}$"gen_hb_config.sh"
+GEN_CLUMGR_CONFIG=${SCRIPTS_PATH}$"gen_clumanager_config.sh"
+
+HATYPE_HBV1=$"hbv1"                    # Heartbeat version 1
+HATYPE_HBV2=$"hbv2"                    # Heartbeat version 2
+HATYPE_CLUMGR=$"clumanager"            # Cluster Manager
+
+HB_TMP_DIR=$"/tmp/heartbeat/"          # Temporary directory
+CLUMGR_TMP_DIR=$"/tmp/clumanager/"
+TMP_DIRS=$"${HB_TMP_DIR} ${CLUMGR_TMP_DIR}"
+
+declare -a CONFIG_ITEM                 # fields in each line of the csv file
+declare -a NODE_NAMES                  # node names in the failover group
+
+# Get and check the positional parameters
+while getopts "t:nfhv" OPTION; do
+       case $OPTION in
+       t) 
+               HATYPE_OPT=$OPTARG
+               if [ "${HATYPE_OPT}" != "${HATYPE_HBV1}" ] \
+               && [ "${HATYPE_OPT}" != "${HATYPE_HBV2}" ] \
+               && [ "${HATYPE_OPT}" != "${HATYPE_CLUMGR}" ]; then
+                       echo >&2 $"`basename $0`: Invalid HA software type" \
+                                 "- ${HATYPE_OPT}!"
+                       usage
+               fi
+               ;;
+        n) 
+               VERIFY_CONNECT=$"no"
+               ;;
+        f) 
+               REFORMAT_OPTION=$"--reformat "
+               ;;
+        h) 
+               sample  
+               ;;
+       v) 
+               VERBOSE_OPT=$" -v"
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Toss out the parameters we've already processed
+shift  `expr $OPTIND - 1`
+
+# Here we expect the csv file
+if [ $# -eq 0 ]; then
+       echo >&2 $"`basename $0`: Missing csv file!"
+       usage
+fi
+
+# Output verbose informations
+verbose_output() {
+       if [ -n "${VERBOSE_OPT}" ]; then
+               echo "`basename $0`: $*"
+       fi
+       return 0
+}
+
+# Check the csv file
+check_file() {
+        # Check argument
+        if [ $# -eq 0 ]; then
+                echo >&2 $"`basename $0`: check_file() error: Lack argument"\
+                         "for function check_file()!"
+                return 1
+        fi
+
+       CSV_FILE=$1
+       if [ ! -s ${CSV_FILE} ]; then
+                echo >&2 $"`basename $0`: check_file() error: ${CSV_FILE}"\
+                         "does not exist or is empty!"
+                return 1
+        fi
+
+        return 0
+}
+
+# Parse a line in the csv file
+parse_line() {
+        # Check argument
+        if [ $# -eq 0 ]; then
+                echo >&2 $"`basename $0`: parse_line() error: Lack argument"\
+                         "for function parse_line()!"
+                return 1
+        fi
+
+       declare -i i=0
+       declare -i length=0 
+       declare -i idx=0
+       declare -i s_quote_flag=0 
+       declare -i d_quote_flag=0
+       local TMP_LETTER LINE
+       LINE=$*
+
+       # Initialize the CONFIG_ITEM array
+       for ((i = 0; i < ${#CONFIG_ITEM[@]}; i++)); do
+               CONFIG_ITEM[i]=$""
+       done
+
+       # Get the length of the line
+        length=${#LINE}
+
+       i=0
+       while [ ${idx} -lt ${length} ]; do
+               # Get a letter from the line
+               TMP_LETTER=${LINE:${idx}:1}
+
+               case "${TMP_LETTER}" in
+               ",")
+                               if [ ${s_quote_flag} -eq 1 ] || [ ${d_quote_flag} -eq 1 ]; then
+                                       CONFIG_ITEM[i]=${CONFIG_ITEM[i]}${TMP_LETTER}
+                               else
+                               i=$i+1
+                               fi
+                               idx=${idx}+1
+                       continue
+                       ;;
+               "'")
+                               if [ ${s_quote_flag} -eq 0 ]; then
+                                       s_quote_flag=1
+                               else
+                                       s_quote_flag=0
+                               fi
+                       ;;
+               "\"")
+                               if [ ${d_quote_flag} -eq 0 ]; then
+                                       d_quote_flag=1
+                               else
+                                       d_quote_flag=0
+                               fi
+
+                               if [ ${i} -eq 1 ]; then
+                               CONFIG_ITEM[i]=${CONFIG_ITEM[i]}$"\\"${TMP_LETTER}
+                               idx=${idx}+1
+                               continue
+                       fi
+                       ;;
+               "\r")
+                               idx=${idx}+1
+                       continue
+                       ;;
+               *)
+                       ;;
+               esac
+                CONFIG_ITEM[i]=${CONFIG_ITEM[i]}${TMP_LETTER}
+                idx=${idx}+1
+               done
+       return 0
+}
+
+# Check the elements required for OSTs, MDTs and MGS
+#
+# When formatting an OST, the following elements: hostname, module_opts,
+# device name, device type and mgs nids, cannot have null value.
+#
+# When formatting an MDT or MGS, the following elements: hostname,
+# module_opts, device name and device type, cannot have null value.
+check_element() {
+        # Check hostname, module_opts, device name and device type
+        if [ -z "${HOST_NAME}" ]||[ -z "${MODULE_OPTS}" ]||[ -z "${DEVICE_NAME}" ]\
+          ||[ -z "${DEVICE_TYPE}" ]; then
+                echo >&2 $"`basename $0`: check_element() error: Some required"\
+                         "element has null value! Check hostname, module_opts,"\
+                         "device name and device type!"
+                return 1
+        fi
+
+        # Check mgs nids
+        if [ "${DEVICE_TYPE}" = "ost" ]&&[ -z "${MGS_NIDS}" ]; then
+                echo >&2 $"`basename $0`: check_element() error: OST's mgs nids"\
+                         "element has null value!"
+                return 1
+        fi
+
+        return 0
+}
+
+# Check the elements required for HA configuration
+check_ha_element() {
+       if [ -z "${HATYPE_OPT}" ]; then
+               return 0
+       fi
+
+       # Check service IP element
+       if [ -z "${SRV_IPADDRS}" ]; then
+                echo >&2 $"`basename $0`: check_ha_element() error: Service IP"\
+                         "element has null value!"
+                return 1
+        fi
+
+       # Check heartbeat channel element
+       if [ "${HATYPE_OPT}" != "${HATYPE_CLUMGR}" -a -z "${HB_CHANNELS}" ]
+       then
+                echo >&2 $"`basename $0`: check_ha_element() error: Heartbeat"\
+                         "channel element has null value!"
+                return 1
+        fi
+
+       return 0
+}
+
+# Check the number of MGS.
+# There should be no more than one MGS specified in the entire csv file.
+check_mgs() {
+       # Check the number of explicit MGS
+       if [ "${DEVICE_TYPE#*mgs*}" != "${DEVICE_TYPE}" ]; then 
+               if [ "${EXP_MGS}" = "${HOST_NAME}" ]; then
+                       echo >&2 $"`basename $0`: check_mgs() error: More than"\
+                                 "one explicit MGS in the csv file!"
+                       return 1
+               fi
+
+               if [ -z "${EXP_MGS}" ]; then
+                       EXP_MGS=${HOST_NAME}
+               fi
+
+               if [ "${EXP_MGS}" != "${HOST_NAME}" ]; then
+                       if [ "${FAILOVERS#*$EXP_MGS*}" = "${FAILOVERS}" ]; then
+                               echo >&2 $"`basename $0`: check_mgs() error:"\
+                                         "More than one explicit MGS in the"\
+                                         "csv file!"
+                       else
+                               echo >&2 $"`basename $0`: check_mgs() error:"\
+                                         "There should not be two entries for"\
+                                         "a server and its failover partner"\
+                                         "in the csv file!"
+                       fi
+                       return 1
+               fi
+       fi
+
+       # Check the number of implicit MGS
+        if [ "${DEVICE_TYPE}" = "mdt" ]&&[ -z "${MGS_NIDS}" ]; then
+               if [ "${IMP_MGS}" = "${HOST_NAME}" ]; then
+                       echo >&2 $"`basename $0`: check_mgs() error: More than"\
+                                 "one implicit MGS in the csv file!"
+                       return 1
+               fi
+
+               if [ -z "${IMP_MGS}" ]; then
+                       IMP_MGS=${HOST_NAME}
+               fi
+
+               if [ "${IMP_MGS}" != "${HOST_NAME}" ]; then
+                       if [ "${FAILOVERS#*$IMP_MGS*}" = "${FAILOVERS}" ]; then
+                               echo >&2 $"`basename $0`: check_mgs() error:"\
+                                         "More than one implicit MGS in the"\
+                                         "csv file!"
+                       else
+                               echo >&2 $"`basename $0`: check_mgs() error:"\
+                                         "There should not be two entries for"\
+                                         "a server and its failover partner"\
+                                         "in the csv file!"
+                       fi
+                       return 1
+               fi
+       fi
+
+       if [ -n "${EXP_MGS}" -a -n "${IMP_MGS}" ]; then
+               echo >&2 $"`basename $0`: check_mgs() error: More than one"\
+                         "MGS in the csv file!"
+               return 1
+       fi
+       
+       return 0
+}
+
+# Construct the command line of mkfs.lustre
+construct_mkfs_cmdline() {
+       MKFS_CMD=${CMD_PATH}$"mkfs.lustre "
+       MKFS_CMD=${MKFS_CMD}${REFORMAT_OPTION}
+
+       case "${DEVICE_TYPE}" in
+       "ost")
+               MKFS_CMD=${MKFS_CMD}$"--ost "
+               ;;
+       "mdt")
+               MKFS_CMD=${MKFS_CMD}$"--mdt "
+               ;;
+       "mgs")
+               MKFS_CMD=${MKFS_CMD}$"--mgs "
+               ;;
+       "mdt|mgs")
+               MKFS_CMD=${MKFS_CMD}$"--mdt --mgs "
+               ;;
+       "mgs|mdt")
+               MKFS_CMD=${MKFS_CMD}$"--mdt --mgs "
+               ;;
+       *)
+               echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\
+                         "Invalid device type - \"${DEVICE_TYPE}\""
+               return 1
+               ;;
+       esac
+
+       if [ -n "${FS_NAME}" ]; then
+               MKFS_CMD=${MKFS_CMD}$"--fsname="${FS_NAME}$" "
+       fi
+
+       if [ -n "${MGS_NIDS}" ]; then
+               MGS_NIDS=`echo "${MGS_NIDS}" | sed 's/^"//' | sed 's/"$//'`
+               MKFS_CMD=${MKFS_CMD}$"--mgsnode="${MGS_NIDS}$" "
+       fi
+
+       if [ -n "${INDEX}" ]; then
+               MKFS_CMD=${MKFS_CMD}$"--index="${INDEX}$" "
+       fi
+
+       if [ -n "${FORMAT_OPTIONS}" ]; then
+               FORMAT_OPTIONS=`echo "${FORMAT_OPTIONS}" | sed 's/^"//' | sed 's/"$//'`
+               MKFS_CMD=${MKFS_CMD}${FORMAT_OPTIONS}$" "
+       fi
+
+       if [ -n "${MKFS_OPTIONS}" ]; then
+               MKFS_OPTIONS=`echo "${MKFS_OPTIONS}" | sed 's/^"//' | sed 's/"$//'`
+               MKFS_CMD=${MKFS_CMD}$"--mkfsoptions="$"\""${MKFS_OPTIONS}$"\""$" "
+       fi
+
+       if [ -n "${MOUNT_OPTIONS}" ]; then
+               MOUNT_OPTIONS=`echo "${MOUNT_OPTIONS}" | sed 's/^"//' | sed 's/"$//'`
+               MKFS_CMD=${MKFS_CMD}$"--mountfsoptions="$"\""${MOUNT_OPTIONS}$"\""$" "
+       fi
+
+       if [ -n "${FAILOVERS}" ]; then
+               FAILOVERS=`echo "${FAILOVERS}" | sed 's/^"//' | sed 's/"$//'`
+               MKFS_CMD=${MKFS_CMD}$"--failnode="${FAILOVERS}$" "
+       fi
+
+       MKFS_CMD=${MKFS_CMD}${DEVICE_NAME}
+       return 0
+} 
+
+# Get all the node names in this failover group
+get_nodenames() {
+        declare -i idx
+        local failover_nids failover_nid first_nid
+
+       NODE_NAMES[0]=${HOST_NAME}
+
+        failover_nids=`echo ${FAILOVERS}|awk '{split($FAILOVERS, a, ":")}\
+                       END {for (i in a) print a[i]}'`
+
+       # XXX: Suppose the first nid of one failover node contains the node name
+       idx=1
+        for failover_nid in ${failover_nids}
+        do
+               first_nid=`echo ${failover_nid} | awk -F, '{print $1}'`
+                NODE_NAMES[idx]=${first_nid%@*}
+                idx=$idx+1
+        done
+
+        return 0
+}
+
+# Produce HA software's configuration files
+gen_ha_config() {
+        local  cmd_line
+        declare -i idx
+
+       if [ -z "${HATYPE_OPT}" ]; then
+               return 0
+       fi
+
+       # Prepare parameters
+       # Hostnames option
+       HOSTNAME_OPT=${HOST_NAME}
+
+       if ! get_nodenames; then
+               return 1
+       fi
+
+        for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do
+                HOSTNAME_OPT=${HOSTNAME_OPT}$":"${NODE_NAMES[idx]}
+        done
+
+       # Target device option
+       TARGET_TYPE=${DEVICE_TYPE}
+       if [ "${TARGET_TYPE}" = "mdt|mgs" -o "${TARGET_TYPE}" = "mgs|mdt" ]
+       then
+               TARGET_TYPE=$"mgs_mdt"
+       fi
+       TARGET_OPT=${DEVICE_NAME}:${TARGET_TYPE}
+
+       # Service IP address option
+       SRVADDR_OPT=${SRV_IPADDRS}
+
+       # Heartbeat channels option
+       HBCHANNEL_OPT=$"\""${HB_CHANNELS}$"\""
+
+       # Heartbeat options option
+       HBOPT_OPT=$"\""${HB_OPTIONS}$"\""
+
+       # Construct the generation script command line
+       case "${HATYPE_OPT}" in
+       "${HATYPE_HBV1}"|"${HATYPE_HBV2}")      # Heartbeat 
+               cmd_line=${GEN_HB_CONFIG}$" -r ${HATYPE_OPT} -n ${HOSTNAME_OPT}"
+               cmd_line=${cmd_line}$" -d ${TARGET_OPT} -c ${HBCHANNEL_OPT}"
+               cmd_line=${cmd_line}$" -s ${SRVADDR_OPT}"${VERBOSE_OPT}
+
+               if [ -n "${HB_OPTIONS}" ]; then
+                       cmd_line=${cmd_line}$" -o ${HBOPT_OPT}"
+               fi
+               ;;
+        "${HATYPE_CLUMGR}")                    # CluManager
+               cmd_line=${GEN_CLUMGR_CONFIG}$" -n ${HOSTNAME_OPT}"
+               cmd_line=${cmd_line}$" -d ${TARGET_OPT} -s ${SRVADDR_OPT}"
+               cmd_line=${cmd_line}${VERBOSE_OPT}
+
+               if [ -n "${HBCHANNEL_OPT}" ]; then
+                       cmd_line=${cmd_line}$" -c ${HBCHANNEL_OPT}"
+               fi
+
+               if [ -n "${HB_OPTIONS}" ]; then
+                       cmd_line=${cmd_line}$" -o ${HBOPT_OPT}"
+               fi
+               ;;
+       esac
+       
+       # Execute script to generate HA software's configuration files
+       verbose_output "${cmd_line}"
+       eval $(echo "${cmd_line}")
+       if [ $? -ne 0 ]; then
+               return 1
+       fi
+       
+       return 0
+}
+
+# Execute pdsh commands to add lnet options lines to remote nodes'
+# modprobe.conf/modules.conf and format(mkfs.lustre) Lustre targets
+#
+# If -t option exists, then also to produce the HA software's 
+# configuration files
+mass_config() {
+       # Check argument
+        if [ $# -eq 0 ]; then
+                echo >&2 $"`basename $0`: mass_config() error: Lack argument"\
+                         "for function mass_config()!"
+                return 1
+        fi
+
+        CSV_FILE=$1
+       local LINE COMMAND
+       declare -a PDSH_PID 
+       declare -a PDSH_CMD 
+       declare -i line_num=1
+       declare -i pid_num=0
+
+       while read -r LINE; do
+               # Get rid of the empty line
+               if [ -z "`echo ${LINE} | awk '/[[:alnum:]]/{print $0}'`" ]; then
+                       line_num=${line_num}+1
+                       continue
+               fi
+
+               # Get rid of the comment line
+               if [ -z "`echo \"${LINE}\" | egrep -v \"([[:space:]]|^)#\"`" ]; then
+                       line_num=${line_num}+1
+                       continue
+               fi
+
+               # Parse the config line into CONFIG_ITEM
+               if ! parse_line $LINE; then
+                       return 1        
+               fi
+
+               HOST_NAME=${CONFIG_ITEM[0]}
+               MODULE_OPTS=${CONFIG_ITEM[1]}
+               DEVICE_NAME=${CONFIG_ITEM[2]}
+               DEVICE_TYPE=${CONFIG_ITEM[3]}
+               FS_NAME=${CONFIG_ITEM[4]}
+               MGS_NIDS=${CONFIG_ITEM[5]}
+               INDEX=${CONFIG_ITEM[6]}
+               FORMAT_OPTIONS=${CONFIG_ITEM[7]}
+               MKFS_OPTIONS=${CONFIG_ITEM[8]}
+               MOUNT_OPTIONS=${CONFIG_ITEM[9]}
+               FAILOVERS=${CONFIG_ITEM[10]}
+
+               HB_CHANNELS=${CONFIG_ITEM[11]}
+               SRV_IPADDRS=${CONFIG_ITEM[12]}
+               HB_OPTIONS=${CONFIG_ITEM[13]}
+
+               # Check some required elements for formatting target
+               if ! check_element; then
+                       echo >&2 $"`basename $0`: check_element() error:"\
+                                 "Occurred on line ${line_num} in ${CSV_FILE}"
+                       return 1        
+               fi
+               
+               # Check the number of MGS
+               if ! check_mgs; then
+                       echo >&2 $"`basename $0`: check_mgs() error:"\
+                                 "Occurred on line ${line_num} in ${CSV_FILE}"
+                       return 1
+               fi
+               
+               # Construct the command line of mkfs.lustre
+               if ! construct_mkfs_cmdline; then
+                       echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\
+                                 "Occurred on line ${line_num} in ${CSV_FILE}"
+                       return 1        
+               fi
+
+               # Produce HA software's configuration files
+               if ! gen_ha_config; then
+                       return 1
+               fi
+
+               # Execute pdsh command to add lnet options lines to 
+               # modprobe.conf/modules.conf
+               COMMAND=$"echo \"${MODULE_OPTS}\"|${MODULE_CONFIG}"
+               verbose_output "Adding module options to ${HOST_NAME}"
+               verbose_output ${COMMAND}
+               ${PDSH} -w ${HOST_NAME} ${COMMAND} >&2 &
+               PDSH_PID[${pid_num}]=$!
+               PDSH_CMD[${pid_num}]="${PDSH} -w ${HOST_NAME} ${COMMAND}"
+               pid_num=${pid_num}+1
+
+               # Execute pdsh command to format Lustre target
+               verbose_output "Formatting Lustre target on ${HOST_NAME}..."
+               verbose_output "Format command line is: ${MKFS_CMD}"
+               ${PDSH} -w ${HOST_NAME} ${MKFS_CMD} >&2 &  
+               PDSH_PID[${pid_num}]=$!
+               PDSH_CMD[${pid_num}]="${PDSH} -w ${HOST_NAME} ${MKFS_CMD}"
+               pid_num=${pid_num}+1
+
+               line_num=${line_num}+1
+       done < ${CSV_FILE}
+
+       # Wait for the exit status of the background pdsh command
+       verbose_output "Waiting for the return of the pdsh command..."
+       for ((pid_num = 0; pid_num < ${#PDSH_PID[@]}; pid_num++)); do
+               wait ${PDSH_PID[${pid_num}]}
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: mass_config() error:"\
+                                "Fail to execute \"${PDSH_CMD[${pid_num}]}\"!"
+               fi
+       done    
+
+       rm -rf ${TMP_DIRS}
+       return 0
+}
+
+# Main flow
+# Check the csv file
+if ! check_file $1; then
+       exit 1  
+fi
+
+if [ "${VERIFY_CONNECT}" != "no" ]; then
+# Check the network connectivity and hostnames
+       verbose_output "Checking the network connectivity and hostnames..."
+       if ! ${VERIFY_CLUSTER_NET} ${VERBOSE_OPT} ${CSV_FILE}; then
+               exit 1
+       fi
+       verbose_output "Check the network connectivity and hostnames OK!"
+fi
+
+# Configure the Lustre cluster
+verbose_output "******** Lustre cluster configuration START ********"
+if ! mass_config ${CSV_FILE}; then
+       rm -rf ${TMP_DIRS}
+       exit 1
+fi
+verbose_output "******** Lustre cluster configuration END **********"
+
+exit 0
diff --git a/lustre/utils/cluster_scripts/gen_clumanager_config.sh b/lustre/utils/cluster_scripts/gen_clumanager_config.sh
new file mode 100755 (executable)
index 0000000..8469f7d
--- /dev/null
@@ -0,0 +1,379 @@
+#!/bin/bash
+#
+# gen_clumanager_config.sh - script for generating the Red Hat's Cluster Manager
+#                           HA software's configuration files
+#
+################################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage:  `basename $0` <-n hostnames> <-d target device> <-s service addresses> 
+                     [-c heartbeat channels] [-o heartbeat options] [-v]
+
+       -n hostnames            the nodenames of the primary node and its fail-
+                               overs
+                               Multiple nodenames are separated by colon (:)
+                               delimeter. The first one is the nodename of the 
+                               primary node, the others are failover nodenames.
+       -d target device        the target device name and type
+                               The name and type are separated by colon (:)
+                               delimeter. The type values are: mgs, mdt, ost or
+                               mgs_mdt.
+       -s service addresses    the IP addresses to failover
+                               Multiple addresses are separated by colon (:)
+                               delimeter.
+       -c heartbeat channels   the methods to send/rcv heartbeats on
+                               The default method is multicast, and multicast_
+                               ipaddress is "225.0.0.11".
+       -o heartbeat options    a "catchall" for other heartbeat configuration 
+                               options
+       -v                      verbose mode
+
+EOF
+       exit 1
+}
+
+# Global variables
+SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"}
+SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}$"verify_serviceIP.sh"
+
+LUSTRE_SRV_SCRIPT=$"/etc/rc.d/init.d/lustre"   # service script for lustre
+
+TMP_DIR=$"/tmp/clumanager/"            # temporary directory
+CLUMGR_DIR=$"/etc/"                    # CluManager configuration directory
+
+CONFIG_CMD=$"redhat-config-cluster-cmd"
+
+declare -a NODE_NAMES                  # node names in the failover group
+declare -a SRV_IPADDRS                 # service IP addresses
+
+# Get and check the positional parameters
+while getopts "n:d:s:c:o:v" OPTION; do
+       case $OPTION in
+        n)
+               HOSTNAME_OPT=$OPTARG 
+               HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
+               if [ ${HOSTNAME_NUM} -lt 2 ]; then
+                       echo >&2 $"`basename $0`: Lack failover nodenames!"
+                       usage
+               fi
+               ;;
+        d)
+               DEVICE_OPT=$OPTARG 
+               TARGET_DEV=`echo ${DEVICE_OPT} | awk -F":" '{print $1}'`
+               TARGET_TYPE=`echo ${DEVICE_OPT} | awk -F":" '{print $2}'`
+               if [ -z "${TARGET_TYPE}" ]; then
+                       echo >&2 $"`basename $0`: Lack target device type!"
+                       usage
+               fi
+               if [ "${TARGET_TYPE}" != "mgs" ]&&[ "${TARGET_TYPE}" != "mdt" ]\
+               &&[ "${TARGET_TYPE}" != "ost" ]&&[ "${TARGET_TYPE}" != "mgs_mdt" ]
+               then
+                       echo >&2 $"`basename $0`: Invalid target device type" \
+                                 "- ${TARGET_TYPE}!"
+                       usage
+               fi
+               ;;
+        s)
+               SRVADDR_OPT=$OPTARG 
+               ;;
+        c)
+               HBCHANNEL_OPT=$OPTARG
+               HBCHANNEL_OPT=`echo "${HBCHANNEL_OPT}" | sed 's/^"//' \
+                               | sed 's/"$//'` 
+               if [ -n "${HBCHANNEL_OPT}" ] \
+               && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*broadcast*}" ] \
+               && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*multicast*}" ]; then
+                       echo >&2 $"`basename $0`: Invalid Heartbeat channel" \
+                                 "- ${HBCHANNEL_OPT}!"
+                       usage
+               fi
+               ;;
+        o)
+               HBOPT_OPT=$OPTARG 
+               HBOPT_OPT=`echo "${HBOPT_OPT}" | sed 's/^"//' | sed 's/"$//'`
+               ;;
+       v) 
+               VERBOSE_OPT=$"yes"
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Check the required parameters
+if [ -z "${HOSTNAME_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -n option!"
+       usage
+fi
+
+if [ -z "${DEVICE_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -d option!"
+       usage
+fi
+
+if [ -z "${SRVADDR_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -s option!"
+       usage
+fi
+
+# Output verbose informations
+verbose_output() {
+       if [ "${VERBOSE_OPT}" = "yes" ]; then
+               echo "`basename $0`: $*"
+       fi
+       return 0
+}
+
+# get_nodenames
+#
+# Get all the node names in this failover group
+get_nodenames() {
+       PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
+
+       declare -i idx
+       local nodename_str nodename
+
+       nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\
+                     END {for (i in a) print a[i]}'`
+       idx=0
+       for nodename in ${nodename_str}
+        do
+               NODE_NAMES[idx]=${nodename}
+               idx=$idx+1
+        done
+
+       return 0
+}
+
+# get_check_srvIPaddrs
+#
+# Get and check all the service IP addresses in this failover group
+get_check_srvIPaddrs() {
+       declare -i idx
+       declare -i i
+       local srvIPaddr_str srvIPaddr
+
+       srvIPaddr_str=`echo ${SRVADDR_OPT}|awk '{split($SRVADDR_OPT, a, ":")}\
+                     END {for (i in a) print a[i]}'`
+       idx=0
+       for srvIPaddr in ${srvIPaddr_str}
+        do
+               SRV_IPADDRS[idx]=${srvIPaddr}
+               idx=$idx+1
+        done
+
+       for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do
+         for ((i = 0; i < ${#NODE_NAMES[@]}; i++)); do
+           # Check service IP address
+           verbose_output "Verifying service IP ${SRV_IPADDRS[idx]} and" \
+                          "real IP of host ${NODE_NAMES[i]} are in the" \
+                          "same subnet..."
+           if ! ${SCRIPT_VERIFY_SRVIP} ${SRV_IPADDRS[idx]} ${NODE_NAMES[i]}
+           then
+             return 1
+           fi
+           verbose_output "OK"
+         done
+       done
+
+       return 0
+}
+
+# stop_clumanager
+#
+# Run pdsh command to stop each node's clumanager service
+stop_clumanager() {
+       declare -i idx
+       local nodename_str=${PRIM_NODENAME}
+
+       for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do
+               nodename_str=${nodename_str}$","${NODE_NAMES[idx]}
+       done
+
+       ${PDSH} -w ${nodename_str} /sbin/service clumanager stop
+       if [ $? -ne 0 ]; then
+               echo >&2 "`basename $0`: stop_clumanager() error:"\
+                        "Fail to execute pdsh command!"
+               return 1
+       fi
+
+       return 0
+}
+
+# check_retval retval
+#
+# Check the return value of redhat-config-cluster-cmd
+check_retval() {
+       if [ $1 -ne 0 ]; then
+               echo >&2 "`basename $0`: Fail to run ${CONFIG_CMD}!"
+               return 1
+       fi
+
+       return 0
+}
+
+# gen_cluster_xml
+#
+# Run redhat-config-cluster-cmd to create the cluster.xml file
+gen_cluster_xml() {
+       declare -i idx
+       local mcast_IPaddr
+       local hbopt_str hbopt
+
+       # Run redhat-config-cluster-cmd to generate cluster.xml
+       # Add clumembd tag
+       if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*broadcast*}" ]; then
+               ${CONFIG_CMD} --clumembd --broadcast=yes
+               if ! check_retval $?; then
+                       return 1
+               fi
+       elif [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*multicast*}" ]; then
+               mcast_IPaddr=`echo ${HBCHANNEL_OPT} | awk '{print $2}'`
+               if [ -n "${mcast_IPaddr}" ]; then
+                       ${CONFIG_CMD} --clumembd --multicast=yes\
+                                     --multicast_ipaddress=${mcast_IPaddr}
+                       if ! check_retval $?; then
+                               return 1
+                       fi
+               fi
+       fi
+
+       # Add cluster tag
+       ${CONFIG_CMD} --cluster --name='${TARGET_TYPE} failover group'
+       if ! check_retval $?; then
+               return 1
+       fi
+
+       # Add member tag
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               ${CONFIG_CMD} --add_member --name=${NODE_NAMES[idx]}
+               if ! check_retval $?; then
+                       return 1
+               fi
+       done
+
+       # Add failoverdomain tag
+       ${CONFIG_CMD} --add_failoverdomain --name=${TARGET_TYPE}-domain
+       if ! check_retval $?; then
+               return 1
+       fi
+
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               ${CONFIG_CMD} --failoverdomain=${TARGET_TYPE}-domain\
+                       --add_failoverdomainnode --name=${NODE_NAMES[idx]}
+               if ! check_retval $?; then
+                       return 1
+               fi
+       done
+
+       # Add service tag
+       ${CONFIG_CMD} --add_service --name=${TARGET_TYPE}-service
+       if ! check_retval $?; then
+               return 1
+       fi
+
+       ${CONFIG_CMD} --service=${TARGET_TYPE}-service \
+               --userscript=${LUSTRE_SRV_SCRIPT}
+       if ! check_retval $?; then
+               return 1
+       fi
+
+       ${CONFIG_CMD} --service=${TARGET_TYPE}-service \
+               --failoverdomain=${TARGET_TYPE}-domain
+       if ! check_retval $?; then
+               return 1
+       fi
+
+       for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do
+               ${CONFIG_CMD} --service=mgs-service \
+                       --add_service_ipaddress --ipaddress=${SRV_IPADDRS[idx]}
+               if ! check_retval $?; then
+                       return 1
+               fi
+       done
+
+       # Add other tags
+       if [ -n "${HBOPT_OPT}"]; then
+               hbopt_str=`echo ${HBOPT_OPT}|awk '{split($HBOPT_OPT, a, ":")}\
+                         END {for (i in a) print a[i]}'`
+               idx=0
+               for hbopt in ${hbopt_str}
+               do
+                       ${CONFIG_CMD} ${hbopt}
+                       if ! check_retval $?; then
+                               return 1
+                       fi
+                       idx=$idx+1
+               done
+       fi
+
+       return 0
+}
+
+# create_config
+#
+# Create the cluster.xml file and scp it to the each node's /etc/
+create_config() {
+       CONFIG_PRIMNODE=${TMP_DIR}$"cluster.xml."${PRIM_NODENAME}
+       declare -i idx
+
+       if [ -e ${CONFIG_PRIMNODE} ]; then
+               verbose_output "${CONFIG_PRIMNODE} already exists."
+               return 0
+       fi
+
+       # Run redhat-config-cluster-cmd to generate cluster.xml
+       verbose_output "Creating cluster.xml file for" \
+                      "${PRIM_NODENAME} failover group hosts..."
+       if ! gen_cluster_xml; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       /bin/cp -f ${CLUMGR_DIR}cluster.xml ${CONFIG_PRIMNODE}
+
+       # scp the cluster.xml file to all the nodes
+       verbose_output "Remote copying cluster.xml file to" \
+                      "${PRIM_NODENAME} failover group hosts..."
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"cluster.xml."${NODE_NAMES[idx]}
+               scp ${CONFIG_PRIMNODE} ${NODE_NAMES[idx]}:${CLUMGR_DIR}cluster.xml
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to scp cluster.xml file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+       verbose_output "OK"
+
+       return 0
+}
+
+# Main flow
+# Get all the node names
+if ! get_nodenames; then
+       exit 1
+fi
+
+# Get and check all the service IP addresses
+if ! get_check_srvIPaddrs; then
+       exit 1
+fi
+
+# Stop clumanager services
+verbose_output "Stopping clumanager service in the ${PRIM_NODENAME}"\
+              "failover group hosts..."
+if ! stop_clumanager; then
+       exit 1
+fi
+verbose_output "OK"
+
+# Generate configuration files
+if ! create_config; then
+       exit 1
+fi
+
+exit 0
diff --git a/lustre/utils/cluster_scripts/gen_hb_config.sh b/lustre/utils/cluster_scripts/gen_hb_config.sh
new file mode 100755 (executable)
index 0000000..bf66368
--- /dev/null
@@ -0,0 +1,591 @@
+#!/bin/bash
+#
+# gen_hb_config.sh - script for generating the Heartbeat HA software's
+#                   configuration files
+#
+###############################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage:  `basename $0` <-r HBver> <-n hostnames> <-d target device>
+                     <-c heartbeat channels> <-s service address>
+                     [-o heartbeat options] [-v]
+
+       -r HBver                the version of Heartbeat software
+                               The Heartbeat software versions which are curr-
+                               ently supported are: hbv1 (Heartbeat version 1) 
+                               and hbv2 (Heartbeat version 2).
+       -n hostnames            the nodenames of the primary node and its fail-
+                               overs
+                               Multiple nodenames are separated by colon (:)
+                               delimeter. The first one is the nodename of the 
+                               primary node, the others are failover nodenames.
+       -d target device        the target device name and type
+                               The name and type are separated by colon (:)
+                               delimeter. The type values are: mgs, mdt, ost or
+                               mgs_mdt.
+       -c heartbeat channels   the methods and devices to send/rcv heartbeats on
+       -s service address      the IP address to failover
+       -o heartbeat options    a "catchall" for other heartbeat configuration 
+                               options
+       -v                      verbose mode
+
+EOF
+       exit 1
+}
+
+# Global variables
+SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"}
+SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}$"verify_serviceIP.sh"
+
+LUSTRE_SRV_SCRIPT=$"lustre"            # service script for lustre
+MON_SRV_SCRIPT=$"mon"                  # service script for mon
+LUSTRE_MON_SCRIPT=$"simple.health_check.monitor"
+LUSTRE_ALERT_SCRIPT=$"fail_lustre.alert"
+CIB_GEN_SCRIPT=$"/usr/lib/heartbeat/cts/haresources2cib.py"
+
+TMP_DIR=$"/tmp/heartbeat/"             # temporary directory
+HACF_TEMP=${TMP_DIR}$"ha.cf.temp"
+AUTHKEYS_TEMP=${TMP_DIR}$"authkeys.temp"
+MONCF_TEMP=${TMP_DIR}$"mon.cf.temp"
+
+HA_DIR=$"/etc/ha.d/"                   # Heartbeat configuration directory
+MON_DIR=$"/etc/mon/"                   # mon configuration directory
+CIB_DIR=$"/var/lib/heartbeat/crm/"     # cib.xml directory
+
+HBVER_HBV1=$"hbv1"                     # Heartbeat version 1
+HBVER_HBV2=$"hbv2"                     # Heartbeat version 2
+
+declare -a NODE_NAMES                  # node names in the failover group
+
+# Get and check the positional parameters
+while getopts "r:n:d:c:s:o:v" OPTION; do
+       case $OPTION in
+       r) 
+               HBVER_OPT=$OPTARG
+               if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \
+               && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then
+                       echo >&2 $"`basename $0`: Invalid Heartbeat software" \
+                                 "version - ${HBVER_OPT}!"
+                       usage
+               fi
+               ;;
+        n)
+               HOSTNAME_OPT=$OPTARG 
+               HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'`
+               if [ ${HOSTNAME_NUM} -lt 2 ]; then
+                       echo >&2 $"`basename $0`: Lack failover nodenames!"
+                       usage
+               fi
+               ;;
+        d)
+               DEVICE_OPT=$OPTARG 
+               TARGET_DEV=`echo ${DEVICE_OPT} | awk -F":" '{print $1}'`
+               TARGET_TYPE=`echo ${DEVICE_OPT} | awk -F":" '{print $2}'`
+               if [ -z "${TARGET_TYPE}" ]; then
+                       echo >&2 $"`basename $0`: Lack target device type!"
+                       usage
+               fi
+               if [ "${TARGET_TYPE}" != "mgs" ]&&[ "${TARGET_TYPE}" != "mdt" ]\
+               &&[ "${TARGET_TYPE}" != "ost" ]&&[ "${TARGET_TYPE}" != "mgs_mdt" ]
+               then
+                       echo >&2 $"`basename $0`: Invalid target device type" \
+                                 "- ${TARGET_TYPE}!"
+                       usage
+               fi
+               ;;
+        c)
+               HBCHANNEL_OPT=$OPTARG 
+               HBCHANNEL_OPT=`echo "${HBCHANNEL_OPT}" | sed 's/^"//' \
+                              | sed 's/"$//'`
+               if [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*serial*}" ] \
+               && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*bcast*}" ] \
+               && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*ucast*}" ] \
+               && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*mcast*}" ]; then
+                       echo >&2 $"`basename $0`: Invalid Heartbeat channel" \
+                                 "- ${HBCHANNEL_OPT}!"
+                       usage
+               fi
+               ;;
+        s)
+               SRVADDR_OPT=$OPTARG 
+               ;;
+        o)
+               HBOPT_OPT=$OPTARG 
+               HBOPT_OPT=`echo "${HBOPT_OPT}" | sed 's/^"//' | sed 's/"$//'`
+               ;;
+       v) 
+               VERBOSE_OPT=$"yes"
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Check the required parameters
+if [ -z "${HBVER_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -r option!"
+       usage
+fi
+
+if [ -z "${HOSTNAME_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -n option!"
+       usage
+fi
+
+if [ -z "${DEVICE_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -d option!"
+       usage
+fi
+
+if [ -z "${HBCHANNEL_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -c option!"
+       usage
+fi
+
+if [ -z "${SRVADDR_OPT}" ]; then
+       echo >&2 $"`basename $0`: Lack -s option!"
+       usage
+fi
+
+if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]; then
+       echo >&2 $"`basename $0`: Heartbeat version 1 can only support 2 nodes!"
+       usage
+fi
+
+# Output verbose informations
+verbose_output() {
+       if [ "${VERBOSE_OPT}" = "yes" ]; then
+               echo "`basename $0`: $*"
+       fi
+       return 0
+}
+
+# get_nodenames
+#
+# Get all the node names in this failover group
+get_nodenames() {
+       PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'`
+
+       declare -i idx
+       local nodename_str nodename
+
+       nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\
+                     END {for (i in a) print a[i]}'`
+       idx=0
+       for nodename in ${nodename_str}
+        do
+               NODE_NAMES[idx]=${nodename}
+               idx=$idx+1
+        done
+
+       return 0
+}
+
+# check_srvIPaddr
+#
+# Check service IP address in this failover group
+check_srvIPaddr() {
+       declare -i idx
+
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               # Check service IP address
+               verbose_output "Verifying service IP ${SRVADDR_OPT} and" \
+                          "real IP of host ${NODE_NAMES[idx]} are in the" \
+                          "same subnet..."
+               if ! ${SCRIPT_VERIFY_SRVIP} ${SRVADDR_OPT} ${NODE_NAMES[idx]}
+               then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       return 0
+}
+
+# stop_heartbeat
+#
+# Run pdsh command to stop each node's heartbeat service
+stop_heartbeat() {
+       declare -i idx
+       local nodename_str=${PRIM_NODENAME}
+
+       for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do
+               nodename_str=${nodename_str}$","${NODE_NAMES[idx]}
+       done
+
+       ${PDSH} -w ${nodename_str} /sbin/service heartbeat stop
+       if [ $? -ne 0 ]; then
+               echo >&2 "`basename $0`: stop_heartbeat() error:"\
+                        "Fail to execute pdsh command!"
+               return 1
+       fi
+
+       return 0
+}
+
+# create_template
+#
+# Create the templates for ha.cf, authkeys and mon.cf files
+create_template() {
+       /bin/mkdir -p ${TMP_DIR}
+
+       # Create the template for ha.cf
+       if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+               cat >${HACF_TEMP} <<EOF
+debugfile /var/log/ha-debug
+logfile /var/log/ha-log
+logfacility     local0
+keepalive 2
+deadtime 15
+warntime 10
+initdead 120
+
+EOF
+       elif [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+               cat >${HACF_TEMP} <<EOF
+logfacility     daemon
+use_logd        yes
+keepalive 2
+deadtime 15
+warntime 10
+initdead 120
+
+EOF
+       fi
+
+       # Create the template for authkeys
+       if [ ! -s ${AUTHKEYS_TEMP} ]; then
+               cat >${AUTHKEYS_TEMP} <<EOF
+auth 1
+1 sha1 HelloLustre!
+EOF
+       fi
+
+       # Create the template for mon.cf 
+       if [ ! -s ${MONCF_TEMP} ]; then
+               cat >${MONCF_TEMP} <<EOF
+cfbasedir   = /etc/mon
+alertdir   = /usr/lib/mon/alert.d
+mondir     = /usr/lib/mon/mon.d
+statedir    = /usr/lib/mon/state.d
+logdir    = /usr/lib/mon/log.d
+dtlogfile    = /usr/lib/mon/log.d/downtime.log
+maxprocs    = 20
+histlength  = 100
+randstart   = 60s
+
+authtype = getpwnam
+
+EOF
+       fi
+
+       return 0
+}
+
+# gen_udpport
+#
+# Generate the UDP port number for Heartbeat bcast/ucast communication
+# The default value for udpport option in ha.cf is 694. If there are multiple 
+# bcast failover groups on the same subnet, this value should be different for 
+# each of the failover groups.
+gen_udpport() {
+       local port_file
+       declare -i default_port=694
+       declare -i dynamic_port=49152
+       declare -i port=0
+       declare -i tmp_port
+       declare -i idx
+
+       UDPPORT_PRIMNODE=${TMP_DIR}$"udpport."${PRIM_NODENAME}
+
+       if [ -s ${UDPPORT_PRIMNODE} ]; then
+               cat ${UDPPORT_PRIMNODE}
+               return 0
+       fi
+
+       # Get the current maximum UDP port number in the cluster
+       for port_file in `ls ${TMP_DIR}udpport.*`
+       do
+               if [ $? -ne 0 ]; then
+                       break
+               fi
+               tmp_port=$(cat ${port_file})
+               if [ $? -ne 0 ]; then
+                       break
+               fi
+               
+               if [ ${tmp_port} -gt ${port} ]; then
+                       port=${tmp_port}
+               fi
+       done
+
+       # Generate and check a new UDP port number
+       if [ ${port} -eq 0 ]; then
+               port=${default_port}
+       elif [ ${port} -eq ${default_port} ]; then
+               port=${dynamic_port}
+       else
+               port=${port}+1
+               if [ ${port} -gt 65535 ]; then
+                       echo >&2 $"`basename $0`: Invalid UDP port" \
+                                 "- ${port}!"
+                       return 1
+               fi
+       fi
+
+        # Add the UDP port number into each failover node's udpport file
+        for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+                UDPPORT_NODE=${TMP_DIR}$"udpport."${NODE_NAMES[idx]}
+               echo ${port} > ${UDPPORT_NODE}
+        done
+
+       echo ${port}
+       return 0
+}
+
+# create_hacf
+#
+# Create the ha.cf file and scp it to each node's /etc/ha.d/
+create_hacf() {
+       HACF_PRIMNODE=${TMP_DIR}$"ha.cf."${PRIM_NODENAME}
+
+       declare -i idx
+
+       if [ -s ${HACF_PRIMNODE} ]; then
+               # The ha.cf file for the primary node has already existed.
+               verbose_output "${HACF_PRIMNODE} already exists."
+               return 0
+       fi
+
+       /bin/cp -f ${HACF_TEMP} ${HACF_PRIMNODE}
+
+       if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*bcast*}" ] \
+       || [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*ucast*}" ]; then
+               UDPPORT_OPT=$(gen_udpport)
+               if [ $? -ne 0 ]; then
+                       return 1
+               fi      
+               echo "udpport ${UDPPORT_OPT}" >> ${HACF_PRIMNODE}
+       fi
+
+       if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*serial*}" ]; then
+               echo "baud    19200" >> ${HACF_PRIMNODE}
+       fi
+
+       echo ${HBCHANNEL_OPT} | awk '{split($HBCHANNEL_OPT, a, ":")} \
+       END {for (i in a) print a[i]}' >> ${HACF_PRIMNODE}
+
+        for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               echo "node    ${NODE_NAMES[idx]}" >> ${HACF_PRIMNODE}
+        done
+
+       echo ${HBOPT_OPT} | awk '{split($HBOPT_OPT, a, ":")} \
+       END {for (i in a) print a[i]}' >> ${HACF_PRIMNODE}
+
+       # scp ha.cf file to all the nodes
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"ha.cf."${NODE_NAMES[idx]}
+               scp ${HACF_PRIMNODE} ${NODE_NAMES[idx]}:${HA_DIR}ha.cf
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to scp ha.cf file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_haresources
+#
+# Create the haresources file and scp it to the each node's /etc/ha.d/
+create_haresources() {
+       HARES_PRIMNODE=${TMP_DIR}$"haresources."${PRIM_NODENAME}
+       declare -i idx
+
+       if [ -s ${HARES_PRIMNODE} ]; then
+               # The haresources file for the primary node has already existed
+               verbose_output "${HARES_PRIMNODE} already exists."
+               return 0
+       fi
+               
+       # Add the resource group line into the haresources file
+       echo "${PRIM_NODENAME} ${SRVADDR_OPT} "\
+            "${LUSTRE_SRV_SCRIPT}::${TARGET_TYPE}::${TARGET_DEV} "\
+            "${MON_SRV_SCRIPT}" > ${HARES_PRIMNODE}
+
+       # Generate the cib.xml file
+       if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+               CIB_PRIMNODE=${TMP_DIR}$"cib.xml."${PRIM_NODENAME}
+               python ${CIB_GEN_SCRIPT} ${HARES_PRIMNODE} > ${CIB_PRIMNODE}
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to generate cib.xml file"\
+                                "for node ${PRIM_NODENAME}!"
+                       return 1
+               fi
+       fi
+
+       # scp the haresources file or cib.xml file
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"haresources."${NODE_NAMES[idx]}
+               if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then
+                       scp ${CIB_PRIMNODE} ${NODE_NAMES[idx]}:${CIB_DIR}cib.xml
+               else
+                       scp ${HARES_PRIMNODE} ${NODE_NAMES[idx]}:${HA_DIR}haresources
+               fi
+
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to scp haresources file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_authkeys
+#
+# Create the authkeys file and scp it to the each node's /etc/ha.d/
+create_authkeys() {
+       AUTHKEYS_PRIMNODE=${TMP_DIR}$"authkeys."${PRIM_NODENAME}
+       declare -i idx
+
+       if [ -e ${AUTHKEYS_PRIMNODE} ]; then
+               verbose_output "${AUTHKEYS_PRIMNODE} already exists."
+               return 0
+       fi
+
+       # scp the authkeys file to all the nodes
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"authkeys."${NODE_NAMES[idx]}
+               scp ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}authkeys
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to scp authkeys file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# create_moncf
+#
+# Create the mon.cf file and scp it to the each node's /etc/mon/
+create_moncf() {
+       MONCF_PRIMNODE=${TMP_DIR}$"mon.cf."${PRIM_NODENAME}
+       declare -i idx
+       local hostgroup_str=$"hostgroup ${TARGET_TYPE}-group"
+
+       if [ -e ${MONCF_PRIMNODE} ]; then
+               verbose_output "${MONCF_PRIMNODE} already exists."
+               return 0
+       fi
+
+       /bin/cp -f ${MONCF_TEMP} ${MONCF_PRIMNODE}
+
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               hostgroup_str=${hostgroup_str}$" "${NODE_NAMES[idx]}
+       done
+
+       echo ${hostgroup_str} >> ${MONCF_PRIMNODE}
+
+       cat >>${MONCF_PRIMNODE} <<EOF
+
+watch ${TARGET_TYPE}-group
+    service ${LUSTRE_SRV_SCRIPT}
+        description Lustre health check
+        interval 1m
+        monitor ${LUSTRE_MON_SCRIPT} -o ${TARGET_TYPE}
+        period wd {Sat-Sun}
+            alert ${LUSTRE_ALERT_SCRIPT}
+
+EOF
+       # scp the mon.cf file to all the nodes
+       for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do
+               touch ${TMP_DIR}$"mon.cf."${NODE_NAMES[idx]}
+               scp ${MONCF_PRIMNODE} ${NODE_NAMES[idx]}:${MON_DIR}mon.cf
+               if [ $? -ne 0 ]; then
+                       echo >&2 "`basename $0`: Fail to scp mon.cf file"\
+                                "to node ${NODE_NAMES[idx]}!"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# generate_config
+#
+# Generate the configuration files for Heartbeat and scp them to all the nodes
+generate_config() {
+       if ! create_template; then
+               return 1
+       fi
+
+       verbose_output "Creating and remote copying ha.cf file to"\
+                      "${PRIM_NODENAME} failover group hosts..." 
+       if ! create_hacf; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then
+               verbose_output "Creating and remote copying haresources file"\
+                              "to ${PRIM_NODENAME} failover group hosts..."
+       else
+               verbose_output "Creating and remote copying cib.xml file"\
+                              "to ${PRIM_NODENAME} failover group hosts..."
+       fi
+
+       if ! create_haresources; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       verbose_output "Creating and remote copying authkeys file to" \
+                      "${PRIM_NODENAME} failover group hosts..."
+       if ! create_authkeys; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       verbose_output "Creating and remote copying mon.cf file to" \
+                      "${PRIM_NODENAME} failover group hosts..."
+       if ! create_moncf; then
+               return 1
+       fi
+       verbose_output "OK"
+
+       return 0
+}
+
+# Main flow
+# Get all the node names
+if ! get_nodenames; then
+       exit 1
+fi
+
+# Check service IP address
+if ! check_srvIPaddr; then
+       exit 1
+fi
+
+# Stop heartbeat services
+verbose_output "Stopping heartbeat service in the ${PRIM_NODENAME}"\
+              "failover group hosts..."
+if ! stop_heartbeat; then
+       exit 1
+fi
+verbose_output "OK"
+
+# Generate configuration files
+if ! generate_config; then
+       exit 1
+fi
+
+exit 0
diff --git a/lustre/utils/cluster_scripts/module_config.sh b/lustre/utils/cluster_scripts/module_config.sh
new file mode 100755 (executable)
index 0000000..baff1eb
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/bash
+#
+# module_config.sh - add lustre options lines into modprobe.conf or 
+#   modules.conf
+#
+#################################################################################
+
+# Check the kernel version
+KERNEL_VERSION=`uname -r`
+KERNEL_VERSION=${KERNEL_VERSION:0:3}
+
+if [ "${KERNEL_VERSION}" = "2.4" ]; then
+       MODULE_CONF=/etc/modules.conf
+else
+       MODULE_CONF=/etc/modprobe.conf
+fi
+
+read -r NETWORKS
+MODLINES_FILE=/tmp/modlines$$.txt
+START_MARKER=$"# start lustre config"
+END_MARKER=$"# end lustre config"
+
+# Generate a temp file contains lnet options lines 
+generate_lnet_lines() {
+       local LNET_LINE TMP_LINE
+
+       TMP_LINE="${NETWORKS}"
+
+       echo ${START_MARKER} > ${MODLINES_FILE}
+       while true; do
+               LNET_LINE=${TMP_LINE%%\\n*}
+               echo ${LNET_LINE} >> ${MODLINES_FILE}
+
+               TMP_LINE=${TMP_LINE#*\\n}
+
+               if [ "${TMP_LINE}" == "${LNET_LINE}" ]; then
+                       break
+               fi
+       done
+       echo ${END_MARKER} >> ${MODLINES_FILE}
+
+       #echo "--------------${MODLINES_FILE}--------------"
+       #cat ${MODLINES_FILE}
+       #echo -e "------------------------------------------\n"
+
+       return 0
+}
+
+if ! generate_lnet_lines; then
+       exit 1  
+fi
+
+# Add lnet options lines to the module configuration file
+if [ -e ${MODULE_CONF} ]; then
+       # Delete the old options
+       sed -i "/${START_MARKER}/,/${END_MARKER}/d" ${MODULE_CONF}
+fi
+
+cat ${MODLINES_FILE} >> ${MODULE_CONF}
+rm -f ${MODLINES_FILE}
+exit 0
diff --git a/lustre/utils/cluster_scripts/verify_cluster_net.sh b/lustre/utils/cluster_scripts/verify_cluster_net.sh
new file mode 100755 (executable)
index 0000000..f5f59c4
--- /dev/null
@@ -0,0 +1,296 @@
+#!/bin/bash
+#
+# verify_cluster_net.sh - script for Lustre cluster network verification
+#
+###############################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage: `basename $0` [-v] <csv file>
+
+       -v              verbose mode
+       csv file        a spreadsheet that contains configuration parameters 
+                       (separated by commas) for each target in a Lustre cl-
+                       uster, the first field of each line is the host name 
+                       of the cluster node
+
+EOF
+       exit 1
+}
+
+# Get and check the positional parameters
+while getopts "v" OPTION; do
+       case $OPTION in
+       v) 
+               VERBOSE_OPT=$"yes"
+               ;;
+        ?) 
+               usage 
+       esac
+done
+
+# Toss out the parameters we've already processed
+shift  `expr $OPTIND - 1`
+
+# Here we expect the csv file
+if [ $# -eq 0 ]; then
+       echo >&2 $"`basename $0`: Lack csv file!"
+       usage
+fi
+
+# Global variables
+CSV_FILE=$1
+declare -a HOST_NAMES
+declare -a HOST_IPADDRS
+
+# Output verbose informations
+verbose_output() {
+       if [ "${VERBOSE_OPT}" = "yes" ]; then
+               echo "`basename $0`: $*"
+       fi
+       return 0
+}
+
+# Check the csv file
+check_file() {
+       if [ ! -s ${CSV_FILE} ]; then
+                echo >&2 $"`basename $0`: check_file() error: ${CSV_FILE}" \
+                         "does not exist or is empty!"
+                return 1
+        fi
+
+        return 0
+}
+
+# Get the host names from the csv file
+get_hostnames() {
+       local NAME CHECK_STR
+       declare -i i
+
+       # Initialize the HOST_NAMES array
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               HOST_NAMES[i]=$""
+       done
+
+       CHECK_STR=`egrep -v "([[:space:]]|^)#" ${CSV_FILE} | awk -F, \
+                 '/[[:alnum:]]/{if ($1 !~/[[:alnum:]]/) print $0}'`
+       if [ -n "${CHECK_STR}" ]; then
+                echo >&2 $"`basename $0`: get_hostnames() error: Lack hostname"\
+                         "field in the line - ${CHECK_STR}"
+               return 1
+       fi
+
+       i=0
+       for NAME in `egrep -v "([[:space:]]|^)#" ${CSV_FILE}\
+                   | awk -F, '/[[:alnum:]]/{print $1}'`
+       do
+               HOST_NAMES[i]=${NAME}
+               i=$i+1
+       done
+
+       return 0
+}
+
+# Check whether the host name matches the name in the local /etc/hosts table
+# and whether the IP address according to the host name is correct
+local_check() {
+       # Check argument
+        if [ $# -ne 2 ]; then
+                echo >&2 $"`basename $0`: local_check() error: Lack argument" \
+                         "for function local_check()!"
+                return 1
+        fi
+
+       local RET_STR REAL_NAME
+
+       # Get the IP address according to the host name from /etc/hosts table
+       # of the current host 
+       HOST_IPADDRS[$2]=`egrep "[[:space:]]$1([[:space:]]|$)" /etc/hosts \
+                    | awk '{print $1}'`
+       if [ -z "${HOST_IPADDRS[$2]}" ]; then
+               echo >&2 "`basename $0`: local_check() error: $1 does not" \
+                        "exist in the local /etc/hosts table!"
+               return 1
+       fi
+
+       if [ ${#HOST_IPADDRS[$2]} -gt 15 ]; then
+               echo >&2 "`basename $0`: local_check() error: More than one" \
+                        "IP address line according to $1 in the local" \
+                        "/etc/hosts table!"
+               return 1
+       fi
+
+       # Execute pdsh command to get the real host name
+       RET_STR=`${PDSH} -w ${HOST_IPADDRS[$2]} hostname 2>&1`
+       if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]; then
+               echo >&2 "`basename $0`: local_check() error: pdsh error:" \
+                        "${RET_STR}"
+               return 1
+       fi
+
+       if [ -z "${RET_STR}" ]; then
+               echo >&2 "`basename $0`: local_check() error: pdsh error:" \
+                        "No results from pdsh! Check the network connectivity"\
+                        "between the local host and ${HOST_IPADDRS[$2]}" \
+                        "or check the two hosts' rcmd module!"
+               return 1
+       fi
+
+       REAL_NAME=`echo ${RET_STR} | awk '{print $2}'`
+       if [ "$1" != "${REAL_NAME}" ]; then
+               echo >&2 "`basename $0`: local_check() error: The real hostname"\
+                        "according to ${HOST_IPADDRS[$2]} is ${REAL_NAME}," \
+                        "not $1! Check the local /etc/hosts table!"
+               return 1
+       fi
+
+       return 0
+}
+
+# Check whether the correct host name and IP address pair matches 
+# the one in the remote /etc/hosts tables
+remote_check() {
+       # Check argument
+        if [ $# -ne 2 ]; then
+                echo >&2 $"`basename $0`: remote_check() error: Lack argument"\
+                         "for function remote_check()!"
+                return 1
+        fi
+
+       declare -i i
+       local RET_STR COMMAND IP_ADDR
+
+       COMMAND=$"egrep \"[[:space:]]$1([[:space:]]|$)\" /etc/hosts"
+
+       # Execute pdsh command to check remote /etc/hosts tables
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               RET_STR=`${PDSH} -w ${HOST_NAMES[i]} ${COMMAND} 2>&1`
+               if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]
+               then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "pdsh error: ${RET_STR}"
+                       return 1
+               fi
+
+               IP_ADDR=`echo ${RET_STR} | awk '{print $2}'`
+               if [ -z "${IP_ADDR}" ]; then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "$1 does not exist in the ${HOST_NAMES[i]}'s"\
+                                "/etc/hosts table!"
+                       return 1
+               fi
+
+               if [ "${IP_ADDR}" != "${HOST_IPADDRS[$2]}" ]; then
+                       echo >&2 "`basename $0`: remote_check() error:" \
+                                "IP address ${IP_ADDR} of $1 in the" \
+                                "${HOST_NAMES[i]}'s /etc/hosts is incorrect!"
+                       return 1
+               fi
+               
+       done
+
+       return 0
+}
+
+# Verify forward and reverse network connectivity of the Lustre cluster
+network_check () {
+       # Check argument
+        if [ $# -eq 0 ]; then
+                echo >&2 $"`basename $0`: network_check() error: Lack argument" \
+                         "for function network_check()!"
+                return 1
+        fi
+
+       declare -i i
+       local RET_STR COMMAND REAL_NAME
+
+       # Execute pdsh command to check network connectivity
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               COMMAND=$"${PDSH} -w ${HOST_NAMES[i]} hostname"
+               RET_STR=`${PDSH} -w $1 ${COMMAND} 2>&1`
+               if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]
+               then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "pdsh error: ${RET_STR}"
+                       return 1
+               fi
+
+               if [ -z "${RET_STR}" ]; then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "pdsh error: Nothing get from pdsh! Check" \
+                                "the network connectivity between $1 and" \
+                                "${HOST_NAMES[i]} or the two hosts' rcmd module!"
+                       return 1
+               fi
+
+               REAL_NAME=`echo ${RET_STR} | awk '{print $3}'`
+               if [ "${HOST_NAMES[i]}" != "${REAL_NAME}" ]; then
+                       echo >&2 "`basename $0`: network_check() error:" \
+                                "${RET_STR}"
+                       return 1
+               fi
+       done
+
+       return 0
+}
+
+# Verify forward and reverse network connectivity of the Lustre cluster,
+# and that hostnames match the names in the /etc/hosts tables.
+network_verify() {
+       declare -i i
+
+       # Initialize the HOST_IPADDRS array
+       for ((i = 0; i < ${#HOST_IPADDRS[@]}; i++)); do
+               HOST_IPADDRS[i]=$""
+       done
+
+       # Get all the host names from the csv file
+       if ! get_hostnames; then
+               return 1
+       fi
+
+       # Check whether all the host names match the names in 
+       # all the /etc/hosts tables of the Lustre cluster
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying IP address of host" \
+                              "${HOST_NAMES[i]} in the local /etc/hosts..."
+               if ! local_check ${HOST_NAMES[i]} $i; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying IP address of host" \
+                              "${HOST_NAMES[i]} in the remote /etc/hosts..."
+               if ! remote_check ${HOST_NAMES[i]} $i; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       # Verify network connectivity of the Lustre cluster
+       for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do
+               verbose_output "Verifying network connectivity of host" \
+                              "${HOST_NAMES[i]} to other hosts..."
+               if ! network_check ${HOST_NAMES[i]}; then
+                       return 1
+               fi
+               verbose_output "OK"
+       done
+
+       return 0
+}
+
+# Main flow
+if ! check_file; then
+       exit 1  
+fi
+
+if ! network_verify; then
+       exit 1  
+fi
+
+exit 0
diff --git a/lustre/utils/cluster_scripts/verify_serviceIP.sh b/lustre/utils/cluster_scripts/verify_serviceIP.sh
new file mode 100755 (executable)
index 0000000..cdc749d
--- /dev/null
@@ -0,0 +1,228 @@
+#!/bin/bash
+#
+# verify_serviceIP.sh - script for verifying the service IP and the real
+#                      interface IP in a remote host are in the same subnet
+#
+###############################################################################
+
+# Usage
+usage() {
+       cat >&2 <<EOF
+
+Usage:  `basename $0` <service IPaddr> <hostname>
+       
+       service IPaddr          the IP address to failover
+       hostname                the hostname of the remote node
+
+EOF
+       exit 1
+}
+
+# Check arguments
+if [ $# -lt 2 ]; then
+        usage
+fi
+
+#
+# inSameIPsubnet serviceIPaddr interfaceIPaddr mask
+#
+# Given two IP addresses and a subnet mask determine if these IP
+# addresses are in the same subnet. If they are, return 0, else return 1.
+#
+inSameIPsubnet() {
+       declare -i n
+       declare -ia mask 
+       declare -ia ip1 ip2             # IP addresses given
+       declare -i quad1 quad2          # calculated quad words
+
+       #
+       # Remove '.' characters from dotted decimal notation and save
+       # in arrays. i.e.
+       #
+       #       192.168.1.163 -> array[0] = 192
+       #                        array[1] = 168
+       #                        array[2] = 1
+       #                        array[3] = 163
+       #
+       let n=0
+       for quad in $(echo $1 | awk -F. '{print $1 " " $2 " " $3 " " $4}')
+       do
+               ip1[n]=$quad
+               let n=n+1
+       done
+
+       let n=0
+       for quad in $(echo $2 | awk -F. '{print $1 " " $2 " " $3 " " $4}')
+       do
+               ip2[n]=$quad
+               let n=n+1
+       done
+
+       let n=0
+       for quad in $(echo $3 | awk -F. '{print $1 " " $2 " " $3 " " $4}')
+       do
+               mask[n]=$quad
+               let n=n+1
+       done
+
+       #
+       # For each quad word, logically AND the IP address with the subnet
+       # mask to get the network/subnet quad word.  If the resulting
+       # quad words for both IP addresses are the same they are in the 
+       # same IP subnet.
+       #
+       for n in 0 1 2 3
+       do
+               let $((quad1=${ip1[n]} & ${mask[n]}))
+               let $((quad2=${ip2[n]} & ${mask[n]}))
+
+               if [ $quad1 != $quad2 ]; then
+                       echo >&2 $"`basename $0`: Service IP address $1 and"\
+                                 "real interface IP address $2 are in"\
+                                 "different subnets!"
+                       return 1        # in different subnets
+               fi
+       done
+
+       return 0        # in the same subnet, all quad words matched
+}
+
+#
+# findInterface IPaddr hostname
+#
+# Given a target IP address and a hostname, find the interface in which 
+# this address is configured.  If found return 0, if not return 1.  The
+# interface name is returned to stdout.
+#
+findInterface() {
+       declare host 
+       declare line
+       declare intf
+       declare addr
+       declare state
+
+       declare target=$1
+       declare hostname=$2
+
+       {
+       while read host intf line
+       do
+               while read host line
+               do
+                       if [ "$line" = "" ]; then       # go to next interface
+                               continue 2
+                       fi
+
+                       set - $line
+                       addr=
+                       while [ $# -gt 0 ]; do
+                               case $1 in
+                               addr:*)
+                                       addr=${1##addr:}
+                                       if [ -n "$addr" -a "$addr" = "$target" ]
+                                       then
+                                               echo $intf
+                                               return 0
+                                       fi
+                                       ;;
+                               esac
+                               shift
+                       done
+               done
+       done
+       } < <(${PDSH} -w $hostname /sbin/ifconfig)
+
+       echo >&2 "`basename $0`: Cannot find the interface in which" \
+                 "$target is configured in the host $hostname!"
+       return 1
+}
+
+#
+# findNetmask interface hostname
+#
+# Given an interface find the netmask addresses associated with it.
+# Return 0 when found, else return 1. The netmask is returned to stdout.
+#
+findNetmask() {
+       declare line
+       declare addr
+       declare target=$1
+       declare hostname=$2
+
+       while read line
+       do
+               set - $line
+
+               while [ $# -gt 0 ]; do
+                       case $1 in
+                       Mask:*)
+                               echo ${1##*:}   # return netmask addr
+                               return 0 
+                               ;;
+                       esac
+                       shift
+               done
+       done < <(${PDSH} -w $hostname /sbin/ifconfig $target)
+
+       echo >&2 "`basename $0`: Cannot find the netmask associated with" \
+                 "the interface $target in the host $hostname!"
+       return 1 
+}
+
+#
+# check_srvIPaddr serviceIPaddr hostname
+#
+# Given a service IP address and hostname, check whether the service IP address
+# and the real interface IP address of hostname are in the same subnet. 
+# If they are, return 0, else return 1.
+#
+check_srvIPaddr() {
+       declare real_IPaddr
+       declare real_intf
+       declare netmask
+       declare srv_IPaddr=$1
+       declare hostname=$2
+
+       # Get the IP address from /etc/hosts table according to the hostname
+       real_IPaddr=`egrep "[[:space:]]$hostname([[:space:]]|$)" /etc/hosts \
+                     | awk '{print $1}'`
+        if [ -z "$real_IPaddr" ]; then
+                echo >&2 "`basename $0`: $hostname does not exist in" \
+                         "the local /etc/hosts table!"
+                return 1
+        fi
+
+        if [ ${#real_IPaddr} -gt 15 ]; then
+                echo >&2 "`basename $0`: More than one IP address line" \
+                         "according to $hostname in the local /etc/hosts table!"
+                return 1
+        fi
+
+       # Get the interface in which the real IP address is configured
+       real_intf=$(findInterface $real_IPaddr $hostname)
+       if [ $? -ne 0 ]; then
+               return 1
+       fi
+       real_intf=${real_intf%%:*}
+
+       # Get the netmask address associated with the real interface
+       netmask=$(findNetmask $real_intf $hostname)
+       if [ $? -ne 0 ]; then
+               return 1
+       fi
+
+       # Determine if the service IP address and the real IP address
+       # are in the same subnet
+       inSameIPsubnet $srv_IPaddr $real_IPaddr $netmask
+       if [ $? -ne 0 ]; then
+               return 1
+       fi
+
+       return 0
+}
+
+# Check service IP address
+if ! check_srvIPaddr $1 $2; then
+       exit 1
+fi
+exit 0
diff --git a/lustre/utils/ha_assist.sh b/lustre/utils/ha_assist.sh
deleted file mode 100755 (executable)
index 0f737f5..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/sh
-
-echo primary `date` >> /tmp/halog
-
-
diff --git a/lustre/utils/ha_assist2.sh b/lustre/utils/ha_assist2.sh
deleted file mode 100755 (executable)
index a07d8b5..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash 
-set -vx
-date
-echo "ha assist checking for problems"
-sleep 3
-if [ ! -e /tmp/halog ]; then
-   echo "no problems, exiting"
-    exit 
-fi
-
-echo "removing /tmp/halog"
-rm /tmp/halog
-
-echo secondary start `date`
-echo "- please supply a new mds"
-
-# invoke ldap client here
-
-
-/usr/src/portals/linux/utils/ptlctl <<EOF3
-setup tcp
-close_uuid mds
-del_uuid mds
-connect dev5 988
-add_uuid mds
-quit
-EOF3
-
-echo "connected to new MDS!"
-
-/usr/src/obd/utils/obdctl  <<EOF2
-name2dev RPCDEV
-newconn
-quit
-EOF2
index 44419e8..fa92ab1 100755 (executable)
@@ -1457,7 +1457,6 @@ class MDSDEV(Module):
                 log(self.module_name, "create mtime LOGS cmdfile failed: ", self.name)
             else:
                 mtimecmdfile = string.split(mktemp[0])[0]
-                #mtimecmdfile="/tmp/lustre-cmd.XXXXXXXX"
                 fd = os.open(mtimecmdfile, os.O_RDWR | os.O_CREAT)
                 os.write(fd, "\n\n\n\n\n%s\n\n" %mtime)
                 os.close(fd)
@@ -2186,7 +2185,7 @@ def doMakeServiceScript(services):
                 extra_error = ""
             panic("Error creating " + target_symlink + ": " + e[1] + extra_error)
 
-# Check mtime of LLOG
+# Check mtime of config logs
 def doCheckMtime(lustreDB, hosts):
     for h in hosts:
         node_db = lustreDB.lookup_name(h, 'node')
index 6561ed5..63e015f 100644 (file)
@@ -96,7 +96,7 @@ command_t cmdlist[] = {
          "setup for elan/myrinet networks.\n"
          "usage: mynid [<nid>]"},
         {"add_uuid", jt_lcfg_add_uuid, 0, "associate a UUID with a nid\n"
-         "usage: add_uuid <uuid> <nid> <net_type>"},
+         "usage: add_uuid <uuid> <nid>"},
         {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n"
          "usage: close_uuid <uuid> <net_type>"},
         {"del_uuid", jt_lcfg_del_uuid, 0, "delete a UUID association\n"
@@ -144,8 +144,8 @@ command_t cmdlist[] = {
         /* Device configuration commands */
         {"==== device config =====", jt_noop, 0, "device config"},
         {"attach", jt_lcfg_attach, 0,
-         "set the type of the current device (with <name> and <uuid>)\n"
-         "usage: attach type [name [uuid]]"},
+         "set the type, name, and uuid of the current device\n"
+         "usage: attach type name uuid"},
         {"setup", jt_lcfg_setup, 0,
          "type specific device configuration information\n"
          "usage: setup <args...>"},
@@ -172,6 +172,9 @@ command_t cmdlist[] = {
          "usage: dump_log config-uuid-name"},
         {"clear_log", jt_cfg_clear_log, 0, "delete current config log of recorded commands\n"
          "usage: clear_log config-name"},
+        {"conf_param", jt_lcfg_mgsparam, 0, "set a permanent config param\n"
+         "usage: conf_param <keyword=val> ...\n"},
+
 
         /* Device operations */
         {"=== device operations ==", jt_noop, 0, "device operations"},
@@ -231,14 +234,16 @@ command_t cmdlist[] = {
         {"del_mount_option", jt_lcfg_del_mount_option, 0,
          "usage: del_mount_option profile\n"},
         {"set_timeout", jt_lcfg_set_timeout, 0,
-         "usage: set_timeout <secs>\n"},
+         "usage: conf_param obd_timeout=<secs>\n"},
         {"set_lustre_upcall", jt_lcfg_set_lustre_upcall, 0,
          "usage: set_lustre_upcall </full/path/to/upcall> \n"},
         {"add_conn ", jt_lcfg_add_conn, 0,
          "usage: add_conn <conn_uuid> [priority]\n"},
         {"del_conn ", jt_lcfg_del_conn, 0,
          "usage: del_conn <conn_uuid> \n"},
-
+        {"local_param", jt_lcfg_param, 0, "set a temporary, local param\n"
+         "usage: local_param <keyword=val> ...\n"},
+       
         /* Llog operations */ 
         {"llog_catlist", jt_llog_catlist, 0, 
          "list all catalog logs on current device.\n"
index aa27001..349c703 100644 (file)
@@ -322,7 +322,7 @@ static int lfs_osts(int argc, char **argv)
         } else {
                 mnt = getmntent(fp);
                 while (feof(fp) == 0 && ferror(fp) ==0) {
-                        if (llapi_is_lustre_mnttype(mnt->mnt_type)) {
+                        if (llapi_is_lustre_mnttype(mnt)) {
                                 rc = llapi_find(mnt->mnt_dir, obduuid, 0, 0, 0);
                                 if (rc)
                                         fprintf(stderr,
@@ -370,7 +370,7 @@ static int path2mnt(char *path, FILE *fp, char *mntdir, int dir_len)
         len = 0;
         mnt = getmntent(fp);
         while (feof(fp) == 0 && ferror(fp) == 0) {
-                if (llapi_is_lustre_mnttype(mnt->mnt_type)) {
+                if (llapi_is_lustre_mnttype(mnt)) {
                         len = strlen(mnt->mnt_dir);
                         if (len > out_len &&
                             !strncmp(rpath, mnt->mnt_dir, len)) {
@@ -585,7 +585,7 @@ static int lfs_df(int argc, char **argv)
         } else {
                 mnt = getmntent(fp);
                 while (feof(fp) == 0 && ferror(fp) == 0) {
-                        if (llapi_is_lustre_mnttype(mnt->mnt_type)) {
+                        if (llapi_is_lustre_mnttype(mnt)) {
                                 rc = mntdf(mnt->mnt_dir, ishow, cooked);
                                 if (rc)
                                         break;
@@ -636,7 +636,7 @@ static int lfs_check(int argc, char **argv)
         } else {
                 mnt = getmntent(fp);
                 while (feof(fp) == 0 && ferror(fp) ==0) {
-                        if (llapi_is_lustre_mnttype(mnt->mnt_type))
+                        if (llapi_is_lustre_mnttype(mnt))
                                 break;
                         mnt = getmntent(fp);
                 }
@@ -677,7 +677,7 @@ static int lfs_catinfo(int argc, char **argv)
         } else {
                 mnt = getmntent(fp);
                 while (feof(fp) == 0 && ferror(fp) == 0) {
-                        if (llapi_is_lustre_mnttype(mnt->mnt_type))
+                        if (llapi_is_lustre_mnttype(mnt))
                                 break;
                         mnt = getmntent(fp);
                 }
index 2c10da6..c320aed 100644 (file)
@@ -889,9 +889,12 @@ int llapi_catinfo(char *dir, char *keyword, char *node_name)
         return rc;
 }
 
-int llapi_is_lustre_mnttype(char *type)
+/* Is this a lustre client fs? */
+int llapi_is_lustre_mnttype(struct mntent *mnt)
 {
-        return (strcmp(type,"lustre") == 0 || strcmp(type,"lustre_lite") == 0);
+        char *type = mnt->mnt_type;
+        return ((strcmp(type, "lustre") == 0 || strcmp(type,"lustre_lite") == 0)
+                && (strstr(mnt->mnt_fsname, ":/") != NULL));
 }
 
 int llapi_quotacheck(char *mnt, int check_type)
index 16ce965..03f04c7 100644 (file)
@@ -224,6 +224,7 @@ static void print_1_cfg(struct lustre_cfg *lcfg)
         return;
 }
 
+
 static void print_setup_cfg(struct lustre_cfg *lcfg)
 {
         struct lov_desc *desc;
@@ -246,10 +247,13 @@ static void print_setup_cfg(struct lustre_cfg *lcfg)
         return;
 }
 
-void print_lustre_cfg(struct lustre_cfg *lcfg)
+void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
 {
         enum lcfg_command_type cmd = le32_to_cpu(lcfg->lcfg_command);
         
+        if (*skip > 0)
+                printf("SKIP ");
+
         switch(cmd){
         case(LCFG_ATTACH):{
                 printf("attach    ");
@@ -326,8 +330,19 @@ void print_lustre_cfg(struct lustre_cfg *lcfg)
                 break;
         }
         case(LCFG_MARKER):{
-                printf("marker ");
-                print_1_cfg(lcfg);
+                struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+                if (marker->cm_flags & CM_SKIP) {
+                        if (marker->cm_flags & CM_START) 
+                                (*skip)++;
+                        if (marker->cm_flags & CM_END)
+                                (*skip)--;
+                }
+                printf("marker %d (flags=%#x) %.16s '%s' %s:%s", marker->cm_step,
+                       marker->cm_flags, marker->cm_svname, 
+                       marker->cm_comment, ctime(&marker->cm_createtime),
+                       marker->cm_canceltime ? 
+                       ctime(&marker->cm_canceltime) : "");
                 break;
         }
         default:
@@ -340,9 +355,9 @@ void print_lustre_cfg(struct lustre_cfg *lcfg)
 void print_records(struct llog_rec_hdr** recs,int rec_number)
 {
         __u32 lopt;
-        int i;
+        int i, skip = 0;
         
-        for(i=0;i<rec_number;i++){
+        for(i = 0; i < rec_number; i++){
         
                 printf("#%.2d ", le32_to_cpu(recs[i]->lrh_index));
 
@@ -353,7 +368,7 @@ void print_records(struct llog_rec_hdr** recs,int rec_number)
                         printf("L "); 
                         lcfg = (struct lustre_cfg *)
                                 ((char*)(recs[i]) + sizeof(struct llog_rec_hdr));
-                        print_lustre_cfg(lcfg);
+                        print_lustre_cfg(lcfg, &skip);
                 }
 
                 if (lopt == PTL_CFG_REC){
diff --git a/lustre/utils/load_ldap.sh b/lustre/utils/load_ldap.sh
deleted file mode 100755 (executable)
index 0163b85..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-#
-# Load a lustre config xml into an openldap database.
-# See https://projects.clusterfs.com/lustre/LustreLDAP
-# for more details.
-#
-# Usage: load_ldap.sh <xml_file>
-set -e
-
-LDAP_BASE=${LDAP_BASE:-fs=lustre}
-LDAP_ROOTDN=${LDAP_ROOTDN:-cn=Manager,fs=lustre}
-LDAP_PW=${LDAP_PW:-secret}
-LDAP_AUTH="-x -D $LDAP_ROOTDN -w $LDAP_PW"
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-
-if [ -f $LUSTRE/autoMakefile.am ]; then
-  CONFDIR=$LUSTRE/conf
-else
-  CONFDIR=/usr/lib/lustre
-fi
-
-TOP=$CONFDIR/top.ldif
-XSL=$CONFDIR/lustre2ldif.xsl
-
-[ ! -z $LDAPURL ] && LDAP_AUTH="$LDAP_AUTH -H $LDAPURL"
-
-XML=${XML:-$1}
-
-if [ -z "$XML" ] || [  ! -r $XML ]; then
-     echo "usage: $0 xmlfile"
-     exit 1
-fi
-
-NAME=`basename $XML .xml`
-LDIF=/tmp/$NAME.ldif
-
-# add the top level record, if needed
-ldapsearch $LDAP_AUTH -b $LDAP_BASE > /dev/null 2>&1 ||
-    ldapadd $LDAP_AUTH -f $TOP 
-
-# If this config already exists, then delete it
-ldapsearch $LDAP_AUTH -b config=$NAME,$LDAP_BASE > /dev/null 2>&1 && 
-    ldapdelete $LDAP_AUTH -r config=$NAME,$LDAP_BASE
-
-4xslt -D config=$NAME $XML $XSL   > $LDIF
-
-echo "Loading config to 'config=$NAME,$LDAP_BASE' ..."
-ldapadd $LDAP_AUTH -f $LDIF
-
-rm -f $LDIF
index 07df82b..342a4da 100644 (file)
@@ -98,23 +98,14 @@ int jt_lcfg_attach(int argc, char **argv)
         struct lustre_cfg *lcfg;
         int rc;
 
-        if (argc != 2 && argc != 3 && argc != 4)
+        if (argc != 4)
                 return CMD_HELP;
 
         lustre_cfg_bufs_reset(&bufs, NULL);
 
         lustre_cfg_bufs_set_string(&bufs, 1, argv[1]);
-        if (argc >= 3) {
-                lustre_cfg_bufs_set_string(&bufs, 0, argv[2]);
-        } else {
-                fprintf(stderr, "error: %s: LCFG_ATTACH requires a name\n",
-                        jt_cmdname(argv[0])); 
-                return -EINVAL;
-        }
-
-        if (argc == 4) {
-                lustre_cfg_bufs_set_string(&bufs, 2, argv[3]);
-        }
+        lustre_cfg_bufs_set_string(&bufs, 0, argv[2]);
+        lustre_cfg_bufs_set_string(&bufs, 2, argv[3]);
 
         lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
         rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
@@ -548,6 +539,12 @@ int jt_lcfg_set_timeout(int argc, char **argv)
         struct lustre_cfg_bufs bufs;
         struct lustre_cfg *lcfg;
 
+        fprintf(stderr, "%s has been deprecated. Use conf_param instead.\n"
+                "e.g. conf_param lustre-MDT0000 obd_timeout=50\n",
+                jt_cmdname(argv[0]));
+        return CMD_HELP;
+
+
         if (argc != 2)
                 return CMD_HELP;
 
@@ -556,6 +553,8 @@ int jt_lcfg_set_timeout(int argc, char **argv)
         lcfg->lcfg_num = atoi(argv[1]);
         
         rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+        //rc = lcfg_mgs_ioctl(argv[0], OBD_DEV_ID, lcfg);
+
         lustre_cfg_free(lcfg);
         if (rc < 0) {
                 fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -659,3 +658,77 @@ int jt_lcfg_del_conn(int argc, char **argv)
 
         return rc;
 }
+
+/* Param set locally, directly on target */
+int jt_lcfg_param(int argc, char **argv)
+{
+        int i, rc;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+
+        if (argc >= LUSTRE_CFG_MAX_BUFCOUNT)
+                return CMD_HELP;
+
+        lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+
+        for (i = 1; i < argc; i++) {
+                lustre_cfg_bufs_set_string(&bufs, i, argv[i]);
+        }
+
+        lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+        
+        rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg);
+        lustre_cfg_free(lcfg);
+        if (rc < 0) {
+                fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
+                        strerror(rc = errno));
+        }
+        return rc;
+}
+
+/* Param set in config log on MGS */
+/* conf_param <cfg_device> key1=value1 [key2=value2...] */
+int jt_lcfg_mgsparam(int argc, char **argv)
+{
+        int i, rc, index_offset = 0;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+
+        if ((argc >= LUSTRE_CFG_MAX_BUFCOUNT) || (argc <= 1))
+                return CMD_HELP;
+
+        if (!strchr(argv[1], '=')) {
+                /* Not key=val, assume <cfg_device> */
+                rc = jt_lcfg_device(2, argv);
+                if (rc) 
+                        return rc;
+                index_offset = 1;
+        }
+
+        if (lcfg_devname == NULL) {
+                fprintf(stderr, "%s: please use 'cfg_device name' to set the "
+                        "device name for config commands.\n", 
+                        jt_cmdname(argv[0])); 
+                return -EINVAL;
+        }
+
+        lustre_cfg_bufs_reset(&bufs, lcfg_devname);
+
+        for (i = 1; i < (argc - index_offset); i++) {
+                lustre_cfg_bufs_set_string(&bufs, i, argv[i + index_offset]);
+        }
+
+        /* We could put other opcodes here. */
+        lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+
+        rc = lcfg_mgs_ioctl(argv[0], OBD_DEV_ID, lcfg);
+        lustre_cfg_free(lcfg);
+        if (rc < 0) {
+                fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
+                        strerror(rc = errno));
+        }
+        
+        return rc;
+}
+
+
diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c
new file mode 100644 (file)
index 0000000..1f88563
--- /dev/null
@@ -0,0 +1,1271 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *   Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Lin Song Tao <lincent@clusterfs.com>
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <mntent.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include <string.h>
+#include <getopt.h>
+
+#include <linux/types.h>
+//#define HAVE_SYS_VFS_H 1
+#include <linux/fs.h> // for BLKGETSIZE64
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lnet/lnetctl.h>
+#include <lustre_ver.h>
+
+
+#define MAX_LOOP_DEVICES 16
+#define L_BLOCK_SIZE 4096
+#define INDEX_UNASSIGNED 0xFFFF
+
+static char *progname;
+static int verbose = 1;
+static int print_only = 0;
+
+
+void usage(FILE *out)
+{
+        fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname);
+        fprintf(out, "usage: %s <target types> [options] <device>\n", progname);
+        fprintf(out, 
+                "\t<device>:block device or file (e.g /dev/sda or /tmp/ost1)\n"
+                "\ttarget types:\n"
+                "\t\t--ost: object storage, mutually exclusive with mdt\n"
+                "\t\t--mdt: metadata storage, mutually exclusive with ost\n"
+                "\t\t--mgs: configuration management service - one per site\n"
+                "\toptions (in order of popularity):\n"
+                "\t\t--mgsnode=<nid>[,<...>] : NID(s) of a remote mgs node\n"
+                "\t\t\trequired for all targets other than the mgs node\n"
+                "\t\t--fsname=<filesystem_name> : default is 'lustre'\n"
+                "\t\t--failnode=<nid>[,<...>] : NID(s) of a failover partner\n"
+                "\t\t--param <key>=<value> : set a permanent parameter\n"
+                "\t\t--index=#N : target index\n"
+                /* FIXME implement 1.6.x
+                "\t\t--configdev=<altdevice|file>: store configuration info\n"
+                "\t\t\tfor this device on an alternate device\n"
+                */
+                "\t\t--mountfsoptions=<opts> : permanent mount options\n"
+                "\t\t--backfstype=<fstype> : backing fs type (ext3, ldiskfs)\n"
+                "\t\t--device-size=#N(KB) : device size for loop devices\n"
+#ifndef TUNEFS
+                "\t\t--mkfsoptions=<opts> : format options\n"
+                "\t\t--reformat: overwrite an existing disk\n"
+                "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n"
+#else
+                "\t\t--erase-params : erase all old parameter settings\n"
+                "\t\t--nomgs: turn off MGS service on this MDT\n"
+                "\t\t--writeconf: erase all config logs for this fs.\n"
+#endif
+                "\t\t--print: just report what we would do; don't write to "
+                "disk\n"
+                "\t\t--verbose\n"
+                "\t\t--quiet\n");
+        return;
+}
+
+#define vprint if (verbose > 0) printf
+
+static void fatal(void)
+{
+        verbose = 0;
+        fprintf(stderr, "\n%s FATAL: ", progname);
+}
+
+/*================ utility functions =====================*/
+
+inline unsigned int 
+dev_major (unsigned long long int __dev)
+{
+        return ((__dev >> 8) & 0xfff) | ((unsigned int) (__dev >> 32) & ~0xfff);
+}
+
+inline unsigned int
+dev_minor (unsigned long long int __dev)
+{
+        return (__dev & 0xff) | ((unsigned int) (__dev >> 12) & ~0xff);
+}
+
+int get_os_version()
+{
+        static int version = 0;
+
+        if (!version) {
+                int fd;
+                char release[4] = "";
+
+                fd = open("/proc/sys/kernel/osrelease", O_RDONLY);
+                if (fd < 0) 
+                        fprintf(stderr, "%s: Warning: Can't resolve kernel "
+                                "version, assuming 2.6\n", progname);
+                else {
+                        read(fd, release, 4);
+                        close(fd);
+                }
+                if (strncmp(release, "2.4.", 4) == 0) 
+                        version = 24;
+                else 
+                        version = 26;
+        }
+        return version;
+}
+
+int run_command(char *cmd)
+{
+        char log[] = "/tmp/mkfs_logXXXXXX";
+        int fd, rc;
+        
+        if (verbose > 1)
+                printf("cmd: %s\n", cmd);
+        
+        if ((fd = mkstemp(log)) >= 0) {
+                close(fd);
+                strcat(cmd, " >");
+                strcat(cmd, log);
+        }
+        strcat(cmd, " 2>&1");
+
+        /* Can't use popen because we need the rv of the command */
+        rc = system(cmd);
+        if (rc && fd >= 0) {
+                char buf[128];
+                FILE *fp;
+                fp = fopen(log, "r");
+                if (fp) {
+                        while (fgets(buf, sizeof(buf), fp) != NULL) {
+                                if (rc || verbose > 2) 
+                                        printf("   %s", buf);
+                        }
+                        fclose(fp);
+                }
+        }
+        if (fd >= 0) 
+                remove(log);
+        return rc;
+}                                                       
+
+static int check_mtab_entry(char *spec, char *type)
+{
+        FILE *fp;
+        struct mntent *mnt;
+
+        fp = setmntent(MOUNTED, "r");
+        if (fp == NULL)
+                return(0);
+
+        while ((mnt = getmntent(fp)) != NULL) {
+                if (strcmp(mnt->mnt_fsname, spec) == 0 &&
+                        strcmp(mnt->mnt_type, type) == 0) {
+                        endmntent(fp);
+                        fprintf(stderr, "%s: according to %s %s is "
+                                "already mounted on %s\n",
+                                progname, MOUNTED, spec, mnt->mnt_dir);
+                        return(EEXIST);
+                }
+        }
+        endmntent(fp);
+
+        return(0);
+}
+
+/*============ disk dev functions ===================*/
+
+/* Setup a file in the first unused loop_device */
+int loop_setup(struct mkfs_opts *mop)
+{
+        char loop_base[20];
+        char l_device[64];
+        int i,ret = 0;
+
+        /* Figure out the loop device names */
+        if (!access("/dev/loop0", F_OK | R_OK))
+                strcpy(loop_base, "/dev/loop\0");
+        else if (!access("/dev/loop/0", F_OK | R_OK))
+                strcpy(loop_base, "/dev/loop/\0");
+        else {
+                fprintf(stderr, "%s: can't access loop devices\n", progname);
+                return 1;
+        }
+
+        /* Find unused loop device */
+        for (i = 0; i < MAX_LOOP_DEVICES; i++) {
+                char cmd[128];
+                sprintf(l_device, "%s%d", loop_base, i);
+                if (access(l_device, F_OK | R_OK)) 
+                        break;
+                sprintf(cmd, "losetup %s > /dev/null 2>&1", l_device);
+                ret = system(cmd);
+                /* losetup gets 1 (ret=256) for non-set-up device */
+                if (ret) {
+                        /* Set up a loopback device to our file */
+                        sprintf(cmd, "losetup %s %s", l_device, mop->mo_device);
+                        ret = run_command(cmd);
+                        if (ret) {
+                                fprintf(stderr, "%s: error %d on losetup: %s\n",
+                                        progname, ret, strerror(ret));
+                                return ret;
+                        }
+                        strcpy(mop->mo_loopdev, l_device);
+                        return ret;
+                }
+        }
+        
+        fprintf(stderr, "%s: out of loop devices!\n", progname);
+        return EMFILE;
+}       
+
+int loop_cleanup(struct mkfs_opts *mop)
+{
+        char cmd[128];
+        int ret = 1;
+        if ((mop->mo_flags & MO_IS_LOOP) && *mop->mo_loopdev) {
+                sprintf(cmd, "losetup -d %s", mop->mo_loopdev);
+                ret = run_command(cmd);
+        }
+        return ret;
+}
+
+/* Determine if a device is a block device (as opposed to a file) */
+int is_block(char* devname)
+{
+        struct stat st;
+        int ret = 0;
+
+        ret = access(devname, F_OK);
+        if (ret != 0) 
+                return 0;
+        ret = stat(devname, &st);
+        if (ret != 0) {
+                fprintf(stderr, "%s: cannot stat %s\n", progname, devname);
+                return -1;
+        }
+        return S_ISBLK(st.st_mode);
+}
+
+__u64 get_device_size(char* device) 
+{
+        int ret, fd;
+        __u64 size = 0;
+
+        fd = open(device, O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "%s: cannot open %s: %s\n", 
+                        progname, device, strerror(errno));
+                return 0;
+        }
+
+        /* size in bytes. bz5831 */
+        ret = ioctl(fd, BLKGETSIZE64, (void*)&size);
+        close(fd);
+        if (ret < 0) {
+                fprintf(stderr, "%s: size ioctl failed: %s\n", 
+                        progname, strerror(errno));
+                return 0;
+        }
+        
+        vprint("device size = "LPU64"MB\n", size >> 20);
+        /* return value in KB */
+        return size >> 10;
+}
+
+int loop_format(struct mkfs_opts *mop)
+{
+        int ret = 0;
+       
+        if (mop->mo_device_sz == 0) {
+                fatal();
+                fprintf(stderr, "loop device requires a --device-size= "
+                        "param\n");
+                return EINVAL;
+        }
+
+        ret = creat(mop->mo_device, S_IRUSR|S_IWUSR);
+        ret = truncate(mop->mo_device, mop->mo_device_sz * 1024);
+        if (ret != 0) {
+                ret = errno;
+                fprintf(stderr, "%s: Unable to create backing store: %d\n", 
+                        progname, ret);
+        }
+
+        return ret;
+}
+
+/* Check whether the file exists in the device */
+static int file_in_dev(char *file_name, char *dev_name)
+{
+        FILE *fp;
+        char debugfs_cmd[256];
+        unsigned int inode_num;
+        int i;
+
+        /* Construct debugfs command line. */
+        memset(debugfs_cmd, 0, sizeof(debugfs_cmd));
+        sprintf(debugfs_cmd,
+                "debugfs -c -R 'stat %s' %s 2>&1 | egrep '(Inode|unsupported)'",
+                file_name, dev_name);
+
+        fp = popen(debugfs_cmd, "r");
+        if (!fp) {
+                fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+                return 0;
+        }
+
+        if (fscanf(fp, "Inode: %u", &inode_num) == 1) { /* exist */
+                pclose(fp);
+                return 1;
+        }
+        i = fread(debugfs_cmd, 1, sizeof(debugfs_cmd), fp);
+        if (i) {
+                /* Filesystem has unsupported feature */
+                vprint("%.*s", i, debugfs_cmd);
+                /* in all likelihood, the "unsupported feature" is
+                  'extents', which older debugfs does not understand.  
+                  Use e2fsprogs-1.38-cfs1 or later, available from 
+                  ftp://ftp.lustre.org/pub/lustre/other/e2fsprogs/ */
+                return -1;
+        }
+        pclose(fp);
+        return 0;
+}
+
+/* Check whether the device has already been used with lustre */
+static int is_lustre_target(struct mkfs_opts *mop)
+{
+        int rc;
+        vprint("checking for existing Lustre data\n");
+        
+        if ((rc = file_in_dev(MOUNT_DATA_FILE, mop->mo_device))
+            || (rc = file_in_dev(LAST_RCVD, mop->mo_device))) { 
+                vprint("found Lustre data\n");
+                /* in the -1 case, 'extents' means this really IS a lustre
+                   target */
+                return rc; 
+        }
+
+        return 0; /* The device is not a lustre target. */
+}
+
+/* Build fs according to type */
+int make_lustre_backfs(struct mkfs_opts *mop)
+{
+        char mkfs_cmd[512];
+        char buf[40];
+        char *dev;
+        int ret = 0;
+        int block_count = 0;
+
+        if (mop->mo_device_sz != 0) {
+                if (mop->mo_device_sz < 8096){
+                        fprintf(stderr, "%s: size of filesystem must be larger "
+                                "than 8MB, but is set to %lldKB\n",
+                                progname, mop->mo_device_sz);
+                        return EINVAL;
+                }
+                block_count = mop->mo_device_sz / (L_BLOCK_SIZE >> 10);
+        }       
+        
+        if ((mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) ||
+            (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS)) { 
+                __u64 device_sz = mop->mo_device_sz;
+
+                /* we really need the size */
+                if (device_sz == 0) {
+                        device_sz = get_device_size(mop->mo_device);
+                        if (device_sz == 0) 
+                                return ENODEV;
+                }
+
+                /* Journal size in MB */
+                if (strstr(mop->mo_mkfsopts, "-J") == NULL) {
+                        /* Choose our own default journal size */
+                        long journal_sz = 0, max_sz;
+                        if (device_sz > 1024 * 1024) /* 1GB */
+                                journal_sz = (device_sz / 102400) * 4;
+                        /* man mkfs.ext3 */
+                        max_sz = (102400 * L_BLOCK_SIZE) >> 20; /* 400MB */
+                        if (journal_sz > max_sz)
+                                journal_sz = max_sz;
+                        if (journal_sz) {
+                                sprintf(buf, " -J size=%ld", journal_sz);
+                                strcat(mop->mo_mkfsopts, buf);
+                        }
+                }
+
+                /* Default bytes_per_inode is block size */
+                if (strstr(mop->mo_mkfsopts, "-i") == NULL) {
+                        long bytes_per_inode = 0;
+                                        
+                        if (IS_MDT(&mop->mo_ldd)) 
+                                bytes_per_inode = 4096;
+
+                        /* Allocate fewer inodes on large OST devices.  Most
+                           filesystems can be much more aggressive than even 
+                           this. */
+                        if ((IS_OST(&mop->mo_ldd) && (device_sz > 1000000))) 
+                                bytes_per_inode = 16384;
+                        
+                        if (bytes_per_inode > 0) {
+                                sprintf(buf, " -i %ld", bytes_per_inode);
+                                strcat(mop->mo_mkfsopts, buf);
+                        }
+                }
+                
+                /* This is an undocumented mke2fs option. Default is 128. */
+                if (strstr(mop->mo_mkfsopts, "-I") == NULL) {
+                        long inode_size = 0;
+                        if (IS_MDT(&mop->mo_ldd)) {
+                                if (mop->mo_stripe_count > 77)
+                                        inode_size = 512; /* bz 7241 */
+                                /* cray stripes across all osts (>60) */
+                                else if (mop->mo_stripe_count > 34)
+                                        inode_size = 2048;
+                                else if (mop->mo_stripe_count > 13)
+                                        inode_size = 1024;
+                                else 
+                                        inode_size = 512;
+                        } else if (IS_OST(&mop->mo_ldd)) {
+                                /* now as we store fids in EA on OST we need 
+                                   to make inode bigger */
+                                inode_size = 256;
+                        }
+
+                        if (inode_size > 0) {
+                                sprintf(buf, " -I %ld", inode_size);
+                                strcat(mop->mo_mkfsopts, buf);
+                        }
+                        
+                }
+
+                if (verbose < 2) {
+                        strcat(mop->mo_mkfsopts, " -q");
+                }
+
+                /* Enable hashed b-tree directory lookup in large dirs bz6224 */
+                if (strstr(mop->mo_mkfsopts, "-O") == NULL) {
+                        strcat(mop->mo_mkfsopts, " -O dir_index");
+                }
+
+                /* Allow reformat of full devices (as opposed to 
+                   partitions.)  We already checked for mounted dev. */
+                strcat(mop->mo_mkfsopts, " -F");
+
+                sprintf(mkfs_cmd, "mkfs.ext2 -j -b %d -L %s ", L_BLOCK_SIZE,
+                        mop->mo_ldd.ldd_svname);
+
+        } else if (mop->mo_ldd.ldd_mount_type == LDD_MT_REISERFS) {
+                long journal_sz = 0; /* FIXME default journal size */
+                if (journal_sz > 0) { 
+                        sprintf(buf, " --journal_size %ld", journal_sz);
+                        strcat(mop->mo_mkfsopts, buf);
+                }
+                sprintf(mkfs_cmd, "mkreiserfs -ff ");
+
+        } else {
+                fprintf(stderr,"%s: unsupported fs type: %d (%s)\n",
+                        progname, mop->mo_ldd.ldd_mount_type, 
+                        MT_STR(&mop->mo_ldd));
+                return EINVAL;
+        }
+
+        /* For loop device format the dev, not the filename */
+        dev = mop->mo_device;
+        if (mop->mo_flags & MO_IS_LOOP) 
+                dev = mop->mo_loopdev;
+        
+        vprint("formatting backing filesystem %s on %s\n",
+               MT_STR(&mop->mo_ldd), dev);
+        vprint("\ttarget name  %s\n", mop->mo_ldd.ldd_svname);
+        vprint("\t4k blocks     %d\n", block_count);
+        vprint("\toptions       %s\n", mop->mo_mkfsopts);
+
+        /* mkfs_cmd's trailing space is important! */
+        strcat(mkfs_cmd, mop->mo_mkfsopts);
+        strcat(mkfs_cmd, " ");
+        strcat(mkfs_cmd, dev);
+        if (block_count != 0) {
+                sprintf(buf, " %d", block_count);
+                strcat(mkfs_cmd, buf);
+        }
+
+        vprint("mkfs_cmd = %s\n", mkfs_cmd);
+        ret = run_command(mkfs_cmd);
+        if (ret) {
+                fatal();
+                fprintf(stderr, "Unable to build fs %s (%d)\n", dev, ret);
+                goto out;
+        }
+
+out:
+        return ret;
+}
+
+/* ==================== Lustre config functions =============*/
+
+void print_ldd(char *str, struct lustre_disk_data *ldd)
+{
+        printf("\n   %s:\n", str);
+        printf("Target:     %s\n", ldd->ldd_svname);
+        if (ldd->ldd_svindex == INDEX_UNASSIGNED) 
+                printf("Index:      unassigned\n");
+        else
+                printf("Index:      %d\n", ldd->ldd_svindex);
+        printf("UUID:       %s\n", (char *)ldd->ldd_uuid);
+        printf("Lustre FS:  %s\n", ldd->ldd_fsname);
+        printf("Mount type: %s\n", MT_STR(ldd));
+        printf("Flags:      %#x\n", ldd->ldd_flags);
+        printf("              (%s%s%s%s%s%s%s%s)\n",
+               IS_MDT(ldd) ? "MDT ":"", 
+               IS_OST(ldd) ? "OST ":"",
+               IS_MGS(ldd) ? "MGS ":"",
+               ldd->ldd_flags & LDD_F_NEED_INDEX ? "needs_index ":"",
+               ldd->ldd_flags & LDD_F_VIRGIN     ? "first_time ":"",
+               ldd->ldd_flags & LDD_F_UPDATE     ? "update ":"",
+               ldd->ldd_flags & LDD_F_WRITECONF  ? "writeconf ":"",
+               ldd->ldd_flags & LDD_F_UPGRADE14  ? "upgrade1.4 ":"");
+        printf("Persistent mount opts: %s\n", ldd->ldd_mount_opts);
+        printf("Parameters:%s\n", ldd->ldd_params);
+        printf("\n");
+}
+
+/* Write the server config files */
+int write_local_files(struct mkfs_opts *mop)
+{
+        char mntpt[] = "/tmp/mntXXXXXX";
+        char filepnm[128];
+        char *dev;
+        FILE *filep;
+        int ret = 0;
+
+        /* Mount this device temporarily in order to write these files */
+        if (!mkdtemp(mntpt)) {
+                fprintf(stderr, "%s: Can't create temp mount point %s: %s\n",
+                        progname, mntpt, strerror(errno));
+                return errno;
+        }
+
+        dev = mop->mo_device;
+        if (mop->mo_flags & MO_IS_LOOP) 
+                dev = mop->mo_loopdev;
+        
+        ret = mount(dev, mntpt, MT_STR(&mop->mo_ldd), 0, NULL);
+        if (ret) {
+                fprintf(stderr, "%s: Unable to mount %s: %s\n", 
+                        progname, dev, strerror(errno));
+                if (errno == ENODEV) {
+                        fprintf(stderr, "Is the %s module available?\n", 
+                                MT_STR(&mop->mo_ldd));
+                }
+                goto out_rmdir;
+        }
+
+        /* Set up initial directories */
+        sprintf(filepnm, "%s/%s", mntpt, MOUNT_CONFIGS_DIR);
+        ret = mkdir(filepnm, 0777);
+        if ((ret != 0) && (errno != EEXIST)) {
+                fprintf(stderr, "%s: Can't make configs dir %s (%d)\n", 
+                        progname, filepnm, ret);
+                goto out_umnt;
+        } else if (errno == EEXIST) {
+                ret = 0;
+        }
+
+        /* Save the persistent mount data into a file. Lustre must pre-read
+           this file to get the real mount options. */
+        vprint("Writing %s\n", MOUNT_DATA_FILE);
+        sprintf(filepnm, "%s/%s", mntpt, MOUNT_DATA_FILE);
+        filep = fopen(filepnm, "w");
+        if (!filep) {
+                fprintf(stderr, "%s: Unable to create %s file\n",
+                        progname, filepnm);
+                goto out_umnt;
+        }
+        fwrite(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep);
+        fclose(filep);
+        
+        /* COMPAT_146 */
+#ifdef TUNEFS
+        /* Check for upgrade */
+        if ((mop->mo_ldd.ldd_flags & (LDD_F_UPGRADE14 | LDD_F_SV_TYPE_MGS)) 
+            == (LDD_F_UPGRADE14 | LDD_F_SV_TYPE_MGS)) {
+                char cmd[128];
+                char *term;
+                vprint("Copying old logs\n");
+#if 0
+ /* Generate new client log as servers upgrade.  Starting a new client 
+    may end up with short lov's, so will be degraded until all servers
+    upgrade */
+                /* Copy the old client log to fsname-client */
+                sprintf(filepnm, "%s/%s/%s-client", 
+                        mntpt, MOUNT_CONFIGS_DIR, mop->mo_ldd.ldd_fsname);
+                sprintf(cmd, "cp %s/%s/client %s", mntpt, MDT_LOGS_DIR,
+                        filepnm);
+                if (verbose > 1) 
+                        printf("cmd: %s\n", cmd);
+                ret = run_command(cmd);
+                if (ret) {
+                        fprintf(stderr, "%s: Can't copy 1.4 config %s/client "
+                                "(%d)\n", progname, MDT_LOGS_DIR, ret);
+                        fprintf(stderr, "mount -t ext3 %s somewhere, "
+                                "find the client log for fs %s and "
+                                "copy it manually into %s/%s-client, "
+                                "then umount.\n",
+                                mop->mo_device, 
+                                mop->mo_ldd.ldd_fsname, MOUNT_CONFIGS_DIR,
+                                mop->mo_ldd.ldd_fsname);
+                        goto out_umnt;
+                }
+ #endif
+                /* We need to use the old mdt log because otherwise mdt won't
+                   have complete lov if old clients connect before all 
+                   servers upgrade. */
+                /* Copy the old mdt log to fsname-MDT0000 (get old
+                   name from mdt_UUID) */
+                ret = 1;
+                strcpy(filepnm, mop->mo_ldd.ldd_uuid);
+                term = strstr(filepnm, "_UUID");
+                if (term) {
+                        *term = '\0';
+                        sprintf(cmd, "cp %s/%s/%s %s/%s/%s",
+                                mntpt, MDT_LOGS_DIR, filepnm, 
+                                mntpt, MOUNT_CONFIGS_DIR,
+                                mop->mo_ldd.ldd_svname);
+                        if (verbose > 1) 
+                                printf("cmd: %s\n", cmd);
+                        ret = run_command(cmd);
+                }
+                if (ret) {
+                        fprintf(stderr, "%s: Can't copy 1.4 config %s/%s "
+                                "(%d)\n", progname, MDT_LOGS_DIR, filepnm, ret);
+                        fprintf(stderr, "mount -t ext3 %s somewhere, "
+                                "find the MDT log for fs %s and "
+                                "copy it manually into %s/%s, "
+                                "then umount.\n",
+                                mop->mo_device, 
+                                mop->mo_ldd.ldd_fsname, MOUNT_CONFIGS_DIR,
+                                mop->mo_ldd.ldd_svname);
+                        goto out_umnt;
+                }
+        }
+#endif
+        /* end COMPAT_146 */
+
+
+out_umnt:
+        umount(mntpt);    
+out_rmdir:
+        rmdir(mntpt);
+        return ret;
+}
+
+int read_local_files(struct mkfs_opts *mop)
+{
+        char mntpt[] = "/tmp/mntXXXXXX";
+        char filepnm[128];
+        char *dev;
+        FILE *filep;
+        int ret = 0;
+
+        /* Mount this device temporarily in order to read these files */
+        if (!mkdtemp(mntpt)) {
+                fprintf(stderr, "%s: Can't create temp mount point %s: %s\n",
+                        progname, mntpt, strerror(errno));
+                return errno;
+        }
+
+        dev = mop->mo_device;
+        if (mop->mo_flags & MO_IS_LOOP) 
+                dev = mop->mo_loopdev;
+        
+        ret = mount(dev, mntpt, MT_STR(&mop->mo_ldd), 0, NULL);
+        if (ret) {
+                fprintf(stderr, "%s: Unable to mount %s: %s\n", 
+                        progname, dev, strerror(errno));
+                goto out_rmdir;
+        }
+
+        sprintf(filepnm, "%s/%s", mntpt, MOUNT_DATA_FILE);
+        filep = fopen(filepnm, "r");
+        if (filep) {
+                vprint("Reading %s\n", MOUNT_DATA_FILE);
+                fread(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep);
+        } else {
+                /* COMPAT_146 */
+                /* Try to read pre-1.6 config from last_rcvd */
+                struct lr_server_data lsd;
+                vprint("%s: Unable to read %s, trying last_rcvd\n",
+                       progname, MOUNT_DATA_FILE);
+                sprintf(filepnm, "%s/%s", mntpt, LAST_RCVD);
+                filep = fopen(filepnm, "r");
+                if (!filep) {
+                        fprintf(stderr, "%s: Unable to read old data\n",
+                                progname);
+                        ret = -errno;
+                        goto out_umnt;
+                }
+                vprint("Reading %s\n", LAST_RCVD);
+                ret = fread(&lsd, 1, sizeof(lsd), filep);
+                if (ret < sizeof(lsd)) {
+                        fprintf(stderr, "%s: Short read (%d of %d)\n",
+                                progname, ret, sizeof(lsd));
+                        ret = -ferror(filep);
+                        if (ret) 
+                                goto out_close;
+                }
+                ret = 0;
+                if (lsd.lsd_feature_compat & OBD_COMPAT_OST) {
+                        mop->mo_ldd.ldd_flags = LDD_F_SV_TYPE_OST;
+                        mop->mo_ldd.ldd_svindex = lsd.lsd_ost_index;
+                } else if (lsd.lsd_feature_compat & OBD_COMPAT_MDT) {
+                        /* We must co-locate so mgs can see old logs.
+                           If user doesn't want this, they can copy the old
+                           logs manually and re-tunefs. */
+                        mop->mo_ldd.ldd_flags = 
+                                LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_MGS;
+                        mop->mo_ldd.ldd_svindex = lsd.lsd_mdt_index;
+                } else  {
+                        /* If neither is set, we're pre-1.4.6, make a guess. */
+                        sprintf(filepnm, "%s/%s", mntpt, MDT_LOGS_DIR);
+                        if (lsd.lsd_ost_index > 0) {
+                                mop->mo_ldd.ldd_flags = LDD_F_SV_TYPE_OST;
+                                mop->mo_ldd.ldd_svindex = lsd.lsd_ost_index;
+                        } else {
+                                /* If there's a LOGS dir, it's an MDT */
+                                if ((ret = access(filepnm, F_OK)) == 0) {
+                                        mop->mo_ldd.ldd_flags =
+                                        LDD_F_SV_TYPE_MDT | 
+                                        LDD_F_SV_TYPE_MGS;
+                                        /* Old MDT's are always index 0 
+                                           (pre CMD) */
+                                        mop->mo_ldd.ldd_svindex = 0;
+                                } else {
+                                        /* The index won't be correct */
+                                        mop->mo_ldd.ldd_flags =
+                                        LDD_F_SV_TYPE_OST | LDD_F_NEED_INDEX;
+                                }
+                        }
+                }
+
+                memcpy(mop->mo_ldd.ldd_uuid, lsd.lsd_uuid, 
+                       sizeof(mop->mo_ldd.ldd_uuid));
+                mop->mo_ldd.ldd_flags |= LDD_F_UPGRADE14;
+        }
+        /* end COMPAT_146 */
+out_close:        
+        fclose(filep);
+        
+out_umnt:
+        umount(mntpt);    
+out_rmdir:
+        rmdir(mntpt);
+        return ret;
+}
+
+
+void set_defaults(struct mkfs_opts *mop)
+{
+        mop->mo_ldd.ldd_magic = LDD_MAGIC;
+        mop->mo_ldd.ldd_config_ver = 1;
+        mop->mo_ldd.ldd_flags = LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_VIRGIN;
+        mop->mo_mgs_failnodes = 0;
+        strcpy(mop->mo_ldd.ldd_fsname, "lustre");
+        if (get_os_version() == 24) 
+                mop->mo_ldd.ldd_mount_type = LDD_MT_EXT3;
+        else 
+                mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS;
+        
+        mop->mo_ldd.ldd_svindex = INDEX_UNASSIGNED;
+        mop->mo_stripe_count = 1;
+}
+
+static inline void badopt(const char *opt, char *type)
+{
+        fprintf(stderr, "%s: '--%s' only valid for %s\n",
+                progname, opt, type);
+        usage(stderr);
+}
+
+static int add_param(char *buf, char *key, char *val)
+{
+        int end = sizeof(((struct lustre_disk_data *)0)->ldd_params);
+        int start = strlen(buf);
+        int keylen = 0;
+
+        if (key) 
+                keylen = strlen(key);
+        if (start + 1 + keylen + strlen(val) >= end) {
+                fprintf(stderr, "%s: params are too long-\n%s %s%s\n",
+                        progname, buf, key ? key : "", val);
+                return 1;
+        }
+
+        sprintf(buf + start, " %s%s", key ? key : "", val);
+        return 0;
+}
+
+/* from mount_lustre */
+/* Get rid of symbolic hostnames for tcp, since kernel can't do lookups */
+#define MAXNIDSTR 1024
+static char *convert_hostnames(char *s1)
+{
+        char *converted, *s2 = 0, *c;
+        int left = MAXNIDSTR;
+        lnet_nid_t nid;
+        
+        converted = malloc(left);
+        c = converted;
+        while ((left > 0) && ((s2 = strsep(&s1, ",: \0")))) {
+                nid = libcfs_str2nid(s2);
+                if (nid == LNET_NID_ANY) {
+                        if (*s2 == '/') 
+                                /* end of nids */
+                                break;
+                        fprintf(stderr, "%s: Can't parse NID '%s'\n", 
+                                progname, s2);
+                        free(converted);
+                        return NULL;
+                }
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == SOCKLND) {
+                        __u32 addr = LNET_NIDADDR(nid);
+                        c += snprintf(c, left, "%u.%u.%u.%u@%s%u,",
+                                      (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+                                      (addr >> 8) & 0xff, addr & 0xff,
+                                      libcfs_lnd2str(SOCKLND), 
+                                      LNET_NETNUM(LNET_NIDNET(nid)));
+                } else {
+                        c += snprintf(c, left, "%s,", s2);
+                }
+                left = converted + MAXNIDSTR - c;
+        }
+        *(c - 1) = '\0';
+        return converted;
+}
+
+int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop,
+               char **mountopts)
+{
+        static struct option long_opt[] = {
+                {"backfstype", 1, 0, 'b'},
+                {"stripe-count-hint", 1, 0, 'c'},
+                {"configdev", 1, 0, 'C'},
+                {"device-size", 1, 0, 'd'},
+                {"erase-params", 0, 0, 'e'},
+                {"failnode", 1, 0, 'f'},
+                {"failover", 1, 0, 'f'},
+                {"mgs", 0, 0, 'G'},
+                {"help", 0, 0, 'h'},
+                {"index", 1, 0, 'i'},
+                {"mkfsoptions", 1, 0, 'k'},
+                {"mgsnode", 1, 0, 'm'},
+                {"mgsnid", 1, 0, 'm'},
+                {"mdt", 0, 0, 'M'},
+                {"fsname",1, 0, 'n'},
+                {"nomgs", 0, 0, 'N'},
+                {"mountfsoptions", 1, 0, 'o'},
+                {"ost", 0, 0, 'O'},
+                {"param", 1, 0, 'p'},
+                {"print", 0, 0, 'P'},
+                {"quiet", 0, 0, 'q'},
+                {"reformat", 0, 0, 'r'},
+                {"verbose", 0, 0, 'v'},
+                {"writeconf", 0, 0, 'w'},
+                {0, 0, 0, 0}
+        };
+        char *optstring = "b:c:C:d:ef:Ghi:k:m:Mn:No:Op:Pqrvw";
+        char opt;
+        int rc, longidx;
+
+        while ((opt = getopt_long(argc, argv, optstring, long_opt, &longidx)) != 
+               EOF) {
+                switch (opt) {
+                case 'b': {
+                        int i = 0;
+                        while (i < LDD_MT_LAST) {
+                                if (strcmp(optarg, mt_str(i)) == 0) {
+                                        mop->mo_ldd.ldd_mount_type = i;
+                                        break;
+                                }
+                                i++;
+                        }
+                        break;
+                }
+                case 'c':
+                        if (IS_MDT(&mop->mo_ldd)) {
+                                int stripe_count = atol(optarg);
+                                if (stripe_count <= 0) {
+                                        fprintf(stderr, "%s: bad stripe count "
+                                                "%d\n", progname, stripe_count);
+                                        return 1;
+                                }
+                                mop->mo_stripe_count = stripe_count;
+                        } else {
+                                badopt(long_opt[longidx].name, "MDT");
+                                return 1;
+                        }
+                        break;
+                case 'C': /* Configdev */
+                        //FIXME
+                        printf("Configdev not implemented\n");
+                        return 1;
+                case 'd':
+                        mop->mo_device_sz = atol(optarg); 
+                        break;
+                case 'e':
+                        mop->mo_ldd.ldd_params[0] = '\0';
+                        break;
+                case 'f': {
+                        char *nids = convert_hostnames(optarg);
+                        if (!nids) 
+                                return 1;
+                        rc = add_param(mop->mo_ldd.ldd_params, PARAM_FAILNODE, 
+                                       nids); 
+                        free(nids);
+                        if (rc) 
+                                return rc;
+                        break;
+                }
+                case 'G':
+                        mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_MGS;
+                        break;
+                case 'h':
+                        usage(stdout);
+                        return 1;
+                case 'i':
+                        if (IS_MDT(&mop->mo_ldd) || IS_OST(&mop->mo_ldd)) {
+                                mop->mo_ldd.ldd_svindex = atol(optarg);
+                                mop->mo_ldd.ldd_flags &= ~LDD_F_NEED_INDEX;
+                        } else {
+                                badopt(long_opt[longidx].name, "MDT,OST");
+                                return 1;
+                        }
+                        break;
+                case 'k':
+                        strncpy(mop->mo_mkfsopts, optarg, 
+                                sizeof(mop->mo_mkfsopts) - 1);
+                        break;
+                case 'm': {
+                        char *nids = convert_hostnames(optarg);
+                        if (!nids) 
+                                return 1;
+                        rc = add_param(mop->mo_ldd.ldd_params, PARAM_MGSNODE, 
+                                       nids); 
+                        free(nids);
+                        if (rc) 
+                                return rc;
+                        mop->mo_mgs_failnodes++;
+                        break;
+                }
+                case 'M':
+                        mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_MDT;
+                        break;
+                case 'n':
+                        if (!(IS_MDT(&mop->mo_ldd) || IS_OST(&mop->mo_ldd))) {
+                                badopt(long_opt[longidx].name, "MDT,OST");
+                                return 1;
+                        }
+                        if (strlen(optarg) > 8) {
+                                fprintf(stderr, "%s: filesystem name must be "
+                                        "<= 8 chars\n", progname);
+                                return 1;
+                        }
+                        if (optarg[0] != 0) 
+                                strncpy(mop->mo_ldd.ldd_fsname, optarg, 
+                                        sizeof(mop->mo_ldd.ldd_fsname) - 1);
+                        break;
+                case 'N':
+                        mop->mo_ldd.ldd_flags &= ~LDD_F_SV_TYPE_MGS;
+                        break;
+                case 'o':
+                        *mountopts = optarg;
+                        break;
+                case 'O':
+                        mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_OST;
+                        break;
+                case 'p':
+                        rc = add_param(mop->mo_ldd.ldd_params, NULL, optarg);
+                        if (rc) 
+                                return rc;
+                        break;
+                case 'P':
+                        print_only++;
+                        break;
+                case 'q':
+                        verbose--;
+                        break;
+                case 'r':
+                        mop->mo_flags |= MO_FORCEFORMAT;
+                        break;
+                case 'v':
+                        verbose++;
+                        break;
+                case 'w':
+                        mop->mo_ldd.ldd_flags |= LDD_F_WRITECONF;
+                        break;
+                default:
+                        if (opt != '?') {
+                                fatal();
+                                fprintf(stderr, "Unknown option '%c'\n", opt);
+                        }
+                        usage(stderr);
+                        return 1;
+                }
+        }//while
+        if (optind >= argc) {
+                fatal();
+                fprintf(stderr, "Bad arguments\n");
+                usage(stderr);
+                return 1;
+        }
+
+        return 0;
+}
+
+int main(int argc, char *const argv[])
+{
+        struct mkfs_opts mop;
+        struct lustre_disk_data *ldd;
+        char *mountopts = NULL;
+        char always_mountopts[512] = "";
+        char default_mountopts[512] = "";
+        int  ret = 0;
+
+        //printf("pad %d\n", offsetof(struct lustre_disk_data, ldd_padding));
+        assert(offsetof(struct lustre_disk_data, ldd_padding) == 200);
+        
+        if ((progname = strrchr(argv[0], '/')) != NULL)
+                progname++;
+        else
+                progname = argv[0];
+
+        if (argc < 2) {
+                usage(stderr);
+                ret = 1;
+                goto out;
+        }
+
+        memset(&mop, 0, sizeof(mop));
+        set_defaults(&mop);
+
+        /* device is last arg */
+        strcpy(mop.mo_device, argv[argc - 1]);
+
+        if (check_mtab_entry(mop.mo_device, "lustre"))
+                return(EEXIST);
+
+        /* Are we using a loop device? */
+        ret = is_block(mop.mo_device);
+        if (ret < 0) 
+                goto out;
+        if (ret == 0) 
+                mop.mo_flags |= MO_IS_LOOP;
+
+#ifdef TUNEFS
+        /* For tunefs, we must read in the old values before parsing any
+           new ones. */
+        /* Create the loopback file */
+        if (mop.mo_flags & MO_IS_LOOP) {
+                ret = access(mop.mo_device, F_OK);
+                if (ret == 0)  
+                        ret = loop_setup(&mop);
+                if (ret) {
+                        fatal();
+                        fprintf(stderr, "Loop device setup for %s failed: %s\n", 
+                                mop.mo_device, strerror(ret));
+                        goto out;
+                }
+        }
+        
+        /* Check whether the disk has already been formatted by mkfs.lustre */
+        ret = is_lustre_target(&mop);
+        if (ret == 0) {
+                fatal();
+                fprintf(stderr, "Device %s has not been formatted with "
+                        "mkfs.lustre\n", mop.mo_device);
+                goto out;
+        }
+
+        ret = read_local_files(&mop);
+        if (ret) {
+                fatal();
+                fprintf(stderr, "Failed to read previous Lustre data from %s\n",
+                        mop.mo_device);
+                goto out;
+        }
+
+        if (verbose > 0) 
+                print_ldd("Read previous values", &(mop.mo_ldd));
+#endif
+
+        ret = parse_opts(argc, argv, &mop, &mountopts);
+        if (ret) 
+                goto out;
+
+        ldd = &mop.mo_ldd;
+        if (!(IS_MDT(ldd) || IS_OST(ldd) || IS_MGS(ldd))) {
+                fatal();
+                fprintf(stderr, "must set target type :{mdt,ost,mgs}\n");
+                usage(stderr);
+                ret = 1;
+                goto out;
+        }
+
+        if (IS_MDT(ldd) && !IS_MGS(ldd) && (mop.mo_mgs_failnodes == 0)) {
+                vprint("No management node specified, adding MGS to this "
+                       "MDT\n");
+                ldd->ldd_flags |= LDD_F_SV_TYPE_MGS;
+        }
+
+        if (!IS_MGS(ldd) && (mop.mo_mgs_failnodes == 0)) {
+                fatal();
+                fprintf(stderr, "Must specify either --mgs or --mgsnode\n");
+                usage(stderr);
+                goto out;
+        }
+
+        /* These are the permanent mount options (always included) */ 
+        switch (ldd->ldd_mount_type) {
+        case LDD_MT_EXT3:
+        case LDD_MT_LDISKFS: {
+                sprintf(always_mountopts, "errors=remount-ro");
+                if (IS_MDT(ldd) || IS_MGS(ldd))
+                        strcat(always_mountopts,
+                               ",iopen_nopriv,user_xattr");
+                if ((get_os_version() == 24) && IS_OST(ldd))
+                        strcat(always_mountopts, ",asyncdel");
+#if 0
+                /* Files created while extents are enabled cannot be read if
+                   mounted with a kernel that doesn't include the CFS patches.*/
+                if (IS_OST(ldd) && 
+                    ldd->ldd_mount_type == LDD_MT_LDISKFS) {
+                        strcat(default_mountopts, ",extents,mballoc");
+                }
+#endif 
+                break;
+        }
+        case LDD_MT_SMFS: {
+                mop.mo_flags |= MO_IS_LOOP;
+                sprintf(always_mountopts, "type=ext3,dev=%s",
+                        mop.mo_device);
+                break;
+        }
+        default: {
+                fatal();
+                fprintf(stderr, "unknown fs type %d '%s'\n",
+                        ldd->ldd_mount_type,
+                        MT_STR(ldd));
+                ret = EINVAL;
+                goto out;
+        }
+        }               
+
+        if (mountopts) {
+                /* If user specifies mount opts, don't use defaults,
+                   but always use always_mountopts */
+                sprintf(ldd->ldd_mount_opts, "%s,%s", 
+                        always_mountopts, mountopts);
+        } else {
+#ifdef TUNEFS
+                if (ldd->ldd_mount_opts[0] == 0) 
+                        /* use the defaults unless old opts exist */
+#endif
+                {
+                        if (default_mountopts[0]) 
+                                sprintf(ldd->ldd_mount_opts, "%s,%s", 
+                                        always_mountopts, default_mountopts);
+                        else
+                                strcpy(ldd->ldd_mount_opts,
+                                       always_mountopts);
+                }
+        }
+
+        server_make_name(ldd->ldd_flags, ldd->ldd_svindex,
+                         ldd->ldd_fsname, ldd->ldd_svname);
+
+        if (verbose > 0)
+                print_ldd("Permanent disk data", ldd);
+
+        if (print_only) {
+                printf("exiting before disk write.\n");
+                goto out;
+        }
+
+#ifndef TUNEFS /* mkfs.lustre */
+        /* Create the loopback file of the correct size */
+        if (mop.mo_flags & MO_IS_LOOP) {
+                ret = access(mop.mo_device, F_OK);
+                /* Don't destroy the loopback file if no FORCEFORMAT */
+                if (ret || (mop.mo_flags & MO_FORCEFORMAT))
+                        ret = loop_format(&mop);
+                if (ret == 0)  
+                        ret = loop_setup(&mop);
+                if (ret) {
+                        fatal();
+                        fprintf(stderr, "Loop device setup failed: %s\n", 
+                                strerror(ret));
+                        goto out;
+                }
+        }
+
+        /* Check whether the disk has already been formatted by mkfs.lustre */
+        if (!(mop.mo_flags & MO_FORCEFORMAT)) {
+                ret = is_lustre_target(&mop);
+                if (ret) {
+                        fatal();
+                        fprintf(stderr, "Device %s was previously formatted " 
+                                "for lustre. Use --reformat to reformat it, "
+                                "or tunefs.lustre to modify.\n",
+                                mop.mo_device);
+                        goto out;
+                }
+        }
+
+        /* Format the backing filesystem */
+        ret = make_lustre_backfs(&mop);
+        if (ret != 0) {
+                fatal();
+                fprintf(stderr, "mkfs failed %d\n", ret);
+                goto out;
+        }
+#endif
+
+        ret = write_local_files(&mop);
+        if (ret != 0) {
+                fatal();
+                fprintf(stderr, "failed to write local files\n");
+                goto out;
+        }
+
+out:
+        loop_cleanup(&mop);      
+        return ret;
+}
index c422184..b8371b4 100755 (executable)
@@ -1,6 +1,7 @@
 #!/bin/sh
 
 MDIR=/lib/modules/`uname -r`/lustre
+mkdir -p $MDIR
 
 KVER=24
 EXT=o
@@ -15,29 +16,31 @@ fi
 
 echo "Copying modules from local build dir to "$MDIR
 
-mkdir -p $MDIR
-
-cp ../../lnet/libcfs/libcfs.$EXT $MDIR
-cp ../../lnet/lnet/lnet.$EXT $MDIR
-cp ../../lnet/klnds/socklnd/ksocklnd.$EXT $MDIR
-cp ../lvfs/lvfs.$EXT $MDIR
-cp ../obdclass/obdclass.$EXT $MDIR
-cp ../ptlrpc/ptlrpc.$EXT $MDIR
-cp ../mdc/mdc.$EXT $MDIR
-cp ../osc/osc.$EXT $MDIR
-cp ../lov/lov.$EXT $MDIR
-cp ../mds/mds.$EXT $MDIR
-cp ../lvfs/$FSFLT.$EXT $MDIR
-[ $KVER == "26" ] && cp ../ldiskfs/ldiskfs.$EXT $MDIR
-cp ../ost/ost.$EXT $MDIR
-cp ../obdfilter/obdfilter.$EXT $MDIR
-cp ../llite/llite.$EXT $MDIR
-
+cp -u ../../lnet/libcfs/libcfs.$EXT $MDIR
+cp -u ../../lnet/lnet/lnet.$EXT $MDIR
+cp -u ../../lnet/klnds/socklnd/ksocklnd.$EXT $MDIR
+cp -u ../lvfs/lvfs.$EXT $MDIR
+cp -u ../obdclass/obdclass.$EXT $MDIR
+cp -u ../ptlrpc/ptlrpc.$EXT $MDIR
+cp -u ../mdc/mdc.$EXT $MDIR
+cp -u ../osc/osc.$EXT $MDIR
+cp -u ../lov/lov.$EXT $MDIR
+cp -u ../mds/mds.$EXT $MDIR
+cp -u ../lvfs/$FSFLT.$EXT $MDIR
+[ $KVER == "26" ] && cp -u ../ldiskfs/ldiskfs.$EXT $MDIR
+cp -u ../ost/ost.$EXT $MDIR
+cp -u ../obdfilter/obdfilter.$EXT $MDIR
+cp -u ../llite/llite.$EXT $MDIR
+cp -u ../mgc/mgc.$EXT $MDIR
+cp -u ../mgs/mgs.$EXT $MDIR
+
+# prevent warnings on my uml
+rm -f /lib/modules/`uname -r`/modules.*
 echo "Depmod"
 depmod -a -e
 
 echo "Copying mount from local build dir to "$MDIR
-cp ../utils/mount.lustre /sbin/.
+cp -u ../utils/mount.lustre /sbin/.
 
 MP="/sbin/modprobe"
 MPI="$MP --ignore-install"
@@ -51,3 +54,8 @@ if [ `egrep -c "lustre|lnet" $MODFILE` -eq 0 ]; then
     echo "alias lustre llite" >> $MODFILE
     echo "# end Lustre modules" >> $MODFILE
 fi
+
+#  To generate gdb debug file:
+# modprobe lustre; modprobe mds; modprobe obdfilter; modprobe mgs; modprobe mgc
+# rm -f /r/tmp/ogdb-`hostname`
+# ./lctl modules > /r/tmp/ogdb-`hostname`
diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c
new file mode 100644 (file)
index 0000000..be8ebdf
--- /dev/null
@@ -0,0 +1,406 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Robert Read <rread@clusterfs.com>
+ *   Author: Nathan Rutman <nathan@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <mntent.h>
+#include <getopt.h>
+#include <sys/utsname.h>
+#include "obdctl.h"
+#include <lustre_ver.h>
+
+int          verbose = 0;
+int          nomtab = 0;
+int          fake = 0;
+int          force = 0;
+static char *progname = NULL;
+
+void usage(FILE *out)
+{
+        fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname);
+        fprintf(out, "usage: %s [-fhnv] [-o <mntopt>] <device> <mountpt>\n", 
+                progname);
+        fprintf(out, 
+                "\t<device>: the disk device, or for a client:\n"
+                "\t\t<mgmtnid>[:<altmgtnid>...]:/<filesystem>-client\n"
+                "\t<filesystem>: name of the Lustre filesystem (e.g. lustre1)\n"
+                "\t<mountpt>: filesystem mountpoint (e.g. /mnt/lustre)\n"
+                "\t-f|--fake: fake mount (updates /etc/mtab)\n"
+                "\t--force: force mount even if already in /etc/mtab\n"
+                "\t-h|--help: print this usage message\n"
+                "\t-n|--nomtab: do not update /etc/mtab after mount\n"
+                "\t-v|--verbose: print verbose config settings\n"
+                "\t<mntopt>: one or more comma separated of:\n"
+                "\t\t(no)flock,(no)user_xattr,(no)acl\n"
+                "\t\tnosvc: only start MGC/MGS obds\n"
+                "\t\texclude=<ostname>[:<ostname>] : colon-separated list of "
+                "inactive OSTs (e.g. lustre-OST0001)\n"
+                );
+        exit((out != stdout) ? EINVAL : 0);
+}
+
+static int check_mtab_entry(char *spec, char *mtpt, char *type)
+{
+        FILE *fp;
+        struct mntent *mnt;
+
+        if (force)
+                return (0);
+
+        fp = setmntent(MOUNTED, "r");
+        if (fp == NULL)
+                return(0);
+
+        while ((mnt = getmntent(fp)) != NULL) {
+                if (strcmp(mnt->mnt_fsname, spec) == 0 &&
+                        strcmp(mnt->mnt_dir, mtpt) == 0 &&
+                        strcmp(mnt->mnt_type, type) == 0) {
+                        endmntent(fp);
+                        fprintf(stderr, "%s: according to %s %s is "
+                                "already mounted on %s\n",
+                                progname, MOUNTED, spec, mtpt);
+                        return(EEXIST); 
+                }
+        }
+        endmntent(fp);
+
+        return(0);
+}
+
+static int
+update_mtab_entry(char *spec, char *mtpt, char *type, char *opts,
+                  int flags, int freq, int pass)
+{
+        FILE *fp;
+        struct mntent mnt;
+        int rc = 0;
+
+        mnt.mnt_fsname = spec;
+        mnt.mnt_dir = mtpt;
+        mnt.mnt_type = type;
+        mnt.mnt_opts = opts ? opts : "";
+        mnt.mnt_freq = freq;
+        mnt.mnt_passno = pass;
+
+        fp = setmntent(MOUNTED, "a+");
+        if (fp == NULL) {
+                fprintf(stderr, "%s: setmntent(%s): %s:",
+                        progname, MOUNTED, strerror (errno));
+                rc = 16;
+        } else {
+                if ((addmntent(fp, &mnt)) == 1) {
+                        fprintf(stderr, "%s: addmntent: %s:",
+                                progname, strerror (errno));
+                        rc = 16;
+                }
+                endmntent(fp);
+        }
+
+        return rc;
+}
+
+/* Get rid of symbolic hostnames for tcp, since kernel can't do lookups */
+#define MAXNIDSTR 1024
+static char *convert_hostnames(char *s1)
+{
+        char *converted, *s2 = 0, *c;
+        char sep;
+        int left = MAXNIDSTR;
+        lnet_nid_t nid;
+        
+        converted = malloc(left);
+        c = converted;
+        while ((left > 0) && (*s1 != '/')) {
+                s2 = strpbrk(s1, ",:");
+                if (!s2)
+                        goto out_free;
+                sep = *s2;
+                *s2 = '\0';     
+                nid = libcfs_str2nid(s1);
+                if (nid == LNET_NID_ANY)
+                        goto out_free;
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == SOCKLND) {
+                        __u32 addr = LNET_NIDADDR(nid);
+                        c += snprintf(c, left, "%u.%u.%u.%u@%s%u%c",
+                                      (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+                                      (addr >> 8) & 0xff, addr & 0xff,
+                                      libcfs_lnd2str(SOCKLND), 
+                                      LNET_NETNUM(LNET_NIDNET(nid)), sep);
+                } else {
+                        c += snprintf(c, left, "%s%c", s1, sep);
+                }
+                left = converted + MAXNIDSTR - c;
+                s1 = s2 + 1;
+        }
+        snprintf(c, left, "%s", s1);
+        return converted;
+out_free:
+        fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1);
+        free(converted);
+        return NULL;
+}
+
+/*****************************************************************************
+ *
+ * This part was cribbed from util-linux/mount/mount.c.  There was no clear
+ * license information, but many other files in the package are identified as
+ * GNU GPL, so it's a pretty safe bet that was their intent.
+ *
+ ****************************************************************************/
+struct opt_map {
+        const char *opt;        /* option name */
+        int skip;               /* skip in mtab option string */
+        int inv;                /* true if flag value should be inverted */
+        int mask;               /* flag mask value */
+};
+
+static const struct opt_map opt_map[] = {
+  /* These flags are parsed by mount, not lustre */
+  { "defaults", 0, 0, 0         },      /* default options */
+  { "rw",       1, 1, MS_RDONLY },      /* read-write */
+  { "ro",       0, 0, MS_RDONLY },      /* read-only */
+  { "exec",     0, 1, MS_NOEXEC },      /* permit execution of binaries */
+  { "noexec",   0, 0, MS_NOEXEC },      /* don't execute binaries */
+  { "suid",     0, 1, MS_NOSUID },      /* honor suid executables */
+  { "nosuid",   0, 0, MS_NOSUID },      /* don't honor suid executables */
+  { "dev",      0, 1, MS_NODEV  },      /* interpret device files  */
+  { "nodev",    0, 0, MS_NODEV  },      /* don't interpret devices */
+  { "async",    0, 1, MS_SYNCHRONOUS},  /* asynchronous I/O */
+  { "auto",     0, 0, 0         },      /* Can be mounted using -a */
+  { "noauto",   0, 0, 0         },      /* Can only be mounted explicitly */
+  { "nousers",  0, 1, 0         },      /* Forbid ordinary user to mount */
+  { "nouser",   0, 1, 0         },      /* Forbid ordinary user to mount */
+  { "noowner",  0, 1, 0         },      /* Device owner has no special privs */
+  { "_netdev",  0, 0, 0         },      /* Device accessible only via network */
+  /* These strings are passed through and parsed in lustre ll_options */
+  { "flock",    0, 0, 0         },      /* Enable flock support */
+  { "noflock",  1, 1, 0         },      /* Disable flock support */
+  { "user_xattr",   0, 0, 0     },      /* Enable get/set user xattr */
+  { "nouser_xattr", 1, 1, 0     },      /* Disable user xattr */
+  { "acl",      0, 0, 0         },      /* Enable ACL support */
+  { "noacl",    1, 1, 0         },      /* Disable ACL support */
+  { "nosvc",    0, 0, 0         },      /* Only start MGS/MGC, nothing else */
+  { "exclude",  0, 0, 0         },      /* OST exclusion list */
+  { NULL,       0, 0, 0         }
+};
+/****************************************************************************/
+
+/* 1  = found, flag set
+   0  = found, no flag set
+   -1 = not found in above list */
+static int parse_one_option(const char *check, int *flagp)
+{
+        const struct opt_map *opt;
+
+        for (opt = &opt_map[0]; opt->opt != NULL; opt++) {
+                if (strncmp(check, opt->opt, strlen(opt->opt)) == 0) {
+                        if (!opt->mask) 
+                                return 0;
+                        if (opt->inv)
+                                *flagp &= ~(opt->mask);
+                        else
+                                *flagp |= opt->mask;
+                        return 1;
+                }
+        }
+        fprintf(stderr, "%s: ignoring unknown option '%s'\n", progname,
+                check);
+        return -1;
+}
+
+int parse_options(char *orig_options, int *flagp)
+{
+        char *options, *opt, *nextopt;
+
+        options = calloc(strlen(orig_options) + 1, 1);
+        *flagp = 0;
+        nextopt = orig_options;
+        while ((opt = strsep(&nextopt, ","))) {
+                if (!*opt) 
+                        /* empty option */
+                        continue;
+                if (parse_one_option(opt, flagp) == 0) {
+                        /* no mount flags set, so pass this on as an option */
+                        if (*options)
+                                strcat(options, ",");
+                        strcat(options, opt);
+                }
+        }
+        /* options will always be <= orig_options */
+        strcpy(orig_options, options);
+        free(options);
+        return 0;
+}
+
+
+int main(int argc, char *const argv[])
+{
+        char default_options[] = "";
+        char *source, *target, *options = default_options, *optcopy;
+        int i, nargs = 3, opt, rc, flags, optlen;
+        static struct option long_opt[] = {
+                {"fake", 0, 0, 'f'},
+                {"force", 0, 0, 1},
+                {"help", 0, 0, 'h'},
+                {"nomtab", 0, 0, 'n'},
+                {"options", 1, 0, 'o'},
+                {"verbose", 0, 0, 'v'},
+                {0, 0, 0, 0}
+        };
+
+        progname = strrchr(argv[0], '/');
+        progname = progname ? progname + 1 : argv[0];
+
+        while ((opt = getopt_long(argc, argv, "fhno:v",
+                                  long_opt, NULL)) != EOF){
+                switch (opt) {
+                case 1:
+                        ++force;
+                        printf("force: %d\n", force);
+                        nargs++;
+                        break;
+                case 'f':
+                        ++fake;
+                        printf("fake: %d\n", fake);
+                        nargs++;
+                        break;
+                case 'h':
+                        usage(stdout);
+                        break;
+                case 'n':
+                        ++nomtab;
+                        printf("nomtab: %d\n", nomtab);
+                        nargs++;
+                        break;
+                case 'o':
+                        options = optarg;
+                        nargs++;
+                        break;
+                case 'v':
+                        ++verbose;
+                        printf("verbose: %d\n", verbose);
+                        nargs++;
+                        break;
+                default:
+                        fprintf(stderr, "%s: unknown option '%c'\n",
+                                progname, opt);
+                        usage(stderr);
+                        break;
+                }
+        }
+
+        if (optind + 2 > argc) {
+                fprintf(stderr, "%s: too few arguments\n", progname);
+                usage(stderr);
+        }
+
+        source = convert_hostnames(argv[optind]);
+        target = argv[optind + 1];
+
+        if (!source) {
+                usage(stderr);
+        }
+
+        if (verbose > 1) {
+                for (i = 0; i < argc; i++)
+                        printf("arg[%d] = %s\n", i, argv[i]);
+                printf("source = %s, target = %s\n", source, target);
+        }
+
+        if (!force && check_mtab_entry(source, target, "lustre"))
+                return(EEXIST);
+
+        rc = parse_options(options, &flags); 
+        if (rc) {
+                fprintf(stderr, "%s: can't parse options: %s\n",
+                        progname, options);
+                return(EINVAL);
+        }
+
+        rc = access(target, F_OK);
+        if (rc) {
+                rc = errno;
+                fprintf(stderr, "%s: %s inaccessible: %s\n", progname, target,
+                        strerror(errno));
+                return rc;
+        }
+
+        /* In Linux 2.4, the target device doesn't get passed to any of our
+           functions.  So we'll stick it on the end of the options. */
+        optlen = strlen(options) + strlen(",device=") + strlen(source) + 1;
+        optcopy = malloc(optlen);
+        strcpy(optcopy, options);
+        if (*optcopy)
+                strcat(optcopy, ",");
+        strcat(optcopy, "device=");
+        strcat(optcopy, source);
+
+        if (verbose) 
+                printf("mounting device %s at %s, flags=%#x options=%s\n",
+                       source, target, flags, optcopy);
+        
+        if (!fake)
+                /* flags and target get to lustre_get_sb, but not 
+                   lustre_fill_super.  Lustre ignores the flags, but mount 
+                   does not. */
+                rc = mount(source, target, "lustre", flags, (void *)optcopy);
+
+        if (rc) {
+                fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname, 
+                        source, target, strerror(errno));
+                if (errno == ENODEV)
+                        fprintf(stderr, "Are the lustre modules loaded?\n"
+                             "Check /etc/modules.conf and /proc/filesystems\n");
+                if (errno == ENOTBLK)
+                        fprintf(stderr,"Does this filesystem have any OSTs?\n");
+                if (errno == ENOENT)
+                        fprintf(stderr,"Is the MGS specification correct? "
+                                "(%s)\n", source);
+                if (errno == EALREADY)
+                        fprintf(stderr,"The target service is already running. "
+                                "(%s)\n", source);
+                if (errno == ENXIO)
+                        fprintf(stderr,"The target service failed to start "
+                                "(bad config log?) (%s)\n", source);
+                if (errno == EIO)
+                        fprintf(stderr,"Is the MGS running? (%s)\n", source);
+                if (errno == EADDRINUSE)
+                        fprintf(stderr,"The target service's index is already "
+                                "in use. (%s)\n", source);
+                rc = errno;
+        } else if (!nomtab) {
+                rc = update_mtab_entry(source, target, "lustre", options,0,0,0);
+        }
+
+        free(optcopy);
+        free(source);
+        return rc;
+}
index 3072e1d..85ee351 100644 (file)
@@ -130,19 +130,6 @@ do {                                                                    \
         }                                                               \
 } while (0)
 
-int obd_record(enum cfg_record_type type, int len, void *ptr)
-{
-        struct obd_ioctl_data data;
-
-        IOC_INIT(data);
-        data.ioc_type = type;
-        data.ioc_plen1 = len;
-        data.ioc_pbuf1 = ptr;
-        IOC_PACK("obd_record", data);
-
-        return  l_ioctl(OBD_DEV_ID, OBD_IOC_DORECORD, &data);
-}
-
 int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
 {
         int opc;
@@ -166,6 +153,45 @@ int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
         return rc;
 }
 
+int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
+{
+        struct obd_ioctl_data data;
+        static int mgs_device = -1;
+        int rc;
+
+        /* Always operates on MGS dev */
+        if (mgs_device == -1) {
+                static int do_device(char *func, char *devname);
+                do_disconnect(NULL, 1);
+                rc = do_device("mgsioc", "MGS");
+                if (rc) {
+                        errno = ENODEV;
+                        return -1;
+                }
+                mgs_device = cur_device;
+        }
+
+        IOC_INIT(data);
+        data.ioc_dev = mgs_device;
+        data.ioc_type = LUSTRE_CFG_TYPE;
+        data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                        lcfg->lcfg_buflens);
+        data.ioc_pbuf1 = (void *)lcfg;
+        IOC_PACK(func, data);
+
+        rc =  l_ioctl(dev_id, OBD_IOC_PARAM, buf);
+
+        if (rc == ENODEV) 
+                fprintf(stderr, "Is the MGS running on this node?\n");
+        if (rc == ENOSYS) 
+                fprintf(stderr, "Make sure cfg_device is set first.\n");
+        if (rc == EINVAL) 
+                fprintf(stderr, "cfg_device should be of the form "
+                        "'lustre-MDT0000'\n");
+
+        return rc;
+}
+
 char *obdo_print(struct obdo *obd)
 {
         char buf[1024];
index 7a7d43c..afbfb52 100644 (file)
@@ -69,6 +69,7 @@ int jt_llog_remove(int argc, char **argv);
 int jt_llog_check(int argc, char **argv);
 
 int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg);
+int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg);
 int parse_devname(char *func, char *name);
 char *jt_cmdname(char *func);
 
@@ -88,6 +89,8 @@ int jt_lcfg_set_timeout(int argc, char **argv);
 int jt_lcfg_set_lustre_upcall(int argc, char **argv);
 int jt_lcfg_add_conn(int argc, char **argv);
 int jt_lcfg_del_conn(int argc, char **argv);
+int jt_lcfg_param(int argc, char **argv);
+int jt_lcfg_mgsparam(int argc, char **argv);
 
 int obd_add_uuid(char *uuid, lnet_nid_t nid);
 
index 9ae82bb..0e1726f 100755 (executable)
@@ -3,6 +3,6 @@
 SRCDIR=`dirname $0`
 PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
 
-lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1 
+lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1
 # do it again, in case we tried to unload ksocklnd too early
-lctl modules | awk '{ print $2 }' | xargs rmmod
+lsmod | grep lnet > /dev/null && lctl modules | awk '{ print $2 }' | xargs rmmod
index 27b12f7..5a1f55a 100644 (file)
@@ -981,6 +981,12 @@ main(int argc, char **argv)
         CHECK_VALUE(REINT_OPEN);
         CHECK_VALUE(REINT_MAX);
 
+        CHECK_VALUE(MGS_CONNECT);
+        CHECK_VALUE(MGS_DISCONNECT);
+        CHECK_VALUE(MGS_EXCEPTION);
+        CHECK_VALUE(MGS_TARGET_REG);
+        CHECK_VALUE(MGS_TARGET_DEL);
+
         CHECK_VALUE(DISP_IT_EXECD);
         CHECK_VALUE(DISP_LOOKUP_EXECD);
         CHECK_VALUE(DISP_LOOKUP_NEG);
index dd8664b..f2b33f8 100644 (file)
@@ -15,12 +15,12 @@ void lustre_assert_wire_constants(void);
 
 int main()
 {
-       lustre_assert_wire_constants();
+        lustre_assert_wire_constants();
 
-       if (ret == 0)
-               printf("wire constants OK\n");
+        if (ret == 0)
+                printf("wire constants OK\n");
 
-       return ret;
+        return ret;
 }
 
 void lustre_assert_wire_constants(void)
@@ -159,6 +159,16 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_STATUS_CONN);
         LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
                  (long long)MDS_STATUS_LOV);
+        LASSERTF(MGS_CONNECT == 250, " found %lld\n",
+                 (long long)MGS_CONNECT);
+        LASSERTF(MGS_DISCONNECT == 251, " found %lld\n",
+                 (long long)MGS_DISCONNECT);
+        LASSERTF(MGS_EXCEPTION == 252, " found %lld\n",
+                 (long long)MGS_EXCEPTION);
+        LASSERTF(MGS_TARGET_REG == 253, " found %lld\n",
+                 (long long)MGS_TARGET_REG);
+        LASSERTF(MGS_TARGET_DEL == 254, " found %lld\n",
+                 (long long)MGS_TARGET_DEL);
         LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
                  (long long)LDLM_ENQUEUE);
         LASSERTF(LDLM_CONVERT == 102, " found %lld\n",