From: nathan Date: Thu, 27 Apr 2006 23:51:36 +0000 (+0000) Subject: b=8341 X-Git-Tag: v1_7_100~1^90~8^2~260 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=bdb937f576e372388c9e67fae7fd16543d6d8e14;p=fs%2Flustre-release.git b=8341 Land mountconf on b1_5 --- diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 6da79a1..1b7a9be 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -6,9 +6,10 @@ subdir-m += lov subdir-m += ptlrpc subdir-m += osc subdir-m += obdecho +subdir-m += mgc -@SERVER_TRUE@subdir-m += mds obdfilter ost -@CLIENT_TRUE@subdir-m += mdc llite +@SERVER_TRUE@subdir-m += mds obdfilter ost mgs +@CLIENT_TRUE@subdir-m += mdc llite @QUOTA_TRUE@subdir-m += quota @INCLUDE_RULES@ diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 27db5ce..368c081 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -6,9 +6,9 @@ AUTOMAKE_OPTIONS = foreign ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \ - doc utils tests conf scripts autoconf + mgc doc utils tests conf scripts autoconf -SERVER_SUBDIRS := ldiskfs obdfilter ost mds +SERVER_SUBDIRS := ldiskfs obdfilter ost mds mgs CLIENT_SUBDIRS := mdc llite diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 638763a..d695f43 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -702,6 +702,10 @@ lustre/osc/Makefile lustre/osc/autoMakefile lustre/ost/Makefile lustre/ost/autoMakefile +lustre/mgc/Makefile +lustre/mgc/autoMakefile +lustre/mgs/Makefile +lustre/mgs/autoMakefile lustre/ptlrpc/Makefile lustre/ptlrpc/autoMakefile lustre/quota/Makefile diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index d35d750..19fad73 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -318,13 +318,15 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {} #ifndef min_t #define min_t(type,x,y) \ - ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #endif #ifndef max_t #define max_t(type,x,y) \ - ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) + ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) #endif +#define simple_strtol strtol + /* registering symbols */ #ifndef ERESTARTSYS #define ERESTARTSYS ERESTART @@ -664,7 +666,7 @@ static inline int schedule_timeout(signed long t) }) #define time_after(a, b) ((long)(b) - (long)(a) < 0) #define time_before(a, b) time_after(b,a) -#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) +#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) struct timer_list { struct list_head tl_list; diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am index 1f6af34..1d58500 100644 --- a/lustre/include/linux/Makefile.am +++ b/lustre/include/linux/Makefile.am @@ -13,4 +13,5 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_lib.h \ lustre_dlm.h lustre_handles.h lustre_net.h obd_class.h obd_support.h \ lustre_log.h lustre_compat25.h lustre_fsfilt.h lustre_mds.h obd.h \ lvfs.h lvfs_linux.h lustre_lite.h lustre_quota.h \ - lustre_disk.h lustre_user.h lustre_types.h + lustre_disk.h lustre_user.h lustre_types.h lustre_param.h + diff --git a/lustre/include/linux/lustre_disk.h b/lustre/include/linux/lustre_disk.h deleted file mode 100644 index 43cfba2..0000000 --- a/lustre/include/linux/lustre_disk.h +++ /dev/null @@ -1,55 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org - * - * Lustre disk format definitions. - */ -#ifndef _LUSTRE_DISK_H -#define _LUSTRE_DISK_H_ - -#include - -#include - -/****************** last_rcvd file *********************/ - -#define LAST_RCVD "last_rcvd" -#define LOV_OBJID "lov_objid" - -#define LR_SERVER_SIZE 512 -#define LR_CLIENT_START 8192 -#define LR_CLIENT_SIZE 128 -#if LR_CLIENT_START < LR_SERVER_SIZE -#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" -#endif -/* This limit is arbitrary (32k clients on x86), but it is convenient to use - * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */ -#define LR_MAX_CLIENTS (PAGE_SIZE * 8) - -#define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */ -#define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */ - -#define OBD_ROCOMPAT_LOVOBJID 0x00000001 /* MDS handles LOV_OBJID file */ -#define OBD_ROCOMPAT_CROW 0x00000002 /* OST will CROW create objects */ - -#define OBD_INCOMPAT_GROUPS 0x00000001 /* OST handles group subdirs */ -#define OBD_INCOMPAT_OST 0x00000002 /* this is an OST (permanent) */ -#define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT (permanent) */ - -/* Data stored per client in the last_rcvd file. In le32 order. */ -struct lsd_client_data { - __u8 lcd_uuid[40]; /* client UUID */ - __u64 lcd_last_transno; /* last completed transaction ID */ - __u64 lcd_last_xid; /* xid for the last transaction */ - __u32 lcd_last_result; /* result from last RPC */ - __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ - /* for MDS_CLOSE requests */ - __u64 lcd_last_close_transno; /* last completed transaction ID */ - __u64 lcd_last_close_xid; /* xid for the last transaction */ - __u32 lcd_last_close_result; /* result from last RPC */ - __u32 lcd_last_close_data; /* per-op data */ - __u8 lcd_padding[LR_CLIENT_SIZE - 88]; -}; - -#endif /* _LUSTRE_DISK_H_ */ diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 7bc0602..84e9af9 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -49,7 +49,8 @@ struct fsfilt_operations { struct list_head fs_list; struct module *fs_owner; char *fs_type; - char *(* fs_label)(struct super_block *sb); + char *(* fs_getlabel)(struct super_block *sb); + int (* fs_setlabel)(struct super_block *sb, char *label); char *(* fs_uuid)(struct super_block *sb); void *(* fs_start)(struct inode *inode, int op, void *desc_private, int logs); @@ -113,14 +114,23 @@ extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops); extern struct fsfilt_operations *fsfilt_get_ops(const char *type); extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); -static inline char *fsfilt_label(struct obd_device *obd, struct super_block *sb) +static inline char *fsfilt_get_label(struct obd_device *obd, + struct super_block *sb) { - if (obd->obd_fsops->fs_label == NULL) + if (obd->obd_fsops->fs_getlabel == NULL) return NULL; - if (obd->obd_fsops->fs_label(sb)[0] == '\0') + if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0') return NULL; - return obd->obd_fsops->fs_label(sb); + return obd->obd_fsops->fs_getlabel(sb); +} + +static inline int fsfilt_set_label(struct obd_device *obd, + struct super_block *sb, char *label) +{ + if (obd->obd_fsops->fs_setlabel == NULL) + return -ENOSYS; + return (obd->obd_fsops->fs_setlabel(sb, label)); } static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb) diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 557c3ab..08f8786 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -23,7 +23,7 @@ extern int llapi_ping(char *obd_type, char *obd_name); extern int llapi_target_check(int num_types, char **obd_types, char *dir); extern int llapi_catinfo(char *dir, char *keyword, char *node_name); extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); -extern int llapi_is_lustre_mnttype(char *type); +extern int llapi_is_lustre_mnttype(struct mntent *mnt); extern int llapi_quotachown(char *path, int flag); extern int llapi_quotacheck(char *mnt, int check_type); extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk); diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 438402c..d7322cf 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -43,6 +43,8 @@ #error Unsupported operating system. #endif +#include /* for lnet_nid_t */ + /* Defn's shared with user-space. */ #include @@ -86,6 +88,9 @@ #define MDS_SETATTR_PORTAL 22 #define MDS_READPAGE_PORTAL 23 +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 #define OST_REQUEST_PORTAL 28 #define SVC_KILLED 1 @@ -110,6 +115,8 @@ #define LUSTRE_OST_VERSION 0x00030000 #define LUSTRE_DLM_VERSION 0x00040000 #define LUSTRE_LOG_VERSION 0x00050000 +#define LUSTRE_MGS_VERSION 0x00060000 + struct lustre_handle { __u64 cookie; @@ -225,6 +232,8 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */ #define OBD_CONNECT_JOIN 0x2000ULL /* files can be concatenated */ #define OBD_CONNECT_NODEVOH 0x8000ULL /* No open handle for special nodes */ +#define OBD_CONNECT_EMPTY 0x80000000ULL /* fake: these are empty connect flags*/ + /* also update obd_connect_names[] for lprocfs_rd_connect_flags() */ #define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ @@ -235,6 +244,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX) #define ECHO_CONNECT_SUPPORTED (0) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION) #define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ ((patch)<<8) + (fix)) @@ -964,6 +974,57 @@ struct ldlm_reply { extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); + +/* + * Opcodes for mountconf (mgs and mgc) + */ +typedef enum { + MGS_CONNECT = 250, + MGS_DISCONNECT, + MGS_EXCEPTION, /* node died, etc. */ + MGS_TARGET_REG, /* whenever target starts up */ + MGS_TARGET_DEL, + MGS_LAST_OPC +} mgs_cmd_t; + +#define MTI_NAME_MAXLEN 64 +#define MTI_UUID_MAXLEN MTI_NAME_MAXLEN + 5 +/* each host can have multiple nids, and multiple failover hosts, and I don't + want to run out of room... */ +#define MTI_NIDS_MAX 64 /* match lustre_disk.h */ + +struct mgs_target_info { + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + lnet_nid_t mti_nids[MTI_NIDS_MAX]; /* host nids */ + lnet_nid_t mti_failnids[MTI_NIDS_MAX]; /* partner nids */ + __u16 mti_failnodes[8]; /* last nid index of each partner */ + __u32 mti_stripe_index; + __u32 mti_nid_count; + __u32 mti_failnid_count; + __u32 mti_config_ver; + __u32 mti_flags; + char mti_params[2048]; +}; + +extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); + +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + time_t cm_createtime; /*when this record was first created */ + time_t cm_canceltime; /*when this record is no longer valid*/ + char cm_svname[16]; + char cm_comment[40]; +}; + /* * Opcodes for multiple servers. */ diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 1d226ea..a9b9812 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -128,6 +128,19 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) uuid->uuid[sizeof(*uuid) - 1] = '\0'; } +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(struct obd_uuid *uuid) +{ + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done...*/ + static char temp[sizeof(*uuid)]; + memcpy(temp, uuid->uuid, sizeof(*uuid)); + temp[sizeof(*uuid) - 1] = '\0'; + return temp; + } + return (char *)(uuid->uuid); +} + #define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ #define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ #define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ diff --git a/lustre/include/lustre_cfg.h b/lustre/include/lustre_cfg.h index cd13b97..c75b49d 100644 --- a/lustre/include/lustre_cfg.h +++ b/lustre/include/lustre_cfg.h @@ -33,6 +33,9 @@ #define LCFG_HDR_SIZE(count) \ size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)])) +/* If not LCFG_REQUIRED, we can ignore this cmd and go on. */ +#define LCFG_REQUIRED 0x0001000 + enum lcfg_command_type { LCFG_ATTACH = 0x00cf001, LCFG_DETACH = 0x00cf002, @@ -48,8 +51,11 @@ enum lcfg_command_type { LCFG_DEL_CONN = 0x00cf00c, LCFG_LOV_ADD_OBD = 0x00cf00d, LCFG_LOV_DEL_OBD = 0x00cf00e, - LCFG_PARAM = 0x00cf00f, - LCFG_MARKER = 0x00cf010 + LCFG_PARAM = 0x00ce00f, + LCFG_MARKER = 0x00ce010, + LCFG_LOG_START = 0x00ce011, + LCFG_LOG_END = 0x00ce012, + LCFG_LOV_ADD_INA = 0x00ce013, }; struct lustre_cfg_bufs { @@ -151,9 +157,14 @@ static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) return NULL; /* make sure it's NULL terminated, even if this kills a char - * of data + * of data. Try to use the padding first though. */ - s[lcfg->lcfg_buflens[index] - 1] = '\0'; + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + int last = min((int)lcfg->lcfg_buflens[index], + size_round(lcfg->lcfg_buflens[index]) - 1); + s[last] = '\0'; + CWARN("Truncating buf %d to '%s'\n", index, s); + } return s; } @@ -223,6 +234,7 @@ static inline int lustre_cfg_sanity_check(void *buf, int len) if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) RETURN(-EINVAL); + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) RETURN(-EINVAL); @@ -237,50 +249,4 @@ static inline int lustre_cfg_sanity_check(void *buf, int len) RETURN(0); } - -#define LMD_MAGIC 0xbdacbd03 -#define LMD_MAGIC_MASK (0xffffff00 & LMD_MAGIC) - -#define lmd_bad_magic(LMDP) \ -({ \ - struct lustre_mount_data *_lmd__ = (LMDP); \ - int _ret__ = 0; \ - if (!_lmd__) { \ - LCONSOLE_ERROR("Missing mount data: " \ - "check that /sbin/mount.lustre is installed.\n");\ - _ret__ = 1; \ - } else if (_lmd__->lmd_magic == LMD_MAGIC) { \ - _ret__ = 0; \ - } else if ((_lmd__->lmd_magic & LMD_MAGIC_MASK) == LMD_MAGIC_MASK) { \ - LCONSOLE_ERROR("You're using an old version of " \ - "/sbin/mount.lustre. Please install version " \ - "1.%d\n", LMD_MAGIC & 0xFF); \ - _ret__ = 1; \ - } else { \ - LCONSOLE_ERROR("Invalid mount data (%#x != %#x): " \ - "check that /sbin/mount.lustre is installed\n", \ - _lmd__->lmd_magic, LMD_MAGIC); \ - _ret__ = 1; \ - } \ - _ret__; \ -}) - -#define MAX_FAILOVER_NIDS 10 - -/* Passed by mount */ -/* Any changes in the alignment of elements in this stuct require a change to - LMD_MAGIC */ -struct lustre_mount_data { - uint32_t lmd_magic; - uint32_t lmd_flags; - uint16_t lmd_nid_count; /* how many failover nids we have for the MDS */ - lnet_nid_t lmd_nid[MAX_FAILOVER_NIDS]; - char lmd_mds[64]; - char lmd_profile[64]; -}; - -#define LMD_FLG_FLOCK 0x0001 -#define LMD_FLG_USER_XATTR 0x0002 -#define LMD_FLG_ACL 0x0004 - #endif // _LUSTRE_CFG_H diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h new file mode 100644 index 0000000..8430107 --- /dev/null +++ b/lustre/include/lustre_disk.h @@ -0,0 +1,307 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * Lustre disk format definitions. + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +#include +#include + + +/****************** persistent mount data *********************/ + +/* Persistent mount data are stored on the disk in this file. + Used before the setup llog can be read. */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/mountdata" +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_NEED_INDEX 0x0010 /* need an index assignment */ +#define LDD_F_VIRGIN 0x0020 /* never registered */ +#define LDD_F_UPDATE 0x0040 /* update the config logs for this server*/ +#define LDD_F_REWRITE_LDD 0x0080 /* rewrite the LDD */ +#define LDD_F_WRITECONF 0x0100 /* regenerate all logs for this fs */ +#define LDD_F_UPGRADE14 0x0200 /* COMPAT_14 */ +#define MTI_F_IOCTL 0x0400 /* only used in mti */ + + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LAST +}; + +static inline char *mt_str(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "ext3", + "ldiskfs", + "smfs", + "reiserfs", + }; + //LASSERT(mt < LDD_MT_LAST); + return mount_type_string[mt]; +} + +#ifndef MTI_NIDS_MAX /* match lustre_idl.h */ +#define MTI_NIDS_MAX 64 +#endif + +#define LDD_INCOMPAT_SUPP 0 +#define LDD_ROCOMPAT_SUPP 0 + +#define LDD_MAGIC 0x1dd00001 + +/* FIXME does on-disk ldd have to be a fixed endianness? (like last_rcvd) */ +struct lustre_disk_data { + __u32 ldd_magic; + __u32 ldd_feature_compat; /* compatible feature flags */ + __u32 ldd_feature_rocompat;/* read-only compatible feature flags */ + __u32 ldd_feature_incompat;/* incompatible feature flags */ + + __u32 ldd_config_ver; /* config rewrite count - not used */ + __u32 ldd_flags; /* LDD_SV_TYPE */ + __u32 ldd_svindex; /* server index (0001), must match + svname */ + __u32 ldd_mount_type; /* target fs type LDD_MT_* */ + char ldd_fsname[64]; /* filesystem this server is part of */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ + +/*200*/ __u8 ldd_padding[4096 - 200]; +/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */ +/*8192*/char ldd_params[4096]; /* key=value pairs */ +}; + +#define IS_MDT(data) ((data)->ldd_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->ldd_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->ldd_flags & LDD_F_SV_TYPE_MGS) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/* Make the mdt/ost server obd name based on the filesystem name */ +static inline int server_make_name(__u32 flags, __u16 index, char *fs, + char *name) +{ + if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { + sprintf(name, "%.8s-%s%04x", fs, + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", + index); + } else if (flags & LDD_F_SV_TYPE_MGS) { + sprintf(name, "MGS"); + } else { + CERROR("unknown server type %#x\n", flags); + return 1; + } + return 0; +} + +/* Get the index from the obd name */ +int server_name2index(char *svname, __u32 *idx, char **endptr); + + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + everything as string options */ + +#define LMD_MAGIC 0xbdacbd03 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + __u32 lmd_magic; + __u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_opts; /* lustre mount options (as opposed to + _device_ mount options) */ + __u32 *lmd_exclude; /* array of OSTs to ignore */ +}; + +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client only */ +#define LMD_FLG_RECOVER 0x0004 /* Allow recovery */ +#define LMD_FLG_NOSVC 0x0008 /* Only start MGS/MGC for servers, + no other services */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + +/****************** mkfs command *********************/ + +#define MO_IS_LOOP 0x01 +#define MO_FORCEFORMAT 0x02 + +/* used to describe the options to format the lustre disk, not persistent */ +struct mkfs_opts { + struct lustre_disk_data mo_ldd; /* to be written in MOUNT_DATA_FILE */ + char mo_mount_type_string[20]; /* "ext3", "ldiskfs", ... */ + char mo_device[128]; /* disk device name */ + char mo_mkfsopts[128]; /* options to the backing-store mkfs */ + char mo_loopdev[128]; /* in case a loop dev is needed */ + __u64 mo_device_sz; /* in KB */ + int mo_stripe_count; + int mo_flags; + int mo_mgs_failnodes; +}; + +/****************** on-disk files *********************/ + +#define LAST_RCVD "last_rcvd" +#define LOV_OBJID "lov_objid" +#define HEALTH_CHECK "health_check" + +/****************** last_rcvd file *********************/ + +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif +/* This limit is arbitrary (32k clients on x86), but it is convenient to use + * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */ +#define LR_MAX_CLIENTS (PAGE_SIZE * 8) + + +/* COMPAT_146 */ +#define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */ +/* end COMPAT_146 */ + +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 /* MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_CROW 0x00000002 /* OST will CROW create objects */ + +#define OBD_INCOMPAT_GROUPS 0x00000001 /* OST handles group subdirs */ +#define OBD_INCOMPAT_OST 0x00000002 /* this is an OST */ +#define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 /* common last_rvcd format */ + + +/* Data stored per server at the head of the last_rcvd file. In le32 order. + This should be common to filter_internal.h, lustre_mds.h */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_unused; /* was fsd_last_objid - don't use for now */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_ost_index; /* index number of OST in LOV */ + __u32 lsd_mdt_index; /* index number of MDT in LMV */ + __u8 lsd_padding[LR_SERVER_SIZE - 148]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + __u8 lcd_padding[LR_CLIENT_SIZE - 88]; +}; + + +#ifdef __KERNEL__ +/****************** superblock additional info *********************/ +struct ll_sb_info; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct lustre_disk_data *lsi_ldd; /* mount info on-disk */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct vfsmount *lsi_srv_mnt; /* the one server mount */ + atomic_t lsi_mounts; /* references to the srv_mnt */ +}; + +#define LSI_SERVER 0x00000001 +#define LSI_UMOUNT_FORCE 0x00000010 +#define LSI_UMOUNT_FAILOVER 0x00000020 + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +# define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +# define s2lsi_nocast(sb) ((sb)->s_fs_info) +#else /* 2.4 here */ +# define s2lsi(sb) ((struct lustre_sb_info *)((sb)->u.generic_sbp)) +# define s2lsi_nocast(sb) ((sb)->u.generic_sbp) +#endif + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) + +#endif /* __KERNEL__ */ + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct vfsmount *lmi_mnt; + struct list_head lmi_list_chain; +}; + +/****************** prototypes *********************/ + +#ifdef __KERNEL__ +#include + +/* obd_mount.c */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb)); +int lustre_common_put_super(struct super_block *sb); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +struct lustre_mount_info *server_get_mount(char *name); +int server_put_mount(char *name, struct vfsmount *mnt); +int server_register_target(struct super_block *sb); +struct mgs_target_info; +int server_mti_print(char *title, struct mgs_target_info *mti); + +/* mgc_request.c */ +int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id); + +#endif + +#endif // _LUSTRE_DISK_H diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index a7f3a34..f6e3f36 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -8,6 +8,7 @@ #include #include +/* Data stored per client in the last_rcvd file. In le32 order. */ struct mds_client_data; struct mds_export_data { diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index d172dec..ff74277 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -86,11 +86,17 @@ struct obd_import { spinlock_t imp_lock; /* flags */ - unsigned int imp_invalid:1, imp_replayable:1, - imp_dlm_fake:1, imp_server_timeout:1, - imp_initial_recov:1, imp_initial_recov_bk:1, - imp_force_verify:1, imp_pingable:1, - imp_resend_replay:1, imp_deactive:1; + unsigned int + imp_invalid:1, /* evicted */ + imp_replayable:1, /* try to recover the import */ + imp_dlm_fake:1, /* don't run recovery (timeout instead) */ + imp_server_timeout:1, /* use 1/2 timeout on MDS' OSCs */ + imp_initial_recov:1, /* retry the initial connection */ + imp_initial_recov_bk:1, /* turn off init_recov after trying all failover nids */ + imp_force_verify:1, /* force an immidiate ping */ + imp_pingable:1, /* pingable */ + imp_resend_replay:1, /* resend for replay */ + imp_deactive:1; /* administratively disabled */ __u32 imp_connect_op; struct obd_connect_data imp_connect_data; __u64 imp_connect_flags_orig; diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index c98cfc8..d83db00 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -471,6 +471,7 @@ static inline void obd_ioctl_freedata(char *buf, int len) #define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) #define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) #define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE) #define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) #define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) diff --git a/lustre/include/lustre_log.h b/lustre/include/lustre_log.h index 2bc951d..c05ce65 100644 --- a/lustre/include/lustre_log.h +++ b/lustre/include/lustre_log.h @@ -92,6 +92,7 @@ int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb, void *data, void *catdata); extern int llog_cancel_rec(struct llog_handle *loghandle, int index); extern int llog_close(struct llog_handle *cathandle); +extern int llog_get_size(struct llog_handle *loghandle); /* llog_cat.c - catalog api */ struct llog_process_data { @@ -217,7 +218,7 @@ static inline void llog_gen_init(struct llog_ctxt *ctxt) if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) ctxt->loc_gen.mnt_cnt = obd->u.mds.mds_mount_count; - else if (!strstr(obd->obd_type->typ_name, LUSTRE_FILTER_NAME)) + else if (!strstr(obd->obd_type->typ_name, LUSTRE_OST_NAME)) ctxt->loc_gen.mnt_cnt = obd->u.filter.fo_mount_count; else ctxt->loc_gen.mnt_cnt = 0; diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 857c29d..bd596bf 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -115,6 +115,16 @@ #define MDS_MAXREQSIZE (5 * 1024) #define MDS_MAXREPSIZE max(9 * 1024, 280 + LOV_MAX_STRIPE_COUNT * 56) +/* FIXME fix all constants here. Andreas suggests dyamically adding threads. */ +#define MGS_MAX_THREADS 8UL +#define MGS_NUM_THREADS max(2UL, min_t(unsigned long, MGS_MAX_THREADS, \ + num_physpages * smp_num_cpus >> (26 - PAGE_SHIFT))) + +#define MGS_NBUFS (64 * smp_num_cpus) +#define MGS_BUFSIZE (8 * 1024) +#define MGS_MAXREQSIZE (5 * 1024) +#define MGS_MAXREPSIZE (9 * 1024) + #define OST_MAX_THREADS 512UL #define OST_DEF_THREADS max_t(unsigned long, 2, \ (num_physpages >> (26-PAGE_SHIFT)) * smp_num_cpus) diff --git a/lustre/include/lustre_param.h b/lustre/include/lustre_param.h new file mode 100644 index 0000000..142c1f1 --- /dev/null +++ b/lustre/include/lustre_param.h @@ -0,0 +1,46 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * User-settable parameter keys + */ + +#ifndef _LUSTRE_PARAM_H +#define _LUSTRE_PARAM_H + +/* obd_mount.c */ +int class_find_param(char *buf, char *key, char **valp); +int class_match_param(char *buf, char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); + + +/****************** User-settable parameter keys *********************/ + +#define PARAM_MGSNODE "mgsnode=" +#define PARAM_FAILNODE "failnode=" +#define PARAM_OBD_TIMEOUT "obd_timeout=" +#define PARAM_DEFAULT_STRIPE "default_stripe_" +#define PARAM_D_STRIPE_SIZE PARAM_DEFAULT_STRIPE"size" +#define PARAM_D_STRIPE_COUNT PARAM_DEFAULT_STRIPE"count" +#define PARAM_D_STRIPE_OFFSET PARAM_DEFAULT_STRIPE"offset" +#define PARAM_D_STRIPE_PATTERN PARAM_DEFAULT_STRIPE"pattern" + +#endif // _LUSTRE_PARAM_H diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 1f03420..8c22e02 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -101,7 +101,7 @@ struct lov_stripe_md { struct { /* Public members. */ __u64 lw_object_id; /* lov object id */ - __u64 lw_object_gr; /* lov object id */ + __u64 lw_object_gr; /* lov object group */ __u64 lw_maxbytes; /* maximum possible file size */ unsigned long lw_xfersize; /* optimal transfer size */ @@ -235,7 +235,7 @@ struct filter_obd { spinlock_t fo_translock; /* protect fsd_last_transno */ struct file *fo_rcvd_filp; struct file *fo_health_check_filp; - struct filter_server_data *fo_fsd; + struct lr_server_data *fo_fsd; unsigned long *fo_last_rcvd_slots; __u64 fo_mount_count; @@ -297,8 +297,6 @@ struct filter_obd { atomic_t fo_quotachecking; }; -struct mds_server_data; - #define OSC_MAX_RIF_DEFAULT 8 #define OSC_MAX_RIF_MAX 256 #define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) @@ -368,6 +366,13 @@ struct client_obd { struct mdc_rpc_lock *cl_setattr_lock; struct osc_creator cl_oscc; + /* mgc datastruct */ + struct semaphore cl_mgc_sem; + struct vfsmount *cl_mgc_vfsmnt; + struct dentry *cl_mgc_configs_dir; + atomic_t cl_mgc_refcount; + struct obd_export *cl_mgc_mgsexp; + /* Flags section */ unsigned int cl_checksum:1; /* debug checksums */ @@ -381,6 +386,16 @@ struct client_obd { #define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ +struct mgs_obd { + struct ptlrpc_service *mgs_service; + struct vfsmount *mgs_vfsmnt; + struct super_block *mgs_sb; + struct dentry *mgs_configs_dir; + struct dentry *mgs_fid_de; + struct list_head mgs_fs_db_list; + struct semaphore mgs_sem; +}; + struct mds_obd { /* NB this field MUST be first */ struct obd_device_target mds_obt; @@ -399,7 +414,7 @@ struct mds_obd { unsigned long mds_atime_diff; struct semaphore mds_epoch_sem; struct ll_fid mds_rootfid; - struct mds_server_data *mds_server_data; + struct lr_server_data *mds_server_data; cfs_dentry_t *mds_pending_dir; cfs_dentry_t *mds_logs_dir; cfs_dentry_t *mds_objects_dir; @@ -409,9 +424,11 @@ struct mds_obd { struct obd_uuid mds_lov_uuid; char *mds_profile; struct obd_export *mds_osc_exp; /* XXX lov_exp */ - int mds_has_lov_desc; struct lov_desc mds_lov_desc; obd_id *mds_lov_objids; + int mds_lov_objids_size; + __u32 mds_lov_objids_in_file; + unsigned int mds_lov_objids_dirty:1; int mds_lov_nextid_set; struct file *mds_lov_objid_filp; struct file *mds_health_check_filp; @@ -464,6 +481,7 @@ struct lov_obd { struct semaphore lov_lock; atomic_t refcount; struct lov_desc desc; + struct obd_connect_data ocd; int bufsize; int connects; int death_row; /* Do we have tgts scheduled to be deleted? @@ -484,19 +502,27 @@ struct niobuf_local { }; /* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ #define LUSTRE_MDS_NAME "mds" #define LUSTRE_MDT_NAME "mdt" #define LUSTRE_MDC_NAME "mdc" -#define LUSTRE_FILTER_NAME "obdfilter" -#define LUSTRE_OST_NAME "ost" +#define LUSTRE_OSS_NAME "ost" /*FIXME change name to oss*/ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost*/ #define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_OSTSAN_NAME "sanobdfilter" #define LUSTRE_SANOSC_NAME "sanosc" #define LUSTRE_SANOST_NAME "sanost" -#define LUSTRE_LOV_NAME "lov" #define LUSTRE_CACHEOBD_NAME "cobd" #define LUSTRE_ECHO_NAME "obdecho" #define LUSTRE_ECHO_CLIENT_NAME "echo_client" +/* Constant obd names */ +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" /* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ #define N_LOCAL_TEMP_PAGE 0x10000000 @@ -589,7 +615,10 @@ enum obd_notify_event { /* Device deactivated */ OBD_NOTIFY_INACTIVE, /* Connect data for import were changed */ - OBD_NOTIFY_OCD + OBD_NOTIFY_OCD, + /* Sync request */ + OBD_NOTIFY_SYNC_NONBLOCK, + OBD_NOTIFY_SYNC }; /* @@ -605,13 +634,12 @@ struct obd_notify_upcall { /* corresponds to one of the obd's */ struct obd_device { - struct obd_type *obd_type; - + struct obd_type *obd_type; /* common and UUID name of this device */ - char *obd_name; - struct obd_uuid obd_uuid; + char *obd_name; + struct obd_uuid obd_uuid; - int obd_minor; + int obd_minor; unsigned int obd_attached:1, obd_set_up:1, obd_recovering:1, obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1, obd_no_recov:1, obd_stopping:1, obd_starting:1, @@ -667,6 +695,7 @@ struct obd_device { struct echo_client_obd echo_client; struct echo_obd echo; struct lov_obd lov; + struct mgs_obd mgs; } u; /* Fields used by LProcFS */ unsigned int obd_cntr_base; @@ -694,6 +723,13 @@ enum obd_cleanup_stage { OBD_CLEANUP_OBD, }; +/* get/set_info keys */ +#define KEY_MDS_CONN "mds_conn" +#define KEY_NEXT_ID "next_id" +#define KEY_LOVDESC "lovdesc" +#define KEY_INIT_RECOV "initial_recov" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" + struct obd_ops { struct module *o_owner; int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, @@ -848,7 +884,7 @@ struct obd_ops { enum obd_import_event); int (*o_notify)(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev); + enum obd_notify_event ev, void *data); int (*o_health_check)(struct obd_device *); diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 78ec204..2a6fbf4 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -62,6 +62,7 @@ int class_name2dev(char *name); struct obd_device *class_name2obd(char *name); int class_uuid2dev(struct obd_uuid *uuid); struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, char * typ_name, struct obd_uuid *grp_uuid); @@ -83,7 +84,7 @@ char *obd_export_nid2str(struct obd_export *exp); int obd_export_evict_by_nid(struct obd_device *obd, char *nid); int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid); -/* config.c */ +/* obd_config.c */ int class_process_config(struct lustre_cfg *lcfg); int class_attach(struct lustre_cfg *lcfg); int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); @@ -92,16 +93,36 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); struct obd_device *class_incref(struct obd_device *obd); void class_decref(struct obd_device *obd); +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_COMPAT146 0x08 /* Translation to new obd names required */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + + /* Passed as data param to class_config_parse_llog */ struct config_llog_instance { - char * cfg_instance; - struct obd_uuid cfg_uuid; + char * cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; }; int class_config_parse_llog(struct llog_ctxt *ctxt, char *name, struct config_llog_instance *cfg); int class_config_dump_llog(struct llog_ctxt *ctxt, char *name, struct config_llog_instance *cfg); +/* list of active configuration logs */ +struct config_llog_data { + char *cld_logname; + struct ldlm_res_id cld_resid; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain; + atomic_t cld_refcount; + unsigned int cld_stopping:1; +}; + struct lustre_profile { struct list_head lp_list; char * lp_profile; @@ -112,6 +133,7 @@ struct lustre_profile { struct lustre_profile *class_get_profile(char * prof); void class_del_profile(char *prof); +/* genops.c */ #define class_export_get(exp) \ ({ \ struct obd_export *exp_ = exp; \ @@ -140,6 +162,7 @@ void class_import_put(struct obd_import *); struct obd_import *class_new_import(struct obd_device *obd); void class_destroy_import(struct obd_import *exp); +struct obd_type *class_search_type(char *name); struct obd_type *class_get_type(char *name); void class_put_type(struct obd_type *type); int class_connect(struct lustre_handle *conn, struct obd_device *obd, @@ -148,7 +171,7 @@ int class_disconnect(struct obd_export *exp); void class_fail_export(struct obd_export *exp); void class_disconnect_exports(struct obd_device *obddev); void class_disconnect_stale_exports(struct obd_device *obddev); -void class_manual_cleanup(struct obd_device *obd); +int class_manual_cleanup(struct obd_device *obd); /* obdo.c */ void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); @@ -1042,11 +1065,16 @@ static inline void obd_import_event(struct obd_device *obd, static inline int obd_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev) + enum obd_notify_event ev, void *data) { ENTRY; OBD_CHECK_DEV(obd); - if (!obd->obd_set_up) { + + /* the check for async_recov is a complete hack - I'm hereby + overloading the meaning to also mean "this was called from + mds_postsetup". I know that my mds is able to handle notifies + by this point, and it needs to get them to execute mds_postrecov. */ + if (!obd->obd_set_up && !obd->obd_async_recov) { CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); RETURN(-EINVAL); } @@ -1057,12 +1085,12 @@ static inline int obd_notify(struct obd_device *obd, } OBD_COUNTER_INCREMENT(obd, notify); - RETURN(OBP(obd, notify)(obd, watched, ev)); + RETURN(OBP(obd, notify)(obd, watched, ev, data)); } static inline int obd_notify_observer(struct obd_device *observer, struct obd_device *observed, - enum obd_notify_event ev) + enum obd_notify_event ev, void *data) { int rc1; int rc2; @@ -1070,7 +1098,7 @@ static inline int obd_notify_observer(struct obd_device *observer, struct obd_notify_upcall *onu; if (observer->obd_observer) - rc1 = obd_notify(observer->obd_observer, observed, ev); + rc1 = obd_notify(observer->obd_observer, observed, ev, data); else rc1 = 0; /* @@ -1181,7 +1209,7 @@ extern void obd_sysctl_clean (void); /* uuid.c */ typedef __u8 class_uuid_t[16]; -//int class_uuid_parse(struct obd_uuid in, class_uuid_t out); +void class_generate_random_uuid(class_uuid_t uuid); void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); /* lustre_peer.c */ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 2fa9852..beca205 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -166,6 +166,10 @@ extern cfs_waitq_t obd_race_waitq; #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 + /* preparation for a more advanced failure testbed (not functional yet) */ #define OBD_FAIL_MASK_SYS 0x0000FF00 #define OBD_FAIL_MASK_LOC (0x000000FF | OBD_FAIL_MASK_SYS) diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config index 8629266..e7eb927 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config @@ -2365,7 +2365,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_DEBUG_HIGHMEM=y -CONFIG_DEBUG_INFO=y +# CONFIG_DEBUG_INFO is not set # CONFIG_FRAME_POINTER is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y diff --git a/lustre/ldlm/ldlm_inodebits.c b/lustre/ldlm/ldlm_inodebits.c index 58229f3..8c473dd 100644 --- a/lustre/ldlm/ldlm_inodebits.c +++ b/lustre/ldlm/ldlm_inodebits.c @@ -44,7 +44,7 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, int compat = 1; ENTRY; - LASSERT(req_bits); /* There is no sence in lock with no bits set, + LASSERT(req_bits); /* There is no sense in lock with no bits set, I think. Also such a lock would be compatible with any other bit lock */ list_for_each(tmp, queue) { diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index a2dcf4b..f9f6c43 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -50,7 +50,6 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq, int ldlm_process_flock_lock(struct ldlm_lock *lock, int *flags, int first_enq, ldlm_error_t *err); - /* ldlm_inodebits.c */ int ldlm_process_inodebits_lock(struct ldlm_lock *lock, int *flags, int first_enq, ldlm_error_t *err); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 6ee4f03..0b9945d 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -207,6 +207,10 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) rq_portal = MDS_REQUEST_PORTAL; rp_portal = MDC_REPLY_PORTAL; connect_op = MDS_CONNECT; + } else if (!strcmp(name, LUSTRE_MGC_NAME)) { + rq_portal = MGS_REQUEST_PORTAL; + rp_portal = MGC_REPLY_PORTAL; + connect_op = MGS_CONNECT; } else { CERROR("unknown client OBD type \"%s\", can't setup\n", name); @@ -234,6 +238,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) } sema_init(&cli->cl_sem, 1); + sema_init(&cli->cl_mgc_sem, 1); cli->cl_conn_count = 0; memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), @@ -284,6 +289,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) imp->imp_client = &obddev->obd_ldlm_client; imp->imp_connect_op = connect_op; imp->imp_initial_recov = 1; + imp->imp_initial_recov_bk = 0; CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain); memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), LUSTRE_CFG_BUFLEN(lcfg, 1)); @@ -331,7 +337,7 @@ int client_obd_cleanup(struct obd_device *obddev) RETURN(0); } -/* ->o_connect() method for client side (OSC and MDC) */ +/* ->o_connect() method for client side (OSC and MDC and MGC) */ int client_connect_import(struct lustre_handle *dlm_handle, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data) @@ -531,8 +537,16 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) obd_str2uuid (&tgtuuid, str); target = class_uuid2obd(&tgtuuid); + /* COMPAT_146 */ + /* old (pre 1.6) lustre_process_log tries to connect to mdsname + (eg. mdsA) instead of uuid. */ + if (!target) { + snprintf((char *)tgtuuid.uuid, sizeof(tgtuuid), "%s_UUID", str); + target = class_uuid2obd(&tgtuuid); + } if (!target) target = class_name2obd(str); + /* end COMPAT_146 */ if (!target || target->obd_stopping || !target->obd_set_up) { DEBUG_REQ(D_ERROR, req, "UUID '%s' is not available " @@ -585,12 +599,12 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { if (!data) { - DEBUG_REQ(D_INFO, req, "Refusing old (unversioned) " + DEBUG_REQ(D_WARNING, req, "Refusing old (unversioned) " "libclient connection attempt\n"); GOTO(out, rc = -EPROTO); } else if (data->ocd_version < LUSTRE_VERSION_CODE - LUSTRE_VERSION_ALLOWED_OFFSET) { - DEBUG_REQ(D_INFO, req, "Refusing old (%d.%d.%d.%d) " + DEBUG_REQ(D_WARNING, req, "Refusing old (%d.%d.%d.%d) " "libclient connection attempt\n", OBD_OCD_VERSION_MAJOR(data->ocd_version), OBD_OCD_VERSION_MINOR(data->ocd_version), diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 293733e..b801f01 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -713,6 +713,11 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, GOTO(out, rc = -EPROTO); } +#if 0 + /* FIXME this makes it impossible to use LDLM_PLAIN locks -- check + against server's _CONNECT_SUPPORTED flags? (I don't want to use + ibits for mgc/mgs) */ + /* INODEBITS_INTEROP: Perform conversion from plain lock to * inodebits lock if client does not support them. */ if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) && @@ -723,6 +728,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, if (dlm_req->lock_desc.l_req_mode == LCK_PR) dlm_req->lock_desc.l_req_mode = LCK_CR; } +#endif if (flags & LDLM_FL_REPLAY) { lock = find_existing_lock(req->rq_export, @@ -1579,6 +1585,9 @@ static int ldlm_setup(void) spin_lock_init(&waiting_locks_spinlock); cfs_timer_init(&waiting_locks_timer, waiting_locks_callback, 0); + /* Using CLONE_FILES instead of CLONE_FS here causes failures in + conf-sanity test 21. But using CLONE_FS can cause problems + if the daemonize happens between push/pop_ctxt... */ rc = cfs_kernel_thread(expired_lock_main, NULL, CLONE_VM | CLONE_FS); if (rc < 0) { CERROR("Cannot start ldlm expired-lock thread: %d\n", rc); diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 054fa0d..cdb8f04 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -101,7 +101,7 @@ int liblustre_process_log(struct config_llog_instance *cfg, GOTO(out, rc); lustre_cfg_bufs_reset(&bufs, name); - lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME); + lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);//FIXME connect to mgc lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid); lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); rc = class_process_config(lcfg); @@ -131,7 +131,7 @@ int liblustre_process_log(struct config_llog_instance *cfg, /* Disable initial recovery on this import */ rc = obd_set_info_async(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", + strlen(KEY_INIT_RECOV), KEY_INIT_RECOV, sizeof(allow_recov), &allow_recov, NULL); rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd); diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 96979ee..d5dfd8c 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -52,7 +52,7 @@ struct llu_inode_info { char *lli_symlink_name; struct semaphore lli_open_sem; __u64 lli_maxbytes; - unsigned long lli_flags; + unsigned long lli_flags; /* for libsysio */ struct file_identifier lli_sysio_fid; diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index d262d5c..7517ef1 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -347,7 +347,7 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) static struct inode* llu_new_inode(struct filesys *fs, struct ll_fid *fid) { - struct inode *inode; + struct inode *inode; struct llu_inode_info *lli; struct intnl_stat st = { .st_dev = 0, @@ -377,11 +377,11 @@ static struct inode* llu_new_inode(struct filesys *fs, lli->lli_fid = *fid; /* file identifier is needed by functions like _sysio_i_find() */ - inode = _sysio_i_new(fs, &lli->lli_sysio_fid, + inode = _sysio_i_new(fs, &lli->lli_sysio_fid, &st, 0, &llu_inode_ops, lli); - if (!inode) - OBD_FREE(lli, sizeof(*lli)); + if (!inode) + OBD_FREE(lli, sizeof(*lli)); return inode; } @@ -719,10 +719,10 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) (rc = ll_permission(inode, MAY_WRITE)) != 0) RETURN(rc); } else { - /* from inode_change_ok() */ - if (current->fsuid != st->st_uid && - !capable(CAP_FOWNER)) - RETURN(-EPERM); + /* from inode_change_ok() */ + if (current->fsuid != st->st_uid && + !capable(CAP_FOWNER)) + RETURN(-EPERM); } } @@ -1692,7 +1692,7 @@ llu_fsswop_mount(const char *source, struct config_llog_instance cfg; char ll_instance[sizeof(sbi) * 2 + 1]; struct lustre_profile *lprof; - char *zconf_mdsnid, *zconf_mdsname, *zconf_profile; + char *zconf_mdsnid, *zconf_mdsname, *zconf_profile; char *osc = NULL, *mdc = NULL; int async = 1, err = -EINVAL; struct obd_connect_data ocd = {0,}; @@ -1842,19 +1842,19 @@ llu_fsswop_mount(const char *source, GOTO(out_request, err = -EBADF); } - /* - * Generate base path-node for root. - */ - rootpb = _sysio_pb_new(&noname, NULL, root); - if (!rootpb) { - err = -ENOMEM; - goto out_inode; - } + /* + * Generate base path-node for root. + */ + rootpb = _sysio_pb_new(&noname, NULL, root); + if (!rootpb) { + err = -ENOMEM; + goto out_inode; + } - err = _sysio_do_mount(fs, rootpb, flags, tocover, mntp); - if (err) { + err = _sysio_do_mount(fs, rootpb, flags, tocover, mntp); + if (err) { _sysio_pb_gone(rootpb); - goto out_inode; + goto out_inode; } ptlrpc_req_finished(request); diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 68c8658..4b2132e 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -13,7 +13,8 @@ #include #include #include - +#include /* for s2sbi */ + /* struct lustre_intent_data { __u64 it_lock_handle[2]; @@ -158,8 +159,6 @@ struct ll_sb_info { struct proc_dir_entry* ll_proc_root; obd_id ll_rootino; /* number of root inode */ - struct lustre_mount_data *ll_lmd; - int ll_flags; struct list_head ll_conn_chain; /* per-conn chain of SBs */ struct lustre_client_ocd ll_lco; @@ -424,12 +423,10 @@ int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name); extern struct super_operations lustre_super_operations; char *ll_read_opt(const char *opt, char *data); -int ll_set_opt(const char *opt, char *data, int fl); -void ll_options(char *options, char **ost, char **mds, int *flags); +void ll_options(char *options, int *flags); void ll_lli_init(struct ll_inode_info *lli); -int ll_fill_super(struct super_block *sb, void *data, int silent); -int lustre_fill_super(struct super_block *sb, void *data, int silent); -void lustre_put_super(struct super_block *sb); +int ll_fill_super(struct super_block *sb); +void ll_put_super(struct super_block *sb); struct inode *ll_inode_from_lock(struct ldlm_lock *lock); void ll_clear_inode(struct inode *inode); int ll_setattr_raw(struct inode *inode, struct iattr *attr); @@ -442,7 +439,7 @@ void ll_read_inode2(struct inode *inode, void *opaque); int ll_iocontrol(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); void ll_umount_begin(struct super_block *sb); -int lustre_remount_fs(struct super_block *sb, int *flags, char *data); +int ll_remount_fs(struct super_block *sb, int *flags, char *data); int ll_prep_inode(struct obd_export *exp, struct inode **inode, struct ptlrpc_request *req, int offset, struct super_block *); void lustre_dump_dentry(struct dentry *, int recur); @@ -513,8 +510,9 @@ int ll_tree_unlock(struct ll_lock_tree *tree); #define LL_MAX_BLKSIZE (4UL * 1024 * 1024) +#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) + #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#define ll_s2sbi_nocast(sb) ((sb)->s_fs_info) void __d_rehash(struct dentry * entry, int lock); static inline __u64 ll_ts2u64(struct timespec *time) { @@ -522,13 +520,11 @@ static inline __u64 ll_ts2u64(struct timespec *time) return t; } #else /* 2.4 here */ -#define ll_s2sbi_nocast(sb) ((sb)->u.generic_sbp) static inline __u64 ll_ts2u64(time_t *time) { return *time; } #endif -#define ll_s2sbi(sb) ((struct ll_sb_info *)ll_s2sbi_nocast(sb)) /* don't need an addref as the sb_info should be holding one */ static inline struct obd_export *ll_s2obdexp(struct super_block *sb) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 90dd73c..cf0fc28 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "llite_internal.h" kmem_cache_t *ll_file_data_slab; @@ -46,33 +47,8 @@ extern struct address_space_operations ll_dir_aops; #define log2(n) ffz(~(n)) #endif -/* We need to have some extra twiddling here because some systems have - * no random state when they start up. */ -static void -lustre_generate_random_uuid(class_uuid_t uuid) -{ - struct timeval t; - int *i, j, k; - - ENTRY; - LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0); - - j = jiffies; - do_gettimeofday(&t); - k = t.tv_usec; - - generate_random_uuid(uuid); - - for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) { - *i ^= j ^ k; - j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff); - k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000); - } - - EXIT; -} -struct ll_sb_info *lustre_init_sbi(struct super_block *sb) +struct ll_sb_info *ll_init_sbi(void) { struct ll_sb_info *sbi = NULL; class_uuid_t uuid; @@ -97,9 +73,8 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); - ll_s2sbi_nocast(sb) = sbi; - lustre_generate_random_uuid(uuid); + class_generate_random_uuid(uuid); class_uuid_unparse(uuid, &sbi->ll_sb_uuid); CDEBUG(D_HA, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); @@ -112,7 +87,7 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) RETURN(sbi); } -void lustre_free_sbi(struct super_block *sb) +void ll_free_sbi(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); ENTRY; @@ -123,7 +98,6 @@ void lustre_free_sbi(struct super_block *sb) spin_unlock(&ll_sb_lock); OBD_FREE(sbi, sizeof(*sbi)); } - ll_s2sbi_nocast(sb) = NULL; EXIT; } @@ -131,7 +105,7 @@ static struct dentry_operations ll_d_root_ops = { .d_compare = ll_dcompare, }; -int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) +int client_common_fill_super(struct super_block *sb, char *mdc, char *osc) { struct inode *root = 0; struct ll_sb_info *sbi = ll_s2sbi(sb); @@ -184,7 +158,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data); if (err == -EBUSY) { - CERROR("An MDS (mdc %s) is performing recovery, of which this" + CERROR("An MDT (mdc %s) is performing recovery, of which this" " client is not a part. Please wait for recovery to " "complete, abort, or time out.\n", mdc); GOTO(out, err); @@ -269,11 +243,18 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp); + err = obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL, + 0, NULL, NULL, NULL); + if (err < 0) { + LCONSOLE_ERROR("There are no OST's in this filesystem. " + "There must be at least one active OST for " + "a client to start.\n"); + GOTO(out_osc, err); + } + if (!ll_async_page_slab) { ll_async_page_slab_size = - size_round(sizeof(struct ll_async_page)) + - obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL, - 0, NULL, NULL, NULL); + size_round(sizeof(struct ll_async_page)) + err; ll_async_page_slab = kmem_cache_create("ll_async_page", ll_async_page_slab_size, 0, 0, NULL, NULL); @@ -475,7 +456,7 @@ static void prune_deathrow(struct ll_sb_info *sbi, int try) EXIT; } -void lustre_common_put_super(struct super_block *sb) +void client_common_put_super(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); ENTRY; @@ -524,24 +505,19 @@ char *ll_read_opt(const char *opt, char *data) RETURN(retval); } -int ll_set_opt(const char *opt, char *data, int fl) +static inline int ll_set_opt(const char *opt, char *data, int fl) { - ENTRY; - - CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data); - if (strncmp(opt, data, strlen(opt))) - RETURN(0); + if (strncmp(opt, data, strlen(opt)) != 0) + return(0); else - RETURN(fl); + return(fl); } -void ll_options(char *options, char **ost, char **mdc, int *flags) +/* non-client-specific mount options are parsed in lmd_parse */ +void ll_options(char *options, int *flags) { int tmp; - char *this_char; -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - char *opt_ptr = options; -#endif + char *s1 = options, *s2; ENTRY; if (!options) { @@ -549,59 +525,57 @@ void ll_options(char *options, char **ost, char **mdc, int *flags) return; } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - for (this_char = strtok (options, ","); - this_char != NULL; - this_char = strtok (NULL, ",")) -#else - while ((this_char = strsep (&opt_ptr, ",")) != NULL) -#endif - { - CDEBUG(D_SUPER, "this_char %s\n", this_char); - if (!*ost && (*ost = ll_read_opt(LUSTRE_OSC_NAME, this_char))) - continue; - if (!*mdc && (*mdc = ll_read_opt(LUSTRE_MDC_NAME, this_char))) - continue; - tmp = ll_set_opt("nolock", this_char, LL_SBI_NOLCK); + CDEBUG(D_CONFIG, "Parsing opts %s\n", options); + + while (*s1) { + CDEBUG(D_SUPER, "next opt=%s\n", s1); + tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); if (tmp) { *flags |= tmp; - continue; + goto next; } - tmp = ll_set_opt("flock", this_char, LL_SBI_FLOCK); + tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); if (tmp) { *flags |= tmp; - continue; + goto next; } - tmp = ll_set_opt("noflock", this_char, LL_SBI_FLOCK); + tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK); if (tmp) { *flags &= ~tmp; - continue; + goto next; } - tmp = ll_set_opt("user_xattr", this_char, LL_SBI_USER_XATTR); + tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); if (tmp) { *flags |= tmp; - continue; + goto next; } - tmp = ll_set_opt("nouser_xattr", this_char, LL_SBI_USER_XATTR); + tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); if (tmp) { *flags &= ~tmp; - continue; + goto next; } - tmp = ll_set_opt("acl", this_char, LL_SBI_ACL); + tmp = ll_set_opt("acl", s1, LL_SBI_ACL); if (tmp) { /* Ignore deprecated mount option. The client will * always try to mount with ACL support, whether this * is used depends on whether server supports it. */ - continue; + goto next; } - tmp = ll_set_opt("noacl", this_char, LL_SBI_ACL); + tmp = ll_set_opt("noacl", s1, LL_SBI_ACL); if (tmp) { - continue; + goto next; } + +next: + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) + break; + s1 = s2 + 1; } EXIT; } - + void ll_lli_init(struct ll_inode_info *lli) { sema_init(&lli->lli_open_sem, 1); @@ -614,346 +588,136 @@ void ll_lli_init(struct ll_inode_info *lli) INIT_LIST_HEAD(&lli->lli_dead_list); } -int ll_fill_super(struct super_block *sb, void *data, int silent) +int ll_fill_super(struct super_block *sb) { + struct lustre_profile *lprof; + struct lustre_sb_info *lsi = s2lsi(sb); struct ll_sb_info *sbi; - char *osc = NULL; - char *mdc = NULL; - int err; + char *osc = NULL; + char *mdc = NULL; + char *profilenm = get_profile_name(sb); + struct config_llog_instance cfg; + char ll_instance[sizeof(sb) * 2 + 1]; + int err; ENTRY; - + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); - sbi = lustre_init_sbi(sb); - if (!sbi) + /* client additional sb info */ + lsi->lsi_llsbi = sbi = ll_init_sbi(); + if (!sbi) RETURN(-ENOMEM); - ll_options(data, &osc, &mdc, &sbi->ll_flags); - - if (!osc) { - CERROR("no osc\n"); - GOTO(out, err = -EINVAL); - } - - if (!mdc) { - CERROR("no mdc\n"); - GOTO(out, err = -EINVAL); - } - - err = lustre_common_fill_super(sb, mdc, osc); -out: - if (err) - lustre_free_sbi(sb); - - if (mdc) - OBD_FREE(mdc, strlen(mdc) + 1); - if (osc) - OBD_FREE(osc, strlen(osc) + 1); - - RETURN(err); -} /* ll_read_super */ - -static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, - char *s1, char *s2) -{ - struct lustre_cfg_bufs bufs; - struct lustre_cfg * lcfg = NULL; - int err; - - CDEBUG(D_TRACE, "lcfg %s %#x %s %s\n", cfgname, cmd, s1, s2); - - lustre_cfg_bufs_reset(&bufs, cfgname); - if (s1) - lustre_cfg_bufs_set_string(&bufs, 1, s1); - if (s2) - lustre_cfg_bufs_set_string(&bufs, 2, s2); - - lcfg = lustre_cfg_new(cmd, &bufs); - lcfg->lcfg_nid = nid; - err = class_process_config(lcfg); - lustre_cfg_free(lcfg); - return(err); -} - -static int lustre_process_log(struct lustre_mount_data *lmd, char * profile, - struct config_llog_instance *cfg) -{ - struct obd_device *obd; - struct lustre_handle mdc_conn = {0, }; - struct obd_export *exp; - char * name = "mdc_dev"; - class_uuid_t uuid; - struct obd_uuid mdc_uuid; - struct llog_ctxt *ctxt; - struct obd_connect_data ocd = { 0 }; - lnet_nid_t nid; - int i, rc = 0, recov_bk = 1; - int err; - ENTRY; - - if (lmd_bad_magic(lmd)) - RETURN(-EINVAL); - - lustre_generate_random_uuid(uuid); - class_uuid_unparse(uuid, &mdc_uuid); - CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid); + ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags); - nid = lmd->lmd_nid[0]; - LASSERT(nid != LNET_NID_ANY); - rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0); - if (rc < 0) - GOTO(out, rc); - - rc = do_lcfg(name, 0, LCFG_ATTACH, LUSTRE_MDC_NAME, mdc_uuid.uuid); - if (rc < 0) - GOTO(out_del_uuid, rc); - - rc = do_lcfg(name, 0, LCFG_SETUP, lmd->lmd_mds, libcfs_nid2str(nid)); - if (rc < 0) { - LCONSOLE_ERROR("I couldn't establish a connection with the MDS." - " Check that the MDS host NID is correct and the" - " networks are up.\n"); - GOTO(out_detach, rc); - } - - obd = class_name2obd(name); - if (obd == NULL) - GOTO(out_cleanup, rc = -EINVAL); - - /* Add the redundant MDS nids */ - for (i = 1; i < lmd->lmd_nid_count; i++) { - nid = lmd->lmd_nid[i]; - rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0); - if (rc) { - CERROR("Add uuid for %s failed %d\n", - libcfs_nid2str(nid), rc); - continue; - } - rc = do_lcfg(name, 0, LCFG_ADD_CONN, libcfs_nid2str(nid), 0); - if (rc) - CERROR("Add conn for %s failed %d\n", - libcfs_nid2str(nid), rc); - } - - /* Try all connections, but only once. */ - rc = obd_set_info_async(obd->obd_self_export, - strlen("init_recov_bk"), "init_recov_bk", - sizeof(recov_bk), &recov_bk, NULL); - if (rc) - GOTO(out_cleanup, rc); - - ocd.ocd_connect_flags = OBD_CONNECT_ACL; - - rc = obd_connect(&mdc_conn, obd, &mdc_uuid, &ocd); - if (rc) { - CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc); - GOTO(out_cleanup, rc); - } - - exp = class_conn2export(&mdc_conn); - - ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT); -#if 1 - rc = class_config_parse_llog(ctxt, profile, cfg); -#else - /* - * For debugging, it's useful to just dump the log - */ - rc = class_config_dump_llog(ctxt, profile, cfg); -#endif - switch (rc) { - case 0: - break; - case -EINVAL: - LCONSOLE_ERROR("%s: The configuration '%s' could not be read " - "from the MDS '%s'. Make sure this client and " - "the MDS are running compatible versions of " - "Lustre.\n", - obd->obd_name, profile, lmd->lmd_mds); - /* fall through */ - default: - LCONSOLE_ERROR("%s: The configuration '%s' could not be read " - "from the MDS '%s'. This may be the result of " - "communication errors between the client and " - "the MDS, or if the MDS is not running.\n", - obd->obd_name, profile, lmd->lmd_mds); - break; - } - - /* We don't so much care about errors in cleaning up the config llog - * connection, as we have already read the config by this point. */ - err = obd_disconnect(exp); - if (err) - CERROR("obd_disconnect failed: rc = %d\n", err); - -out_cleanup: - err = do_lcfg(name, 0, LCFG_CLEANUP, 0, 0); - if (err) - CERROR("mdc_cleanup failed: rc = %d\n", err); - -out_detach: - err = do_lcfg(name, 0, LCFG_DETACH, 0, 0); - if (err) - CERROR("mdc_detach failed: rc = %d\n", err); - -out_del_uuid: - /* class_add_uuid adds a nid even if the same uuid exists; we might - delete any copy here. So they all better match. */ - for (i = 0; i < lmd->lmd_nid_count; i++) { - nid = lmd->lmd_nid[i]; - err = do_lcfg(name, nid, LCFG_DEL_UUID, libcfs_nid2str(nid), 0); - if (err) - CERROR("del MDC UUID %s failed: rc = %d\n", - libcfs_nid2str(nid), err); - } - /* class_import_put will get rid of the additional connections */ -out: - RETURN(rc); -} - -static void lustre_manual_cleanup(struct ll_sb_info *sbi) -{ - struct obd_device *obd; - int next = 0; - - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { - class_manual_cleanup(obd); - } - - if (sbi->ll_lmd != NULL) - class_del_profile(sbi->ll_lmd->lmd_profile); -} - -int lustre_fill_super(struct super_block *sb, void *data, int silent) -{ - struct lustre_mount_data * lmd = data; - struct ll_sb_info *sbi; - char *osc = NULL; - char *mdc = NULL; - int err; - ENTRY; - - CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); - if (lmd_bad_magic(lmd)) - RETURN(-EINVAL); - - sbi = lustre_init_sbi(sb); - if (!sbi) - RETURN(-ENOMEM); - - if (lmd->lmd_profile) { - struct lustre_profile *lprof; - struct config_llog_instance cfg; - char ll_instance[sizeof(sb) * 2 + 1]; - - if (lmd->lmd_mds[0] == '\0') { - CERROR("no mds name\n"); - GOTO(out_free, err = -EINVAL); - } - - OBD_ALLOC(sbi->ll_lmd, sizeof(*sbi->ll_lmd)); - if (sbi->ll_lmd == NULL) - GOTO(out_free, err = -ENOMEM); - memcpy(sbi->ll_lmd, lmd, sizeof(*lmd)); - if (lmd->lmd_flags & LMD_FLG_FLOCK) - sbi->ll_flags |= LL_SBI_FLOCK; - if (lmd->lmd_flags & LMD_FLG_USER_XATTR) - sbi->ll_flags |= LL_SBI_USER_XATTR; - - /* generate a string unique to this super, let's try - the address of the super itself.*/ - sprintf(ll_instance, "%p", sb); - - cfg.cfg_instance = ll_instance; - cfg.cfg_uuid = sbi->ll_sb_uuid; - err = lustre_process_log(lmd, lmd->lmd_profile, &cfg); - if (err < 0) { - CERROR("Unable to process log: %s\n", lmd->lmd_profile); - GOTO(out_free, err); - } - - lprof = class_get_profile(lmd->lmd_profile); - if (lprof == NULL) { - CERROR("No profile found: %s\n", lmd->lmd_profile); - GOTO(out_free, err = -EINVAL); - } - if (osc) - OBD_FREE(osc, strlen(osc) + 1); - OBD_ALLOC(osc, strlen(lprof->lp_osc) + - strlen(ll_instance) + 2); - sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance); - - if (mdc) - OBD_FREE(mdc, strlen(mdc) + 1); - OBD_ALLOC(mdc, strlen(lprof->lp_mdc) + - strlen(ll_instance) + 2); - sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance); - } - - if (!osc) { - CERROR("no osc\n"); - GOTO(out_free, err = -EINVAL); + /* Generate a string unique to this super, in case some joker tries + to mount the same fs at two mount points. + Use the address of the super itself.*/ + sprintf(ll_instance, "%p", sb); + cfg.cfg_instance = ll_instance; + cfg.cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; + cfg.cfg_last_idx = 0; + + /* set up client obds */ + err = lustre_process_log(sb, profilenm, &cfg); + if (err < 0) { + CERROR("Unable to process log: %d\n", err); + GOTO(out_free, err); } - if (!mdc) { - CERROR("no mdc\n"); + lprof = class_get_profile(profilenm); + if (lprof == NULL) { + CERROR("No profile found: %s\n", profilenm); GOTO(out_free, err = -EINVAL); } - - err = lustre_common_fill_super(sb, mdc, osc); - - if (err) - GOTO(out_free, err); - -out_dev: + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, + lprof->lp_mdc, lprof->lp_osc); + + OBD_ALLOC(osc, strlen(lprof->lp_osc) + + strlen(ll_instance) + 2); + if (!osc) + GOTO(out_free, err = -ENOMEM); + sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance); + + OBD_ALLOC(mdc, strlen(lprof->lp_mdc) + + strlen(ll_instance) + 2); + if (!mdc) + GOTO(out_free, err = -ENOMEM); + sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance); + + /* connections, registrations, sb setup */ + err = client_common_fill_super(sb, mdc, osc); + +out_free: if (mdc) OBD_FREE(mdc, strlen(mdc) + 1); if (osc) OBD_FREE(osc, strlen(osc) + 1); - - RETURN(err); - -out_free: - if (sbi->ll_lmd) { - lustre_manual_cleanup(sbi); - OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd)); + if (err) { + struct obd_device *obd; + int next = 0; + /* like ll_put_super below */ + lustre_end_log(sb, NULL, &cfg); + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) + != NULL) { + class_manual_cleanup(obd); + } + class_del_profile(profilenm); + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; + lustre_common_put_super(sb); } - lustre_free_sbi(sb); + RETURN(err); +} /* ll_fill_super */ - goto out_dev; -} /* lustre_fill_super */ -void lustre_put_super(struct super_block *sb) +void ll_put_super(struct super_block *sb) { + struct config_llog_instance cfg; + char ll_instance[sizeof(sb) * 2 + 1]; struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); struct ll_sb_info *sbi = ll_s2sbi(sb); - int force = 0; + char *profilenm = get_profile_name(sb); + int next; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); + CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); + + sprintf(ll_instance, "%p", sb); + cfg.cfg_instance = ll_instance; + lustre_end_log(sb, NULL, &cfg); + obd = class_exp2obd(sbi->ll_mdc_exp); if (obd) { - int next = 0; - /* We need to set force before the lov_disconnect in + int force = obd->obd_no_recov; + /* We need to set force before the lov_disconnect in lustre_common_put_super, since l_d cleans up osc's as well. */ - force = obd->obd_no_recov; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) - !=NULL) { + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) + != NULL) { obd->obd_force = force; - } + } } - lustre_common_put_super(sb); + client_common_put_super(sb); + + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { + class_manual_cleanup(obd); + } + + if (profilenm) + class_del_profile(profilenm); - if (sbi->ll_lmd != NULL) { - lustre_manual_cleanup(sbi); - OBD_FREE(sbi->ll_lmd, sizeof(*sbi->ll_lmd)); - } + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; - lustre_free_sbi(sb); + lustre_common_put_super(sb); + LCONSOLE_WARN("client umount complete\n"); EXIT; -} /* lustre_put_super */ +} /* client_put_super */ #ifdef HAVE_REGISTER_CACHE #include @@ -1666,12 +1430,18 @@ int ll_iocontrol(struct inode *inode, struct file *file, RETURN(0); } +/* umount -f client means force down, don't save state */ void ll_umount_begin(struct super_block *sb) { + struct lustre_sb_info *lsi = s2lsi(sb); struct ll_sb_info *sbi = ll_s2sbi(sb); struct obd_device *obd; struct obd_ioctl_data ioc_data = { 0 }; ENTRY; + + /* Tell the MGC we got umount -f */ + lsi->lsi_flags |= LSI_UMOUNT_FORCE; + CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, sb->s_count, atomic_read(&sb->s_active)); @@ -1707,12 +1477,12 @@ void ll_umount_begin(struct super_block *sb) EXIT; } -int lustre_remount_fs(struct super_block *sb, int *flags, char *data) +int ll_remount_fs(struct super_block *sb, int *flags, char *data) { struct ll_sb_info *sbi = ll_s2sbi(sb); int err; __u32 read_only; - + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { read_only = *flags & MS_RDONLY; err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"), @@ -1723,7 +1493,7 @@ int lustre_remount_fs(struct super_block *sb, int *flags, char *data) "remount: %d\n", err); return err; } - + if (read_only) sb->s_flags |= MS_RDONLY; else diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 0a54eca..d655cf4 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -486,7 +486,7 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) ll_teardown_mmaps(page->mapping, (__u64)page->index<index<mapping); ll_truncate_complete_page(page); diff --git a/lustre/llite/super.c b/lustre/llite/super.c index e2f60fb..3d3fef2 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -41,60 +41,25 @@ extern struct address_space_operations ll_aops; extern struct address_space_operations ll_dir_aops; -static struct super_block *ll_read_super(struct super_block *sb, - void *data, int silent) -{ - int err; - ENTRY; - err = ll_fill_super(sb, data, silent); - if (err) - RETURN(NULL); - RETURN(sb); -} - -static struct super_block *lustre_read_super(struct super_block *sb, - void *data, int silent) -{ - int err; - ENTRY; - err = lustre_fill_super(sb, data, silent); - if (err) - RETURN(NULL); - RETURN(sb); -} - -static struct file_system_type lustre_lite_fs_type = { - .owner = THIS_MODULE, - .name = "lustre_lite", - .fs_flags = FS_NFSEXP_FSID, - .read_super = ll_read_super, -}; /* exported operations */ struct super_operations lustre_super_operations = { .read_inode2 = ll_read_inode2, .clear_inode = ll_clear_inode, - .put_super = lustre_put_super, + .put_super = ll_put_super, .statfs = ll_statfs, .umount_begin = ll_umount_begin, .fh_to_dentry = ll_fh_to_dentry, .dentry_to_fh = ll_dentry_to_fh, - .remount_fs = lustre_remount_fs, -}; - -static struct file_system_type lustre_fs_type = { - .owner = THIS_MODULE, - .name = "lustre", - .fs_flags = FS_NFSEXP_FSID, - .read_super = lustre_read_super, + .remount_fs = ll_remount_fs, }; static int __init init_lustre_lite(void) { int rc, seed[2]; - printk(KERN_INFO "Lustre: Lustre Lite Client File System; " + printk(KERN_INFO "Lustre: Lustre Client File System; " "info@clusterfs.com\n"); ll_file_data_slab = kmem_cache_create("ll_file_data", sizeof(struct ll_file_data), 0, @@ -107,14 +72,7 @@ static int __init init_lustre_lite(void) ll_register_cache(&ll_cache_definition); - rc = register_filesystem(&lustre_lite_fs_type); - if (rc == 0) - rc = register_filesystem(&lustre_fs_type); - if (rc) { - /* This is safe even if lustre_lite_fs_type isn't registered */ - unregister_filesystem(&lustre_lite_fs_type); - ll_unregister_cache(&ll_cache_definition); - } + lustre_register_client_fill_super(ll_fill_super); get_random_bytes(seed, sizeof(seed)); ll_srand(seed[0], seed[1]); @@ -126,9 +84,8 @@ static void __exit exit_lustre_lite(void) { int rc; - unregister_filesystem(&lustre_lite_fs_type); - unregister_filesystem(&lustre_fs_type); - + lustre_register_client_fill_super(NULL); + ll_unregister_cache(&ll_cache_definition); rc = kmem_cache_destroy(ll_file_data_slab); diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index c3072a3..b6e7d51 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -35,20 +35,6 @@ #include #include "llite_internal.h" -struct super_block * ll_get_sb(struct file_system_type *fs_type, - int flags, const char *devname, void * data) -{ - /* calls back in fill super */ - return get_sb_nodev(fs_type, flags, data, ll_fill_super); -} - -struct super_block * lustre_get_sb(struct file_system_type *fs_type, - int flags, const char *devname, void * data) -{ - /* calls back in fill super */ - return get_sb_nodev(fs_type, flags, data, lustre_fill_super); -} - static kmem_cache_t *ll_inode_cachep; static struct inode *ll_alloc_inode(struct super_block *sb) @@ -105,33 +91,17 @@ struct super_operations lustre_super_operations = .alloc_inode = ll_alloc_inode, .destroy_inode = ll_destroy_inode, .clear_inode = ll_clear_inode, - .put_super = lustre_put_super, + .put_super = ll_put_super, .statfs = ll_statfs, .umount_begin = ll_umount_begin, - .remount_fs = lustre_remount_fs, -}; - - -struct file_system_type lustre_lite_fs_type = { - .owner = THIS_MODULE, - .name = "lustre_lite", - .get_sb = ll_get_sb, - .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, + .remount_fs = ll_remount_fs, }; -struct file_system_type lustre_fs_type = { - .owner = THIS_MODULE, - .name = "lustre", - .get_sb = lustre_get_sb, - .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, -}; static int __init init_lustre_lite(void) { int rc, seed[2]; - printk(KERN_INFO "Lustre: Lustre Lite Client File System; " + printk(KERN_INFO "Lustre: Lustre Client File System; " "info@clusterfs.com\n"); rc = ll_init_inodecache(); if (rc) @@ -148,19 +118,12 @@ static int __init init_lustre_lite(void) proc_mkdir("llite", proc_lustre_root) : NULL; ll_register_cache(&ll_cache_definition); - - rc = register_filesystem(&lustre_lite_fs_type); - if (rc == 0) - rc = register_filesystem(&lustre_fs_type); - if (rc) { - /* This is safe even if lustre_lite_fs_type isn't registered */ - unregister_filesystem(&lustre_lite_fs_type); - ll_unregister_cache(&ll_cache_definition); - } - + + lustre_register_client_fill_super(ll_fill_super); + get_random_bytes(seed, sizeof(seed)); ll_srand(seed[0], seed[1]); - + return rc; } @@ -168,8 +131,7 @@ static void __exit exit_lustre_lite(void) { int rc; - unregister_filesystem(&lustre_fs_type); - unregister_filesystem(&lustre_lite_fs_type); + lustre_register_client_fill_super(NULL); ll_unregister_cache(&ll_cache_definition); diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c index 23ccc08..454b5a6 100644 --- a/lustre/lov/lov_log.c +++ b/lustre/lov/lov_log.c @@ -107,8 +107,12 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count, int i, rc = 0; ENTRY; - LASSERT(lov->desc.ld_tgt_count == count); - for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { + /* We might have added an osc and not told the mds yet */ + if (count != lov->desc.ld_tgt_count) + CERROR("Origin connect mds cnt %d != lov cnt %d\n", count, + lov->desc.ld_tgt_count); + + for (i = 0, tgt = lov->tgts; i < count; i++, tgt++) { struct obd_device *child; struct llog_ctxt *cctxt; @@ -121,7 +125,7 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count, rc = llog_connect(cctxt, 1, logid, gen, uuid); if (rc) { - CERROR("error osc_llog_connect %d\n", i); + CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc); break; } } @@ -188,6 +192,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt, if (rc) RETURN(rc); + CDEBUG(D_CONFIG, "llog init with %d targets\n", count); LASSERT(lov->desc.ld_tgt_count == count); for (i = 0, ctgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, ctgt++) { struct obd_device *child; @@ -196,7 +201,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt, child = ctgt->ltd_exp->exp_obd; rc = obd_llog_init(child, tgt, 1, logid + i); if (rc) { - CERROR("error osc_llog_init %d\n", i); + CERROR("error osc_llog_init %d (%d)\n", i, rc); break; } } diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 4fcc9d1..4b22292 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "lov_internal.h" @@ -107,17 +108,22 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, &obd->obd_uuid); if (!tgt_obd) { - CERROR("Target %s not attached\n", tgt_uuid->uuid); + CERROR("Target %s not attached\n", obd_uuid2str(tgt_uuid)); RETURN(-EINVAL); } + + CDEBUG(D_CONFIG, "Connect tgt %s (%s)\n", obd_uuid2str(tgt_uuid), + tgt_obd->obd_name); if (!tgt_obd->obd_set_up) { - CERROR("Target %s not set up\n", tgt_uuid->uuid); + CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); RETURN(-EINVAL); } if (activate) { tgt_obd->obd_no_recov = 0; + /* FIXME this is probably supposed to be + ptlrpc_set_import_active. Horrible naming. */ ptlrpc_activate_import(tgt_obd->u.cli.cl_import); } @@ -128,19 +134,20 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, if (imp->imp_invalid) { CERROR("not connecting OSC %s; administratively " - "disabled\n", tgt_uuid->uuid); + "disabled\n", obd_uuid2str(tgt_uuid)); rc = obd_register_observer(tgt_obd, obd); if (rc) { CERROR("Target %s register_observer error %d; " "will not be able to reactivate\n", - tgt_uuid->uuid, rc); + obd_uuid2str(tgt_uuid), rc); } RETURN(0); } rc = obd_connect(&conn, tgt_obd, &lov_osc_uuid, data); if (rc) { - CERROR("Target %s connect error %d\n", tgt_uuid->uuid, rc); + CERROR("Target %s connect error %d\n", + obd_uuid2str(tgt_uuid), rc); RETURN(rc); } tgt->ltd_exp = class_conn2export(&conn); @@ -148,7 +155,7 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, rc = obd_register_observer(tgt_obd, obd); if (rc) { CERROR("Target %s register_observer error %d\n", - tgt_uuid->uuid, rc); + obd_uuid2str(tgt_uuid), rc); obd_disconnect(tgt->ltd_exp); tgt->ltd_exp = NULL; RETURN(rc); @@ -191,58 +198,20 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data) { struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgt; - struct obd_export *exp; - __u64 connect_flags = data ? data->ocd_connect_flags : 0; - int rc, rc2, i; + int rc; ENTRY; - rc = class_connect(conn, obd, cluuid); - if (rc) - RETURN(rc); - - exp = class_conn2export(conn); + lov->ocd.ocd_connect_flags = OBD_CONNECT_EMPTY; + if (data) + lov->ocd = *data; - /* We don't want to actually do the underlying connections more than - * once, so keep track. */ - lov->connects++; - if (lov->connects > 1) { - class_export_put(exp); - RETURN(0); - } - - for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { - if (obd_uuid_empty(&tgt->uuid)) - continue; - if (connect_flags & OBD_CONNECT_INDEX) - data->ocd_index = i; - rc = lov_connect_obd(obd, tgt, 0, data); - if (rc) - GOTO(out_disc, rc); - if (data) - connect_flags &= data->ocd_connect_flags; - } - - if (data) - data->ocd_connect_flags = connect_flags; + rc = class_connect(conn, obd, cluuid); + if (!rc) + lov->connects++; + CDEBUG(D_CONFIG, "connect #%d\n", lov->connects); - class_export_put(exp); - RETURN (0); + /* target connects are done in lov_add_target */ - out_disc: - while (i-- > 0) { - struct obd_uuid uuid; - --tgt; - --lov->desc.ld_active_tgt_count; - tgt->active = 0; - /* save for CERROR below; (we know it's terminated) */ - uuid = tgt->uuid; - rc2 = obd_disconnect(tgt->ltd_exp); - if (rc2) - CERROR("error: LOV target %s disconnect on OST idx %d: " - "rc = %d\n", uuid.uuid, i, rc2); - } - class_disconnect(exp); RETURN (rc); } @@ -254,7 +223,8 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) int rc; ENTRY; - CDEBUG(D_CONFIG, "Disconnecting lov target %s\n", obd->obd_uuid.uuid); + CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", + obd->obd_name, osc_obd->obd_name); lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); if (lov_proc_dir) { @@ -299,8 +269,8 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) RETURN(0); } -static int lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, - int index, int gen); +static int lov_del_target(struct obd_device *obd, struct obd_uuid *uuidp, + int index, int gen); static int lov_disconnect(struct obd_export *exp) { @@ -315,8 +285,11 @@ static int lov_disconnect(struct obd_export *exp) /* Only disconnect the underlying layers on the final disconnect. */ lov->connects--; - if (lov->connects != 0) + if (lov->connects != 0) { + /* why should there be more than 1 connect? */ + CERROR("disconnect #%d\n", lov->connects); goto out; + } /* Let's hold another reference so lov_del_obd doesn't spin through putref every time */ @@ -324,13 +297,13 @@ static int lov_disconnect(struct obd_export *exp) for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { if (tgt->ltd_exp) { /* Disconnection is the last we know about an obd */ - lov_del_obd(obd, &tgt->uuid, i, tgt->ltd_gen); + lov_del_target(obd, &tgt->uuid, i, tgt->ltd_gen); } } lov_putref(obd); out: - rc = class_disconnect(exp); + rc = class_disconnect(exp); /* bz 9811 */ RETURN(rc); } @@ -384,21 +357,24 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, } static int lov_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev) + enum obd_notify_event ev, void *data) { - struct obd_uuid *uuid; - int rc; + int rc = 0; ENTRY; - if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { - CERROR("unexpected notification of %s %s!\n", - watched->obd_type->typ_name, - watched->obd_name); - RETURN(-EINVAL); - } - uuid = &watched->u.cli.cl_target_uuid; - if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + RETURN(-EINVAL); + } + uuid = &watched->u.cli.cl_target_uuid; + /* Set OSC as active before notifying the observer, so the * observer can use the OSC normally. */ @@ -409,29 +385,51 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, if (rc) { CERROR("%sactivation of %s failed: %d\n", (ev == OBD_NOTIFY_ACTIVE) ? "" : "de", - uuid->uuid, rc); + obd_uuid2str(uuid), rc); RETURN(rc); } } /* Pass the notification up the chain. */ - rc = obd_notify_observer(obd, watched, ev); + if (watched) { + rc = obd_notify_observer(obd, watched, ev, data); + } else { + /* NULL watched means all osc's in the lov (only for syncs) */ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int i; + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) { + if (obd_uuid_empty(&tgt->uuid)) + continue; + tgt_obd = class_exp2obd(tgt->ltd_exp); + rc = obd_notify_observer(obd, tgt_obd, ev, data); + if (rc) { + CERROR("%s: notify %s of %s failed %d\n", + obd->obd_name, + obd->obd_observer->obd_name, + tgt_obd->obd_name, rc); + break; + } + } + } RETURN(rc); } -static int -lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) +static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + int index, int gen, int active) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; - obd_id params[2]; - int rc, old_count; - __u32 bufsize, size = 2; + struct obd_connect_data *ocd = NULL; + __u32 bufsize, idx; + int rc; ENTRY; - CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n", - uuidp->uuid, index, gen); + CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n", + uuidp->uuid, index, gen, active); if (index < 0) { CERROR("request to add OBD %s at invalid index: %d\n", @@ -474,8 +472,8 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) tgt = &lov->tgts[index]; if (!obd_uuid_empty(&tgt->uuid)) { - CERROR("OBD already assigned at LOV target index %d\n", - index); + CERROR("UUID %s already assigned at LOV target index %d\n", + obd_uuid2str(&tgt->uuid), index); RETURN(-EEXIST); } @@ -485,18 +483,12 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) tgt->index = index; INIT_LIST_HEAD(&tgt->qos_bavail_list); - old_count = lov->desc.ld_tgt_count; if (index >= lov->desc.ld_tgt_count) lov->desc.ld_tgt_count = index + 1; CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", index, tgt->ltd_gen, lov->desc.ld_tgt_count); - if (lov->connects == 0) - /* lov_connect hasn't been called yet. So we'll do the - lov_connect_obd on this obd when that fn first runs. */ - RETURN(0); - if (tgt->ltd_exp) { struct obd_device *osc_obd; @@ -505,37 +497,36 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) osc_obd->obd_no_recov = 0; } - /* NULL may need to change when we use flags for osc's */ - rc = lov_connect_obd(obd, tgt, 1, NULL); - if (rc || !obd->obd_observer) - RETURN(rc); - - /* tell the mds_lov about the new target */ - obd_llog_finish(obd->obd_observer, old_count); - llog_cat_initialize(obd->obd_observer, lov->desc.ld_tgt_count); - - params[0] = index; - rc = obd_get_info(tgt->ltd_exp, strlen("last_id"), "last_id", &size, - ¶ms[1]); + if (lov->ocd.ocd_connect_flags != OBD_CONNECT_EMPTY) { + /* Keep the original connect flags pristine */ + OBD_ALLOC(ocd, sizeof(*ocd)); + if (!ocd) + RETURN(-ENOMEM); + *ocd = lov->ocd; + } + rc = lov_connect_obd(obd, tgt, active, ocd); + if (ocd) + OBD_FREE(ocd, sizeof(*ocd)); if (rc) GOTO(out, rc); - rc = obd_set_info_async(obd->obd_observer->obd_self_export, - strlen("next_id"),"next_id", 2, params, NULL); - if (rc) - GOTO(out, rc); + idx = index; + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, + active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE, + (void *)&idx); - rc = lov_notify(obd, tgt->ltd_exp->exp_obd, OBD_NOTIFY_ACTIVE); - GOTO(out, rc); - out: - if (rc && tgt->ltd_exp != NULL) - lov_disconnect_obd(obd, tgt); - return rc; +out: + if (rc) { + CERROR("add failed (%d), deleting %s\n", rc, + (char *)tgt->uuid.uuid); + lov_del_target(obd, &tgt->uuid, index, 0); + } + RETURN(rc); } /* Schedule a target for deletion */ -static int -lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) +static int lov_del_target(struct obd_device *obd, struct obd_uuid *uuidp, + int index, int gen) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; @@ -556,7 +547,7 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) RETURN(-EINVAL); } - if (strncmp(uuidp->uuid, tgt->uuid.uuid, sizeof uuidp->uuid) != 0) { + if (!obd_uuid_equals(uuidp, &tgt->uuid)) { CERROR("LOV target UUID %s at index %d doesn't match %s.\n", tgt->uuid.uuid, index, uuidp->uuid); RETURN(-EINVAL); @@ -581,6 +572,9 @@ static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) LASSERT(tgt->reap); osc_obd = class_exp2obd(tgt->ltd_exp); + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", tgt->uuid.uuid, + osc_obd ? osc_obd->obd_name : ""); + if (tgt->ltd_exp) lov_disconnect_obd(obd, tgt); @@ -602,6 +596,31 @@ static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) } } +static void lov_fix_desc(struct lov_desc *desc) +{ + if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) { + CWARN("Increasing default_stripe_size "LPU64" to %u\n", + desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE); + desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE; + } else if (desc->ld_default_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { + CWARN("default_stripe_size "LPU64" isn't a multiple of %u\n", + desc->ld_default_stripe_size, LOV_MIN_STRIPE_SIZE); + desc->ld_default_stripe_size &= ~(LOV_MIN_STRIPE_SIZE - 1); + CWARN("changing to "LPU64"\n", desc->ld_default_stripe_size); + } + + if (desc->ld_default_stripe_count == 0) + desc->ld_default_stripe_count = 1; + + /* from lov_setstripe */ + if ((desc->ld_pattern != 0) && + (desc->ld_pattern != LOV_PATTERN_RAID0)) { + CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n", + desc->ld_pattern); + desc->ld_pattern = 0; + } +} + static int lov_setup(struct obd_device *obd, obd_count len, void *buf) { struct lprocfs_static_vars lvars; @@ -637,22 +656,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) } } - if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) { - CWARN("Increasing default_stripe_size "LPU64" to %u\n", - desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE); - CWARN("Please update config and run --write-conf on MDS\n"); - - desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE; - } else if (desc->ld_default_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { - CWARN("default_stripe_size "LPU64" isn't a multiple of %u\n", - desc->ld_default_stripe_size, LOV_MIN_STRIPE_SIZE); - CWARN("Please update config and run --write-conf on MDS\n"); - - desc->ld_default_stripe_size &= ~(LOV_MIN_STRIPE_SIZE - 1); - } - - if (desc->ld_default_stripe_count == 0) - desc->ld_default_stripe_count = 1; + lov_fix_desc(desc); /* Because of 64-bit divide/mod operations only work with a 32-bit * divisor in a 32-bit kernel, we cannot support a stripe width @@ -746,8 +750,11 @@ static int lov_cleanup(struct obd_device *obd) /* We should never get here - these should have been removed in the disconnect. */ if (!obd_uuid_empty(&tgt->uuid)) { - CERROR("lov tgt %d not cleaned!\n", i); - lov_del_obd(obd, &tgt->uuid, i, 0); + CERROR("lov tgt %d not cleaned!" + " deathrow=%d, lovrc=%d\n", + i, lov->death_row, + atomic_read(&lov->refcount)); + lov_del_target(obd, &tgt->uuid, i, 0); } } OBD_FREE(lov->tgts, lov->bufsize); @@ -767,7 +774,9 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) switch(cmd = lcfg->lcfg_command) { case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: case LCFG_LOV_DEL_OBD: { + /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) GOTO(out, rc = -EINVAL); @@ -778,9 +787,52 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) GOTO(out, rc = -EINVAL); if (cmd == LCFG_LOV_ADD_OBD) - rc = lov_add_obd(obd, &obd_uuid, index, gen); + rc = lov_add_target(obd, &obd_uuid, index, gen, 1); + else if (cmd == LCFG_LOV_ADD_INA) + rc = lov_add_target(obd, &obd_uuid, index, gen, 0); else - rc = lov_del_obd(obd, &obd_uuid, index, gen); + rc = lov_del_target(obd, &obd_uuid, index, gen); + GOTO(out, rc); + } + case LCFG_PARAM: { + int i; + struct lov_obd *lov = &obd->u.lov; + struct lov_desc *desc = &(lov->desc); + if (!desc) + GOTO(out, rc = -EINVAL); + /* see jt_obd_lov_getconfig for variable names */ + /* setparam 0:lov_mdsA 1:default_stripe_size=1048576 + 2:default_stripe_pattern=0 3:default_stripe_offset=0 */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + char *key, *sval; + long val; + key = lustre_cfg_buf(lcfg, i); + sval = strchr(key, '='); + if (!sval || (*(sval + 1) == 0)) { + CERROR("Can't parse param %s\n", key); + rc = -EINVAL; + /* continue parsing other params */ + continue; + } + *sval = 0; + val = simple_strtol(sval + 1, NULL, 0); + if (strcmp(key, PARAM_D_STRIPE_SIZE) == 0) + desc->ld_default_stripe_size = val; + else if (strcmp(key, PARAM_D_STRIPE_COUNT) == 0) + desc->ld_default_stripe_count = val; + else if (strcmp(key, PARAM_D_STRIPE_OFFSET) == 0) + desc->ld_default_stripe_offset = val; + else if (strcmp(key, PARAM_D_STRIPE_PATTERN) == 0) + desc->ld_pattern = val; + else { + CERROR("Unknown param %s\n", key); + rc = -EINVAL; + /* continue parsing other params */ + continue; + } + LCONSOLE_INFO("set %s to %ld\n", key, val); + } + lov_fix_desc(desc); GOTO(out, rc); } default: { @@ -837,6 +889,8 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa, if (ost_uuid && !obd_uuid_equals(ost_uuid, &lov->tgts[i].uuid)) continue; + CDEBUG(D_CONFIG,"Clear orphans for %d:%s\n", i, ost_uuid->uuid); + memcpy(tmp_oa, src_oa, sizeof(*tmp_oa)); LASSERT(lov->tgts[i].ltd_exp); @@ -1482,14 +1536,24 @@ int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, { struct lov_obd *lov = &exp->exp_obd->u.lov; struct lov_async_page *lap; - int rc; + int rc = 0; ENTRY; - if (!page) - return size_round(sizeof(*lap)) + - obd_prep_async_page(lov->tgts[0].ltd_exp, NULL, NULL, - NULL, 0, NULL, NULL, NULL); - + if (!page) { + int i = 0; + /* Find an existing osc so we can get it's stupid sizeof(*oap). + Only because of this layering limitation will a client + mount with no osts fail */ + while (!lov->tgts[i].ltd_exp) { + i++; + if (i >= lov->desc.ld_tgt_count) + RETURN(-ENOTBLK); + } + rc = size_round(sizeof(*lap)) + + obd_prep_async_page(lov->tgts[i].ltd_exp, NULL, NULL, + NULL, 0, NULL, NULL, NULL); + RETURN(rc); + } ASSERT_LSM_MAGIC(lsm); LASSERT(loi == NULL); @@ -2142,7 +2206,8 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, GOTO(out, rc); } GOTO(out, rc = 0); - } else if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) { + } else if (keylen >= strlen(KEY_LOVDESC) && + strcmp(key, KEY_LOVDESC) == 0) { struct lov_desc *desc_ret = val; *desc_ret = lov->desc; @@ -2171,15 +2236,15 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(-ENOMEM); } - if (KEY_IS("next_id")) { - if (vallen != lov->desc.ld_tgt_count) + if (KEY_IS(KEY_NEXT_ID)) { + if (vallen > lov->desc.ld_tgt_count) RETURN(-EINVAL); vallen = sizeof(obd_id); } lov_getref(obddev); - if (KEY_IS("next_id") || KEY_IS("checksum")) { + if (KEY_IS(KEY_NEXT_ID) || KEY_IS("checksum")) { for (i = 0; i < lov->desc.ld_tgt_count; i++) { /* OST was disconnected */ if (!lov->tgts[i].ltd_exp) @@ -2209,7 +2274,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, GOTO(out, rc); } - if (KEY_IS("mds_conn") || KEY_IS("unlinked")) { + if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) { if (vallen != 0) GOTO(out, rc = -EINVAL); } else { diff --git a/lustre/lvfs/fsfilt.c b/lustre/lvfs/fsfilt.c index fee6b69..6f88917 100644 --- a/lustre/lvfs/fsfilt.c +++ b/lustre/lvfs/fsfilt.c @@ -35,16 +35,16 @@ int fsfilt_register_ops(struct fsfilt_operations *fs_ops) if ((found = fsfilt_search_type(fs_ops->fs_type))) { if (found != fs_ops) { CERROR("different operations for type %s\n", - fs_ops->fs_type); + fs_ops->fs_type); /* unlock fsfilt_types list */ RETURN(-EEXIST); } } else { PORTAL_MODULE_USE; - list_add(&fs_ops->fs_list, &fsfilt_types); - } + list_add(&fs_ops->fs_list, &fsfilt_types); + } - /* unlock fsfilt_types list */ + /* unlock fsfilt_types list */ return 0; } @@ -54,7 +54,7 @@ void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops) /* lock fsfilt_types list */ list_for_each(p, &fsfilt_types) { - struct fsfilt_operations *found; + struct fsfilt_operations *found; found = list_entry(p, typeof(*found), fs_list); if (found == fs_ops) { @@ -86,9 +86,9 @@ struct fsfilt_operations *fsfilt_get_ops(const char *type) } if (rc) { - CERROR("Can't find fsfilt_%s interface\n", name); - RETURN(ERR_PTR(rc)); - /* unlock fsfilt_types list */ + CERROR("Can't find %s interface\n", name); + RETURN(ERR_PTR(rc < 0 ? rc : -rc)); + /* unlock fsfilt_types list */ } } try_module_get(fs_ops->fs_owner); diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 34da8d1..533f0d3 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -73,11 +73,44 @@ struct fsfilt_cb_data { #define EXT3_XATTR_INDEX_TRUSTED 4 #endif -static char *fsfilt_ext3_label(struct super_block *sb) +static char *fsfilt_ext3_get_label(struct super_block *sb) { return EXT3_SB(sb)->s_es->s_volume_name; } +static int fsfilt_ext3_set_label(struct super_block *sb, char *label) +{ + /* see e.g. fsfilt_ext3_write_record() */ + journal_t *journal; + handle_t *handle; + int err; + + journal = EXT3_SB(sb)->s_journal; + lock_24kernel(); + handle = journal_start(journal, 1); + unlock_24kernel(); + if (IS_ERR(handle)) { + CERROR("can't start transaction\n"); + return(PTR_ERR(handle)); + } + + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out; + + memcpy(EXT3_SB(sb)->s_es->s_volume_name, label, + sizeof(EXT3_SB(sb)->s_es->s_volume_name)); + + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + +out: + lock_24kernel(); + journal_stop(handle); + unlock_24kernel(); + + return(err); +} + static char *fsfilt_ext3_uuid(struct super_block *sb) { return EXT3_SB(sb)->s_es->s_uuid; @@ -693,7 +726,7 @@ static int fsfilt_ext3_sync(struct super_block *sb) #undef EXT3_MULTIBLOCK_ALLOCATOR #endif #ifndef EXT3_EXTENTS_FL -#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ #endif #ifdef EXT3_MULTIBLOCK_ALLOCATOR @@ -1924,7 +1957,8 @@ static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd) static struct fsfilt_operations fsfilt_ext3_ops = { .fs_type = "ext3", .fs_owner = THIS_MODULE, - .fs_label = fsfilt_ext3_label, + .fs_getlabel = fsfilt_ext3_get_label, + .fs_setlabel = fsfilt_ext3_set_label, .fs_uuid = fsfilt_ext3_uuid, .fs_start = fsfilt_ext3_start, .fs_brw_start = fsfilt_ext3_brw_start, diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 188f8be..969f1c3 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -294,8 +294,9 @@ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix) /* Fixup directory permissions if necessary */ if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) { - CWARN("fixing permissions on %s from %o to %o\n", - name, old_mode, mode); + CDEBUG(D_CONFIG, + "fixing permissions on %s from %o to %o\n", + name, old_mode, mode); dchild->d_inode->i_mode = (mode & S_IALLUGO) | (old_mode & ~S_IALLUGO); mark_inode_dirty(dchild->d_inode); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 580cebc..dc9fc62 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -838,16 +838,16 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen, struct obd_import *imp = class_exp2cliimp(exp); int rc = -EINVAL; - if (KEY_IS("initial_recov")) { + if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); imp->imp_initial_recov = *(int *)val; - CDEBUG(D_HA, "%s: set imp_no_init_recov = %d\n", + CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov); RETURN(0); } /* Turn off initial_recov after we try all backup servers once */ - if (KEY_IS("init_recov_bk")) { + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { if (vallen != sizeof(int)) RETURN(-EINVAL); imp->imp_initial_recov_bk = *(int *)val; @@ -1074,7 +1074,7 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, break; } case IMP_EVENT_INACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); break; } case IMP_EVENT_INVALIDATE: { @@ -1085,7 +1085,7 @@ static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, break; } case IMP_EVENT_ACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); break; } case IMP_EVENT_OCD: @@ -1154,7 +1154,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp) int rc, size; ENTRY; - rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc", + rc = obd_get_info(lov_exp, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC, &valsize, &desc); if (rc) RETURN(rc); @@ -1187,13 +1187,17 @@ static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) int rc = 0; ENTRY; - if (stage < OBD_CLEANUP_SELF_EXP) - RETURN(0); - - rc = obd_llog_finish(obd, 0); - if (rc != 0) - CERROR("failed to cleanup llogging subsystems\n"); - + switch (stage) { + case OBD_CLEANUP_EARLY: + case OBD_CLEANUP_EXPORTS: + break; + case OBD_CLEANUP_SELF_EXP: + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + case OBD_CLEANUP_OBD: + break; + } RETURN(rc); } diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 71d31f8..9a1f1fe 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -34,11 +34,9 @@ #endif #define DEBUG_SUBSYSTEM S_MDS -#include #include -#include +#include #include -#include #include #include #include @@ -51,12 +49,15 @@ #else # include #endif + +#include +#include #include -#include #include #include #include #include +#include #include #include "mds_internal.h" @@ -995,7 +996,6 @@ out_ucred: return rc; } - static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs, unsigned long max_age) { @@ -1228,7 +1228,7 @@ static char *reint_names[] = { [REINT_OPEN] "open", }; -static int mds_set_info(struct obd_export *exp, struct ptlrpc_request *req) +static int mds_set_info_rpc(struct obd_export *exp, struct ptlrpc_request *req) { char *key; __u32 *val; @@ -1585,7 +1585,7 @@ int mds_handle(struct ptlrpc_request *req) case MDS_SET_INFO: DEBUG_REQ(D_INODE, req, "set_info"); - rc = mds_set_info(req->rq_export, req); + rc = mds_set_info_rpc(req->rq_export, req); break; case MDS_QUOTACHECK: @@ -1708,22 +1708,44 @@ int mds_handle(struct ptlrpc_request *req) int mds_update_server_data(struct obd_device *obd, int force_sync) { struct mds_obd *mds = &obd->u.mds; - struct mds_server_data *msd = mds->mds_server_data; + struct lr_server_data *lsd = mds->mds_server_data; + struct lr_server_data *lsd_copy = NULL; struct file *filp = mds->mds_rcvd_filp; struct lvfs_run_ctxt saved; loff_t off = 0; int rc; ENTRY; - push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - msd->msd_last_transno = cpu_to_le64(mds->mds_last_transno); - CDEBUG(D_SUPER, "MDS mount_count is "LPU64", last_transno is "LPU64"\n", mds->mds_mount_count, mds->mds_last_transno); - rc = fsfilt_write_record(obd, filp, msd, sizeof(*msd), &off,force_sync); + + lsd->lsd_last_transno = cpu_to_le64(mds->mds_last_transno); + + if (!(lsd->lsd_feature_incompat & cpu_to_le32(OBD_INCOMPAT_COMMON_LR))){ + /* Swap to the old mds_server_data format, in case + someone wants to revert to a pre-1.6 lustre */ + CDEBUG(D_CONFIG, "writing old last_rcvd format\n"); + /* malloc new struct instead of swap in-place because + we don't have a lock on the last_trasno or mount count - + someone may modify it while we're here, and we don't want + them to inc the wrong thing. */ + OBD_ALLOC(lsd_copy, sizeof(*lsd_copy)); + if (!lsd_copy) + RETURN(-ENOMEM); + *lsd_copy = *lsd; + lsd_copy->lsd_unused = lsd->lsd_last_transno; + lsd_copy->lsd_last_transno = lsd->lsd_mount_count; + lsd = lsd_copy; + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = fsfilt_write_record(obd, filp, lsd, sizeof(*lsd), &off,force_sync); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); if (rc) CERROR("error writing MDS server data: rc = %d\n", rc); - pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + if (lsd_copy) + OBD_FREE(lsd_copy, sizeof(*lsd_copy)); RETURN(rc); } @@ -1768,6 +1790,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) struct lprocfs_static_vars lvars; struct lustre_cfg* lcfg = buf; struct mds_obd *mds = &obd->u.mds; + struct lustre_mount_info *lmi; struct vfsmount *mnt; struct obd_uuid uuid; __u8 *uuid_ptr; @@ -1777,6 +1800,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) int rc = 0; ENTRY; + /* setup 1:/dev/loop/0 2:ext3 3:mdsA 4:errors=remount-ro,iopen_nopriv */ + CLASSERT(offsetof(struct obd_device, u.obt) == offsetof(struct obd_device, u.mds.mds_obt)); @@ -1786,37 +1811,50 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) if (LUSTRE_CFG_BUFLEN(lcfg, 1) == 0 || LUSTRE_CFG_BUFLEN(lcfg, 2) == 0) RETURN(rc = -EINVAL); - obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); - if (IS_ERR(obd->obd_fsops)) - RETURN(rc = PTR_ERR(obd->obd_fsops)); - - page = __get_free_page(GFP_KERNEL); - if (!page) - RETURN(-ENOMEM); - - options = (char *)page; - memset(options, 0, PAGE_SIZE); - - /* here we use "iopen_nopriv" hardcoded, because it affects MDS utility - * and the rest of options are passed by mount options. Probably this - * should be moved to somewhere else like startup scripts or lconf. */ - strcpy(options, "iopen_nopriv"); - - if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) { - sprintf(options + strlen(options), ",%s", - lustre_cfg_string(lcfg, 4)); - fsoptions_to_mds_flags(mds, options); - } + lmi = server_get_mount(obd->obd_name); + if (lmi) { + /* We already mounted in lustre_fill_super. + lcfg bufs 1, 2, 4 (device, fstype, mount opts) are ignored.*/ + struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb); + mnt = lmi->lmi_mnt; + obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + } else { + /* old path - used by lctl */ + CERROR("Using old MDS mount method\n"); + page = __get_free_page(GFP_KERNEL); + if (!page) + RETURN(-ENOMEM); + + options = (char *)page; + memset(options, 0, PAGE_SIZE); + + /* here we use "iopen_nopriv" hardcoded, because it affects + * MDS utility and the rest of options are passed by mount + * options. Probably this should be moved to somewhere else + * like startup scripts or lconf. */ + strcpy(options, "iopen_nopriv"); + + if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) { + sprintf(options + strlen(options), ",%s", + lustre_cfg_string(lcfg, 4)); + fsoptions_to_mds_flags(mds, options); + } + + mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0, + lustre_cfg_string(lcfg, 1), + (void *)options); + free_page(page); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + LCONSOLE_ERROR("Can't mount disk %s (%d)\n", + lustre_cfg_string(lcfg, 1), rc); + RETURN(rc); + } - mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0, - lustre_cfg_string(lcfg, 1), (void *)options); - free_page(page); - if (IS_ERR(mnt)) { - rc = PTR_ERR(mnt); - LCONSOLE_ERROR("Can't mount disk %s (%d)\n", - lustre_cfg_string(lcfg, 1), rc); - GOTO(err_ops, rc); + obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); } + if (IS_ERR(obd->obd_fsops)) + GOTO(err_put, rc = PTR_ERR(obd->obd_fsops)); CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt); @@ -1833,7 +1871,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER); if (obd->obd_namespace == NULL) { mds_cleanup(obd); - GOTO(err_put, rc = -ENOMEM); + GOTO(err_ops, rc = -ENOMEM); } ldlm_register_intent(obd->obd_namespace, mds_intent_policy); @@ -1880,9 +1918,9 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) /* Don't wait for mds_postrecov trying to clear orphans */ obd->obd_async_recov = 1; rc = mds_postsetup(obd); + obd->obd_async_recov = 0; if (rc) GOTO(err_qctxt, rc); - obd->obd_async_recov = 0; lprocfs_init_vars(mds, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); @@ -1895,7 +1933,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) str = "no UUID"; } - label = fsfilt_label(obd, obd->u.obt.obt_sb); + label = fsfilt_get_label(obd, obd->u.obt.obt_sb); if (obd->obd_recovering) { LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in " "recovery until %d %s reconnect, or if no clients" @@ -1932,13 +1970,18 @@ err_fs: err_ns: ldlm_namespace_free(obd->obd_namespace, 0); obd->obd_namespace = NULL; -err_put: - unlock_kernel(); - mntput(mds->mds_vfsmnt); - obd->u.obt.obt_sb = NULL; - lock_kernel(); err_ops: fsfilt_put_ops(obd->obd_fsops); +err_put: + if (lmi) { + server_put_mount(obd->obd_name, mds->mds_vfsmnt); + } else { + /* old method */ + unlock_kernel(); + mntput(mds->mds_vfsmnt); + lock_kernel(); + } + obd->u.obt.obt_sb = NULL; return rc; } @@ -1957,7 +2000,6 @@ static int mds_lov_clean(struct obd_device *obd) /* There better be a lov */ if (!osc) RETURN(0); - if (IS_ERR(osc)) RETURN(PTR_ERR(osc)); @@ -1992,33 +2034,12 @@ static int mds_postsetup(struct obd_device *obd) RETURN(rc); if (mds->mds_profile) { - struct lvfs_run_ctxt saved; struct lustre_profile *lprof; - struct config_llog_instance cfg; - - cfg.cfg_instance = NULL; - cfg.cfg_uuid = mds->mds_lov_uuid; - push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - rc = class_config_parse_llog(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT), - mds->mds_profile, &cfg); - pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - switch (rc) { - case 0: - break; - case -EINVAL: - LCONSOLE_ERROR("%s: the profile %s could not be read. " - "If you recently installed a new " - "version of Lustre, you may need to " - "re-run 'lconf --write_conf " - ".xml' command line before " - "restarting the MDS.\n", - obd->obd_name, mds->mds_profile); - /* fall through */ - default: - GOTO(err_llog, rc); - break; - } - + /* The profile defines which osc and mdc to connect to, for a + client. We reuse that here to figure out the name of the + lov to use (and ignore lprof->lp_mdc). + The profile was set in the config log with + LCFG_MOUNTOPT profilenm oscnm mdcnm */ lprof = class_get_profile(mds->mds_profile); if (lprof == NULL) { CERROR("No profile found: %s\n", mds->mds_profile); @@ -2033,7 +2054,6 @@ static int mds_postsetup(struct obd_device *obd) err_cleanup: mds_lov_clean(obd); -err_llog: llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT)); llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT)); RETURN(rc); @@ -2050,11 +2070,12 @@ int mds_postrecov(struct obd_device *obd) LASSERT(!obd->obd_recovering); LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); + /* FIXME why not put this in the synchronize? */ /* set nextid first, so we are sure it happens */ rc = mds_lov_set_nextid(obd); if (rc) { - CERROR("%s: mds_lov_set_nextid failed\n", - obd->obd_name); + CERROR("%s: mds_lov_set_nextid failed %d\n", + obd->obd_name, rc); GOTO(out, rc); } @@ -2063,8 +2084,13 @@ int mds_postrecov(struct obd_device *obd) if (rc < 0) GOTO(out, rc); - /* Does anyone need this to be synchronous ever? */ - mds_lov_start_synchronize(obd, NULL, obd->obd_async_recov); + /* FIXME Does target_finish_recovery really need this to block? */ + /* Notify the LOV, which will in turn call mds_notify for each tgt */ + /* This means that we have to hack obd_notify to think we're obd_set_up + during mds_lov_connect. */ + obd_notify(obd->u.mds.mds_osc_obd, NULL, + obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK : + OBD_NOTIFY_SYNC, NULL); /* quota recovery */ lquota_recovery(quota_interface, obd); @@ -2115,6 +2141,7 @@ static int mds_cleanup(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; lvfs_sbdev_type save_dev; + int must_put = 0; int must_relock = 0; ENTRY; @@ -2132,20 +2159,15 @@ static int mds_cleanup(struct obd_device *obd) lquota_cleanup(quota_interface, obd); mds_update_server_data(obd, 1); - if (mds->mds_lov_objids != NULL) { - OBD_FREE(mds->mds_lov_objids, - mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id)); - } + if (mds->mds_lov_objids != NULL) + OBD_FREE(mds->mds_lov_objids, mds->mds_lov_objids_size); mds_fs_cleanup(obd); upcall_cache_cleanup(mds->mds_group_hash); mds->mds_group_hash = NULL; - /* 2 seems normal on mds, (may_umount() also expects 2 - fwiw), but we only see 1 at this point in obdfilter. */ - if (atomic_read(&obd->u.mds.mds_vfsmnt->mnt_count) > 2) - CERROR("%s: mount busy, mnt_count %d != 2\n", obd->obd_name, - atomic_read(&obd->u.mds.mds_vfsmnt->mnt_count)); + must_put = server_put_mount(obd->obd_name, mds->mds_vfsmnt); + /* must_put is for old method (l_p_m returns non-0 on err) */ /* We can only unlock kernel if we are in the context of sys_ioctl, otherwise we never called lock_kernel */ @@ -2153,8 +2175,10 @@ static int mds_cleanup(struct obd_device *obd) unlock_kernel(); must_relock++; } - - mntput(mds->mds_vfsmnt); + + if (must_put) + /* In case we didn't mount with lustre_get_mount -- old method*/ + mntput(mds->mds_vfsmnt); obd->u.obt.obt_sb = NULL; ldlm_namespace_free(obd->obd_namespace, obd->obd_force); diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 8cd2dc9..2bae298 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -44,11 +44,11 @@ #include #include #include +#include #include #include "mds_internal.h" -#define HEALTH_CHECK "health_check" /* Add client data to the MDS. We use a bitmap to locate a free space * in the last_rcvd file if cl_off is -1 (i.e. a new client). @@ -100,8 +100,8 @@ int mds_client_add(struct obd_device *obd, struct mds_obd *mds, cl_idx, med->med_mcd->mcd_uuid); med->med_lr_idx = cl_idx; - med->med_lr_off = le32_to_cpu(mds->mds_server_data->msd_client_start) + - (cl_idx * le16_to_cpu(mds->mds_server_data->msd_client_size)); + med->med_lr_off = le32_to_cpu(mds->mds_server_data->lsd_client_start) + + (cl_idx * le16_to_cpu(mds->mds_server_data->lsd_client_size)); LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off); if (new_client) { @@ -209,7 +209,7 @@ static int mds_server_free_data(struct mds_obd *mds) static int mds_init_server_data(struct obd_device *obd, struct file *file) { struct mds_obd *mds = &obd->u.mds; - struct mds_server_data *msd; + struct lr_server_data *lsd; struct mds_client_data *mcd = NULL; loff_t off = 0; unsigned long last_rcvd_size = file->f_dentry->d_inode->i_size; @@ -218,87 +218,104 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) ENTRY; /* ensure padding in the struct is the correct size */ - LASSERT(offsetof(struct mds_server_data, msd_padding) + - sizeof(msd->msd_padding) == LR_SERVER_SIZE); + LASSERT(offsetof(struct lr_server_data, lsd_padding) + + sizeof(lsd->lsd_padding) == LR_SERVER_SIZE); LASSERT(offsetof(struct mds_client_data, mcd_padding) + sizeof(mcd->mcd_padding) == LR_CLIENT_SIZE); - OBD_ALLOC_WAIT(msd, sizeof(*msd)); - if (!msd) + OBD_ALLOC_WAIT(lsd, sizeof(*lsd)); + if (!lsd) RETURN(-ENOMEM); OBD_ALLOC_WAIT(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8); if (!mds->mds_client_bitmap) { - OBD_FREE(msd, sizeof(*msd)); + OBD_FREE(lsd, sizeof(*lsd)); RETURN(-ENOMEM); } - mds->mds_server_data = msd; + mds->mds_server_data = lsd; if (last_rcvd_size == 0) { - CWARN("%s: initializing new %s\n", obd->obd_name, LAST_RCVD); - - memcpy(msd->msd_uuid, obd->obd_uuid.uuid,sizeof(msd->msd_uuid)); - msd->msd_last_transno = 0; - mount_count = msd->msd_mount_count = 0; - msd->msd_server_size = cpu_to_le32(LR_SERVER_SIZE); - msd->msd_client_start = cpu_to_le32(LR_CLIENT_START); - msd->msd_client_size = cpu_to_le16(LR_CLIENT_SIZE); - msd->msd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID); + LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name); + + memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = 0; + mount_count = lsd->lsd_mount_count = 0; + lsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE); + lsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START); + lsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE); + lsd->lsd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID); + lsd->lsd_feature_incompat = cpu_to_le32(OBD_INCOMPAT_MDT | + OBD_INCOMPAT_COMMON_LR); } else { - rc = fsfilt_read_record(obd, file, msd, sizeof(*msd), &off); + rc = fsfilt_read_record(obd, file, lsd, sizeof(*lsd), &off); if (rc) { CERROR("error reading MDS %s: rc %d\n", LAST_RCVD, rc); GOTO(err_msd, rc); } - if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) { + if (strcmp(lsd->lsd_uuid, obd->obd_uuid.uuid) != 0) { LCONSOLE_ERROR("Trying to start OBD %s using the wrong" " disk %s. Were the /dev/ assignments " "rearranged?\n", - obd->obd_uuid.uuid, msd->msd_uuid); + obd->obd_uuid.uuid, lsd->lsd_uuid); GOTO(err_msd, rc = -EINVAL); } - mount_count = le64_to_cpu(msd->msd_mount_count); + mount_count = le64_to_cpu(lsd->lsd_mount_count); } - if (msd->msd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) { + + if (lsd->lsd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) { CERROR("%s: unsupported incompat filesystem feature(s) %x\n", - obd->obd_name, le32_to_cpu(msd->msd_feature_incompat) & + obd->obd_name, le32_to_cpu(lsd->lsd_feature_incompat) & ~MDT_INCOMPAT_SUPP); GOTO(err_msd, rc = -EINVAL); } - - if (msd->msd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) { + if (lsd->lsd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) { CERROR("%s: unsupported read-only filesystem feature(s) %x\n", - obd->obd_name, le32_to_cpu(msd->msd_feature_rocompat) & + obd->obd_name, le32_to_cpu(lsd->lsd_feature_rocompat) & ~MDT_ROCOMPAT_SUPP); /* Do something like remount filesystem read-only */ GOTO(err_msd, rc = -EINVAL); } + if (!(lsd->lsd_feature_incompat & cpu_to_le32(OBD_INCOMPAT_COMMON_LR))){ + CDEBUG(D_WARNING, "using old last_rcvd format\n"); + lsd->lsd_mount_count = lsd->lsd_last_transno; + lsd->lsd_last_transno = lsd->lsd_unused; + /* If we update the last_rcvd, we can never go back to + an old install, so leave this in the old format for now. + lsd->lsd_feature_incompat |= cpu_to_le32(LR_INCOMPAT_COMMON_LR); + */ + } + lsd->lsd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT); + + mds->mds_last_transno = le64_to_cpu(lsd->lsd_last_transno); - mds->mds_last_transno = le64_to_cpu(msd->msd_last_transno); - - msd->msd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT); CDEBUG(D_INODE, "%s: server last_transno: "LPU64"\n", obd->obd_name, mds->mds_last_transno); CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n", obd->obd_name, mount_count + 1); CDEBUG(D_INODE, "%s: server data size: %u\n", - obd->obd_name, le32_to_cpu(msd->msd_server_size)); + obd->obd_name, le32_to_cpu(lsd->lsd_server_size)); CDEBUG(D_INODE, "%s: per-client data start: %u\n", - obd->obd_name, le32_to_cpu(msd->msd_client_start)); + obd->obd_name, le32_to_cpu(lsd->lsd_client_start)); CDEBUG(D_INODE, "%s: per-client data size: %u\n", - obd->obd_name, le32_to_cpu(msd->msd_client_size)); + obd->obd_name, le32_to_cpu(lsd->lsd_client_size)); CDEBUG(D_INODE, "%s: last_rcvd size: %lu\n", obd->obd_name, last_rcvd_size); CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name, - last_rcvd_size <= le32_to_cpu(msd->msd_client_start) ? 0 : - (last_rcvd_size - le32_to_cpu(msd->msd_client_start)) / - le16_to_cpu(msd->msd_client_size)); + last_rcvd_size <= le32_to_cpu(lsd->lsd_client_start) ? 0 : + (last_rcvd_size - le32_to_cpu(lsd->lsd_client_start)) / + le16_to_cpu(lsd->lsd_client_size)); + + if (!lsd->lsd_server_size || !lsd->lsd_client_start || + !lsd->lsd_client_size) { + CERROR("Bad last_rcvd contents!\n"); + GOTO(err_msd, rc = -EINVAL); + } /* When we do a clean MDS shutdown, we save the last_transno into * the header. If we find clients with higher last_transno values * then those clients may need recovery done. */ - for (cl_idx = 0, off = le32_to_cpu(msd->msd_client_start); + for (cl_idx = 0, off = le32_to_cpu(lsd->lsd_client_start); off < last_rcvd_size; cl_idx++) { __u64 last_transno; struct obd_export *exp; @@ -312,9 +329,9 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) /* Don't assume off is incremented properly by * fsfilt_read_record(), in case sizeof(*mcd) - * isn't the same as msd->msd_client_size. */ - off = le32_to_cpu(msd->msd_client_start) + - cl_idx * le16_to_cpu(msd->msd_client_size); + * isn't the same as lsd->lsd_client_size. */ + off = le32_to_cpu(lsd->lsd_client_start) + + cl_idx * le16_to_cpu(lsd->lsd_client_size); rc = fsfilt_read_record(obd, file, mcd, sizeof(*mcd), &off); if (rc) { CERROR("error reading MDS %s idx %d, off %llu: rc %d\n", @@ -335,7 +352,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) */ CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64 " srv lr: "LPU64" lx: "LPU64"\n", mcd->mcd_uuid, cl_idx, - last_transno, le64_to_cpu(msd->msd_last_transno), + last_transno, le64_to_cpu(lsd->lsd_last_transno), le64_to_cpu(mcd->mcd_last_xid)); exp = class_new_export(obd, (struct obd_uuid *)mcd->mcd_uuid); @@ -380,7 +397,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) } mds->mds_mount_count = mount_count + 1; - msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count); + lsd->lsd_mount_count = cpu_to_le64(mds->mds_mount_count); /* save it, so mount count and last_transno is current */ rc = mds_update_server_data(obd, 1); @@ -410,6 +427,7 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt) RETURN(rc); mds->mds_vfsmnt = mnt; + /* why not mnt->mnt_sb instead of mnt->mnt_root->d_inode->i_sb? */ obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb; fsfilt_setup(obd, obd->u.obt.obt_sb); @@ -458,13 +476,16 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt) } mds->mds_pending_dir = dentry; - dentry = simple_mkdir(current->fs->pwd, "LOGS", 0777, 1); + /* COMPAT_146 */ + dentry = simple_mkdir(current->fs->pwd, MDT_LOGS_DIR, 0777, 1); if (IS_ERR(dentry)) { rc = PTR_ERR(dentry); - CERROR("cannot create LOGS directory: rc = %d\n", rc); + CERROR("cannot create %s directory: rc = %d\n", + MDT_LOGS_DIR, rc); GOTO(err_pending, rc); } mds->mds_logs_dir = dentry; + /* end COMPAT_146 */ dentry = simple_mkdir(current->fs->pwd, "OBJECTS", 0777, 1); if (IS_ERR(dentry)) { @@ -560,8 +581,8 @@ int mds_fs_cleanup(struct obd_device *obd) int rc = 0; if (obd->obd_fail) - CWARN("%s: shutting down for failover; client state will " - "be preserved.\n", obd->obd_name); + LCONSOLE_WARN("%s: shutting down for failover; client state " + "will be preserved.\n", obd->obd_name); class_disconnect_exports(obd); /* cleans up client info too */ mds_server_free_data(mds); diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 78f6768..d90664a 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -5,34 +5,11 @@ #ifndef _MDS_INTERNAL_H #define _MDS_INTERNAL_H -#include +#include #include #define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID) - -#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT) - -/* Data stored per server at the head of the last_rcvd file. In le32 order. - * Try to keep this the same as fsd_server_data so we might one day merge. */ -struct mds_server_data { - __u8 msd_uuid[40]; /* server UUID */ - __u64 msd_last_transno; /* last completed transaction ID */ - __u64 msd_mount_count; /* MDS incarnation number */ - __u64 msd_mount_count_new; /* future MDS incarnation number */ - __u32 msd_feature_compat; /* compatible feature flags */ - __u32 msd_feature_rocompat;/* read-only compatible feature flags */ - __u32 msd_feature_incompat;/* incompatible feature flags */ - __u32 msd_server_size; /* size of server data area */ - __u32 msd_client_start; /* start of per-client data area */ - __u16 msd_client_size; /* size of per-client data area */ - __u16 msd_subdir_count; /* number of subdirectories for objects */ - __u64 msd_catalog_oid; /* recovery catalog object id */ - __u32 msd_catalog_ogen; /* recovery catalog inode generation */ - __u8 msd_peeruuid[40]; /* UUID of LOV/OSC associated with MDS */ - __u32 msd_ost_index; /* index number of OST in LOV */ - __u32 msd_mds_index; /* index number of MDS in LMV */ - __u8 msd_padding[LR_SERVER_SIZE - 148]; -}; +#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR) /* Data stored per client in the last_rcvd file. In le32 order. */ struct mds_client_data { @@ -210,11 +187,12 @@ int mds_lov_write_objids(struct obd_device *obd); void mds_lov_update_objids(struct obd_device *obd, obd_id *ids); int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid); int mds_lov_set_nextid(struct obd_device *obd); -int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, - int nonblock); +int mds_lov_start_synchronize(struct obd_device *obd, + struct obd_device *watched, + void *data, int nonblock); int mds_post_mds_lovconf(struct obd_device *obd); int mds_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev); + enum obd_notify_event ev, void *data); int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode, struct lov_mds_md *lmm, int lmm_size); void mds_objids_from_lmm(obd_id *ids, struct lov_mds_md *lmm, diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c index a76be7d..43a63e3 100644 --- a/lustre/mds/mds_log.c +++ b/lustre/mds/mds_log.c @@ -83,7 +83,7 @@ static int mds_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls ENTRY; lctxt = llog_get_context(lov_obd, ctxt->loc_idx); - rc = llog_cancel(lctxt, lsm, count, cookies,flags); + rc = llog_cancel(lctxt, lsm, count, cookies, flags); RETURN(rc); } diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 4135e9b..0f95347 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -50,8 +50,10 @@ void mds_lov_update_objids(struct obd_device *obd, obd_id *ids) lock_kernel(); for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) - if (ids[i] > (mds->mds_lov_objids)[i]) + if (ids[i] > (mds->mds_lov_objids)[i]) { (mds->mds_lov_objids)[i] = ids[i]; + mds->mds_lov_objids_dirty = 1; + } unlock_kernel(); EXIT; } @@ -61,47 +63,67 @@ static int mds_lov_read_objids(struct obd_device *obd) struct mds_obd *mds = &obd->u.mds; obd_id *ids; loff_t off = 0; - int i, rc, size = mds->mds_lov_desc.ld_tgt_count * sizeof(*ids); + int i, rc, size; ENTRY; - if (mds->mds_lov_objids != NULL) + LASSERT(!mds->mds_lov_objids_size); + LASSERT(!mds->mds_lov_objids_dirty); + + /* Read everything in the file, even if our current lov desc + has fewer targets. Old targets not in the lov descriptor + during mds setup may still have valid objids. */ + size = mds->mds_lov_objid_filp->f_dentry->d_inode->i_size; + if (size == 0) RETURN(0); OBD_ALLOC(ids, size); if (ids == NULL) RETURN(-ENOMEM); mds->mds_lov_objids = ids; + mds->mds_lov_objids_size = size; - if (mds->mds_lov_objid_filp->f_dentry->d_inode->i_size == 0) - RETURN(0); rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, ids, size, &off); if (rc < 0) { CERROR("Error reading objids %d\n", rc); - } else { - mds->mds_lov_objids_valid = 1; - rc = 0; + RETURN(rc); } - - for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) + + mds->mds_lov_objids_in_file = size / sizeof(*ids); + + for (i = 0; i < mds->mds_lov_objids_in_file; i++) { CDEBUG(D_INFO, "read last object "LPU64" for idx %d\n", mds->mds_lov_objids[i], i); - - RETURN(rc); + } + RETURN(0); } int mds_lov_write_objids(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; loff_t off = 0; - int i, rc, size = mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id); + int i, rc, tgts; ENTRY; - for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) + if (!mds->mds_lov_objids_dirty) + RETURN(0); + + tgts = max(mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids_in_file); + + if (!tgts) + RETURN(0); + + for (i = 0; i < tgts; i++) CDEBUG(D_INFO, "writing last object "LPU64" for idx %d\n", mds->mds_lov_objids[i], i); rc = fsfilt_write_record(obd, mds->mds_lov_objid_filp, - mds->mds_lov_objids, size, &off, 0); + mds->mds_lov_objids, tgts * sizeof(obd_id), + &off, 0); + if (rc >= 0) { + mds->mds_lov_objids_dirty = 0; + rc = 0; + } + RETURN(rc); } @@ -141,41 +163,146 @@ int mds_lov_set_nextid(struct obd_device *obd) LASSERT(mds->mds_lov_objids != NULL); - rc = obd_set_info_async(mds->mds_osc_exp, strlen("next_id"), "next_id", + rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID), + KEY_NEXT_ID, mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids, NULL); + + if (rc) + CERROR ("%s: mds_lov_set_nextid failed (%d)\n", + obd->obd_name, rc); + RETURN(rc); } -int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp) +/* Update the lov desc for a new size lov. */ +static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) { struct mds_obd *mds = &obd->u.mds; - int valsize, rc, tgt_count; - __u32 stripes; + struct lov_desc *ld; + __u32 size, stripes, valsize = sizeof(mds->mds_lov_desc); + int rc = 0; ENTRY; - mds->mds_has_lov_desc = 0; - valsize = sizeof(mds->mds_lov_desc); - rc = obd_get_info(mds->mds_osc_exp, strlen("lovdesc") + 1, - "lovdesc", &valsize, &mds->mds_lov_desc); - if (rc) { - CERROR("can't get lov_desc, rc %d\n", rc); - RETURN(rc); + OBD_ALLOC(ld, sizeof(*ld)); + if (!ld) + RETURN(-ENOMEM); + + rc = obd_get_info(lov, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC, + &valsize, ld); + if (rc) + GOTO(out, rc); + + /* The size of the LOV target table may have increased. */ + size = ld->ld_tgt_count * sizeof(obd_id); + if ((mds->mds_lov_objids_size == 0) || + (size > mds->mds_lov_objids_size)) { + obd_id *ids; + + /* add room by powers of 2 */ + size = 1; + while (size < ld->ld_tgt_count) + size = size << 1; + size = size * sizeof(obd_id); + + OBD_ALLOC(ids, size); + if (ids == NULL) + GOTO(out, rc = -ENOMEM); + memset(ids, 0, size); + if (mds->mds_lov_objids_size) { + obd_id *old_ids = mds->mds_lov_objids; + memcpy(ids, mds->mds_lov_objids, + mds->mds_lov_objids_size); + mds->mds_lov_objids = ids; + OBD_FREE(old_ids, mds->mds_lov_objids_size); + } + mds->mds_lov_objids = ids; + mds->mds_lov_objids_size = size; } - mds->mds_has_lov_desc = 1; - tgt_count = mds->mds_lov_desc.ld_tgt_count; - stripes = min(tgt_count, LOV_MAX_STRIPE_COUNT); + /* Don't change the mds_lov_desc until the objids size matches the + count (paranoia) */ + mds->mds_lov_desc = *ld; + CDEBUG(D_CONFIG, "updated lov_desc, tgt_count: %d\n", + mds->mds_lov_desc.ld_tgt_count); + stripes = min((__u32)LOV_MAX_STRIPE_COUNT, + max(mds->mds_lov_desc.ld_tgt_count, + mds->mds_lov_objids_in_file)); mds->mds_max_mdsize = lov_mds_md_size(stripes); mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); + CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize: %d/%d\n", + mds->mds_max_mdsize, mds->mds_max_cookiesize); + +out: + OBD_FREE(ld, sizeof(*ld)); + RETURN(rc); +} - CDEBUG(D_HA, "updated lov_desc, tgt_count: %d\n", tgt_count); - CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n", - mds->mds_max_mdsize, mds->mds_max_cookiesize); +#define MDSLOV_NO_INDEX -1 - RETURN(0); +/* Inform MDS about new/updated target */ +static int mds_lov_update_mds(struct obd_device *obd, + struct obd_device *watched, + __u32 idx) +{ + struct mds_obd *mds = &obd->u.mds; + int old_count; + int rc = 0; + ENTRY; + + old_count = mds->mds_lov_desc.ld_tgt_count; + rc = mds_lov_update_desc(obd, mds->mds_osc_exp); + if (rc) + RETURN(rc); + + CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n", + idx, obd->obd_recovering, obd->obd_async_recov, old_count, + mds->mds_lov_desc.ld_tgt_count); + + /* idx is set as data from lov_notify. */ + if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) { + if (idx >= mds->mds_lov_desc.ld_tgt_count) { + CERROR("index %d > count %d!\n", idx, + mds->mds_lov_desc.ld_tgt_count); + RETURN(-EINVAL); + } + + if (idx >= mds->mds_lov_objids_in_file) { + /* We never read this lastid; ask the osc */ + obd_id lastid; + __u32 size = sizeof(lastid); + rc = obd_get_info(watched->obd_self_export, + strlen("last_id"), + "last_id", &size, &lastid); + if (rc) + RETURN(rc); + mds->mds_lov_objids[idx] = lastid; + mds->mds_lov_objids_dirty = 1; + mds_lov_write_objids(obd); + } else { + /* We have read this lastid from disk; tell the osc. + Don't call this during recovery. */ + rc = mds_lov_set_nextid(obd); + } + + CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n", + mds->mds_lov_objids[idx], idx); + } + + /* If we added a target we have to reconnect the llogs */ + /* Only do this at first add (idx), or the first time after recovery */ + if (idx != MDSLOV_NO_INDEX || 1/*FIXME*/) { + CDEBUG(D_CONFIG, "reset llogs idx=%d\n", idx); + /* These two must be atomic */ + down(&mds->mds_orphan_recovery_sem); + obd_llog_finish(obd, old_count); + llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count); + up(&mds->mds_orphan_recovery_sem); + } + + RETURN(rc); } /* update the LOV-OSC knowledge of the last used object id's */ @@ -223,17 +350,17 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) GOTO(err_discon, rc); } - /* init lov_desc + easize */ - rc = mds_init_lov_desc(obd, mds->mds_osc_exp); - if (rc) - GOTO(err_reg, rc); - rc = mds_lov_read_objids(obd); if (rc) { CERROR("cannot read %s: rc = %d\n", "lov_objids", rc); GOTO(err_reg, rc); } + rc = mds_lov_update_desc(obd, mds->mds_osc_exp); + if (rc) + GOTO(err_reg, rc); + + /* tgt_count may be 0! */ rc = llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count); if (rc) { CERROR("failed to initialize catalog %d\n", rc); @@ -242,7 +369,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) /* If we're mounting this code for the first time on an existing FS, * we need to populate the objids array from the real OST values */ - if (!mds->mds_lov_objids_valid) { + if (mds->mds_lov_desc.ld_tgt_count > mds->mds_lov_objids_in_file) { int size = sizeof(obd_id) * mds->mds_lov_desc.ld_tgt_count; rc = obd_get_info(mds->mds_osc_exp, strlen("last_id"), "last_id", &size, mds->mds_lov_objids); @@ -250,7 +377,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) CWARN("got last object "LPU64" from OST %d\n", mds->mds_lov_objids[i], i); - mds->mds_lov_objids_valid = 1; + mds->mds_lov_objids_dirty = 1; rc = mds_lov_write_objids(obd); if (rc) CERROR("got last objids from OSTs, but error " @@ -461,8 +588,9 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, rc = llog_ioctl(ctxt, cmd, data); pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count); - rc2 = obd_set_info_async(mds->mds_osc_exp, strlen("mds_conn"), - "mds_conn", 0, NULL, NULL); + rc2 = obd_set_info_async(mds->mds_osc_exp, + strlen(KEY_MDS_CONN), KEY_MDS_CONN, + 0, NULL, NULL); if (!rc) rc = rc2; RETURN(rc); @@ -493,33 +621,47 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, } struct mds_lov_sync_info { - struct obd_device *mlsi_obd; /* the lov device to sync */ - struct obd_uuid *mlsi_uuid; /* target to sync */ + struct obd_device *mlsi_obd; /* the lov device to sync */ + struct obd_device *mlsi_watched; /* target osc */ + __u32 mlsi_index; /* index of target */ }; -static int __mds_lov_syncronize(void *data) +/* We only sync one osc at a time, so that we don't have to hold + any kind of lock on the whole mds_lov_desc, which may change + (grow) as a result of mds_lov_add_ost. This also avoids any + kind of mismatch between the lov_desc and the mds_lov_desc, + which are not in lock-step during lov_add_obd */ +static int __mds_lov_synchronize(void *data) { struct mds_lov_sync_info *mlsi = data; - struct obd_device *obd; + struct obd_device *obd = mlsi->mlsi_obd; + struct obd_device *watched = mlsi->mlsi_watched; + struct mds_obd *mds = &obd->u.mds; struct obd_uuid *uuid; + __u32 idx = mlsi->mlsi_index; int rc = 0; ENTRY; - obd = mlsi->mlsi_obd; - uuid = mlsi->mlsi_uuid; - OBD_FREE(mlsi, sizeof(*mlsi)); - LASSERT(obd != NULL); + LASSERT(obd); + LASSERT(watched); + uuid = &watched->u.cli.cl_target_uuid; + LASSERT(uuid); - rc = obd_set_info_async(obd->u.mds.mds_osc_exp, strlen("mds_conn"), - "mds_conn", 0, uuid, NULL); + rc = mds_lov_update_mds(obd, watched, idx); + if (rc != 0) + GOTO(out, rc); + + rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_MDS_CONN), + KEY_MDS_CONN, 0, uuid, NULL); if (rc != 0) GOTO(out, rc); rc = llog_connect(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT), - obd->u.mds.mds_lov_desc.ld_tgt_count, + mds->mds_lov_desc.ld_tgt_count, NULL, NULL, uuid); + if (rc != 0) { CERROR("%s: failed at llog_origin_connect: %d\n", obd->obd_name, rc); @@ -527,50 +669,60 @@ static int __mds_lov_syncronize(void *data) } LCONSOLE_INFO("MDS %s: %s now active, resetting orphans\n", - obd->obd_name, uuid ? (char *)uuid->uuid : "All OSCs"); + obd->obd_name, obd_uuid2str(uuid)); if (obd->obd_stopping) GOTO(out, rc = -ENODEV); - rc = mds_lov_clear_orphans(&obd->u.mds, uuid); + rc = mds_lov_clear_orphans(mds, uuid); if (rc != 0) { CERROR("%s: failed at mds_lov_clear_orphans: %d\n", obd->obd_name, rc); GOTO(out, rc); } - EXIT; out: class_decref(obd); - return rc; + RETURN(rc); } int mds_lov_synchronize(void *data) { - ptlrpc_daemonize("mds_lov_sync"); + struct mds_lov_sync_info *mlsi = data; + char name[20]; - return (__mds_lov_syncronize(data)); + sprintf(name, "ll_mlov_sync_%02u", mlsi->mlsi_index); + ptlrpc_daemonize(name); + + RETURN(__mds_lov_synchronize(data)); } -int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, - int nonblock) +int mds_lov_start_synchronize(struct obd_device *obd, + struct obd_device *watched, + void *data, int nonblock) { struct mds_lov_sync_info *mlsi; int rc; ENTRY; + LASSERT(watched); + OBD_ALLOC(mlsi, sizeof(*mlsi)); if (mlsi == NULL) RETURN(-ENOMEM); mlsi->mlsi_obd = obd; - mlsi->mlsi_uuid = uuid; + mlsi->mlsi_watched = watched; + if (data) + mlsi->mlsi_index = *(__u32 *)data; + else + mlsi->mlsi_index = MDSLOV_NO_INDEX; /* Although class_export_get(obd->obd_self_export) would lock the MDS in place, since it's only a self-export it doesn't lock the LOV in place. The LOV can be disconnected - during MDS precleanup, leaving nothing for __mds_lov_syncronize. + during MDS precleanup, leaving nothing for __mds_lov_synchronize. Simply taking an export ref on the LOV doesn't help, because it's still disconnected. Taking an obd reference insures that we don't disconnect the LOV. This of course means a cleanup won't @@ -578,61 +730,67 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid, class_incref(obd); if (nonblock) { - /* Syncronize in the background */ - rc = kernel_thread(mds_lov_synchronize, mlsi, CLONE_VM | CLONE_FILES); + /* Synchronize in the background */ + rc = cfs_kernel_thread(mds_lov_synchronize, mlsi, + CLONE_VM | CLONE_FILES); if (rc < 0) { CERROR("%s: error starting mds_lov_synchronize: %d\n", obd->obd_name, rc); class_decref(obd); } else { - CDEBUG(D_HA, "%s: mds_lov_synchronize thread: %d\n", - obd->obd_name, rc); + CDEBUG(D_HA, "%s: mds_lov_synchronize idx=%d " + "thread=%d\n", obd->obd_name, + mlsi->mlsi_index, rc); rc = 0; } } else { - rc = __mds_lov_syncronize((void *)mlsi); + rc = __mds_lov_synchronize((void *)mlsi); } RETURN(rc); } int mds_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev) + enum obd_notify_event ev, void *data) { - struct mds_obd *mds = &obd->u.mds; - struct obd_uuid *uuid; int rc = 0; ENTRY; - if (ev != OBD_NOTIFY_ACTIVE) + switch (ev) { + /* We only handle these: */ + case OBD_NOTIFY_ACTIVE: + case OBD_NOTIFY_SYNC: + case OBD_NOTIFY_SYNC_NONBLOCK: + break; + default: RETURN(0); + } + + CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev); - if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) != 0) { CERROR("unexpected notification of %s %s!\n", watched->obd_type->typ_name, watched->obd_name); RETURN(-EINVAL); } - uuid = &watched->u.cli.cl_target_uuid; if (obd->obd_recovering) { - /* in the case OBD is in recovery we do not reinit desc and - * easize, as that will be done in mds_lov_connect() after - * recovery is finished. */ CWARN("MDS %s: in recovery, not resetting orphans on %s\n", - obd->obd_name, uuid->uuid); - } else { - LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); - - /* this may be called also in case of adding new OST, thus, we - * have to update MDS lov_desc and re-init MDS easize. The same - * should be done on clients. */ - rc = mds_init_lov_desc(obd, mds->mds_osc_exp); - if (rc) - RETURN(rc); - - rc = mds_lov_start_synchronize(obd, uuid, 1); - lquota_recovery(quota_interface, obd); + obd->obd_name, + obd_uuid2str(&watched->u.cli.cl_target_uuid)); + /* We still have to fix the lov descriptor for ost's added + after the mdt in the config log. They didn't make it into + mds_lov_connect. */ + rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp); + RETURN(rc); } + + LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); + rc = mds_lov_start_synchronize(obd, watched, data, + !(ev == OBD_NOTIFY_SYNC)); + + lquota_recovery(quota_interface, obd); + RETURN(rc); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index f2c8d1b..585cbb0 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -314,7 +314,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, struct obd_trans_info oti = { 0 }; struct lov_stripe_md *lsm = NULL; struct lov_mds_md *lmm = NULL; - int rc, lmm_bufsize, lmm_size; + int rc, lmm_size; struct mds_body *body; struct obdo *oa; void *lmm_buf; @@ -359,15 +359,13 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, mds_objids_from_lmm(*ids, lmm, &mds->mds_lov_desc); - lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0); - lmm_bufsize = req->rq_repmsg->buflens[offset]; + rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov"); + lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, lmm_size); LASSERT(lmm_buf); - LASSERT(lmm_bufsize >= lmm_size); memcpy(lmm_buf, lmm, lmm_size); - rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov"); if (rc) CERROR("open replay failed to set md:%d\n", rc); - RETURN(0); + RETURN(rc); } if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_ALLOC_OBDO)) @@ -478,11 +476,8 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, } rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov"); - lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0); - lmm_bufsize = req->rq_repmsg->buflens[offset]; + lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, lmm_size); LASSERT(lmm_buf); - LASSERT(lmm_bufsize >= lmm_size); - memcpy(lmm_buf, lmm, lmm_size); obd_free_diskmd(mds->mds_osc_exp, &lmm); out_oa: diff --git a/lustre/mgc/.cvsignore b/lustre/mgc/.cvsignore new file mode 100644 index 0000000..d5103fa --- /dev/null +++ b/lustre/mgc/.cvsignore @@ -0,0 +1,15 @@ +.Xrefs +config.log +config.status +configure +Makefile +.deps +TAGS +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.o.flags +.tmp_versions +.depend diff --git a/lustre/mgc/Makefile.in b/lustre/mgc/Makefile.in new file mode 100644 index 0000000..2d7cad5 --- /dev/null +++ b/lustre/mgc/Makefile.in @@ -0,0 +1,4 @@ +MODULES := mgc +mgc-objs := mgc_request.o + +@INCLUDE_RULES@ diff --git a/lustre/mgc/autoMakefile.am b/lustre/mgc/autoMakefile.am new file mode 100644 index 0000000..2b3a807 --- /dev/null +++ b/lustre/mgc/autoMakefile.am @@ -0,0 +1,11 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +modulefs_DATA = mgc$(KMODEXT) +endif + +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +DIST_SOURCES := $(mgc-objs:%.o=%.c) diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c new file mode 100644 index 0000000..0b419ec --- /dev/null +++ b/lustre/mgc/mgc_request.c @@ -0,0 +1,1118 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mgc/mgc_request.c + * Lustre Management Client + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MGC +#define D_MGC D_CONFIG /*|D_WARNING*/ + +#ifdef __KERNEL__ +# include +# include +# include +# include +#else +# include +#endif + +#include +#include +#include +#include +#include + + +int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id) +{ + char *name_end; + int len; + __u64 resname = 0; + + /* fsname is at most 8 chars long at the beginning of the logname + e.g. "lustre-MDT0001" or "lustre" */ + name_end = strrchr(logname, '-'); + if (name_end) + len = name_end - logname; + else + len = strlen(logname); + if (len > 8) { + CERROR("fsname too long: %s\n", logname); + return -EINVAL; + } + if (len <= 0) { + CERROR("missing fsname: %s\n", logname); + return -EINVAL; + } + memcpy(&resname, logname, len); + + memset(res_id, 0, sizeof(*res_id)); + /* FIXME are resid names swabbed across the wire? */ + res_id->name[0] = cpu_to_le64(resname); + CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", logname, + res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); + return 0; +} +EXPORT_SYMBOL(mgc_logname2resid); + +/********************** config llog list **********************/ +static struct list_head config_llog_list = LIST_HEAD_INIT(config_llog_list); +static spinlock_t config_list_lock = SPIN_LOCK_UNLOCKED; + +/* Take a reference to a config log */ +static int config_log_get(struct config_llog_data *cld) +{ + ENTRY; + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + if (cld->cld_stopping) + RETURN(1); + atomic_inc(&cld->cld_refcount); + RETURN(0); +} + +/* Drop a reference to a config log. When no longer referenced, + we can free the config log data */ +static void config_log_put(struct config_llog_data *cld) +{ + ENTRY; + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + if (atomic_dec_and_test(&cld->cld_refcount)) { + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + OBD_FREE(cld->cld_logname, strlen(cld->cld_logname) + 1); + if (cld->cld_cfg.cfg_instance != NULL) + OBD_FREE(cld->cld_cfg.cfg_instance, + strlen(cld->cld_cfg.cfg_instance) + 1); + OBD_FREE(cld, sizeof(*cld)); + } + EXIT; +} + +/* Find a config log by name */ +static struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) +{ + struct list_head *tmp; + struct config_llog_data *cld; + char *logid = logname; + int match_instance = 0; + ENTRY; + + if (cfg && cfg->cfg_instance) { + match_instance++; + logid = cfg->cfg_instance; + } + if (!logid) { + CERROR("No log specified\n"); + RETURN(ERR_PTR(-EINVAL)); + } + + spin_lock(&config_list_lock); + list_for_each(tmp, &config_llog_list) { + cld = list_entry(tmp, struct config_llog_data, cld_list_chain); + if (match_instance && cld->cld_cfg.cfg_instance && + strcmp(logid, cld->cld_cfg.cfg_instance) == 0) + goto out_found; + if (!match_instance && + strcmp(logid, cld->cld_logname) == 0) + goto out_found; + } + spin_unlock(&config_list_lock); + + CERROR("can't get log %s\n", logid); + RETURN(ERR_PTR(-ENOENT)); +out_found: + atomic_inc(&cld->cld_refcount); + spin_unlock(&config_list_lock); + RETURN(cld); +} + +/* Add this log to our list of active logs. + We have one active log per "mount" - client instance or servername. + Each instance may be at a different point in the log. */ +static int config_log_add(char *logname, struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld; + int rc; + ENTRY; + + CDEBUG(D_MGC, "adding config log %s:%s\n", logname, cfg->cfg_instance); + + OBD_ALLOC(cld, sizeof(*cld)); + if (!cld) + RETURN(-ENOMEM); + OBD_ALLOC(cld->cld_logname, strlen(logname) + 1); + if (!cld->cld_logname) { + OBD_FREE(cld, sizeof(*cld)); + RETURN(-ENOMEM); + } + strcpy(cld->cld_logname, logname); + cld->cld_cfg = *cfg; + cld->cld_cfg.cfg_last_idx = 0; + cld->cld_cfg.cfg_flags = 0; + cld->cld_cfg.cfg_sb = sb; + atomic_set(&cld->cld_refcount, 1); + if (cfg->cfg_instance != NULL) { + OBD_ALLOC(cld->cld_cfg.cfg_instance, + strlen(cfg->cfg_instance) + 1); + strcpy(cld->cld_cfg.cfg_instance, cfg->cfg_instance); + } + rc = mgc_logname2resid(logname, &cld->cld_resid); + if (rc) { + config_log_put(cld); + RETURN(rc); + } + spin_lock(&config_list_lock); + list_add(&cld->cld_list_chain, &config_llog_list); + spin_unlock(&config_list_lock); + + RETURN(rc); +} + +/* Stop watching for updates on this log. */ +static int config_log_end(char *logname, struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + int rc = 0; + ENTRY; + + cld = config_log_find(logname, cfg); + if (IS_ERR(cld)) + RETURN(PTR_ERR(cld)); + /* drop the ref from the find */ + config_log_put(cld); + + cld->cld_stopping = 1; + spin_lock(&config_list_lock); + list_del(&cld->cld_list_chain); + spin_unlock(&config_list_lock); + /* drop the start ref */ + config_log_put(cld); + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", + rc); + RETURN(rc); +} + +/* Failsafe */ +static void config_log_end_all(void) +{ + struct list_head *tmp, *n; + struct config_llog_data *cld; + ENTRY; + + spin_lock(&config_list_lock); + list_for_each_safe(tmp, n, &config_llog_list) { + cld = list_entry(tmp, struct config_llog_data, cld_list_chain); + CERROR("conflog failsafe %s\n", cld->cld_logname); + list_del(&cld->cld_list_chain); + config_log_put(cld); + } + spin_unlock(&config_list_lock); + EXIT; +} + + +/********************** class fns **********************/ + +static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb, + struct vfsmount *mnt) +{ + struct lvfs_run_ctxt saved; + struct lustre_sb_info *lsi = s2lsi(sb); + struct client_obd *cli = &obd->u.cli; + struct dentry *dentry; + char *label; + int err = 0; + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_srv_mnt == mnt); + + /* The mgc fs exclusion sem. Only one fs can be setup at a time. */ + down(&cli->cl_mgc_sem); + + cleanup_group_info(); + + obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + if (IS_ERR(obd->obd_fsops)) { + up(&cli->cl_mgc_sem); + CERROR("No fstype %s rc=%ld\n", MT_STR(lsi->lsi_ldd), + PTR_ERR(obd->obd_fsops)); + RETURN(PTR_ERR(obd->obd_fsops)); + } + + cli->cl_mgc_vfsmnt = mnt; + fsfilt_setup(obd, mnt->mnt_sb); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.pwdmnt = mnt; + obd->obd_lvfs_ctxt.pwd = mnt->mnt_root; + obd->obd_lvfs_ctxt.fs = get_ds(); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd, + strlen(MOUNT_CONFIGS_DIR)); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + CERROR("cannot lookup %s directory: rc = %d\n", + MOUNT_CONFIGS_DIR, err); + GOTO(err_ops, err); + } + cli->cl_mgc_configs_dir = dentry; + + /* We take an obd ref to insure that we can't get to mgc_cleanup + without calling mgc_fs_cleanup first. */ + class_incref(obd); + + label = fsfilt_get_label(obd, mnt->mnt_sb); + if (label) + CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label); + + /* We keep the cl_mgc_sem until mgc_fs_cleanup */ + RETURN(0); + +err_ops: + fsfilt_put_ops(obd->obd_fsops); + obd->obd_fsops = NULL; + cli->cl_mgc_vfsmnt = NULL; + up(&cli->cl_mgc_sem); + RETURN(err); +} + +static int mgc_fs_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc = 0; + ENTRY; + + LASSERT(cli->cl_mgc_vfsmnt != NULL); + + if (cli->cl_mgc_configs_dir != NULL) { + struct lvfs_run_ctxt saved; + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + l_dput(cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + class_decref(obd); + } + + cli->cl_mgc_vfsmnt = NULL; + if (obd->obd_fsops) + fsfilt_put_ops(obd->obd_fsops); + + up(&cli->cl_mgc_sem); + RETURN(rc); +} + +static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: + case OBD_CLEANUP_EXPORTS: + break; + case OBD_CLEANUP_SELF_EXP: + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + case OBD_CLEANUP_OBD: + break; + } + RETURN(rc); +} + +static int mgc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + ENTRY; + + LASSERT(cli->cl_mgc_vfsmnt == NULL); + + config_log_end_all(); + + ptlrpcd_decref(); + + rc = client_obd_cleanup(obd); + RETURN(rc); +} + +static struct obd_device *the_mgc; + +static int mgc_setup(struct obd_device *obd, obd_count len, void *buf) +{ + int rc; + ENTRY; + + ptlrpcd_addref(); + + rc = client_obd_setup(obd, len, buf); + if (rc) + GOTO(err_decref, rc); + + rc = obd_llog_init(obd, obd, 0, NULL); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + GOTO(err_cleanup, rc); + } + + the_mgc = obd; + RETURN(rc); + +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + RETURN(rc); +} + +static int mgc_process_log(struct obd_device *mgc, + struct config_llog_data *cld); + +/* FIXME I don't want a thread for every cld; make a list of cld's to requeue + and use only 1 thread. */ +/* reenqueue the lock, reparse the log */ +static int mgc_async_requeue(void *data) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + struct config_llog_data *cld = (struct config_llog_data *)data; + char name[24]; + int rc = 0; + ENTRY; + + if (!data) + RETURN(-EINVAL); + if (cld->cld_stopping) + GOTO(out, rc = 0); + + snprintf(name, sizeof(name), "ll_log_%s", cld->cld_logname); + name[sizeof(name)-1] = '\0'; + ptlrpc_daemonize(name); + + CDEBUG(D_MGC, "requeue "LPX64" %s:%s\n", + cld->cld_resid.name[0], cld->cld_logname, + cld->cld_cfg.cfg_instance); + + /* Sleep a few seconds to allow the server who caused + the lock revocation to finish its setup, plus some random + so everyone doesn't try to reconnect at once. */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(3 * HZ + (ll_rand() & 0x7f), NULL, NULL); + l_wait_event(waitq, 0, &lwi); + + LASSERT(the_mgc); + + class_export_get(the_mgc->obd_self_export); +#if 0 + /* Re-send server info every time, in case MGS needs to regen its + logs (for write_conf). Do we need this? It's extra RPCs for + every server at every update. Turning it off until I'm sure + it's needed. */ + server_register_target(cld->cld_cfg.cfg_sb); +#endif + rc = mgc_process_log(the_mgc, cld); + class_export_put(the_mgc->obd_self_export); +out: + /* Whether we enqueued again or not in mgc_process_log, + we're done with the ref from the old mgc_blocking_ast */ + config_log_put(cld); + + RETURN(rc); +} + +/* based on ll_mdc_blocking_ast */ +static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + struct config_llog_data *cld = (struct config_llog_data *)data; + int rc = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + /* mgs wants the lock, give it up... */ + LDLM_DEBUG(lock, "MGC blocking CB"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh); + break; + case LDLM_CB_CANCELING: { + /* We've given up the lock, prepare ourselves to update. */ + LDLM_DEBUG(lock, "MGC cancel CB"); + + CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n", + lock->l_resource->lr_name.name[0], + (char *)&lock->l_resource->lr_name.name[0]); + + /* Make sure not to re-enqueue when the mgc is stopping + (we get called from client_disconnect_export) */ + if (!lock->l_conn_export || + !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) { + CDEBUG(D_MGC, "Disconnecting, don't requeue\n"); + goto out_drop; + } + if (lock->l_req_mode != lock->l_granted_mode) { + CERROR("original grant failed, won't requeue\n"); + goto out_drop; + } + if (!data) { + CERROR("missing data, won't requeue\n"); + goto out_drop; + } + if (cld->cld_stopping) { + CERROR("stopping, won't requeue\n"); + goto out_drop; + } + + /* Re-enqueue the lock in a separate thread, because we must + return from this fn before that lock can be taken. */ + rc = cfs_kernel_thread(mgc_async_requeue, data, + CLONE_VM | CLONE_FILES); + if (rc < 0) { + CERROR("Cannot re-enqueue thread: %d\n", rc); + } else { + rc = 0; + break; + } +out_drop: + /* Drop this here or in mgc_async_requeue, + in either case, we're done with the reference + after this. */ + config_log_put(cld); + break; + } + default: + LBUG(); + } + + + if (rc) { + CERROR("%s CB failed %d:\n", flag == LDLM_CB_BLOCKING ? + "blocking" : "cancel", rc); + LDLM_ERROR(lock, "MGC ast"); + } + RETURN(rc); +} + +/* Take a config lock so we can get cancel notifications */ +static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + int *flags, void *bl_cb, void *cp_cb, void *gl_cb, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) +{ + struct config_llog_data *cld = (struct config_llog_data *)data; + struct obd_device *obd = class_exp2obd(exp); + int rc; + ENTRY; + + CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname, + cld->cld_resid.name[0]); + + /* We can only drop this config log ref when we drop the lock */ + if (config_log_get(cld)) + RETURN(ELDLM_LOCK_ABORTED); + + /* We need a callback for every lockholder, so don't try to + ldlm_lock_match (see rev 1.1.2.11.2.47) */ + + rc = ldlm_cli_enqueue(exp, NULL, obd->obd_namespace, cld->cld_resid, + type, NULL, mode, flags, + mgc_blocking_ast, ldlm_completion_ast, NULL, + data, NULL, 0, NULL, lockh); + + RETURN(rc); +} + +static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *lockh) +{ + ENTRY; + + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + +#if 0 +static int mgc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct llog_ctxt *ctxt; + struct lvfs_run_ctxt saved; + int rc; + ENTRY; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + MOD_INC_USE_COUNT; +#else + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } +#endif + switch (cmd) { + /* REPLicator context */ + case OBD_IOC_PARSE: { + CERROR("MGC parsing llog %s\n", data->ioc_inlbuf1); + ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT); + rc = class_config_parse_llog(ctxt, data->ioc_inlbuf1, NULL); + GOTO(out, rc); + } +#ifdef __KERNEL__ + case OBD_IOC_LLOG_INFO: + case OBD_IOC_LLOG_PRINT: { + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_ioctl(ctxt, cmd, data); + + GOTO(out, rc); + } +#endif + /* ORIGinator context */ + case OBD_IOC_DUMP_LOG: { + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (rc) + RETURN(rc); + + GOTO(out, rc); + } + default: + CERROR("mgc_ioctl(): unrecognised ioctl %#x\n", cmd); + GOTO(out, rc = -ENOTTY); + } +out: +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + MOD_DEC_USE_COUNT; +#else + module_put(THIS_MODULE); +#endif + + return rc; +} +#endif + +/* Send target_reg message to MGS */ +static int mgc_target_register(struct obd_export *exp, + struct mgs_target_info *mti) +{ + struct ptlrpc_request *req; + struct mgs_target_info *req_mti, *rep_mti; + int size = sizeof(*req_mti); + int rep_size = sizeof(*mti); + int rc; + ENTRY; + + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MGS_VERSION, + MGS_TARGET_REG, 1, &size, NULL); + if (!req) + RETURN(-ENOMEM); + + req_mti = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*req_mti)); + if (!req_mti) + RETURN(-ENOMEM); + memcpy(req_mti, mti, sizeof(*req_mti)); + + req->rq_replen = lustre_msg_size(1, &rep_size); + + CDEBUG(D_MGC, "register %s\n", mti->mti_svname); + + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_mti = lustre_swab_repbuf(req, 0, sizeof(*rep_mti), + lustre_swab_mgs_target_info); + memcpy(mti, rep_mti, sizeof(*rep_mti)); + CDEBUG(D_MGC, "register %s got index = %d\n", + mti->mti_svname, mti->mti_stripe_index); + } else { + CERROR("register failed. rc=%d\n", rc); + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +int mgc_set_info_async(struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc = -EINVAL; + ENTRY; + + /* Try to "recover" the initial connection; i.e. retry */ + if (KEY_IS(KEY_INIT_RECOV)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + imp->imp_initial_recov = *(int *)val; + CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", + exp->exp_obd->obd_name, imp->imp_initial_recov); + RETURN(0); + } + /* Turn off initial_recov after we try all backup servers once */ + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { + int value; + if (vallen != sizeof(int)) + RETURN(-EINVAL); + value = *(int *)val; + imp->imp_initial_recov_bk = value > 0; + if (imp->imp_invalid || value > 1) { + /* Resurrect if we previously died */ + CDEBUG(D_MGC, "Reactivate %s %d:%d:%d:%s\n", + imp->imp_obd->obd_name, value, + imp->imp_deactive, imp->imp_invalid, + ptlrpc_import_state_name(imp->imp_state)); + /* can't put this in obdclass, module loop with ptlrpc*/ + /* This seems to be necessary when restarting a + combo mgs/mdt while the mgc is alive */ + ptlrpc_invalidate_import(imp); + /* Remove 'invalid' flag */ + ptlrpc_activate_import(imp); + /* Attempt a new connect */ + ptlrpc_recover_import(imp, NULL); + } + RETURN(0); + } + /* Hack alert */ + if (KEY_IS("register_target")) { + struct mgs_target_info *mti; + if (vallen != sizeof(struct mgs_target_info)) + RETURN(-EINVAL); + mti = (struct mgs_target_info *)val; + CDEBUG(D_MGC, "register_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(exp, mti); + RETURN(rc); + } + if (KEY_IS("set_fs")) { + struct super_block *sb = (struct super_block *)val; + struct lustre_sb_info *lsi; + if (vallen != sizeof(struct super_block)) + RETURN(-EINVAL); + lsi = s2lsi(sb); + rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt); + if (rc) { + CERROR("set_fs got %d\n", rc); + } + RETURN(rc); + } + if (KEY_IS("clear_fs")) { + if (vallen != 0) + RETURN(-EINVAL); + rc = mgc_fs_cleanup(exp->exp_obd); + if (rc) { + CERROR("clear_fs got %d\n", rc); + } + RETURN(rc); + } + + RETURN(rc); +} + +static int mgc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + CDEBUG(D_MGC, "import event %#x\n", event); + + switch (event) { + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + break; + } + case IMP_EVENT_DISCON: + /* MGC imports should not wait for recovery */ + ptlrpc_invalidate_import(imp); + break; + case IMP_EVENT_INACTIVE: + case IMP_EVENT_ACTIVE: + case IMP_EVENT_OCD: + break; + default: + CERROR("Unknown import event %#x\n", event); + LBUG(); + } + RETURN(rc); +} + +static int mgc_llog_init(struct obd_device *obd, struct obd_device *tgt, + int count, struct llog_catid *logid) +{ + struct llog_ctxt *ctxt; + int rc; + ENTRY; + + rc = llog_setup(obd, LLOG_CONFIG_ORIG_CTXT, tgt, 0, NULL, + &llog_lvfs_ops); + if (rc) + RETURN(rc); + + rc = llog_setup(obd, LLOG_CONFIG_REPL_CTXT, tgt, 0, NULL, + &llog_client_ops); + if (rc == 0) { + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + ctxt->loc_imp = obd->u.cli.cl_import; + } + + RETURN(rc); +} + +static int mgc_llog_finish(struct obd_device *obd, int count) +{ + int rc; + ENTRY; + + rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_REPL_CTXT)); + rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT)); + + RETURN(rc); +} + +/* identical to mgs_log_is_empty */ +static int mgc_llog_is_empty(struct obd_device *obd, struct llog_ctxt *ctxt, + char *name) +{ + struct lvfs_run_ctxt saved; + struct llog_handle *llh; + int rc = 0; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = llog_create(ctxt, &llh, NULL, name); + if (rc == 0) { + llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + rc = llog_get_size(llh); + llog_close(llh); + } + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + /* header is record 1 */ + return(rc <= 1); +} + +static int mgc_copy_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, + void *data) +{ + struct llog_rec_hdr local_rec = *rec; + struct llog_handle *local_llh = (struct llog_handle *)data; + char *cfg_buf = (char*) (rec + 1); + struct lustre_cfg *lcfg; + int rc = 0; + ENTRY; + + lcfg = (struct lustre_cfg *)cfg_buf; + + /* FIXME we should always write to an empty log, so remove this check.*/ + /* append new records */ + if (rec->lrh_index >= llog_get_size(local_llh)) { + rc = llog_write_rec(local_llh, &local_rec, NULL, 0, + (void *)cfg_buf, -1); + + CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n", + rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command, + lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1)); + } else { + CDEBUG(D_INFO, "skip idx=%d\n", rec->lrh_index); + } + + RETURN(rc); +} + +static int mgc_copy_llog(struct obd_device *obd, struct llog_ctxt *rctxt, + struct llog_ctxt *lctxt, char *logname) +{ + struct llog_handle *local_llh, *remote_llh; + struct obd_uuid *uuid; + int rc, rc2; + ENTRY; + + /* open local log */ + rc = llog_create(lctxt, &local_llh, NULL, logname); + if (rc) + RETURN(rc); + /* set the log header uuid for fun */ + OBD_ALLOC_PTR(uuid); + obd_str2uuid(uuid, logname); + rc = llog_init_handle(local_llh, LLOG_F_IS_PLAIN, uuid); + OBD_FREE_PTR(uuid); + if (rc) + GOTO(out_closel, rc); + + /* FIXME write new log to a temp name, then vfs_rename over logname + upon successful completion. */ + + /* open remote log */ + rc = llog_create(rctxt, &remote_llh, NULL, logname); + if (rc) + GOTO(out_closel, rc); + rc = llog_init_handle(remote_llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_closer, rc); + + rc = llog_process(remote_llh, mgc_copy_handler,(void *)local_llh, NULL); + +out_closer: + rc2 = llog_close(remote_llh); + if (!rc) + rc = rc2; +out_closel: + rc2 = llog_close(local_llh); + if (!rc) + rc = rc2; + + CDEBUG(D_MGC, "Copied remote log %s (%d)\n", logname, rc); + RETURN(rc); +} + +DECLARE_MUTEX(llog_process_lock); + +/* Get a config log from the MGS and process it. + This func is called for both clients and servers. */ +static int mgc_process_log(struct obd_device *mgc, + struct config_llog_data *cld) +{ + struct llog_ctxt *ctxt, *lctxt; + struct lustre_handle lockh; + struct client_obd *cli = &mgc->u.cli; + struct lvfs_run_ctxt saved; + struct lustre_sb_info *lsi; + int rc = 0, rcl, flags = 0, must_pop = 0; + ENTRY; + + if (!cld || !cld->cld_cfg.cfg_sb) { + /* This should never happen */ + CERROR("Missing cld, aborting log update\n"); + RETURN(-EINVAL); + } + if (cld->cld_stopping) + RETURN(0); + + lsi = s2lsi(cld->cld_cfg.cfg_sb); + + CDEBUG(D_MGC, "Process log %s:%s from %d\n", cld->cld_logname, + cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); + + ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + if (!ctxt) { + CERROR("missing llog context\n"); + RETURN(-EINVAL); + } + + /* I don't want mutliple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ + down(&llog_process_lock); + + /* Get the cfg lock on the llog */ + rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, + LCK_CR, &flags, NULL, NULL, NULL, + cld, 0, NULL, &lockh); + if (rcl) + CERROR("Can't get cfg lock: %d\n", rcl); + + lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT); + + /* Copy the setup log locally if we can. Don't mess around if we're + running an MGS though (logs are already local). */ + if (lctxt && lsi && (lsi->lsi_flags & LSI_SERVER) && + (lsi->lsi_srv_mnt == cli->cl_mgc_vfsmnt) && + !IS_MGS(lsi->lsi_ldd)) { + push_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL); + must_pop++; + if (rcl == 0) + /* Only try to copy log if we have the lock. */ + rc = mgc_copy_llog(mgc, ctxt, lctxt, cld->cld_logname); + if (rcl || rc) { + if (mgc_llog_is_empty(mgc, lctxt, cld->cld_logname)) { + LCONSOLE_ERROR("Failed to get MGS log %s " + "and no local copy.\n", + cld->cld_logname); + GOTO(out_pop, rc = -ENOTCONN); + } + LCONSOLE_WARN("Failed to get MGS log %s, using " + "local copy.\n", cld->cld_logname); + } + /* Now, whether we copied or not, start using the local llog. + If we failed to copy, we'll start using whatever the old + log has. */ + ctxt = lctxt; + } + + /* logname and instance info should be the same, so use our + copy of the instance for the update. The cfg_last_idx will + be updated here. */ + rc = class_config_parse_llog(ctxt, cld->cld_logname, &cld->cld_cfg); + + out_pop: + if (must_pop) + pop_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL); + + /* Now drop the lock so MGS can revoke it */ + if (!rcl) { + rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL, + LCK_CR, &lockh); + if (rcl) + CERROR("Can't drop cfg lock: %d\n", rcl); + } + + if (rc) { + CERROR("%s: the configuration '%s' could not be read " + "(%d) from the MGS.\n", + mgc->obd_name, cld->cld_logname, rc); + } + + up(&llog_process_lock); + + RETURN(rc); +} + +static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + int cmd; + int rc = 0; + ENTRY; + + switch(cmd = lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: { + struct mgs_target_info *mti; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) != + sizeof(struct mgs_target_info)) + GOTO(out, rc = -EINVAL); + + mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); + CDEBUG(D_MGC, "add_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); + break; + } + case LCFG_LOV_DEL_OBD: + /* FIXME */ + CERROR("lov_del_obd unimplemented\n"); + rc = -ENOSYS; + break; + case LCFG_LOG_START: { + struct config_llog_data *cld; + struct config_llog_instance *cfg; + struct super_block *sb; + char *logname = lustre_cfg_string(lcfg, 1); + cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); + sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); + + CDEBUG(D_MGC, "parse_log %s from %d\n", logname, + cfg->cfg_last_idx); + + /* We're only called through here on the initial mount */ + rc = config_log_add(logname, cfg, sb); + if (rc) + break; + cld = config_log_find(logname, cfg); + if (IS_ERR(cld)) { + rc = PTR_ERR(cld); + break; + } + + /* COMPAT_146 */ + /* For old logs, there was no start marker. */ + /* FIXME only set this for old logs! */ + cld->cld_cfg.cfg_flags |= CFG_F_MARKER; + + rc = mgc_process_log(obd, cld); + config_log_put(cld); + + break; + } + case LCFG_LOG_END: { + struct config_llog_instance *cfg = NULL; + char *logname = lustre_cfg_string(lcfg, 1); + if (lcfg->lcfg_bufcount >= 2) + cfg = (struct config_llog_instance *)lustre_cfg_buf( + lcfg, 2); + rc = config_log_end(logname, cfg); + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + //.o_enqueue = mgc_enqueue, + .o_cancel = mgc_cancel, + //.o_iocontrol = mgc_iocontrol, + .o_set_info_async = mgc_set_info_async, + .o_import_event = mgc_import_event, + .o_llog_init = mgc_llog_init, + .o_llog_finish = mgc_llog_finish, + .o_process_config = mgc_process_config, +}; + +int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, LUSTRE_MGC_NAME); +} + +#ifdef __KERNEL__ +static void /*__exit*/ mgc_exit(void) +{ + class_unregister_type(LUSTRE_MGC_NAME); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Management Client"); +MODULE_LICENSE("GPL"); + +module_init(mgc_init); +module_exit(mgc_exit); +#endif diff --git a/lustre/mgs/.cvsignore b/lustre/mgs/.cvsignore new file mode 100644 index 0000000..d5103fa --- /dev/null +++ b/lustre/mgs/.cvsignore @@ -0,0 +1,15 @@ +.Xrefs +config.log +config.status +configure +Makefile +.deps +TAGS +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.o.flags +.tmp_versions +.depend diff --git a/lustre/mgs/Makefile.in b/lustre/mgs/Makefile.in new file mode 100644 index 0000000..8bb6a5f --- /dev/null +++ b/lustre/mgs/Makefile.in @@ -0,0 +1,4 @@ +MODULES := mgs +mgs-objs := mgs_handler.o mgs_fs.o mgs_llog.o lproc_mgs.o + +@INCLUDE_RULES@ diff --git a/lustre/mgs/autoMakefile.am b/lustre/mgs/autoMakefile.am new file mode 100644 index 0000000..53734b0 --- /dev/null +++ b/lustre/mgs/autoMakefile.am @@ -0,0 +1,11 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +modulefs_DATA = mgs$(KMODEXT) +endif + +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +DIST_SOURCES := $(mgs-objs:%.o=%.c) mgs_internal.h diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c new file mode 100644 index 0000000..d1ce512 --- /dev/null +++ b/lustre/mgs/lproc_mgs.c @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * You may have signed or agreed to another license before downloading + * this software. If so, you are bound by the terms and conditions + * of that agreement, and the following does not apply to you. See the + * LICENSE file included with this distribution for more information. + * + * If you did not agree to a different license, then this copy of Lustre + * is open source software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In either case, Lustre is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * license text for more details. + * + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#endif +#include +#include +#include +#include "mgs_internal.h" + +#ifdef LPROCFS +struct lprocfs_vars lprocfs_mgs_obd_vars[] = { + { 0 } +}; + +struct lprocfs_vars lprocfs_mgs_module_vars[] = { + { 0 } +}; + +struct lprocfs_vars lprocfs_mgt_obd_vars[] = { + { 0 } +}; + +struct lprocfs_vars lprocfs_mgt_module_vars[] = { + { 0 } +}; + +LPROCFS_INIT_VARS(mgs, lprocfs_mgs_module_vars, lprocfs_mgs_obd_vars); +LPROCFS_INIT_VARS(mgt, lprocfs_mgt_module_vars, lprocfs_mgt_obd_vars); +#endif diff --git a/lustre/mgs/mgs_fs.c b/lustre/mgs/mgs_fs.c new file mode 100644 index 0000000..8151705 --- /dev/null +++ b/lustre/mgs/mgs_fs.c @@ -0,0 +1,200 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mgs/mgs_fs.c + * Lustre Management Server (MGS) filesystem interface code + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MGS + +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#endif +#include +#include +#include +#include +#include +#include +#include "mgs_internal.h" + +/* Same as mds_fid2dentry */ +/* Look up an entry by inode number. */ +/* this function ONLY returns valid dget'd dentries with an initialized inode + or errors */ +static struct dentry *mgs_fid2dentry(struct mgs_obd *mgs, struct ll_fid *fid) +{ + char fid_name[32]; + unsigned long ino = fid->id; + __u32 generation = fid->generation; + struct inode *inode; + struct dentry *result; + + CDEBUG(D_DENTRY, "--> mgs_fid2dentry: ino/gen %lu/%u, sb %p\n", + ino, generation, mgs->mgs_sb); + + if (ino == 0) + RETURN(ERR_PTR(-ESTALE)); + + snprintf(fid_name, sizeof(fid_name), "0x%lx", ino); + + /* under ext3 this is neither supposed to return bad inodes + nor NULL inodes. */ + result = ll_lookup_one_len(fid_name, mgs->mgs_fid_de, strlen(fid_name)); + if (IS_ERR(result)) + RETURN(result); + + inode = result->d_inode; + if (!inode) + RETURN(ERR_PTR(-ENOENT)); + + if (inode->i_generation == 0 || inode->i_nlink == 0) { + LCONSOLE_WARN("Found inode with zero generation or link -- this" + " may indicate disk corruption (inode: %lu, link:" + " %lu, count: %d)\n", inode->i_ino, + (unsigned long)inode->i_nlink, + atomic_read(&inode->i_count)); + l_dput(result); + RETURN(ERR_PTR(-ENOENT)); + } + + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + CDEBUG(D_INODE, "found wrong generation: inode %lu, link: %lu, " + "count: %d, generation %u/%u\n", inode->i_ino, + (unsigned long)inode->i_nlink, + atomic_read(&inode->i_count), inode->i_generation, + generation); + l_dput(result); + RETURN(ERR_PTR(-ENOENT)); + } + + RETURN(result); +} + +static struct dentry *mgs_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr, + void *data) +{ + struct obd_device *obd = data; + struct ll_fid fid; + fid.id = id; + fid.generation = gen; + return mgs_fid2dentry(&obd->u.mgs, &fid); +} + +struct lvfs_callback_ops mgs_lvfs_ops = { + l_fid2dentry: mgs_lvfs_fid2dentry, +}; + +int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct lvfs_run_ctxt saved; + struct dentry *dentry; + int rc; + ENTRY; + + // FIXME what's this? + rc = cleanup_group_info(); + if (rc) + RETURN(rc); + + mgs->mgs_vfsmnt = mnt; + mgs->mgs_sb = mnt->mnt_root->d_inode->i_sb; + + fsfilt_setup(obd, mgs->mgs_sb); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.pwdmnt = mnt; + obd->obd_lvfs_ctxt.pwd = mnt->mnt_root; + obd->obd_lvfs_ctxt.fs = get_ds(); + obd->obd_lvfs_ctxt.cb_ops = mgs_lvfs_ops; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + /* Setup the configs dir */ + dentry = simple_mkdir(current->fs->pwd, MOUNT_CONFIGS_DIR, 0777, 1); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + CERROR("cannot create %s directory: rc = %d\n", + MOUNT_CONFIGS_DIR, rc); + GOTO(err_pop, rc); + } + mgs->mgs_configs_dir = dentry; + + /* Need the iopen dir for fid2dentry, required by + LLOG_ORIGIN_HANDLE_READ_HEADER */ + dentry = lookup_one_len("__iopen__", current->fs->pwd, + strlen("__iopen__")); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + CERROR("cannot lookup __iopen__ directory: rc = %d\n", rc); + GOTO(err_configs, rc); + } + mgs->mgs_fid_de = dentry; + if (!dentry->d_inode || is_bad_inode(dentry->d_inode)) { + rc = -ENOENT; + CERROR("__iopen__ directory has no inode? rc = %d\n", rc); + GOTO(err_fid, rc); + } + +err_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + return rc; +err_fid: + dput(mgs->mgs_fid_de); +err_configs: + dput(mgs->mgs_configs_dir); + goto err_pop; +} + +int mgs_fs_cleanup(struct obd_device *obd) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct lvfs_run_ctxt saved; + int rc = 0; + + class_disconnect_exports(obd); /* cleans up client info too */ + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + if (mgs->mgs_configs_dir) { + /*CERROR("configs dir dcount=%d\n", + atomic_read(&mgs->mgs_configs_dir->d_count));*/ + l_dput(mgs->mgs_configs_dir); + mgs->mgs_configs_dir = NULL; + } + + shrink_dcache_parent(mgs->mgs_fid_de); + /*CERROR("fid dir dcount=%d\n", + atomic_read(&mgs->mgs_fid_de->d_count));*/ + dput(mgs->mgs_fid_de); + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + return rc; +} diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c new file mode 100644 index 0000000..94dc87c --- /dev/null +++ b/lustre/mgs/mgs_handler.c @@ -0,0 +1,698 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mgs/mgs_handler.c + * Lustre Management Server (mgs) request handler + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MGS +#define D_MGS D_CONFIG/*|D_WARNING*/ + +#ifdef __KERNEL__ +# include +# include +# include +# include +#else +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "mgs_internal.h" + + +/* Establish a connection to the MGS.*/ +static int mgs_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data) +{ + struct obd_export *exp; + int rc; + ENTRY; + + if (!conn || !obd || !cluuid) + RETURN(-EINVAL); + + rc = class_connect(conn, obd, cluuid); + if (rc) + RETURN(rc); + exp = class_conn2export(conn); + LASSERT(exp); + + if (data != NULL) { + data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED; + exp->exp_connect_flags = data->ocd_connect_flags; + data->ocd_version = LUSTRE_VERSION_CODE; + } + + if (rc) { + class_disconnect(exp); + } else { + class_export_put(exp); + } + + RETURN(rc); +} + +static int mgs_disconnect(struct obd_export *exp) +{ + unsigned long irqflags; + int rc; + ENTRY; + + LASSERT(exp); + class_export_get(exp); + + /* Disconnect early so that clients can't keep using export */ + rc = class_disconnect(exp); + ldlm_cancel_locks_for_export(exp); + + /* complete all outstanding replies */ + spin_lock_irqsave(&exp->exp_lock, irqflags); + while (!list_empty(&exp->exp_outstanding_replies)) { + struct ptlrpc_reply_state *rs = + list_entry(exp->exp_outstanding_replies.next, + struct ptlrpc_reply_state, rs_exp_list); + struct ptlrpc_service *svc = rs->rs_service; + + spin_lock(&svc->srv_lock); + list_del_init(&rs->rs_exp_list); + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&svc->srv_lock); + } + spin_unlock_irqrestore(&exp->exp_lock, irqflags); + + class_export_put(exp); + RETURN(rc); +} + +static int mgs_cleanup(struct obd_device *obd); +static int mgs_handle(struct ptlrpc_request *req); + +/* Start the MGS obd */ +static int mgs_setup(struct obd_device *obd, obd_count len, void *buf) +{ + struct lprocfs_static_vars lvars; + struct mgs_obd *mgs = &obd->u.mgs; + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + struct vfsmount *mnt; + int rc = 0; + ENTRY; + + CDEBUG(D_CONFIG, "Starting MGS\n"); + + /* Find our disk */ + lmi = server_get_mount(obd->obd_name); + if (!lmi) + RETURN(rc = -EINVAL); + + mnt = lmi->lmi_mnt; + lsi = s2lsi(lmi->lmi_sb); + obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + if (IS_ERR(obd->obd_fsops)) + GOTO(err_put, rc = PTR_ERR(obd->obd_fsops)); + + /* namespace for mgs llog */ + obd->obd_namespace = ldlm_namespace_new("MGS", LDLM_NAMESPACE_SERVER); + if (obd->obd_namespace == NULL) { + mgs_cleanup(obd); + GOTO(err_ops, rc = -ENOMEM); + } + + /* ldlm setup */ + ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "mgs_ldlm_client", &obd->obd_ldlm_client); + + LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); + + rc = mgs_fs_setup(obd, mnt); + if (rc) { + CERROR("%s: MGS filesystem method init failed: rc = %d\n", + obd->obd_name, rc); + GOTO(err_ns, rc); + } + + rc = llog_start_commit_thread(); + if (rc < 0) + GOTO(err_fs, rc); + + rc = llog_setup(obd, LLOG_CONFIG_ORIG_CTXT, obd, 0, NULL, + &llog_lvfs_ops); + if (rc) + GOTO(err_fs, rc); + + /* Allow reconnect attempts */ + obd->obd_replayable = 1; + + /* Internal mgs setup */ + mgs_init_fsdb_list(obd); + sema_init(&mgs->mgs_sem, 1); + + /* Start the service threads */ + mgs->mgs_service = + ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE, + MGS_MAXREPSIZE, MGS_REQUEST_PORTAL, + MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT, + mgs_handle, LUSTRE_MGS_NAME, + obd->obd_proc_entry, NULL, MGS_NUM_THREADS); + + if (!mgs->mgs_service) { + CERROR("failed to start service\n"); + GOTO(err_fs, rc = -ENOMEM); + } + + rc = ptlrpc_start_threads(obd, mgs->mgs_service, "ll_mgs"); + if (rc) + GOTO(err_thread, rc); + + /* Setup proc */ + lprocfs_init_vars(mgs, &lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); + + ping_evictor_start(); + + LCONSOLE_INFO("MGS %s started\n", obd->obd_name); + + RETURN(0); + +err_thread: + ptlrpc_unregister_service(mgs->mgs_service); +err_fs: + /* No extra cleanup needed for llog_init_commit_thread() */ + mgs_fs_cleanup(obd); +err_ns: + ldlm_namespace_free(obd->obd_namespace, 0); + obd->obd_namespace = NULL; +err_ops: + fsfilt_put_ops(obd->obd_fsops); +err_put: + server_put_mount(obd->obd_name, mgs->mgs_vfsmnt); + mgs->mgs_sb = 0; + return rc; +} + +static int mgs_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: + case OBD_CLEANUP_EXPORTS: + break; + case OBD_CLEANUP_SELF_EXP: + llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT)); + rc = obd_llog_finish(obd, 0); + break; + case OBD_CLEANUP_OBD: + break; + } + RETURN(rc); +} + +static int mgs_ldlm_nsfree(void *data) +{ + struct ldlm_namespace *ns = (struct ldlm_namespace *)data; + int rc; + ENTRY; + + ptlrpc_daemonize("ll_mgs_nsfree"); + rc = ldlm_namespace_free(ns, 1 /* obd_force should always be on */); + RETURN(rc); +} + +static int mgs_cleanup(struct obd_device *obd) +{ + struct mgs_obd *mgs = &obd->u.mgs; + lvfs_sbdev_type save_dev; + ENTRY; + + ping_evictor_stop(); + + if (mgs->mgs_sb == NULL) + RETURN(0); + + save_dev = lvfs_sbdev(mgs->mgs_sb); + + ptlrpc_unregister_service(mgs->mgs_service); + + lprocfs_obd_cleanup(obd); + + mgs_cleanup_fsdb_list(obd); + + mgs_fs_cleanup(obd); + + server_put_mount(obd->obd_name, mgs->mgs_vfsmnt); + mgs->mgs_sb = NULL; + + /* Free the namespace in it's own thread, so that if the + ldlm_cancel_handler put the last mgs obd ref, we won't + deadlock here. */ + cfs_kernel_thread(mgs_ldlm_nsfree, obd->obd_namespace, + CLONE_VM | CLONE_FILES); + + lvfs_clear_rdonly(save_dev); + + fsfilt_put_ops(obd->obd_fsops); + + LCONSOLE_INFO("%s has stopped.\n", obd->obd_name); + RETURN(0); +} + +/* similar to filter_prepare_destroy */ +static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname, + struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id; + int rc, flags = 0; + ENTRY; + + rc = mgc_logname2resid(fsname, &res_id); + if (!rc) + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id, + LDLM_PLAIN, NULL, LCK_EX, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + NULL, fsname, NULL, 0, NULL, lockh); + if (rc) + CERROR("can't take cfg lock for %s (%d)\n", fsname, rc); + + RETURN(rc); +} + +static int mgs_put_cfg_lock(struct lustre_handle *lockh) +{ + ENTRY; + ldlm_lock_decref(lockh, LCK_EX); + RETURN(0); +} + +/* rc=0 means ok */ +static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti) +{ + int rc; + ENTRY; + + rc = mgs_check_index(obd, mti); + if (rc == 0) { + LCONSOLE_ERROR("Index for %s has disappeared! " + "Regenerating this portion of the logs." + "\n", mti->mti_svname); + mti->mti_flags |= LDD_F_UPDATE; + rc = 1; + } else if (rc == -1) { + LCONSOLE_ERROR("Client log %s-client has disappeared! " + "Regenerating all logs.\n", + mti->mti_fsname); + mti->mti_flags |= LDD_F_WRITECONF; + rc = 1; + } else { + /* Index is correctly marked as used */ + + /* If the logs don't contain the mti_nids then add + them as failover nids */ + rc = mgs_check_failnid(obd, mti); + } + + + RETURN(rc); +} + +/* Called whenever a target starts up. Flags indicate first connect, etc. */ +static int mgs_handle_target_reg(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + struct lustre_handle lockh; + struct mgs_target_info *mti, *rep_mti; + int rep_size = sizeof(*mti); + int rc = 0, lockrc; + ENTRY; + + mti = lustre_swab_reqbuf(req, 0, sizeof(*mti), + lustre_swab_mgs_target_info); + + if (!(mti->mti_flags & (LDD_F_WRITECONF | LDD_F_UPGRADE14 | + LDD_F_UPDATE))) { + /* We're just here as a startup ping. */ + CDEBUG(D_MGS, "Server %s is running on %s\n", + mti->mti_svname, obd_export_nid2str(req->rq_export)); + rc = mgs_check_target(obd, mti); + /* above will set appropriate mti flags */ + if (!rc) + /* Nothing wrong, don't revoke lock */ + GOTO(out_nolock, rc); + } + + /* Revoke the config lock to make sure nobody is reading. */ + /* Although actually I think it should be alright if + someone was reading while we were updating the logs - if we + revoke at the end they will just update from where they left off. */ + lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh); + if (lockrc != ELDLM_OK) { + LCONSOLE_ERROR("%s: Can't signal other nodes to update " + "their configuration (%d). Updating local logs " + "anyhow; you might have to manually restart " + "other nodes to get the latest configuration.\n", + obd->obd_name, lockrc); + } + + /* Log writing contention is handled by the fsdb_sem */ + + if (mti->mti_flags & LDD_F_WRITECONF) { + rc = mgs_erase_logs(obd, mti->mti_fsname); + mti->mti_flags |= LDD_F_UPDATE; + LCONSOLE_WARN("%s: Logs for fs %s were removed by user request." + " All servers must re-register in order to " + "regenerate the client log.\n", + obd->obd_name, mti->mti_fsname); + mti->mti_flags &= ~LDD_F_WRITECONF; + } + + /* COMPAT_146 */ + if (mti->mti_flags & LDD_F_UPGRADE14) { + rc = mgs_upgrade_sv_14(obd, mti); + if (rc) { + CERROR("Can't upgrade from 1.4 (%d)\n", rc); + GOTO(out, rc); + } + + mti->mti_flags &= ~LDD_F_UPGRADE14; + /* Turn off the upgrade flag permanently */ + mti->mti_flags |= LDD_F_REWRITE_LDD; + } + /* end COMPAT_146 */ + + if (mti->mti_flags & LDD_F_UPDATE) { + CDEBUG(D_MGS, "adding %s, index=%d\n", mti->mti_svname, + mti->mti_stripe_index); + + /* create the log for the new target + and update the client/mdt logs */ + rc = mgs_write_log_target(obd, mti); + if (rc) { + CERROR("Failed to write %s log (%d)\n", + mti->mti_svname, rc); + GOTO(out, rc); + } + + mti->mti_flags &= ~(LDD_F_VIRGIN | LDD_F_UPDATE | + LDD_F_NEED_INDEX); + mti->mti_flags |= LDD_F_REWRITE_LDD; + } + +out: + /* done with log update */ + if (lockrc == ELDLM_OK) + mgs_put_cfg_lock(&lockh); +out_nolock: + CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname, + mti->mti_stripe_index, rc); + lustre_pack_reply(req, 1, &rep_size, NULL); + /* send back the whole mti in the reply */ + rep_mti = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep_mti)); + memcpy(rep_mti, mti, sizeof(*rep_mti)); + RETURN(rc); +} + +int mgs_handle(struct ptlrpc_request *req) +{ + int fail = OBD_FAIL_MGS_ALL_REPLY_NET; + int rc = 0; + ENTRY; + + OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0); + + LASSERT(current->journal_info == NULL); + if (req->rq_reqmsg->opc != MGS_CONNECT) { + if (req->rq_export == NULL) { + CERROR("lustre_mgs: operation %d on unconnected MGS\n", + req->rq_reqmsg->opc); + req->rq_status = -ENOTCONN; + GOTO(out, rc = -ENOTCONN); + } + } + + switch (req->rq_reqmsg->opc) { + case MGS_CONNECT: + DEBUG_REQ(D_MGS, req, "connect"); + rc = target_handle_connect(req, mgs_handle); + if (!rc && (req->rq_reqmsg->conn_cnt > 1)) + /* Make clients trying to reconnect after a MGS restart + happy; also requires obd_replayable */ + lustre_msg_add_op_flags(req->rq_repmsg, + MSG_CONNECT_RECONNECT); + break; + case MGS_DISCONNECT: + DEBUG_REQ(D_MGS, req, "disconnect"); + rc = target_handle_disconnect(req); + req->rq_status = rc; /* superfluous? */ + break; + case MGS_TARGET_REG: + DEBUG_REQ(D_MGS, req, "target add\n"); + rc = mgs_handle_target_reg(req); + break; + case MGS_TARGET_DEL: + DEBUG_REQ(D_MGS, req, "target del\n"); + //rc = mgs_handle_target_del(req); + break; + + case LDLM_ENQUEUE: + DEBUG_REQ(D_MGS, req, "enqueue"); + rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast, + ldlm_server_blocking_ast, NULL); + fail = OBD_FAIL_LDLM_REPLY; + break; + case LDLM_BL_CALLBACK: + case LDLM_CP_CALLBACK: + DEBUG_REQ(D_MGS, req, "callback"); + CERROR("callbacks should not happen on MGS\n"); + LBUG(); + break; + + case OBD_PING: + DEBUG_REQ(D_INFO, req, "ping"); + rc = target_handle_ping(req); + break; + case OBD_LOG_CANCEL: + DEBUG_REQ(D_MGS, req, "log cancel\n"); + rc = -ENOTSUPP; /* la la la */ + break; + + case LLOG_ORIGIN_HANDLE_CREATE: + DEBUG_REQ(D_MGS, req, "llog_init"); + rc = llog_origin_handle_create(req); + break; + case LLOG_ORIGIN_HANDLE_NEXT_BLOCK: + DEBUG_REQ(D_MGS, req, "llog next block"); + rc = llog_origin_handle_next_block(req); + break; + case LLOG_ORIGIN_HANDLE_READ_HEADER: + DEBUG_REQ(D_MGS, req, "llog read header"); + rc = llog_origin_handle_read_header(req); + break; + case LLOG_ORIGIN_HANDLE_CLOSE: + DEBUG_REQ(D_MGS, req, "llog close"); + rc = llog_origin_handle_close(req); + break; + case LLOG_CATINFO: + DEBUG_REQ(D_MGS, req, "llog catinfo"); + rc = llog_catinfo(req); + break; + default: + req->rq_status = -ENOTSUPP; + rc = ptlrpc_error(req); + RETURN(rc); + } + + LASSERT(current->journal_info == NULL); + + CDEBUG(D_CONFIG | (rc?D_ERROR:0), "MGS handle cmd=%d rc=%d\n", + req->rq_reqmsg->opc, rc); + + out: + target_send_reply(req, rc, fail); + RETURN(0); +} + +static inline int mgs_destroy_export(struct obd_export *exp) +{ + ENTRY; + + target_destroy_export(exp); + + RETURN(0); +} + +/* from mdt_iocontrol */ +int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct lvfs_run_ctxt saved; + int rc = 0; + + ENTRY; + CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd); + + switch (cmd) { + + case OBD_IOC_PARAM: { + struct lustre_handle lockh; + struct lustre_cfg *lcfg; + struct llog_rec_hdr rec; + char fsname[32], *devname; + int lockrc; + + CERROR("MGS param\n"); + + rec.lrh_len = llog_data_len(data->ioc_plen1); + + if (data->ioc_type == LUSTRE_CFG_TYPE) { + rec.lrh_type = OBD_CFG_REC; + } else { + CERROR("unknown cfg record type:%d \n", data->ioc_type); + RETURN(-EINVAL); + } + + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) + RETURN(-ENOMEM); + rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1); + if (rc) + GOTO(out_free, rc); + + if (lcfg->lcfg_bufcount < 1) + GOTO(out_free, rc = -EINVAL); + + /* Extract fsname */ + memset(fsname, 0, sizeof(fsname)); + devname = lustre_cfg_string(lcfg, 0); + if (devname) { + char *ptr = strchr(devname, '-'); + if (!ptr) { + /* assume devname is the fsname */ + strncpy(fsname, devname, sizeof(fsname)); + } else { + strncpy(fsname, devname, ptr - devname); + } + CDEBUG(D_MGS, "set param on fs %s device %s\n", + fsname, devname); + } else { + CDEBUG(D_MGS, "set global param\n"); + } + + rc = mgs_setparam(obd, fsname, lcfg); + if (rc) { + CERROR("setparam err %d\n", rc); + GOTO(out_free, rc); + } + + /* Revoke lock so everyone updates. Should be alright if + someone was already reading while we were updating the logs, + so we don't really need to hold the lock while we're + writing (above). */ + if (fsname) { + lockrc = mgs_get_cfg_lock(obd, fsname, &lockh); + if (lockrc != ELDLM_OK) + CERROR("lock error %d for fs %s\n", lockrc, + fsname); + else + mgs_put_cfg_lock(&lockh); + } +out_free: + OBD_FREE(lcfg, data->ioc_plen1); + RETURN(rc); + } + + case OBD_IOC_DUMP_LOG: { + struct llog_ctxt *ctxt = + llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = class_config_dump_llog(ctxt, data->ioc_inlbuf1, NULL); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (rc) + RETURN(rc); + + RETURN(rc); + } + + case OBD_IOC_LLOG_CHECK: + case OBD_IOC_LLOG_INFO: + case OBD_IOC_LLOG_PRINT: { + struct llog_ctxt *ctxt = + llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + + push_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); + rc = llog_ioctl(ctxt, cmd, data); + pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL); + + RETURN(rc); + } + + default: + CDEBUG(D_INFO, "unknown command %x\n", cmd); + RETURN(-EINVAL); + } + RETURN(0); +} + +/* use obd ops to offer management infrastructure */ +static struct obd_ops mgs_obd_ops = { + .o_owner = THIS_MODULE, + .o_connect = mgs_connect, + .o_disconnect = mgs_disconnect, + .o_setup = mgs_setup, + .o_precleanup = mgs_precleanup, + .o_cleanup = mgs_cleanup, + .o_destroy_export = mgs_destroy_export, + .o_iocontrol = mgs_iocontrol, +}; + +static int __init mgs_init(void) +{ + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(mgs, &lvars); + class_register_type(&mgs_obd_ops, lvars.module_vars, LUSTRE_MGS_NAME); + + return 0; +} + +static void /*__exit*/ mgs_exit(void) +{ + class_unregister_type(LUSTRE_MGS_NAME); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Management Server (MGS)"); +MODULE_LICENSE("GPL"); + +module_init(mgs_init); +module_exit(mgs_exit); diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h new file mode 100644 index 0000000..688055c --- /dev/null +++ b/lustre/mgs/mgs_internal.h @@ -0,0 +1,49 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#ifndef _MGS_INTERNAL_H +#define _MGS_INTERNAL_H + +#ifdef __KERNEL__ +# include +#endif +#include +#include +#include +#include +#include +#include + + +/* MDS has o_t * 1000 */ +#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 10) + +/* mgs_llog.c */ +#define FSDB_EMPTY 0x0001 + +struct fs_db { + char fsdb_name[8]; + struct list_head fsdb_list; + struct semaphore fsdb_sem; + void* fsdb_ost_index_map; + void* fsdb_mdt_index_map; + __u32 fsdb_flags; + __u32 fsdb_gen; +}; + +int mgs_init_fsdb_list(struct obd_device *obd); +int mgs_cleanup_fsdb_list(struct obd_device *obd); +int mgs_check_index(struct obd_device *obd, struct mgs_target_info *mti); +int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti); +int mgs_write_log_target(struct obd_device *obd, struct mgs_target_info *mti); +int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti); +int mgs_erase_logs(struct obd_device *obd, char *fsname); +int mgs_setparam(struct obd_device *obd, char *fsname, struct lustre_cfg *lcfg); + +/* mgs_fs.c */ +int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt); +int mgs_fs_cleanup(struct obd_device *obddev); + + +#endif diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c new file mode 100644 index 0000000..3face3f --- /dev/null +++ b/lustre/mgs/mgs_llog.c @@ -0,0 +1,1654 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mgs/mgs_llog.c + * Lustre Management Server (mgs) config llog creation + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +#define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MGS +#define D_MGS D_CONFIG/*|D_WARNING*/ + +#ifdef __KERNEL__ +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mgs_internal.h" + +/********************** Class fns ********************/ + +static int class_dentry_readdir(struct obd_device *obd, struct dentry *dir, + struct vfsmount *inmnt, + struct list_head *dentry_list){ + /* see mds_cleanup_pending */ + struct lvfs_run_ctxt saved; + struct file *file; + struct dentry *dentry; + struct vfsmount *mnt; + int rc = 0; + ENTRY; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dentry = dget(dir); + if (IS_ERR(dentry)) + GOTO(out_pop, rc = PTR_ERR(dentry)); + mnt = mntget(inmnt); + if (IS_ERR(mnt)) { + l_dput(dentry); + GOTO(out_pop, rc = PTR_ERR(mnt)); + } + + file = dentry_open(dentry, mnt, O_RDONLY); + if (IS_ERR(file)) + /* dentry_open_it() drops the dentry, mnt refs */ + GOTO(out_pop, rc = PTR_ERR(file)); + + INIT_LIST_HEAD(dentry_list); + rc = l_readdir(file, dentry_list); + filp_close(file, 0); + /* filp_close->fput() drops the dentry, mnt refs */ + +out_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + RETURN(rc); +} + +/******************** DB functions *********************/ + +/* from the (client) config log, figure out: + 1. which ost's/mdt's are configured (by index) + 2. what the last config step is +*/ +/* FIXME is it better to have a separate db file, instead of parsing the info + out of the client log? */ +static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, + void *data) +{ + struct fs_db *fsdb = (struct fs_db *)data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char*) (rec + 1); + struct lustre_cfg *lcfg; + __u32 index; + int rc = 0; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); + RETURN(-EINVAL); + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) { + CERROR("Insane cfg\n"); + RETURN(rc); + } + + lcfg = (struct lustre_cfg *)cfg_buf; + + CDEBUG(D_INFO, "cmd %x %s %s\n", lcfg->lcfg_command, + lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1)); + + /* Figure out ost indicies */ + /* lov_modify_tgts add 0:lov1 1:ost1_UUID 2(index):0 3(gen):1 */ + if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD || + lcfg->lcfg_command == LCFG_LOV_DEL_OBD) { + index = simple_strtoul(lustre_cfg_string(lcfg, 2), + NULL, 10); + CDEBUG(D_MGS, "OST index for %s is %u (%s)\n", + lustre_cfg_string(lcfg, 1), index, + lustre_cfg_string(lcfg, 2)); + set_bit(index, fsdb->fsdb_ost_index_map); + } + + /* Figure out mdt indicies */ + /* attach 0:MDC_uml1_mdsA_MNT_client 1:mdc 2:1d834_MNT_client_03f */ + if ((lcfg->lcfg_command == LCFG_ATTACH) && + (strcmp(lustre_cfg_string(lcfg, 1), LUSTRE_MDC_NAME) == 0)) { + rc = server_name2index(lustre_cfg_string(lcfg, 0), + &index, NULL); + if (rc != LDD_F_SV_TYPE_MDT) { + CWARN("Unparsable MDC name %s, assuming index 0\n", + lustre_cfg_string(lcfg, 0)); + index = 0; + } + rc = 0; + CDEBUG(D_MGS, "MDT index is %u\n", index); + set_bit(index, fsdb->fsdb_mdt_index_map); + } + + /* Keep track of the latest marker step */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + fsdb->fsdb_gen = max(fsdb->fsdb_gen, marker->cm_step); + } + + RETURN(rc); +} + +static int mgs_get_fsdb_from_llog(struct obd_device *obd, char *logname, + struct fs_db *fsdb) +{ + struct llog_handle *loghandle; + struct lvfs_run_ctxt saved; + int rc, rc2; + ENTRY; + + down(&fsdb->fsdb_sem); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT), + &loghandle, NULL, logname); + if (rc) + GOTO(out_pop, rc); + + rc = llog_init_handle(loghandle, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + if (llog_get_size(loghandle) <= 1) + fsdb->fsdb_flags |= FSDB_EMPTY; + + rc = llog_process(loghandle, mgs_fsdb_handler, (void *)fsdb, NULL); + CDEBUG(D_MGS, "get_db = %d\n", rc); +out_close: + rc2 = llog_close(loghandle); + if (!rc) + rc = rc2; + +out_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + up(&fsdb->fsdb_sem); + + RETURN(rc); +} + +static int next_index(void *index_map, int map_len) +{ + int i; + for (i = 0; i < map_len * 8; i++) + if (!test_bit(i, index_map)) { + return i; + } + CERROR("max index %d exceeded.\n", i); + return -1; +} + +#if 0 +static int count_osts(void *index_map, int map_len) +{ + int i, num; + for (i = 0, num = 0; i < map_len * 8; i++) + if (test_bit(i, index_map)) + num++; + return num; +} +#endif + +static struct fs_db *mgs_find_fsdb(struct obd_device *obd, char *fsname) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct fs_db *fsdb; + struct list_head *tmp; + + list_for_each(tmp, &mgs->mgs_fs_db_list) { + fsdb = list_entry(tmp, struct fs_db, fsdb_list); + if (strcmp(fsdb->fsdb_name, fsname) == 0) + return fsdb; + } + return NULL; +} + +#define INDEX_MAP_SIZE 4096 + +/* caller must hold the mgs->mgs_fs_db_lock */ +static struct fs_db *mgs_new_fsdb(struct obd_device *obd, char *fsname) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct fs_db *fsdb; + ENTRY; + + OBD_ALLOC_PTR(fsdb); + if (!fsdb) + RETURN(NULL); + + OBD_ALLOC(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); + OBD_ALLOC(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); + if (!fsdb->fsdb_ost_index_map || !fsdb->fsdb_mdt_index_map) { + CERROR("No memory for index maps\n"); + GOTO(err, 0); + } + + strncpy(fsdb->fsdb_name, fsname, sizeof(fsdb->fsdb_name)); + sema_init(&fsdb->fsdb_sem, 1); + list_add(&fsdb->fsdb_list, &mgs->mgs_fs_db_list); + + RETURN(fsdb); +err: + if (fsdb->fsdb_ost_index_map) + OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); + if (fsdb->fsdb_mdt_index_map) + OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); + OBD_FREE_PTR(fsdb); + RETURN(NULL); +} + +static void mgs_free_fsdb(struct fs_db *fsdb) +{ + /* wait for anyone with the sem */ + down(&fsdb->fsdb_sem); + list_del(&fsdb->fsdb_list); + OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); + OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); + OBD_FREE_PTR(fsdb); +} + +int mgs_init_fsdb_list(struct obd_device *obd) +{ + struct mgs_obd *mgs = &obd->u.mgs; + INIT_LIST_HEAD(&mgs->mgs_fs_db_list); + return 0; +} + +int mgs_cleanup_fsdb_list(struct obd_device *obd) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct fs_db *fsdb; + struct list_head *tmp, *tmp2; + down(&mgs->mgs_sem); + list_for_each_safe(tmp, tmp2, &mgs->mgs_fs_db_list) { + fsdb = list_entry(tmp, struct fs_db, fsdb_list); + mgs_free_fsdb(fsdb); + } + up(&mgs->mgs_sem); + return 0; +} + +static inline int name_create(char *prefix, char *suffix, char **newname) +{ + LASSERT(newname); + OBD_ALLOC(*newname, strlen(prefix) + strlen(suffix) + 1); + if (!*newname) + return -ENOMEM; + sprintf(*newname, "%s%s", prefix, suffix); + return 0; +} + +static inline void name_destroy(char *name) +{ + if (name) + OBD_FREE(name, strlen(name) + 1); +} + + +static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, + struct fs_db **dbh) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct fs_db *fsdb; + char *cliname; + int rc = 0; + + down(&mgs->mgs_sem); + fsdb = mgs_find_fsdb(obd, name); + if (fsdb) { + up(&mgs->mgs_sem); + *dbh = fsdb; + return 0; + } + + CDEBUG(D_MGS, "Creating new db\n"); + fsdb = mgs_new_fsdb(obd, name); + up(&mgs->mgs_sem); + if (!fsdb) + return -ENOMEM; + + /* populate the db from the client llog */ + name_create(name, "-client", &cliname); + rc = mgs_get_fsdb_from_llog(obd, cliname, fsdb); + name_destroy(cliname); + if (rc) { + CERROR("Can't get db from llog %d\n", rc); + mgs_free_fsdb(fsdb); + return rc; + } + + *dbh = fsdb; + + return 0; +} + +/* 1 = index in use + 0 = index unused + -1= empty client log */ +int mgs_check_index(struct obd_device *obd, struct mgs_target_info *mti) +{ + struct fs_db *fsdb; + void *imap; + int rc = 0; + ENTRY; + + LASSERT(!(mti->mti_flags & LDD_F_NEED_INDEX)); + + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); + if (rc) { + CERROR("Can't get db for %s\n", mti->mti_fsname); + RETURN(rc); + } + + if (fsdb->fsdb_flags & FSDB_EMPTY) + RETURN(-1); + + if (mti->mti_flags & LDD_F_SV_TYPE_OST) + imap = fsdb->fsdb_ost_index_map; + else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) + imap = fsdb->fsdb_mdt_index_map; + else + RETURN(-EINVAL); + + if (test_bit(mti->mti_stripe_index, imap)) + RETURN(1); + RETURN(0); +} + + +int mgs_set_index(struct obd_device *obd, struct mgs_target_info *mti) +{ + struct fs_db *fsdb; + void *imap; + int rc = 0; + ENTRY; + + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); + if (rc) { + CERROR("Can't get db for %s\n", mti->mti_fsname); + RETURN(rc); + } + + if (mti->mti_flags & LDD_F_SV_TYPE_OST) + imap = fsdb->fsdb_ost_index_map; + else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) + imap = fsdb->fsdb_mdt_index_map; + else + RETURN(-EINVAL); + + if (mti->mti_flags & LDD_F_NEED_INDEX) { + rc = next_index(imap, INDEX_MAP_SIZE); + if (rc == -1) + RETURN(-ERANGE); + mti->mti_stripe_index = rc; + } + + /* Remove after CMD */ + if ((mti->mti_flags & LDD_F_SV_TYPE_MDT) && + (mti->mti_stripe_index > 0)) { + LCONSOLE_ERROR("MDT index must = 0 (until Clustered MetaData " + "feature is ready.)\n"); + mti->mti_stripe_index = 0; + } + + if (mti->mti_stripe_index >= INDEX_MAP_SIZE * 8) { + LCONSOLE_ERROR("Server %s requested index %d, but the" + "max index is %d.\n", + mti->mti_svname, mti->mti_stripe_index, + INDEX_MAP_SIZE * 8); + RETURN(-ERANGE); + } + + if (test_bit(mti->mti_stripe_index, imap)) { + if (mti->mti_flags & LDD_F_VIRGIN) { + LCONSOLE_ERROR("Server %s requested index %d, but that " + "index is already in use\n", + mti->mti_svname, mti->mti_stripe_index); + RETURN(-EADDRINUSE); + } else { + CERROR("Server %s updating index %d\n", + mti->mti_svname, mti->mti_stripe_index); + RETURN(EALREADY); + } + } + + set_bit(mti->mti_stripe_index, imap); + fsdb->fsdb_flags &= ~FSDB_EMPTY; + server_make_name(mti->mti_flags, mti->mti_stripe_index, + mti->mti_fsname, mti->mti_svname); + + CDEBUG(D_MGS, "Set index for %s to %d\n", mti->mti_svname, + mti->mti_stripe_index); + + RETURN(0); +} + +/******************** config log recording functions *********************/ + +static int record_lcfg(struct obd_device *obd, struct llog_handle *llh, + struct lustre_cfg *lcfg) +{ + struct lvfs_run_ctxt saved; + struct llog_rec_hdr rec; + int buflen, rc; + + LASSERT(llh); + LASSERT(llh->lgh_ctxt); + + buflen = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + rec.lrh_len = llog_data_len(buflen); + rec.lrh_type = OBD_CFG_REC; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + /* idx = -1 means append */ + rc = llog_write_rec(llh, &rec, NULL, 0, (void *)lcfg, -1); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (rc) { + CERROR("failed %d\n", rc); + } + LASSERT(!rc); + return rc; +} + +static int record_base(struct obd_device *obd, struct llog_handle *llh, + char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + int rc; + + CDEBUG(D_MGS, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + + rc = record_lcfg(obd, llh, lcfg); + + lustre_cfg_free(lcfg); + + if (rc) { + CERROR("error %d: lcfg %s %#x %s %s %s %s\n", rc, cfgname, + cmd, s1, s2, s3, s4); + } + return(rc); +} + + +static inline int record_add_uuid(struct obd_device *obd, + struct llog_handle *llh, + uint64_t nid, char *uuid) +{ + return record_base(obd,llh,NULL,nid,LCFG_ADD_UUID,uuid,0,0,0); + +} + +static inline int record_add_conn(struct obd_device *obd, + struct llog_handle *llh, + char *devname, + char *uuid) +{ + return record_base(obd,llh,devname,0,LCFG_ADD_CONN,uuid,0,0,0); +} + +static inline int record_attach(struct obd_device *obd, struct llog_handle *llh, + char *devname, char *type, char *uuid) +{ + return record_base(obd,llh,devname,0,LCFG_ATTACH,type,uuid,0,0); +} + +static inline int record_setup(struct obd_device *obd, struct llog_handle *llh, + char *devname, + char *s1, char *s2, char *s3, char *s4) +{ + return record_base(obd,llh,devname,0,LCFG_SETUP,s1,s2,s3,s4); +} + +static int record_lov_setup(struct obd_device *obd, struct llog_handle *llh, + char *devname, struct lov_desc *desc) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + int rc; + + lustre_cfg_bufs_reset(&bufs, devname); + lustre_cfg_bufs_set(&bufs, 1, desc, sizeof(*desc)); + lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); + + rc = record_lcfg(obd, llh, lcfg); + + lustre_cfg_free(lcfg); + return rc; +} + +static inline int record_lov_add(struct obd_device *obd, + struct llog_handle *llh, + char *lov_name, char *ost_uuid, + char *index, char *gen) +{ + return record_base(obd,llh,lov_name,0,LCFG_LOV_ADD_OBD, + ost_uuid,index,gen,0); +} + +static inline int record_mount_opt(struct obd_device *obd, + struct llog_handle *llh, + char *profile, char *lov_name, + char *mdc_name) +{ + return record_base(obd,llh,NULL,0,LCFG_MOUNTOPT, + profile,lov_name,mdc_name,0); +} + +static int record_marker(struct obd_device *obd, struct llog_handle *llh, + struct fs_db *fsdb, __u32 flags, + char *svname, char *comment) +{ + struct cfg_marker marker; + struct timeval tv; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + int rc; + + if (flags & CM_START) + fsdb->fsdb_gen++; + marker.cm_step = fsdb->fsdb_gen; + marker.cm_flags = flags; + strncpy(marker.cm_svname, svname, sizeof(marker.cm_svname)); + strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment)); + do_gettimeofday(&tv); + marker.cm_createtime = tv.tv_sec; + marker.cm_canceltime = 0; + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set(&bufs, 1, &marker, sizeof(marker)); + lcfg = lustre_cfg_new(LCFG_MARKER, &bufs); + + rc = record_lcfg(obd, llh, lcfg); + + lustre_cfg_free(lcfg); + return rc; +} + +static int record_start_log(struct obd_device *obd, + struct llog_handle **llh, char *name) +{ + static struct obd_uuid cfg_uuid = { .uuid = "config_uuid" }; + struct lvfs_run_ctxt saved; + int rc = 0; + + if (*llh) { + GOTO(out, rc = -EBUSY); + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT), + llh, NULL, name); + if (rc == 0) + llog_init_handle(*llh, LLOG_F_IS_PLAIN, &cfg_uuid); + else + *llh = NULL; + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + +out: + if (rc) { + CERROR("Can't start log %s: %d\n", name, rc); + } + RETURN(rc); +} + +static int record_end_log(struct obd_device *obd, struct llog_handle **llh) +{ + struct lvfs_run_ctxt saved; + int rc = 0; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + rc = llog_close(*llh); + *llh = NULL; + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + RETURN(rc); +} + +static int mgs_log_is_empty(struct obd_device *obd, char *name) +{ + struct lvfs_run_ctxt saved; + struct llog_handle *llh; + int rc = 0; + + /* FIXME cache the empty state in the db */ + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT), + &llh, NULL, name); + if (rc == 0) { + llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + rc = llog_get_size(llh); + llog_close(llh); + } + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + /* header is record 1 */ + return(rc <= 1); +} + +/******************** config "macros" *********************/ + +/* write an lcfg directly into a log (with markers) */ +static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb, + char *logname, char *obdname, + struct lustre_cfg *lcfg) +{ + struct llog_handle *llh = NULL; + int rc; + ENTRY; + + rc = record_start_log(obd, &llh, logname); + rc = record_marker(obd, llh, fsdb, CM_START, obdname, "param"); + + rc = record_lcfg(obd, llh, lcfg); + + rc = record_marker(obd, llh, fsdb, CM_END, obdname, "param"); + rc = record_end_log(obd, &llh); + + RETURN(rc); +} + +/* write the lcfg in all logs for the given fs */ +int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb, + char *fsname, struct lustre_cfg *lcfg) +{ + struct mgs_obd *mgs = &obd->u.mgs; + struct list_head dentry_list; + struct l_linux_dirent *dirent, *n; + char *logname; + int rc, len = strlen(fsname); + ENTRY; + + /* We need to set params for any future logs + as well. FIXME Append this file to every new log. */ + name_create(fsname, "-params", &logname); + if (mgs_log_is_empty(obd, logname)) { + struct llog_handle *llh = NULL; + rc = record_start_log(obd, &llh, logname); + rc = record_end_log(obd, &llh); + } + name_destroy(logname); + + /* Find all the logs in the CONFIGS directory */ + rc = class_dentry_readdir(obd, mgs->mgs_configs_dir, + mgs->mgs_vfsmnt, &dentry_list); + if (rc) { + CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR); + RETURN(rc); + } + + /* Could use fsdb index maps instead of directory listing */ + list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) { + list_del(&dirent->lld_list); + if (strncmp(fsname, dirent->lld_name, len) == 0) { + CDEBUG(D_MGS, "Changing log %s\n", dirent->lld_name); + rc = mgs_write_log_direct(obd, fsdb, dirent->lld_name, + dirent->lld_name, lcfg); + } + OBD_FREE(dirent, sizeof(*dirent)); + } + + RETURN(rc); +} + +/* lov is the first thing in the mdt and client logs */ +static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti, + char *logname, char *lovname) +{ + struct llog_handle *llh = NULL; + struct lov_desc *lovdesc; + char *uuid; + int rc = 0; + ENTRY; + + CDEBUG(D_MGS, "Writing log %s\n", logname); + + /* + #01 L attach 0:lov_mdsA 1:lov 2:71ccb_lov_mdsA_19f961a9e1 + #02 L lov_setup 0:lov_mdsA 1:(struct lov_desc) + uuid=lov1_UUID, stripe count=1, size=1048576, offset=0, pattern=0 + */ + + /* FIXME just make lov_setup accept empty desc (put uuid in buf 2) */ + OBD_ALLOC(lovdesc, sizeof(*lovdesc)); + if (lovdesc == NULL) + RETURN(-ENOMEM); + lovdesc->ld_magic = LOV_DESC_MAGIC; + lovdesc->ld_tgt_count = 0; + /* Defaults. Can be changed later by lcfg config_param */ + lovdesc->ld_default_stripe_count = 1; + lovdesc->ld_pattern = LOV_PATTERN_RAID0; + lovdesc->ld_default_stripe_size = 1024 * 1024; + lovdesc->ld_default_stripe_offset = 0; + sprintf((char*)lovdesc->ld_uuid.uuid, "%s_UUID", lovname); + /* can these be the same? */ + uuid = (char *)lovdesc->ld_uuid.uuid; + + /* This should always be the first entry in a log. + rc = mgs_clear_log(obd, logname); */ + rc = record_start_log(obd, &llh, logname); + rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup"); + rc = record_attach(obd, llh, lovname, "lov", uuid); + rc = record_lov_setup(obd, llh, lovname, lovdesc); + rc = record_marker(obd, llh, fsdb, CM_END, lovname, "lov setup"); + rc = record_end_log(obd, &llh); + + OBD_FREE(lovdesc, sizeof(*lovdesc)); + RETURN(rc); +} + +/* add failnids to open log */ +static int mgs_write_log_failnids(struct obd_device *obd, + struct mgs_target_info *mti, + struct llog_handle *llh, + char *cliname) +{ + char *failnodeuuid = NULL; + char *ptr = mti->mti_params; + lnet_nid_t nid; + int rc = 0; + + /* + #03 L add_uuid nid=uml1@tcp(0x20000c0a80201) nal=90 0: 1:uml1_UUID + #04 L add_uuid nid=1@elan(0x1000000000001) nal=90 0: 1:uml1_UUID + #05 L setup 0:OSC_uml1_ost1_mdsA 1:ost1_UUID 2:uml1_UUID + #06 L add_uuid nid=uml2@tcp(0x20000c0a80202) nal=90 0: 1:uml2_UUID + #0x L add_uuid nid=2@elan(0x1000000000002) nal=90 0: 1:uml2_UUID + #07 L add_conn 0:OSC_uml1_ost1_mdsA 1:uml2_UUID + */ + + /* Pull failnid info out of params string */ + while (class_find_param(ptr, PARAM_FAILNODE, &ptr) == 0) { + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + if (failnodeuuid == NULL) { + /* We don't know the failover node name, + so just use the first nid as the uuid */ + rc = name_create(libcfs_nid2str(nid), "", + &failnodeuuid); + if (rc) + return rc; + } + CDEBUG(D_MGS, "add nid %s for failover uuid %s, " + "client %s\n", libcfs_nid2str(nid), + failnodeuuid, cliname); + rc = record_add_uuid(obd, llh, nid, failnodeuuid); + } + if (failnodeuuid) { + rc = record_add_conn(obd, llh, cliname, failnodeuuid); + name_destroy(failnodeuuid); + failnodeuuid = NULL; + } + } + + return rc; +} + +static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti) +{ + struct llog_handle *llh = NULL; + char *cliname, *mdcname, *lovname, *nodeuuid, *mdcuuid; + int rc, i, first_log = 0; + ENTRY; + + CDEBUG(D_MGS, "writing new mdt %s\n", mti->mti_svname); + + /* COMPAT_146 */ + if (mti->mti_flags & LDD_F_UPGRADE14) { + char *ptr, oldname[sizeof(mti->mti_uuid)]; + /* We're starting with an old uuid. Assume old name for lov + as well since the lov entry already exists in the log. */ + CERROR("old mds uuid %s\n", mti->mti_uuid); + strcpy(oldname, mti->mti_uuid); + ptr = strstr(oldname, "_UUID"); + if (!ptr) { + CERROR("Can't get old MDT name from %s\n", + mti->mti_uuid); + RETURN(-EINVAL); + } + *ptr = '\0'; + name_create("lov_", oldname, &lovname); + CERROR("lov name: %s\n", lovname); + } else { + /* Make up our own uuid and lov name */ + snprintf(mti->mti_uuid, sizeof(mti->mti_uuid), + "%s_UUID", mti->mti_svname); + name_create(mti->mti_fsname, "-mdtlov", &lovname); + } + + /* Append mdt info to mdt log */ + if (mgs_log_is_empty(obd, mti->mti_svname)) { + /* This is the first time for all logs for this fs, + since any ost should have already started the mdt log. */ + first_log++; + rc = mgs_write_log_lov(obd, fsdb, mti, mti->mti_svname, + lovname); + } + /* else there's already some ost entries in the mdt log. */ + + /* We added the lov, maybe some osc's, now for the mdt. + We might add more ost's after this. Note that during the parsing + of this log, this is when the mdt will start. (This was not + formerly part of the old mds log, it was directly executed by + lconf.) */ + /* + #09 L mount_option 0: 1:mdsA 2:lov_mdsA + attach mds mdsA mdsA_UUID + setup /dev/loop2 ldiskfs mdsA errors=remount-ro,user_xattr + */ + rc = record_start_log(obd, &llh, mti->mti_svname); + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add mdt"); + rc = record_mount_opt(obd, llh, mti->mti_svname, lovname, 0); + rc = record_attach(obd, llh, mti->mti_svname, LUSTRE_MDS_NAME, + mti->mti_uuid); + rc = record_setup(obd, llh, mti->mti_svname, + "dev"/*ignored*/, "type"/*ignored*/, + mti->mti_svname, 0/*options*/); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdt"); + rc = record_end_log(obd, &llh); + + /* Append the mdt info to the client log */ + name_create(mti->mti_fsname, "-client", &cliname); + name_destroy(lovname); + name_create(mti->mti_fsname, "-clilov", &lovname); + if (first_log || + /* If we're upgrading, the MDT log will exist but not the client. */ + ((mti->mti_flags & LDD_F_UPGRADE14) && + mgs_log_is_empty(obd, cliname))) { + /* Start client log */ + rc = mgs_write_log_lov(obd, fsdb, mti, cliname, lovname); + } + + name_create(libcfs_nid2str(mti->mti_nids[0]), /*"_UUID"*/"", &nodeuuid); + name_create(mti->mti_svname, "-mdc", &mdcname); + name_create(mdcname, "_UUID", &mdcuuid); + /* + #09 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0: 1:uml1_UUID + #10 L attach 0:MDC_uml1_mdsA_MNT_client 1:mdc 2:1d834_MNT_client_03f + #11 L setup 0:MDC_uml1_mdsA_MNT_client 1:mdsA_UUID 2:uml1_UUID + #12 L add_uuid nid=uml2@tcp(0x20000c0a80202) 0: 1:uml2_UUID + #13 L add_conn 0:MDC_uml1_mdsA_MNT_client 1:uml2_UUID + #14 L mount_option 0: 1:client 2:lov1 3:MDC_uml1_mdsA_MNT_client + */ + rc = record_start_log(obd, &llh, cliname); + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add mdc"); + for (i = 0; i < mti->mti_nid_count; i++) { + CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i])); + rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); + } + rc = record_attach(obd, llh, mdcname, LUSTRE_MDC_NAME, mdcuuid); + rc = record_setup(obd, llh, mdcname, mti->mti_uuid,nodeuuid, 0, 0); + rc = mgs_write_log_failnids(obd, mti, llh, mdcname); + rc = record_mount_opt(obd, llh, cliname, lovname, mdcname); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); + rc = record_end_log(obd, &llh); + + name_destroy(mdcuuid); + name_destroy(mdcname); + name_destroy(nodeuuid); + name_destroy(cliname); + name_destroy(lovname); + RETURN(rc); +} + +/* Add the ost info to the client/mdt lov */ +static int mgs_write_log_osc(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti, + char *logname, char *lovname, int flags) +{ + struct llog_handle *llh = NULL; + char *nodeuuid, *oscname, *oscuuid, *lovuuid; + char index[5]; + int i, rc; + + if (mgs_log_is_empty(obd, logname)) { + /* The first item in the log must be the lov, so we have + somewhere to add our osc. */ + rc = mgs_write_log_lov(obd, fsdb, mti, logname, lovname); + } + + CDEBUG(D_MGS, "adding osc for %s to log %s\n", + mti->mti_svname, logname); + + name_create(libcfs_nid2str(mti->mti_nids[0]), "", &nodeuuid); + name_create(mti->mti_svname, "-osc", &oscname); + name_create(oscname, "_UUID", &oscuuid); + name_create(lovname, "_UUID", &lovuuid); + + /* + #03 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0: 1:uml1_UUID + multihomed (#4) + #04 L add_uuid nid=1@elan(0x1000000000001) nal=90 0: 1:uml1_UUID + #04 L attach 0:OSC_uml1_ost1_MNT_client 1:osc 2:89070_lov1_a41dff51a + #05 L setup 0:OSC_uml1_ost1_MNT_client 1:ost1_UUID 2:uml1_UUID + failover (#6,7) + #06 L add_uuid nid=uml2@tcp(0x20000c0a80202) 0: 1:uml2_UUID + #07 L add_conn 0:OSC_uml1_ost1_MNT_client 1:uml2_UUID + #08 L lov_modify_tgts add 0:lov1 1:ost1_UUID 2(index):0 3(gen):1 + */ + rc = record_start_log(obd, &llh, logname); + rc = record_marker(obd, llh, fsdb, CM_START | flags, mti->mti_svname, + "add osc"); + for (i = 0; i < mti->mti_nid_count; i++) { + CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i])); + rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); + } + rc = record_attach(obd, llh, oscname, LUSTRE_OSC_NAME, lovuuid); + rc = record_setup(obd, llh, oscname, mti->mti_uuid, nodeuuid, 0, 0); + rc = mgs_write_log_failnids(obd, mti, llh, oscname); + snprintf(index, sizeof(index), "%d", mti->mti_stripe_index); + rc = record_lov_add(obd, llh, lovname, mti->mti_uuid, index, "1"); + rc = record_marker(obd, llh, fsdb, CM_END | flags, mti->mti_svname, + "add osc"); + rc = record_end_log(obd, &llh); + + name_destroy(lovuuid); + name_destroy(oscuuid); + name_destroy(oscname); + name_destroy(nodeuuid); + return rc; +} + +static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti) +{ + struct llog_handle *llh = NULL; + char *logname, *lovname; + int rc, flags = 0; + ENTRY; + + CDEBUG(D_MGS, "writing new ost %s\n", mti->mti_svname); + + /* The ost startup log */ + + /* If the ost log already exists, that means that someone reformatted + the ost and it called target_add again. + FIXME check and warn here, maybe inc config ver #? Or abort, + and claim there's already a server with that name? Maybe need + another flag to say it's okay to rewrite. + Heck, what do we do about the client and mds logs? We better + abort. */ + if (!mgs_log_is_empty(obd, mti->mti_svname)) { + LCONSOLE_ERROR("The config log for %s already exists, yet the " + "server claims it never registered. It may have" + " been reformatted, or the index changed. Use " + " tunefs.lustre --writeconf to regenerate " + " all logs.\n", mti->mti_svname); + return -EALREADY; + } + /* + attach obdfilter ost1 ost1_UUID + setup /dev/loop2 ldiskfs f|n errors=remount-ro,user_xattr + */ + rc = record_start_log(obd, &llh, mti->mti_svname); + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost"); + if (*mti->mti_uuid == '\0') + snprintf(mti->mti_uuid, sizeof(mti->mti_uuid), + "%s_UUID", mti->mti_svname); + rc = record_attach(obd, llh, mti->mti_svname, + "obdfilter"/*LUSTRE_OST_NAME*/, mti->mti_uuid); + rc = record_setup(obd,llh,mti->mti_svname, + "dev"/*ignored*/,"type"/*ignored*/, + "f", 0/*options*/); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost"); + rc = record_end_log(obd, &llh); + + /* We also have to update the other logs where this osc is part of + the lov */ + + /* Append ost info to mdt log */ + if (mti->mti_flags & LDD_F_UPGRADE14) + /* If we're upgrading, the old mdt log already has our + entry. Let's do a fake one for fun. */ + flags = CM_SKIP | CM_UPGRADE146; + /* FIXME add to all mdt logs for CMD */ + // FIXME need real mdt name -- but MDT may not have registered yet! + name_create(mti->mti_fsname, "-MDT0000", &logname); + name_create(mti->mti_fsname, "-mdtlov", &lovname); + mgs_write_log_osc(obd, fsdb, mti, logname, lovname, flags); + name_destroy(lovname); + name_destroy(logname); + + /* Append ost info to the client log */ + name_create(mti->mti_fsname, "-client", &logname); + name_create(mti->mti_fsname, "-clilov", &lovname); + mgs_write_log_osc(obd, fsdb, mti, logname, lovname, 0); + name_destroy(lovname); + name_destroy(logname); + + RETURN(rc); +} + +/* Add additional failnids to an existing log. + The mdc/osc must have been added to logs first */ +/* tcp nids must be in dotted-quad ascii - + we can't resolve hostnames from the kernel. */ +static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti) +{ + char *logname, *cliname; + struct llog_handle *llh = NULL; + int rc; + ENTRY; + + /* Verify that we know about this target */ + if (mgs_log_is_empty(obd, mti->mti_svname)) { + LCONSOLE_ERROR("The target %s has not registered yet. " + "It must be started before failnids can " + "be added.\n", mti->mti_svname); + RETURN(-ENOENT); + } + + /* Create mdc/osc client name (e.g. lustre-OST0001-osc) */ + if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { + name_create(mti->mti_svname, "-mdc", &cliname); + } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) { + name_create(mti->mti_svname, "-osc", &cliname); + } else { + RETURN(-EINVAL); + } + + /* Add failover nids to client log */ + name_create(mti->mti_fsname, "-client", &logname); + rc = record_start_log(obd, &llh, logname); + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname, + "add failnid"); + rc = mgs_write_log_failnids(obd, mti, llh, cliname); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, + "add failnid"); + rc = record_end_log(obd, &llh); + name_destroy(logname); + + if (mti->mti_flags & LDD_F_SV_TYPE_OST) { + /* Add OST failover nids to the MDT log as well */ + name_create(mti->mti_fsname, "-MDT0000", &logname); + rc = record_start_log(obd, &llh, logname); + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname, + "add failnid"); + rc = mgs_write_log_failnids(obd, mti, llh, cliname); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, + "add failnid"); + rc = record_end_log(obd, &llh); + name_destroy(logname); + } + + name_destroy(cliname); + RETURN(rc); +} + +static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + char *ptr = mti->mti_params; + char *endptr; + char *end = mti->mti_params + sizeof(mti->mti_params); + int rc = 0, len; + ENTRY; + + if (!mti->mti_params) + RETURN(0); + + while (ptr < end) { + while (*ptr == ' ') + ptr++; + if (*ptr == '\0' || (ptr >= end)) + break; + endptr = strchr(ptr, ' '); + if (endptr) + len = endptr - ptr; + else + len = strlen(ptr); + CDEBUG(D_MGS, "next param '%.*s'\n", len, ptr); + + if (class_match_param(ptr, PARAM_MGSNODE, &endptr) == 0) + GOTO(end_while, rc); + + if (class_match_param(ptr, PARAM_FAILNODE, &endptr) == 0) { + /* Add a failover nidlist */ + rc = 0; + /* We already processed failovers params for new + targets in mgs_write_log_target */ + if (mti->mti_flags & MTI_F_IOCTL) { + CDEBUG(D_MGS, "Adding failnode\n"); + rc = mgs_write_log_add_failnid(obd, fsdb, mti); + } + GOTO(end_while, rc); + } + + if (class_match_param(ptr, PARAM_OBD_TIMEOUT, &endptr) == 0) { + /* Change obd timeout */ + int timeout; + timeout = simple_strtoul(endptr, &endptr, 0); + + CDEBUG(D_MGS, "obd timeout %d\n", timeout); + lustre_cfg_bufs_reset(&bufs, NULL); + lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs); + lcfg->lcfg_num = timeout; + /* modify all servers and clients */ + rc = mgs_write_log_direct_all(obd, fsdb, mti->mti_fsname, + lcfg); + lustre_cfg_free(lcfg); + GOTO(end_while, rc); + } + + if (class_match_param(ptr, PARAM_DEFAULT_STRIPE, &endptr) == 0){ + /* Change lov default stripe params */ + char *lovname, *logname; + CDEBUG(D_MGS, "lov param %s\n", ptr); + if (!(mti->mti_flags & LDD_F_SV_TYPE_MDT)) { + LCONSOLE_ERROR("Default stripe params must be " + "set on the MDT, not %s. " + "Ignoring.\n", + mti->mti_svname); + GOTO(end_while, rc = 0); + } + + /* Modify mdtlov */ + name_create(mti->mti_fsname, "-mdtlov", &lovname); + if (mgs_log_is_empty(obd, mti->mti_svname)) { + name_destroy(lovname); + GOTO(end_while, rc = -ENODEV); + } + lustre_cfg_bufs_reset(&bufs, lovname); + lustre_cfg_bufs_set(&bufs, 1, ptr, len); + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + rc = mgs_write_log_direct(obd, fsdb, mti->mti_svname, + lovname, lcfg); + lustre_cfg_free(lcfg); + name_destroy(lovname); + if (rc) + GOTO(end_while, rc); + + /* Modify clilov */ + name_create(mti->mti_fsname, "-client", &logname); + name_create(mti->mti_fsname, "-clilov", &lovname); + lustre_cfg_bufs_reset(&bufs, lovname); + lustre_cfg_bufs_set(&bufs, 1, ptr, len); + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + rc = mgs_write_log_direct(obd, fsdb, logname, + lovname, lcfg); + lustre_cfg_free(lcfg); + name_destroy(lovname); + name_destroy(logname); + GOTO(end_while, rc); + } + + LCONSOLE_WARN("Ignoring unrecognized param '%.*s'\n", len, ptr); + +end_while: + if (rc) { + CERROR("err %d on param '%.*s\n", rc, len, ptr); + break; + } + ptr += len; + } + + RETURN(rc); +} + +int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti) +{ + /* Not implementing automatic failover nid addition at this time. */ + return 0; +#if 0 + struct fs_db *fsdb; + int rc; + ENTRY; + + rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); + if (rc) + RETURN(rc); + + if (mgs_log_is_empty(obd, mti->mti_svname)) + /* should never happen */ + RETURN(-ENOENT); + + CDEBUG(D_MGS, "Checking for new failnids for %s\n", mti->mti_svname); + // FIXME check logs + /* FIXME we need a real database lookup. Create on-disk db of known + size, lookup by index */ + /* Check each nid, or check only nid0 and add all if nid0 is missing? + What if someone adds a net to a node? Better check everything. */ + /* if nid 0 is missing, mgs_write_log_add_failnid. + if just one nid is missing, add uuid for nodeuuid[nid0]). + */ + + /* Hey, we can just check mti->params to see if we're already in + the failover list */ + + down(&fsdb->fsdb_sem); + rc = mgs_write_log_add_failnid(obd, fsdb, mti); + up(&fsdb->fsdb_sem); + + RETURN(rc); +#endif +} + +int mgs_write_log_target(struct obd_device *obd, + struct mgs_target_info *mti) +{ + struct fs_db *fsdb; + int rc = -EINVAL; + ENTRY; + + /* set/check the new target index */ + rc = mgs_set_index(obd, mti); + if (rc < 0) { + CERROR("Can't get index (%d)\n", rc); + RETURN(rc); + } + if (rc == EALREADY) { + // FIXME mark old log sections as invalid, add new. + CERROR("updates not yet implemented\n"); + RETURN(-EALREADY); + } + + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); + if (rc) { + CERROR("Can't get db for %s\n", mti->mti_fsname); + RETURN(rc); + } + + down(&fsdb->fsdb_sem); + + if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { + rc = mgs_write_log_mdt(obd, fsdb, mti); + } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) { + rc = mgs_write_log_ost(obd, fsdb, mti); + } else { + CERROR("Unknown target type %#x, can't create log for %s\n", + mti->mti_flags, mti->mti_svname); + } + if (rc) { + CERROR("Can't write logs for %s (%d)\n", mti->mti_svname, rc); + GOTO(out_up, rc); + } + + rc = mgs_write_log_params(obd, fsdb, mti); + +out_up: + up(&fsdb->fsdb_sem); + RETURN(rc); +} + + +/* COMPAT_146 */ +/***************** upgrade pre-mountconf logs to mountconf *****************/ + +#if 0 +int mgs_upgrade_logs_14(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti) +{ + int rc = 0; + ENTRY; + + CDEBUG(D_MGS, "Upgrading old logs for %s\n", mti->mti_fsname); + + /* If we get here, we know: + the client log fsname-client exists + the logs have not been updated + so + 1. parse the old client log (client log name?) to find out UUIDs for + all servers + 2. regen all ost logs: servers will get new + name based on index, but will keep their old uuids. + 3. append mdt startup to the end of the mdt log + 4. append marker to old client log signifying we did the upgrade + ? translate mds/client logs to new names? + 2 UP mdt MDS MDS_uuid 3 + 3 UP lov lov_mdsA 47d06_lov_mdsA_61f31f85bc 4 + 4 UP osc OSC_uml1_ost1_mdsA 47d06_lov_mdsA_61f31f85bc 4 + 5 UP osc OSC_uml1_ost2_mdsA 47d06_lov_mdsA_61f31f85bc 4 + 6 UP mds lustre-MDT0000 mdsA_UUID 3 + to + ? update server uuids? + */ + + + /* old mdt log: + old osc's were part of old lov, + mount opt connects mdt to lov + so need to use old lov name. + old client logs starts old mdc and lov, + so need to use old lov,mdc names in mount opt - but new client name + *-client instead of just "client" */ + + + if ((mti->mti_flags & LDD_F_SV_TYPE_MDT)) { + CDEBUG(D_MGS, "Upgrade MDT\n"); + if (mgs_log_is_empty(obd, mti->mti_svname)) { + CERROR("The MDT log %s is missing.\n", mti->mti_svname); + RETURN(-ENOENT); + } + /* Append the MDT startup sequence to the old log + (lconf used to start the MDT directly) */ + rc = mgs_write_log_mdt(obd, fsdb, mti); + if (rc) + RETURN(rc); + + /* this would be for trying to update an old client log */ + struct llog_handle *llh = NULL; + char *cliname; + CDEBUG(D_MGS, "Upgrade client\n"); + + name_create(mti->mti_fsname, "-client", &cliname); + + rc = record_start_log(obd, &llh, cliname); + rc = record_marker(obd, llh, fsdb, CM_START, "client", + "upgrade from 1.4"); + /* FIXME find the old lovname and mdcname from old log */ + /* old: mount_option 0: 1:client 2:lov1 3:MDC_uml1_mdsA_MNT_client */ + /* new: mount_option 0: 1:lustre-client 2:lustre-clilov 3:lustre-MDT0000-mdc */ + rc = record_mount_opt(obd, llh, cliname, "lov1", + "MDC_uml1_mdsA_MNT_client"); + rc = record_marker(obd, llh, fsdb, CM_END, "client", + "upgrade to 1.6"); + rc = record_end_log(obd, &llh); + name_destroy(cliname); + } + + if ((mti->mti_flags & LDD_F_SV_TYPE_OST)) { + CDEBUG(D_MGS, "Upgrade OST\n"); + /* A regular new ost log, but don't update client or MDT logs */ + rc = mgs_write_log_ost(obd, fsdb, mti); + } + + RETURN(rc); +} +#endif + +/* first connect of upgraded servers */ +int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti) +{ + struct fs_db *fsdb; + int rc = 0; + ENTRY; + + /* Create client and ost log normally, as servers register. + That way logs are modern (except have old uuids (from last_rcvd)) + - Old clients can continue to use upgraded OSTs + - New clients will only start with upgraded OSTs + - MDT won't know about old OSTs, only upgraded, so we need the old + MDT log in order for old clients to work. (Old clients connect to + the MDT, not the MGS, for their logs, and will therefore receive + the old client log from the MDT /LOGS dir.) */ + + CDEBUG(D_MGS, "upgrading server %s from pre-1.6\n", + mti->mti_svname); + server_mti_print("upgrade", mti); + + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); + if (rc) + RETURN(rc); + + if (fsdb->fsdb_flags & FSDB_EMPTY) + /* First server to upgrade sees this */ + CWARN("info: missing client log\n"); + + if (!(fsdb->fsdb_flags & FSDB_EMPTY) && (fsdb->fsdb_gen == 0)) { + /* There were no markers in the client log, meaning we have + not updated the logs for this fs */ + CWARN("info: found old, unupdated client log\n"); + } + + if ((mti->mti_flags & LDD_F_SV_TYPE_MDT) && + mgs_log_is_empty(obd, mti->mti_svname)) { + LCONSOLE_ERROR("The old MDT log %s is missing. Was " + "tunefs.lustre successful?\n", + mti->mti_svname); + RETURN(-ENOENT); + } + + /* FIXME Old MDT log already has an old mount opt + which we should drop */ + rc = mgs_write_log_target(obd, mti); + RETURN(rc); +} +/* end COMPAT_146 */ + +static int mgs_clear_log(struct obd_device *obd, char *name) +{ + struct lvfs_run_ctxt saved; + struct llog_handle *llh; + int rc = 0; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc = llog_create(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT), + &llh, NULL, name); + if (rc == 0) { + llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + rc = llog_destroy(llh); + llog_free_handle(llh); + } + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + if (rc) + CERROR("failed to clear log %s: %d\n", name, rc); + + return(rc); +} + +/* erase all logs for the given fs */ +int mgs_erase_logs(struct obd_device *obd, char *fsname) +{ + struct mgs_obd *mgs = &obd->u.mgs; + static struct fs_db *fsdb; + struct list_head dentry_list; + struct l_linux_dirent *dirent, *n; + int rc, len = strlen(fsname); + ENTRY; + + /* Find all the logs in the CONFIGS directory */ + rc = class_dentry_readdir(obd, mgs->mgs_configs_dir, + mgs->mgs_vfsmnt, &dentry_list); + if (rc) { + CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR); + RETURN(rc); + } + + /* Delete the fs db */ + down(&mgs->mgs_sem); + fsdb = mgs_find_fsdb(obd, fsname); + if (fsdb) + mgs_free_fsdb(fsdb); + up(&mgs->mgs_sem); + + list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) { + list_del(&dirent->lld_list); + if (strncmp(fsname, dirent->lld_name, len) == 0) { + CDEBUG(D_MGS, "Removing log %s\n", dirent->lld_name); + mgs_clear_log(obd, dirent->lld_name); + } + OBD_FREE(dirent, sizeof(*dirent)); + } + + RETURN(rc); +} + +/* from llog_swab */ +static void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + CDEBUG(D_MGS, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_MGS, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_MGS, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_MGS, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_MGS, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_MGS, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_MGS, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + CDEBUG(D_MGS, "\tlcfg->lcfg_buflens[%d]: %d %s\n", + i, lcfg->lcfg_buflens[i], + lustre_cfg_string(lcfg, i)); + } + EXIT; +} + +/* Set a permanent (config log) param for a target or fs */ +int mgs_setparam(struct obd_device *obd, char *fsname, struct lustre_cfg *lcfg) +{ + struct fs_db *fsdb; + struct mgs_target_info *mti; + char *devname; + int rc = 0; + ENTRY; + + print_lustre_cfg(lcfg); + + /* lustre, lustre-mdtlov, lustre-client, lustre-MDT0000 */ + devname = lustre_cfg_string(lcfg, 0); + + if (devname == NULL) { + /* Global setting across all fs's? */ + LCONSOLE_ERROR("Global settings not implemented yet!\n"); + RETURN(-ENOSYS); + } + + CDEBUG(D_MGS, "target: %s\n", devname); + + rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); + if (rc) + RETURN(rc); + if (fsdb->fsdb_flags & FSDB_EMPTY) { + CERROR("No filesystem targets for %s\n", fsname); + RETURN(-EINVAL); + } + + /* Create a fake mti to hold everything */ + OBD_ALLOC_PTR(mti); + if (!mti) + GOTO(out, rc = -ENOMEM); + strcpy(mti->mti_fsname, fsname); + strcpy(mti->mti_svname, devname); + rc = server_name2index(devname, &mti->mti_stripe_index, NULL); + if (rc < 0) + GOTO(out, rc); + mti->mti_flags = rc | MTI_F_IOCTL; + strncpy(mti->mti_params, lustre_cfg_string(lcfg, 1), + sizeof(mti->mti_params)); + + down(&fsdb->fsdb_sem); + rc = mgs_write_log_params(obd, fsdb, mti); + up(&fsdb->fsdb_sem); + +out: + OBD_FREE_PTR(mti); + RETURN(rc); +} + + +#if 0 +/******************** unused *********************/ +static int mgs_backup_llog(struct obd_device *obd, char* fsname) +{ + struct file *filp, *bak_filp; + struct lvfs_run_ctxt saved; + char *logname, *buf; + loff_t soff = 0 , doff = 0; + int count = 4096, len; + int rc = 0; + + OBD_ALLOC(logname, PATH_MAX); + if (logname == NULL) + return -ENOMEM; + + OBD_ALLOC(buf, count); + if (!buf) + GOTO(out , rc = -ENOMEM); + + len = snprintf(logname, PATH_MAX, "%s/%s.bak", + MOUNT_CONFIGS_DIR, fsname); + + if (len >= PATH_MAX - 1) { + GOTO(out, -ENAMETOOLONG); + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + bak_filp = l_filp_open(logname, O_RDWR|O_CREAT|O_TRUNC, 0660); + if (IS_ERR(bak_filp)) { + rc = PTR_ERR(bak_filp); + CERROR("backup logfile open %s: %d\n", logname, rc); + GOTO(pop, rc); + } + sprintf(logname, "%s/%s", MOUNT_CONFIGS_DIR, fsname); + filp = l_filp_open(logname, O_RDONLY, 0); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CERROR("logfile open %s: %d\n", logname, rc); + GOTO(close1f, rc); + } + + while ((rc = lustre_fread(filp, buf, count, &soff)) > 0) { + rc = lustre_fwrite(bak_filp, buf, count, &doff); + break; + } + + filp_close(filp, 0); +close1f: + filp_close(bak_filp, 0); +pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); +out: + if (buf) + OBD_FREE(buf, count); + OBD_FREE(logname, PATH_MAX); + return rc; +} + + + +#endif diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index ff70e59..a33afd2 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -23,7 +23,7 @@ obdclass-all-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o obdclass-all-objs += class_obd.o obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o -obdclass-all-objs += statfs_pack.o obdo.o obd_config.o prng.o +obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o prng.o obdclass-objs := $(obdclass-linux-objs) $(obdclass-all-objs) diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 180ce86..b5fea25 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -386,12 +386,14 @@ EXPORT_SYMBOL(proc_lustre_root); EXPORT_SYMBOL(class_register_type); EXPORT_SYMBOL(class_unregister_type); +EXPORT_SYMBOL(class_search_type); EXPORT_SYMBOL(class_get_type); EXPORT_SYMBOL(class_put_type); EXPORT_SYMBOL(class_name2dev); EXPORT_SYMBOL(class_name2obd); EXPORT_SYMBOL(class_uuid2dev); EXPORT_SYMBOL(class_uuid2obd); +EXPORT_SYMBOL(class_obd_list); EXPORT_SYMBOL(class_find_client_obd); EXPORT_SYMBOL(class_find_client_notype); EXPORT_SYMBOL(class_devices_in_group); @@ -403,6 +405,7 @@ EXPORT_SYMBOL(class_conn2cliimp); EXPORT_SYMBOL(class_disconnect); /* uuid.c */ +EXPORT_SYMBOL(class_generate_random_uuid); EXPORT_SYMBOL(class_uuid_unparse); EXPORT_SYMBOL(lustre_uuid_to_peer); @@ -410,7 +413,7 @@ EXPORT_SYMBOL(class_handle_hash); EXPORT_SYMBOL(class_handle_unhash); EXPORT_SYMBOL(class_handle2object); -/* config.c */ +/* obd_config.c */ EXPORT_SYMBOL(class_incref); EXPORT_SYMBOL(class_decref); EXPORT_SYMBOL(class_get_profile); @@ -508,17 +511,18 @@ static int __init init_obdclass(void) int init_obdclass(void) #endif { + int i, err; struct obd_device *obd; - int err; - int i; - #ifdef __KERNEL__ + int lustre_register_fs(void); + printk(KERN_INFO "Lustre: OBD class driver Build Version: " BUILD_VERSION", info@clusterfs.com\n"); #else CDEBUG(D_INFO, "Lustre: OBD class driver Build Version: " BUILD_VERSION", info@clusterfs.com\n"); #endif + spin_lock_init(&obd_types_lock); spin_lock_init(&handle_lock); cfs_waitq_init(&obd_race_waitq); @@ -550,6 +554,7 @@ int init_obdclass(void) return err; #ifdef __KERNEL__ err = class_procfs_init(); + lustre_register_fs(); #endif return err; @@ -561,8 +566,11 @@ int init_obdclass(void) static void cleanup_obdclass(void) { int i; + int lustre_unregister_fs(void); ENTRY; + lustre_unregister_fs(); + cfs_psdev_deregister(&obd_psdev); for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 361c2d4..7ed2f36 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -46,7 +46,7 @@ int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); * support functions: we could use inter-module communication, but this * is more portable to other OS's */ -static struct obd_type *class_search_type(char *name) +struct obd_type *class_search_type(char *name) { struct list_head *tmp; struct obd_type *type; @@ -69,11 +69,15 @@ struct obd_type *class_get_type(char *name) #ifdef CONFIG_KMOD if (!type) { - if (!request_module(name)) { - CDEBUG(D_INFO, "Loaded module '%s'\n", name); + char *modname = name; + if (strcmp(modname, LUSTRE_MDT_NAME) == 0) + modname = LUSTRE_MDS_NAME; + if (!request_module(modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); type = class_search_type(name); - } else - CDEBUG(D_INFO, "Can't load module '%s'\n", name); + } else { + LCONSOLE_ERROR("Can't load module '%s'\n", modname); + } } #endif if (type) @@ -274,6 +278,33 @@ struct obd_device *class_uuid2obd(struct obd_uuid *uuid) return &obd_dev[dev]; } +void class_obd_list(void) +{ + char *status; + int i; + + spin_lock(&obd_dev_lock); + for (i = 0; i < MAX_OBD_DEVICES; i++) { + struct obd_device *obd = &obd_dev[i]; + if (obd->obd_type == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + spin_unlock(&obd_dev_lock); + return; +} + /* Search for a client OBD connected to tgt_uuid. If grp_uuid is specified, then only the client with that uuid is returned, otherwise any client connected to the tgt is returned. */ diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index a6edbb7..2af0105 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -142,8 +142,11 @@ int llog_init_handle(struct llog_handle *handle, int flags, rc = llog_read_header(handle); if (rc == 0) { flags = llh->llh_flags; - if (uuid) - LASSERT(obd_uuid_equals(uuid, &llh->llh_tgtuuid)); + if (uuid && !obd_uuid_equals(uuid, &llh->llh_tgtuuid)) { + CERROR("uuid mismatch: %s/%s\n", (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + rc = -EEXIST; + } GOTO(out, rc); } else if (rc != LLOG_EEMPTY || !flags) { /* set a pesudo flag for initialization */ @@ -209,15 +212,19 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, char *buf; __u64 cur_offset = LLOG_CHUNK_SIZE; int rc = 0, index = 1, last_index; - int saved_index = 0; + int saved_index = 0, last_called_index = 0; ENTRY; + LASSERT(llh); + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); if (!buf) RETURN(-ENOMEM); - if (cd != NULL) + if (cd != NULL) { + last_called_index = cd->first_idx; index = cd->first_idx + 1; + } if (cd != NULL && cd->last_idx) last_index = cd->last_idx; else @@ -285,6 +292,7 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, /* if set, process the callback on this record */ if (ext2_test_bit(index, llh->llh_bitmap)) { rc = cb(loghandle, rec, data); + last_called_index = index; if (rc == LLOG_PROC_BREAK) { CWARN("recovery from log: "LPX64":%x" " stopped\n", @@ -309,12 +317,22 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, } out: + if (cd != NULL) + cd->last_idx = last_called_index; if (buf) OBD_FREE(buf, LLOG_CHUNK_SIZE); RETURN(rc); } EXPORT_SYMBOL(llog_process); +inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} +EXPORT_SYMBOL(llog_get_size); + int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb, void *data, void *catdata) { diff --git a/lustre/obdclass/llog_ioctl.c b/lustre/obdclass/llog_ioctl.c index f9c1ec0..9bdea74 100644 --- a/lustre/obdclass/llog_ioctl.c +++ b/lustre/obdclass/llog_ioctl.c @@ -129,7 +129,6 @@ static int llog_check_cb(struct llog_handle *handle, struct llog_rec_hdr *rec, case MDS_UNLINK_REC: case MDS_SETATTR_REC: case OBD_CFG_REC: - case PTL_CFG_REC: /* obsolete */ case LLOG_HDR_MAGIC: { l = snprintf(out, remains, "[index]: %05d [type]: " "%02x [len]: %04d ok\n", diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 6d56707..2eedc32 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "llog_internal.h" #if defined(__KERNEL__) && defined(LLOG_LVFS) @@ -494,7 +495,7 @@ static int llog_lvfs_prev_block(struct llog_handle *loghandle, RETURN(-EIO); } -static struct file *llog_filp_open(char *name, int flags, int mode) +static struct file *llog_filp_open(char *dir, char *name, int flags, int mode) { char *logname; struct file *filp; @@ -504,7 +505,7 @@ static struct file *llog_filp_open(char *name, int flags, int mode) if (logname == NULL) return ERR_PTR(-ENOMEM); - len = snprintf(logname, PATH_MAX, "LOGS/%s", name); + len = snprintf(logname, PATH_MAX, "%s/%s", dir, name); if (len >= PATH_MAX - 1) { filp = ERR_PTR(-ENAMETOOLONG); } else { @@ -513,7 +514,6 @@ static struct file *llog_filp_open(char *name, int flags, int mode) CERROR("logfile creation %s: %ld\n", logname, PTR_ERR(filp)); } - OBD_FREE(logname, PATH_MAX); return filp; } @@ -572,7 +572,16 @@ static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res, handle->lgh_id = *logid; } else if (name) { - handle->lgh_file = llog_filp_open(name, open_flags, 0644); + /* COMPAT_146 */ + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) { + handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, + open_flags, 0644); + } else { + /* end COMPAT_146 */ + handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, + name, open_flags, + 0644); + } if (IS_ERR(handle->lgh_file)) GOTO(cleanup, rc = PTR_ERR(handle->lgh_file)); @@ -639,12 +648,20 @@ static int llog_lvfs_destroy(struct llog_handle *handle) { struct dentry *fdentry; struct obdo *oa; + struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd; + char *dir; int rc; ENTRY; + /* COMPAT_146 */ + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) + dir = MDT_LOGS_DIR; + else + /* end COMPAT_146 */ + dir = MOUNT_CONFIGS_DIR; + fdentry = handle->lgh_file->f_dentry; - if (!strcmp(fdentry->d_parent->d_name.name, "LOGS")) { - struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd; + if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) { struct inode *inode = fdentry->d_parent->d_inode; struct lvfs_run_ctxt saved; @@ -692,7 +709,8 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd, int size = sizeof(*idarray) * count; loff_t off = 0; - LASSERT(count); + if (!count) + return (0); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700); @@ -702,17 +720,19 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd, name, rc); GOTO(out, rc); } - + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { CERROR("%s is not a regular file!: mode = %o\n", name, file->f_dentry->d_inode->i_mode); GOTO(out, rc = -ENOENT); } + CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", + (int)file->f_dentry->d_inode->i_size, size); + rc = fsfilt_read_record(disk_obd, file, idarray, size, &off); if (rc) { - CDEBUG(D_INODE,"OBD filter: error reading %s: rc %d\n", - name, rc); + CERROR("OBD filter: error reading %s: rc %d\n", name, rc); GOTO(out, rc); } @@ -734,7 +754,8 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd, int size = sizeof(*idarray) * count; loff_t off = 0; - LASSERT(count); + if (!count) + return (0); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); file = filp_open(name, O_RDWR | O_CREAT | O_LARGEFILE, 0700); diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c index 4833c29..5f6e680 100644 --- a/lustre/obdclass/llog_obd.c +++ b/lustre/obdclass/llog_obd.c @@ -39,6 +39,28 @@ /* helper functions for calling the llog obd methods */ +int llog_cleanup(struct llog_ctxt *ctxt) +{ + int rc = 0; + ENTRY; + + if (!ctxt) { + CERROR("No ctxt\n"); + RETURN(-ENODEV); + } + + if (CTXTP(ctxt, cleanup)) + rc = CTXTP(ctxt, cleanup)(ctxt); + + ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL; + if (ctxt->loc_exp) + class_export_put(ctxt->loc_exp); + OBD_FREE(ctxt, sizeof(*ctxt)); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cleanup); + int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd, int count, struct llog_logid *logid, struct llog_operations *op) { @@ -49,6 +71,17 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd, if (index < 0 || index >= LLOG_MAX_CTXTS) RETURN(-EFAULT); + if (obd->obd_llog_ctxt[index]) { + /* During an mds_lov_add_ost, we try to tear down and resetup llogs. + But the mdt teardown does not flow down to the lov/osc's as the + setup does, because the lov/osc must clean up only when they are + done, not when the mdt is done. So instead, we just assume that + if the lov llogs are already set up then we must cleanup first. */ + CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", + obd->obd_name, index); + llog_cleanup(obd->obd_llog_ctxt[index]); + } + OBD_ALLOC(ctxt, sizeof(*ctxt)); if (!ctxt) RETURN(-ENOMEM); @@ -67,28 +100,6 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd, } EXPORT_SYMBOL(llog_setup); -int llog_cleanup(struct llog_ctxt *ctxt) -{ - int rc = 0; - ENTRY; - - if (!ctxt) { - CERROR("No ctxt\n"); - RETURN(-ENODEV); - } - - if (CTXTP(ctxt, cleanup)) - rc = CTXTP(ctxt, cleanup)(ctxt); - - ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL; - if (ctxt->loc_exp) - class_export_put(ctxt->loc_exp); - OBD_FREE(ctxt, sizeof(*ctxt)); - - RETURN(rc); -} -EXPORT_SYMBOL(llog_cleanup); - int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp) { int rc = 0; diff --git a/lustre/obdclass/llog_swab.c b/lustre/obdclass/llog_swab.c index e12003f..4f45df0 100644 --- a/lustre/obdclass/llog_swab.c +++ b/lustre/obdclass/llog_swab.c @@ -239,11 +239,9 @@ void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) } __swab32s(&lcfg->lcfg_command); - __swab32s(&lcfg->lcfg_num); __swab32s(&lcfg->lcfg_flags); __swab64s(&lcfg->lcfg_nid); - __swab32s(&lcfg->lcfg_bufcount); for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) __swab32s(&lcfg->lcfg_buflens[i]); diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index 2a42138..920ba03 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -81,6 +81,8 @@ int lustre_uuid_to_peer(char *uuid, lnet_nid_t *peer_nid, int index) return -ENOENT; } +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ int class_add_uuid(char *uuid, __u64 nid) { struct uuid_nid_data *data; @@ -114,7 +116,7 @@ int class_add_uuid(char *uuid, __u64 nid) return 0; } -/* delete only one entry if uuid is specified, otherwise delete all */ +/* Delete the nids for one uuid if specified, otherwise delete all */ int class_del_uuid (char *uuid) { struct list_head deathrow; diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 6befd70..ff70f56 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2006 Cluster File Systems, Inc. * * This file is part of the Lustre file system, http://www.lustre.org * Lustre is a trademark of Cluster File Systems, Inc. @@ -38,6 +38,8 @@ #include +/********************** class fns **********************/ + /* Create a new device and set the type, name and uuid. If * successful, the new device can be accessed by either name or uuid. */ @@ -222,6 +224,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) RETURN(0); err_exp: + CERROR("setup %s failed (%d)\n", obd->obd_name, err); class_unlink_export(obd->obd_self_export); obd->obd_self_export = NULL; obd->obd_starting = 0; @@ -334,8 +337,8 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) obd->obd_force = 1; break; case 'A': - LCONSOLE_WARN("Failing %s by user command\n", - obd->obd_name); + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); obd->obd_fail = 1; obd->obd_no_transno = 1; obd->obd_no_recov = 1; @@ -458,7 +461,8 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) RETURN(-EINVAL); } if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { CERROR("can't add connection on non-client dev\n"); RETURN(-EINVAL); } @@ -529,7 +533,7 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc, ENTRY; OBD_ALLOC(lprof, sizeof(*lprof)); if (lprof == NULL) - GOTO(out, err = -ENOMEM); + RETURN(-ENOMEM); CFS_INIT_LIST_HEAD(&lprof->lp_list); LASSERT(proflen == (strlen(prof) + 1)); @@ -540,7 +544,7 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc, LASSERT(osclen == (strlen(osc) + 1)); OBD_ALLOC(lprof->lp_osc, osclen); - if (lprof->lp_profile == NULL) + if (lprof->lp_osc == NULL) GOTO(out, err = -ENOMEM); memcpy(lprof->lp_osc, osc, osclen); @@ -553,8 +557,16 @@ int class_add_profile(int proflen, char *prof, int osclen, char *osc, } list_add(&lprof->lp_list, &lustre_profile_list); + RETURN(err); out: + if (lprof->lp_mdc) + OBD_FREE(lprof->lp_mdc, mdclen); + if (lprof->lp_osc) + OBD_FREE(lprof->lp_osc, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); RETURN(err); } @@ -621,8 +633,6 @@ int class_process_config(struct lustre_cfg *lcfg) case LCFG_DEL_MOUNTOPT: { CDEBUG(D_IOCTL, "mountopt: profile %s\n", lustre_cfg_string(lcfg, 1)); - /* set these mount options somewhere, so ll_fill_super - * can find them. */ class_del_profile(lustre_cfg_string(lcfg, 1)); GOTO(out, err = 0); } @@ -643,9 +653,11 @@ int class_process_config(struct lustre_cfg *lcfg) sizeof (obd_lustre_upcall)); GOTO(out, err = 0); } - case LCFG_PARAM: case LCFG_MARKER: { - LCONSOLE_WARN("LCFG_MARKER not yet implemented.\n"); + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_svname, marker->cm_comment); GOTO(out, err = 0); } } @@ -690,17 +702,34 @@ int class_process_config(struct lustre_cfg *lcfg) } } out: + if ((err == -ENOSYS || err == -EINVAL) && + !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Skipping optional command %#x\n", lcfg->lcfg_command); + err = 0; + } return err; } +int class_config_dump_handler(struct llog_handle * handle, + struct llog_rec_hdr *rec, void *data); + +#ifdef __KERNEL__ +extern int lustre_check_exclusion(struct super_block *sb, char *svname); +#else +#define lustre_check_exclusion(a,b) 0 +#endif + static int class_config_llog_handler(struct llog_handle * handle, struct llog_rec_hdr *rec, void *data) { - struct config_llog_instance *cfg = data; + struct config_llog_instance *clli = data; int cfg_len = rec->lrh_len; char *cfg_buf = (char*) (rec + 1); int rc = 0; ENTRY; + + //class_config_dump_handler(handle, rec, data); + switch (rec->lrh_type) { case OBD_CFG_REC: { struct lustre_cfg *lcfg, *lcfg_new; @@ -717,23 +746,74 @@ static int class_config_llog_handler(struct llog_handle * handle, if (rc) GOTO(out, rc); + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_CONFIG, "Marker, cfg_flg=%#x\n", + clli->cfg_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + clli->cfg_flags = CFG_F_MARKER; + if (marker->cm_flags & CM_SKIP) { + clli->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if (lustre_check_exclusion(clli->cfg_sb, + marker->cm_svname)) { + clli->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + clli->cfg_flags = 0; + } + } + /* A config command without a start marker before it is + illegal (1.4.6. compat must set it artificially) */ + if (!(clli->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Config not inside markers, ignoring! (%#x)\n", + clli->cfg_flags); + clli->cfg_flags |= CFG_F_SKIP; + } + + if (clli->cfg_flags & CFG_F_SKIP) { + // FIXME warning + CDEBUG(D_CONFIG|D_WARNING, "skipping %#x\n", + clli->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + if ((clli->cfg_flags & CFG_F_EXCLUDE) && + (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + lustre_cfg_bufs_init(&bufs, lcfg); - if (cfg && cfg->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { + if (clli && clli->cfg_instance && LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){ inst = 1; inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + - strlen(cfg->cfg_instance) + 1; + strlen(clli->cfg_instance) + 1; OBD_ALLOC(inst_name, inst_len); if (inst_name == NULL) GOTO(out, rc = -ENOMEM); sprintf(inst_name, "%s-%s", lustre_cfg_string(lcfg, 0), - cfg->cfg_instance); + clli->cfg_instance); lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); } - if (cfg && lcfg->lcfg_command == LCFG_ATTACH) { - lustre_cfg_bufs_set_string(&bufs, 2, cfg->cfg_uuid.uuid); + /* we override the llog's uuid for clients, to insure they + are unique */ + if (clli && clli->cfg_instance && + lcfg->lcfg_command == LCFG_ATTACH) { + lustre_cfg_bufs_set_string(&bufs, 2, + clli->cfg_uuid.uuid); } lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); @@ -765,22 +845,23 @@ static int class_config_llog_handler(struct llog_handle * handle, OBD_FREE(inst_name, inst_len); break; } - case PTL_CFG_REC: { - CWARN("Ignoring obsolete portals config\n"); - break; - } default: CERROR("Unknown llog record type %#x encountered\n", rec->lrh_type); break; } out: + if (rc) { + CERROR("Err %d on cfg command:\n", rc); + class_config_dump_handler(handle, rec, data); + } RETURN(rc); } int class_config_parse_llog(struct llog_ctxt *ctxt, char *name, struct config_llog_instance *cfg) { + struct llog_process_cat_data cd = {0, 0}; struct llog_handle *llh; int rc, rc2; ENTRY; @@ -794,14 +875,25 @@ int class_config_parse_llog(struct llog_ctxt *ctxt, char *name, if (rc) GOTO(parse_out, rc); - rc = llog_process(llh, class_config_llog_handler, cfg, NULL); + /* continue processing from where we last stopped to end-of-log */ + if (cfg) + cd.first_idx = cfg->cfg_last_idx; + cd.last_idx = 0; + + rc = llog_process(llh, class_config_llog_handler, cfg, &cd); + + /* FIXME remove warning */ + CDEBUG(D_CONFIG|D_WARNING, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.first_idx + 1, cd.last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.last_idx; + parse_out: rc2 = llog_close(llh); if (rc == 0) rc = rc2; RETURN(rc); - } int class_config_dump_handler(struct llog_handle * handle, @@ -809,8 +901,16 @@ int class_config_dump_handler(struct llog_handle * handle, { int cfg_len = rec->lrh_len; char *cfg_buf = (char*) (rec + 1); + char *outstr, *ptr, *end; int rc = 0; ENTRY; + + OBD_ALLOC(outstr, 256); + end = outstr + 256; + ptr = outstr; + if (!outstr) { + RETURN(-ENOMEM); + } if (rec->lrh_type == OBD_CFG_REC) { struct lustre_cfg *lcfg; int i; @@ -820,30 +920,39 @@ int class_config_dump_handler(struct llog_handle * handle, GOTO(out, rc); lcfg = (struct lustre_cfg *)cfg_buf; - CDEBUG(D_INFO, "lcfg command: %x\n", lcfg->lcfg_command); - if (LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) - CDEBUG(D_INFO, " devname: %s\n", - lustre_cfg_string(lcfg, 0)); - if (lcfg->lcfg_flags) - CDEBUG(D_INFO, " flags: %x\n", lcfg->lcfg_flags); - if (lcfg->lcfg_nid) - CDEBUG(D_INFO, " nid: %s\n", - libcfs_nid2str(lcfg->lcfg_nid)); - if (lcfg->lcfg_nal) - CDEBUG(D_INFO, " nal: %x (obsolete)\n", lcfg->lcfg_nal); - if (lcfg->lcfg_num) - CDEBUG(D_INFO, " num: %x\n", lcfg->lcfg_num); - for (i = 1; i < lcfg->lcfg_bufcount; i++) - if (LUSTRE_CFG_BUFLEN(lcfg, i) > 0) - CDEBUG(D_INFO, " inlbuf%d: %s\n", i, - lustre_cfg_string(lcfg, i)); - } else if (rec->lrh_type == PTL_CFG_REC) { - CDEBUG(D_INFO, "Obsolete pcfg command\n"); + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", + lcfg->lcfg_command); + if (lcfg->lcfg_flags) { + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + } + if (lcfg->lcfg_num) { + ptr += snprintf(ptr, end-ptr, "num=%#08x ", + lcfg->lcfg_num); + } + if (lcfg->lcfg_nid) { + ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n ", + libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); + } + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_svname, marker->cm_comment); + } else { + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + LCONSOLE(D_WARNING, " %s\n", outstr); } else { - CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); rc = -EINVAL; } out: + OBD_FREE(outstr, 256); RETURN(rc); } @@ -854,6 +963,8 @@ int class_config_dump_llog(struct llog_ctxt *ctxt, char *name, int rc, rc2; ENTRY; + LCONSOLE_INFO("Dumping config log %s\n", name); + rc = llog_create(ctxt, &llh, NULL, name); if (rc) RETURN(rc); @@ -868,23 +979,23 @@ parse_out: if (rc == 0) rc = rc2; + LCONSOLE_INFO("End config log %s\n", name); RETURN(rc); } /* Cleanup and detach */ -void class_manual_cleanup(struct obd_device *obd) +int class_manual_cleanup(struct obd_device *obd) { struct lustre_cfg *lcfg; struct lustre_cfg_bufs bufs; - int err; + int rc; char flags[3]=""; ENTRY; if (!obd) { CERROR("empty cleanup\n"); - EXIT; - return; + RETURN(-EALREADY); } if (obd->obd_force) @@ -899,16 +1010,19 @@ void class_manual_cleanup(struct obd_device *obd) lustre_cfg_bufs_set_string(&bufs, 1, flags); lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); - err = class_process_config(lcfg); - if (err) - CERROR("cleanup failed %d: %s\n", err, obd->obd_name); + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + GOTO(out, rc); + } /* the lcfg is almost the same for both ops */ lcfg->lcfg_command = LCFG_DETACH; - err = class_process_config(lcfg); + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: lustre_cfg_free(lcfg); - if (err) - CERROR("detach failed %d: %s\n", err, obd->obd_name); - EXIT; + RETURN(rc); } diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c new file mode 100644 index 0000000..01a9b1b --- /dev/null +++ b/lustre/obdclass/obd_mount.c @@ -0,0 +1,1919 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/obdclass/obd_mount.c + * Client/server mount routines + * + * Copyright (c) 2006 Cluster File Systems, Inc. + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#define DEBUG_SUBSYSTEM S_MGMT +#define D_MOUNT D_SUPER|D_CONFIG /*|D_WARNING */ +#define PRINT_CMD LCONSOLE +#define PRINT_MASK D_SUPER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int (*client_fill_super)(struct super_block *sb) = NULL; + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + if ((ptr = strstr(buf, key)) == NULL) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} + +/* returns 0 if this is the first key in the buffer, else 1 */ +int class_match_param(char *buf, char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} + +/* 0 is good nid, + 1 not found + < 0 error + endh is set to next separator */ +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + char tmp, *endp; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* nid separators or end of nids */ + endp = strpbrk(buf, ",: /"); + if (endp == NULL) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + *nid = libcfs_str2nid(buf); + if (*nid == LNET_NID_ANY) { + LCONSOLE_ERROR("Can't parse NID '%s'\n", buf); + *endp = tmp; + return -EINVAL; + } + *endp = tmp; + + if (endh) + *endh = endp; + CDEBUG(D_MOUNT, "Nid %s\n", libcfs_nid2str(*nid)); + return 0; +} + +/*********** mount lookup *********/ + +DECLARE_MUTEX(lustre_mount_info_lock); +struct list_head server_mount_info_list = LIST_HEAD_INIT(server_mount_info_list); + +static struct lustre_mount_info *server_find_mount(char *name) +{ + struct list_head *tmp; + struct lustre_mount_info *lmi; + ENTRY; + + list_for_each(tmp, &server_mount_info_list) { + lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain); + if (strcmp(name, lmi->lmi_name) == 0) + RETURN(lmi); + } + RETURN(NULL); +} + +/* we must register an obd for a mount before we call the setup routine. + *_setup will call lustre_get_mount to get the mnt struct + by obd_name, since we can't pass the pointer to setup. */ +static int server_register_mount(char *name, struct super_block *sb, + struct vfsmount *mnt) +{ + struct lustre_mount_info *lmi; + char *name_cp; + ENTRY; + + LASSERT(mnt); + LASSERT(sb); + + OBD_ALLOC(lmi, sizeof(*lmi)); + if (!lmi) + RETURN(-ENOMEM); + OBD_ALLOC(name_cp, strlen(name) + 1); + if (!name_cp) { + OBD_FREE(lmi, sizeof(*lmi)); + RETURN(-ENOMEM); + } + strcpy(name_cp, name); + + down(&lustre_mount_info_lock); + + if (server_find_mount(name)) { + up(&lustre_mount_info_lock); + OBD_FREE(lmi, sizeof(*lmi)); + OBD_FREE(name_cp, strlen(name) + 1); + CERROR("Already registered %s\n", name); + RETURN(-EEXIST); + } + lmi->lmi_name = name_cp; + lmi->lmi_sb = sb; + lmi->lmi_mnt = mnt; + list_add(&lmi->lmi_list_chain, &server_mount_info_list); + + up(&lustre_mount_info_lock); + + CDEBUG(D_MOUNT, "reg_mnt %p from %s, vfscount=%d\n", + lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count)); + + RETURN(0); +} + +/* when an obd no longer needs a mount */ +static int server_deregister_mount(char *name) +{ + struct lustre_mount_info *lmi; + ENTRY; + + down(&lustre_mount_info_lock); + lmi = server_find_mount(name); + if (!lmi) { + up(&lustre_mount_info_lock); + CERROR("%s not registered\n", name); + RETURN(-ENOENT); + } + + CDEBUG(D_MOUNT, "dereg_mnt %p from %s, vfscount=%d\n", + lmi->lmi_mnt, name, atomic_read(&lmi->lmi_mnt->mnt_count)); + + OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); + list_del(&lmi->lmi_list_chain); + OBD_FREE(lmi, sizeof(*lmi)); + up(&lustre_mount_info_lock); + + RETURN(0); +} + +/* Deregister anyone referencing the mnt. Everyone should have + put_mount in *_cleanup, but this is a catch-all in case of err... */ +/* FIXME this should be removed from lustre_free_lsi, which may be called + from server_put_mount _before_ it gets to server_deregister_mount. + Leave it here for now for the error message it shows... */ +static void server_deregister_mount_all(struct vfsmount *mnt) +{ + struct list_head *tmp, *n; + struct lustre_mount_info *lmi; + ENTRY; + + if (!mnt) { + EXIT; + return; + } + + //down(&lustre_mount_info_lock); + list_for_each_safe(tmp, n, &server_mount_info_list) { + lmi = list_entry(tmp, struct lustre_mount_info, lmi_list_chain); + if (lmi->lmi_mnt == mnt) { + CERROR("Mount %p still referenced by %s\n", mnt, + lmi->lmi_name); + //OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); + //list_del(&lmi->lmi_list_chain); + //OBD_FREE(lmi, sizeof(*lmi)); + } + } + //up(&lustre_mount_info_lock); + EXIT; +} + +/* obd's look up a registered mount using their name. This is just + for initial obd setup to find the mount struct. It should not be + called every time you want to mntget. */ +struct lustre_mount_info *server_get_mount(char *name) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + down(&lustre_mount_info_lock); + lmi = server_find_mount(name); + up(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(NULL); + } + lsi = s2lsi(lmi->lmi_sb); + mntget(lmi->lmi_mnt); + atomic_inc(&lsi->lsi_mounts); + + CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n", + lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), + atomic_read(&lmi->lmi_mnt->mnt_count)); + + RETURN(lmi); +} + +static void unlock_mntput(struct vfsmount *mnt) +{ + if (kernel_locked()) { + unlock_kernel(); + mntput(mnt); + lock_kernel(); + } else { + mntput(mnt); + } +} + +static int lustre_put_lsi(struct super_block *sb); + +/* to be called from obd_cleanup methods */ +int server_put_mount(char *name, struct vfsmount *mnt) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + down(&lustre_mount_info_lock); + lmi = server_find_mount(name); + up(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(-ENOENT); + } + lsi = s2lsi(lmi->lmi_sb); + LASSERT(lmi->lmi_mnt == mnt); + unlock_mntput(lmi->lmi_mnt); + + CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n", + lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), + atomic_read(&lmi->lmi_mnt->mnt_count)); + + if (lustre_put_lsi(lmi->lmi_sb)) { + CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n", + lmi->lmi_mnt, name, + atomic_read(&lmi->lmi_mnt->mnt_count)); + /* last mount is the One True Mount */ + if (atomic_read(&lmi->lmi_mnt->mnt_count) > 1) + CERROR("%s: mount busy, vfscount=%d!\n", name, + atomic_read(&lmi->lmi_mnt->mnt_count)); + } + + /* this obd should never need the mount again */ + server_deregister_mount(name); + + RETURN(0); +} + + +/******* mount helper utilities *********/ + +static void ldd_print(struct lustre_disk_data *ldd) +{ + PRINT_CMD(PRINT_MASK, " disk data:\n"); + PRINT_CMD(PRINT_MASK, "config: %d\n", ldd->ldd_config_ver); + PRINT_CMD(PRINT_MASK, "fs: %s\n", ldd->ldd_fsname); + PRINT_CMD(PRINT_MASK, "server: %s\n", ldd->ldd_svname); + PRINT_CMD(PRINT_MASK, "index: %04x\n", ldd->ldd_svindex); + PRINT_CMD(PRINT_MASK, "flags: %#x\n", ldd->ldd_flags); + PRINT_CMD(PRINT_MASK, "diskfs: %s\n", MT_STR(ldd)); + PRINT_CMD(PRINT_MASK, "options: %s\n", ldd->ldd_mount_opts); + PRINT_CMD(PRINT_MASK, "params: %s\n", ldd->ldd_params); +} + +static int ldd_parse(struct lvfs_run_ctxt *mount_ctxt, + struct lustre_disk_data *ldd) +{ + struct lvfs_run_ctxt saved; + struct file *file; + loff_t off = 0; + unsigned long len; + int rc; + ENTRY; + + push_ctxt(&saved, mount_ctxt, NULL); + + file = filp_open(MOUNT_DATA_FILE, O_RDONLY, 0644); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc); + GOTO(out, rc); + } + + len = file->f_dentry->d_inode->i_size; + CDEBUG(D_MOUNT, "Have %s, size %lu\n", MOUNT_DATA_FILE, len); + if (len != sizeof(*ldd)) { + CERROR("disk data size does not match: see %lu expect %u\n", + len, sizeof(*ldd)); + GOTO(out_close, rc = -EINVAL); + } + + rc = lustre_fread(file, ldd, len, &off); + if (rc != len) { + CERROR("error reading %s: read %d of %lu\n", + MOUNT_DATA_FILE, rc, len); + GOTO(out_close, rc = -EINVAL); + } + rc = 0; + + if (ldd->ldd_magic != LDD_MAGIC) { + /* FIXME add swabbing support */ + CERROR("Bad magic in %s: %x!=%x\n", MOUNT_DATA_FILE, + ldd->ldd_magic, LDD_MAGIC); + GOTO(out_close, rc = -EINVAL); + } + + if (ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP) { + CERROR("%s: unsupported incompat filesystem feature(s) %x\n", + ldd->ldd_svname, + ldd->ldd_feature_incompat & ~LDD_INCOMPAT_SUPP); + GOTO(out_close, rc = -EINVAL); + } + if (ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP) { + CERROR("%s: unsupported read-only filesystem feature(s) %x\n", + ldd->ldd_svname, + ldd->ldd_feature_rocompat & ~LDD_ROCOMPAT_SUPP); + /* Do something like remount filesystem read-only */ + GOTO(out_close, rc = -EINVAL); + } + + ldd_print(ldd); + +out_close: + filp_close(file, 0); +out: + pop_ctxt(&saved, mount_ctxt, NULL); + RETURN(rc); +} + +static int ldd_write(struct lvfs_run_ctxt *mount_ctxt, + struct lustre_disk_data *ldd) +{ + struct lvfs_run_ctxt saved; + struct file *file; + loff_t off = 0; + unsigned long len = sizeof(struct lustre_disk_data); + int rc = 0; + ENTRY; + + LASSERT(ldd->ldd_magic == LDD_MAGIC); + + ldd->ldd_config_ver++; + + push_ctxt(&saved, mount_ctxt, NULL); + + file = filp_open(MOUNT_DATA_FILE, O_RDWR, 0644); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("cannot open %s: rc = %d\n", MOUNT_DATA_FILE, rc); + GOTO(out, rc); + } + + rc = lustre_fwrite(file, ldd, len, &off); + if (rc != len) { + CERROR("error writing %s: read %d of %lu\n", + MOUNT_DATA_FILE, rc, len); + GOTO(out_close, rc = -EINVAL); + } + + rc = 0; + ldd_print(ldd); + +out_close: + filp_close(file, 0); +out: + pop_ctxt(&saved, mount_ctxt, NULL); + RETURN(rc); +} + + +/**************** config llog ********************/ + +/* Get a config log from the MGS and process it. + This func is called for both clients and servers. + Continue to process new statements appended to the logs + (whenever the config lock is revoked) until lustre_end_log + is called. */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + LASSERT(mgc); + LASSERT(cfg); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(&bufs, 3, &sb, sizeof(sb)); + lcfg = lustre_cfg_new(LCFG_LOG_START, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + + if (rc) + LCONSOLE_ERROR("%s: The configuration '%s' could not be read " + "from the MGS (%d). This may be the result of " + "communication errors between this node and " + "the MGS, or the MGS may not be running.\n", + mgc->obd_name, logname, rc); + + class_obd_list(); + RETURN(rc); +} + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + if (!mgc) + RETURN(-ENOENT); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + RETURN(rc); +} + +/**************** obd start *******************/ + +static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg * lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + return(rc); +} + +static int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2) +{ + int rc; + CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return(rc); + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, 0, 0); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0); + } + return rc; +} + +/* Set up a MGS to serve startup logs */ +static int server_start_mgs(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct vfsmount *mnt = lsi->lsi_srv_mnt; + struct lustre_mount_info *lmi; + int rc = 0; + ENTRY; + LASSERT(mnt); + + /* It is impossible to have more than 1 MGS per node, since + MGC wouldn't know which to connect to */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi) { + lsi = s2lsi(lmi->lmi_sb); + LCONSOLE_ERROR("The MGS service was already started from " + "server %s\n", lsi->lsi_ldd->ldd_svname); + RETURN(-EALREADY); + } + + CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME); + + rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt); + + if (!rc && + ((rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, + LUSTRE_MGS_OBDNAME, 0, 0)))) + server_deregister_mount(LUSTRE_MGS_OBDNAME); + + if (rc) + LCONSOLE_ERROR("Failed to start MGS '%s' (%d). Is the 'mgs' " + "module loaded?\n", LUSTRE_MGS_OBDNAME, rc); + + RETURN(rc); +} + +static int server_stop_mgs(struct super_block *sb) +{ + struct obd_device *obd; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME); + + /* There better be only one MGS */ + obd = class_name2obd(LUSTRE_MGS_OBDNAME); + if (!obd) { + CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME); + RETURN(-EALREADY); + } + + /* The MGS should always stop when we say so */ + obd->obd_force = 1; + rc = class_manual_cleanup(obd); + RETURN(rc); +} + +/* Set up a mgcobd to process startup logs */ +static int lustre_start_mgc(struct super_block *sb) +{ + struct lustre_handle mgc_conn = {0, }; + struct obd_connect_data ocd = { 0 }; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid; + class_uuid_t uuidc; + lnet_nid_t nid; + char niduuid[10]; + char *ptr; + int recov_bk = 0; + int rc = 0, i = 0, j; + ENTRY; + + LASSERT(lsi->lsi_lmd); + + obd = class_name2obd(LUSTRE_MGC_OBDNAME); + if (obd) { + atomic_inc(&obd->u.cli.cl_mgc_refcount); + /* FIXME There's only one MGC, but users could give different + MGS nids on the mount line. So now do we add new MGS uuids + or not? If there's truly one MGS per site, the MGS uuids + _should_ all be the same. Maybe check here? + */ + + /* If we are restarting the MGS, don't try to keep the MGC's + old connection, or registration will fail. */ + if ((lsi->lsi_flags & LSI_SERVER) && IS_MGS(lsi->lsi_ldd)) { + CDEBUG(D_MOUNT|D_ERROR, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* Try all connections, but only once (again). + We don't want to block another target from starting + (using its local copy of the log), but we do want to connect + if at all possible. */ + recov_bk++; + CDEBUG(D_MOUNT, "Set MGS reconnect %d\n", recov_bk); + rc = obd_set_info_async(obd->obd_self_export, + strlen(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + GOTO(out, rc = 0); + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", LUSTRE_MGC_OBDNAME); + + /* Add the primary nids for the MGS */ + if (lsi->lsi_flags & LSI_SERVER) { + ptr = lsi->lsi_ldd->ldd_params; + if (IS_MGS(lsi->lsi_ldd)) { + /* Use local nids (including LO) */ + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(LUSTRE_MGC_OBDNAME, id.nid, + LCFG_ADD_UUID, "mgsnid0", 0,0,0); + } + } else { + /* Use mgsnode= nids */ + if (class_find_param(ptr, PARAM_MGSNODE, &ptr) != 0) { + CERROR("No MGS nids given.\n"); + RETURN(-EINVAL); + } + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, + LCFG_ADD_UUID, "mgsnid0", 0,0,0); + i++; + } + } + } else { /* client */ + /* use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, + LCFG_ADD_UUID, "mgsnid0", 0,0,0); + i++; + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + RETURN(-EINVAL); + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + class_generate_random_uuid(uuidc); + class_uuid_unparse(uuidc, uuid); + + /* Start the MGC */ + rc = lustre_start_simple(LUSTRE_MGC_OBDNAME, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + "mgsnid0"); + OBD_FREE_PTR(uuid); + if (rc) + RETURN(rc); + + /* Add any failover MGS nids */ + i = 1; + while ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0)) { + /* New failover node */ + sprintf(niduuid, "mgsnid%d", i); + j = 0; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + j++; + rc = do_lcfg(LUSTRE_MGC_OBDNAME, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_ADD_CONN, + niduuid, 0, 0, 0); + i++; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(LUSTRE_MGC_OBDNAME); + if (!obd) { + CERROR("Can't find mgcobd %s\n", LUSTRE_MGC_OBDNAME); + RETURN(-ENOTCONN); + } + + /* Try all connections, but only once. */ + recov_bk = 1; + rc = obd_set_info_async(obd->obd_self_export, + strlen(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + if (rc) + /* nonfatal */ + CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); + + /* FIXME add ACL support? */ + //ocd.ocd_connect_flags = OBD_CONNECT_ACL; + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd); + if (rc) { + CERROR("connect failed %d\n", rc); + GOTO(out, rc); + } + + exp = class_conn2export(&mgc_conn); + obd->u.cli.cl_mgc_mgsexp = exp; + + /* And keep a refcount of servers/clients who started with "mount", + so we know when we can get rid of the mgc. */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + +out: + /* Keep the mgc info in the sb. Note that many lsi's can point + to the same mgc.*/ + lsi->lsi_mgc = obd; + RETURN(rc); +} + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char niduuid[10]; + int i, rc; + ENTRY; + + if (!lsi) + RETURN(-ENOENT); + obd = lsi->lsi_mgc; + if (!obd) + RETURN(-ENOENT); + + lsi->lsi_mgc = NULL; + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* This is not fatal, every client that stops + will call in here. */ + CDEBUG(D_MOUNT, "mgc still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + RETURN(-EBUSY); + } + + /* MGC must always stop */ + obd->obd_force = 1; + /* client_disconnect_export uses the no_recov flag to decide whether it + should disconnect or just invalidate. (The MGC has no + recoverable data in any case.) */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) + obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + + rc = class_manual_cleanup(obd); + if (rc) + RETURN(rc); + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(niduuid, "mgsnid%d", i); + rc = do_lcfg(obd->obd_name, 0, LCFG_DEL_UUID, + niduuid, 0, 0, 0); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } + /* class_import_put will get rid of the additional connections */ + + RETURN(0); +} + +/* Since there's only one mgc per node, we have to change it's fs to get + access to the right disk. */ +static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev); + + /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ + rc = obd_set_info_async(mgc->obd_self_export, + strlen("set_fs"), "set_fs", + sizeof(*sb), sb, NULL); + if (rc) { + CERROR("can't set_fs %d\n", rc); + } + + RETURN(rc); +} + +static int server_mgc_clear_fs(struct obd_device *mgc) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Unassign mgc disk\n"); + + rc = obd_set_info_async(mgc->obd_self_export, + strlen("clear_fs"), "clear_fs", + 0, NULL, NULL); + RETURN(rc); +} + +/* Stop MDS/OSS if nobody is using them */ +static int server_stop_servers(int lddflags, int lsiflags) +{ + struct obd_device *obd = NULL; + struct obd_type *type = NULL; + int rc = 0; + ENTRY; + + /* Either an MDT or an OST or neither */ + + /* if this was an MDT, and there are no more MDT's, clean up the MDS */ + if ((lddflags & LDD_F_SV_TYPE_MDT) && (obd = class_name2obd("MDS"))) { + //FIXME pre-rename, should eventually be LUSTRE_MDT_NAME + type = class_search_type(LUSTRE_MDS_NAME); + } + /* if this was an OST, and there are no more OST's, clean up the OSS */ + if ((lddflags & LDD_F_SV_TYPE_OST) && (obd = class_name2obd("OSS"))) { + type = class_search_type(LUSTRE_OST_NAME); + } + + if (obd && (!type || !type->typ_refcnt)) { + int err; + obd->obd_force = 1; + /* obd_fail doesn't mean much on a server obd */ + err = class_manual_cleanup(obd); + if (!rc) + rc = err; + } + + RETURN(rc); +} + +int server_mti_print(char *title, struct mgs_target_info *mti) +{ + PRINT_CMD(PRINT_MASK, "mti %s\n", title); + PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); + PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname); + PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid); + PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n", + mti->mti_config_ver, mti->mti_flags); + return(0); +} + +static int server_sb2mti(struct super_block *sb, struct mgs_target_info *mti) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_disk_data *ldd = lsi->lsi_ldd; + lnet_process_id_t id; + int i = 0; + ENTRY; + + if (!(lsi->lsi_flags & LSI_SERVER)) + RETURN(-EINVAL); + + strncpy(mti->mti_fsname, ldd->ldd_fsname, + sizeof(mti->mti_fsname)); + strncpy(mti->mti_svname, ldd->ldd_svname, + sizeof(mti->mti_svname)); + + mti->mti_nid_count = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + mti->mti_nids[mti->mti_nid_count] = id.nid; + mti->mti_nid_count++; + if (mti->mti_nid_count >= MTI_NIDS_MAX) { + CWARN("Only using first %d nids for %s\n", + mti->mti_nid_count, mti->mti_svname); + break; + } + } + + mti->mti_config_ver = 0; + mti->mti_flags = ldd->ldd_flags; + mti->mti_stripe_index = ldd->ldd_svindex; + memcpy(mti->mti_uuid, ldd->ldd_uuid, sizeof(mti->mti_uuid)); + if (strlen(ldd->ldd_params) > sizeof(mti->mti_params)) { + CERROR("params too big for mti\n"); + RETURN(-ENOMEM); + /* FIXME we can't send a msg much bigger than 4k - use bulk? */ + } + memcpy(mti->mti_params, ldd->ldd_params, sizeof(mti->mti_params)); + RETURN(0); +} + +/* Register an old or new target with the MGS. If needed MGS will construct + startup logs and assign index */ +int server_register_target(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + struct lustre_disk_data *ldd = lsi->lsi_ldd; + struct mgs_target_info *mti = NULL; + int rc; + ENTRY; + + LASSERT(mgc); + + if (!(lsi->lsi_flags & LSI_SERVER)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + rc = server_sb2mti(sb, mti); + if (rc) + GOTO(out, rc); + + CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n", + mti->mti_svname, mti->mti_fsname, + libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index, + mti->mti_flags); + + /* Register the target */ + /* FIXME use mdc_process_config instead */ + rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp, + strlen("register_target"), "register_target", + sizeof(*mti), mti, NULL); + if (rc) { + CERROR("registration with the MGS failed (%d)\n", rc); + GOTO(out, rc); + } + + /* Always update our flags */ + ldd->ldd_flags = mti->mti_flags & ~LDD_F_REWRITE_LDD; + + /* If this flag is set, it means the MGS wants us to change our + on-disk data. (So far this means just the index.) */ + if (mti->mti_flags & LDD_F_REWRITE_LDD) { + char *label; + int err; + CDEBUG(D_MOUNT, "Changing on-disk index from %#x to %#x " + "for %s\n", ldd->ldd_svindex, mti->mti_stripe_index, + mti->mti_svname); + ldd->ldd_svindex = mti->mti_stripe_index; + strncpy(ldd->ldd_svname, mti->mti_svname, + sizeof(ldd->ldd_svname)); + /* or ldd_make_sv_name(ldd); */ + ldd_write(&mgc->obd_lvfs_ctxt, ldd); + + err = fsfilt_set_label(mgc, lsi->lsi_srv_mnt->mnt_sb, + mti->mti_svname); + if (err) + CERROR("Label set error %d\n", err); + label = fsfilt_get_label(mgc, lsi->lsi_srv_mnt->mnt_sb); + if (label) + CDEBUG(D_MOUNT, "Disk label changed to %s\n", label); + } + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); +} + +/* Start targets */ +static int server_start_targets(struct super_block *sb, struct vfsmount *mnt) +{ + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance cfg; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_ldd->ldd_svname); + + /* If we're an MDT, make sure the global MDS is running */ + if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_MDT) { + /* make sure (what will be called) the MDS is started */ + obd = class_name2obd("MDS"); + if (!obd) { + //FIXME pre-rename, should eventually be LUSTRE_MDS_NAME + rc = lustre_start_simple("MDS", LUSTRE_MDT_NAME, + "MDS_uuid", 0, 0); + if (rc) { + CERROR("failed to start MDS: %d\n", rc); + GOTO(out_servers, rc); + } + } + } + + /* If we're an OST, make sure the global OSS is running */ + if (lsi->lsi_ldd->ldd_flags & LDD_F_SV_TYPE_OST) { + /* make sure OSS is started */ + obd = class_name2obd("OSS"); + if (!obd) { + rc = lustre_start_simple("OSS", LUSTRE_OSS_NAME, + "OSS_uuid", 0, 0); + if (rc) { + CERROR("failed to start OSS: %d\n", rc); + GOTO(out_servers, rc); + } + } + } + + /* Set the mgc fs to our server disk. This allows the MGC + to read and write configs locally. */ + server_mgc_set_fs(lsi->lsi_mgc, sb); + + /* Register with MGS */ + rc = server_register_target(sb); + if (rc && (lsi->lsi_ldd->ldd_flags & + (LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_UPGRADE14))){ + CERROR("Required registration failed for %s: %d\n", + lsi->lsi_ldd->ldd_svname, rc); + if (rc == -EIO) { + LCONSOLE_ERROR("Communication error with the MGS. Is " + "the MGS running?\n"); + } + GOTO(out, rc); + } + + /* Let the target look up the mount using the target's name + (we can't pass the sb or mnt through class_process_config.) */ + rc = server_register_mount(lsi->lsi_ldd->ldd_svname, sb, mnt); + if (rc) + GOTO(out, rc); + + /* Start targets using the llog named for the target */ + memset(&cfg, 0, sizeof(cfg)); + rc = lustre_process_log(sb, lsi->lsi_ldd->ldd_svname, &cfg); + if (rc) { + CERROR("failed to start server %s: %d\n", + lsi->lsi_ldd->ldd_svname, rc); + GOTO(out, rc); + } + + if (!class_name2obd(lsi->lsi_ldd->ldd_svname)) { + CERROR("no server named %s was started\n", + lsi->lsi_ldd->ldd_svname); + rc = -ENXIO; + } + +out: + /* Release the mgc fs for others to use */ + server_mgc_clear_fs(lsi->lsi_mgc); + +out_servers: + RETURN(rc); +} + +/***************** lustre superblock **************/ + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = NULL; + ENTRY; + + OBD_ALLOC(lsi, sizeof(*lsi)); + if (!lsi) + RETURN(NULL); + OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + if (!lsi->lsi_lmd) { + OBD_FREE(lsi, sizeof(*lsi)); + RETURN(NULL); + } + + lsi->lsi_lmd->lmd_exclude_count = 0; + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + RETURN(lsi); +} + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + if (!lsi) + RETURN(0); + + CDEBUG(D_MOUNT, "Freeing lsi\n"); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + if (lsi->lsi_ldd != NULL) + OBD_FREE(lsi->lsi_ldd, sizeof(*lsi->lsi_ldd)); + + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + } + + LASSERT(lsi->lsi_llsbi == NULL); + + server_deregister_mount_all(lsi->lsi_srv_mnt); + + OBD_FREE(lsi, sizeof(*lsi)); + s2lsi_nocast(sb) = NULL; + + RETURN(0); +} + +static int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + lustre_free_lsi(sb); + RETURN(1); + } + RETURN(0); +} + +/*************** server mount ******************/ + +/* Kernel mount using mount options in MOUNT_DATA_FILE */ +static struct vfsmount *server_kernel_mount(struct super_block *sb) +{ + struct lvfs_run_ctxt mount_ctxt; + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_disk_data *ldd; + struct lustre_mount_data *lmd = lsi->lsi_lmd; + struct vfsmount *mnt; + char *options = NULL; + unsigned long page, s_flags; + int rc; + ENTRY; + + OBD_ALLOC(ldd, sizeof(*ldd)); + if (!ldd) + RETURN(ERR_PTR(-ENOMEM)); + + /* In the past, we have always used flags = 0. + Note ext3/ldiskfs can't be mounted ro. */ + s_flags = sb->s_flags; + + /* Pre-mount ext3 to read the MOUNT_DATA_FILE */ + CDEBUG(D_MOUNT, "Pre-mount ext3 %s\n", lmd->lmd_dev); + mnt = do_kern_mount("ext3", s_flags, lmd->lmd_dev, 0); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + CERROR("premount ext3 failed (%d), trying ldiskfs\n", rc); + /* If ext3 fails (bec. of mballoc, extents), try ldiskfs */ + mnt = do_kern_mount("ldiskfs", s_flags, lmd->lmd_dev, 0); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + CERROR("premount ldiskfs failed: rc = %d\n", rc); + GOTO(out_free, rc); + } + } + + OBD_SET_CTXT_MAGIC(&mount_ctxt); + mount_ctxt.pwdmnt = mnt; + mount_ctxt.pwd = mnt->mnt_root; + mount_ctxt.fs = get_ds(); + + rc = ldd_parse(&mount_ctxt, ldd); + unlock_mntput(mnt); + + if (rc) { + CERROR("premount parse options failed: rc = %d\n", rc); + GOTO(out_free, rc); + } + + /* Done with our pre-mount, now do the real mount. */ + + /* Glom up mount options */ + page = __get_free_page(GFP_KERNEL); + if (!page) + GOTO(out_free, rc = -ENOMEM); + + options = (char *)page; + memset(options, 0, PAGE_SIZE); + strncpy(options, ldd->ldd_mount_opts, PAGE_SIZE - 2); + + /* Add in any mount-line options */ + if (lmd->lmd_opts && (*(lmd->lmd_opts) != 0)) { + int len = PAGE_SIZE - strlen(options) - 2; + if (*options != 0) + strcat(options, ","); + strncat(options, lmd->lmd_opts, len); + } + + /* Special permanent mount flags */ + if (IS_OST(ldd)) + s_flags |= MS_NOATIME | MS_NODIRATIME; + + CDEBUG(D_MOUNT, "kern_mount: %s %s %s\n", + MT_STR(ldd), lmd->lmd_dev, options); + mnt = do_kern_mount(MT_STR(ldd), s_flags, lmd->lmd_dev, + (void *)options); + free_page(page); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + CERROR("do_kern_mount failed: rc = %d\n", rc); + GOTO(out_free, rc); + } + + lsi->lsi_ldd = ldd; /* freed at lsi cleanup */ + CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt); + RETURN(mnt); + +out_free: + OBD_FREE(ldd, sizeof(*ldd)); + lsi->lsi_ldd = NULL; + RETURN(ERR_PTR(rc)); +} + +static void server_wait_finished(struct vfsmount *mnt) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + int retries = 10; + + init_waitqueue_head(&waitq); + + while ((atomic_read(&mnt->mnt_count) > 0) && retries--) { + CWARN("Mount still busy with %d refs\n", + atomic_read(&mnt->mnt_count)); + + /* Wait for a bit */ + lwi = LWI_TIMEOUT(2 * HZ, NULL, NULL); + l_wait_event(waitq, 0, &lwi); + } + if (atomic_read(&mnt->mnt_count)) { + CERROR("Mount is still busy, giving up.\n"); + } +} + +static void server_put_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct vfsmount *mnt = lsi->lsi_srv_mnt; + char *tmpname; + int tmpname_sz; + int lddflags = lsi->lsi_ldd->ldd_flags; + int lsiflags = lsi->lsi_flags; + int rc; + ENTRY; + + LASSERT(lsiflags & LSI_SERVER); + + tmpname_sz = strlen(lsi->lsi_ldd->ldd_svname) + 1; + OBD_ALLOC(tmpname, tmpname_sz); + memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz); + CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); + + /* Stop the target */ + if (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd)) { + + /* tell the mgc to drop the config log */ + lustre_end_log(sb, lsi->lsi_ldd->ldd_svname, NULL); + + obd = class_name2obd(lsi->lsi_ldd->ldd_svname); + if (obd) { + CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name); + if (lsi->lsi_flags & LSI_UMOUNT_FORCE) + obd->obd_force = 1; + if (lsi->lsi_flags & LSI_UMOUNT_FAILOVER) + obd->obd_fail = 1; + /* We can't seem to give an error return code + to .put_super, so we better make sure we clean up! + FIXME is there a way to get around this? */ + obd->obd_force = 1; + class_manual_cleanup(obd); + } else { + CERROR("no obd %s\n", lsi->lsi_ldd->ldd_svname); + server_deregister_mount(lsi->lsi_ldd->ldd_svname); + } + } + + /* If they wanted the mgs to stop separately from the mdt, they + should have put it on a different device. */ + if (IS_MGS(lsi->lsi_ldd)) { + /* stop the mgc before the mgs so the connection gets cleaned + up */ + lustre_stop_mgc(sb); + server_stop_mgs(sb); + } + + /* clean the mgc and sb */ + rc = lustre_common_put_super(sb); + // FIXME how do I return a failure? + + /* drop the One True Mount */ + unlock_mntput(mnt); + + /* Wait for the targets to really clean up - can't exit (and let the + sb get destroyed) while the mount is still in use */ + server_wait_finished(mnt); + + /* Stop the servers (MDS, OSS) if no longer needed. We must wait + until the target is really gone so that our type refcount check + is right. */ + server_stop_servers(lddflags, lsiflags); + + LCONSOLE_WARN("server umount %s complete\n", tmpname); + OBD_FREE(tmpname, tmpname_sz); + EXIT; +} + +static void server_umount_begin(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + CDEBUG(D_MOUNT, "umount -f\n"); + /* umount = failover + umount -f = force + no third way to do non-force, non-failover */ + lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER; + lsi->lsi_flags |= LSI_UMOUNT_FORCE; + EXIT; +} + +static int server_statfs (struct super_block *sb, struct kstatfs *buf) +{ + struct vfsmount *mnt = s2lsi(sb)->lsi_srv_mnt; + ENTRY; + + if (mnt && mnt->mnt_sb && mnt->mnt_sb->s_op->statfs) { + int rc = mnt->mnt_sb->s_op->statfs(mnt->mnt_sb, buf); + if (!rc) { + buf->f_type = sb->s_magic; + RETURN(0); + } + } + + /* just return 0 */ + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = 1; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = 1; + buf->f_ffree = 0; + buf->f_namelen = NAME_MAX; + RETURN(0); +} + +static struct super_operations server_ops = +{ + .put_super = server_put_super, + .umount_begin = server_umount_begin, /* umount -f */ + .statfs = server_statfs, +}; + +#define log2(n) ffz(~(n)) +#define LUSTRE_SUPER_MAGIC 0x0BD00BD1 + +static int server_fill_super_common(struct super_block *sb) +{ + struct inode *root = 0; + ENTRY; + + CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = log2(sb->s_blocksize); + sb->s_magic = LUSTRE_SUPER_MAGIC; + sb->s_maxbytes = 0; //PAGE_CACHE_MAXBYTES; + sb->s_flags |= MS_RDONLY; + sb->s_op = &server_ops; + + root = new_inode(sb); + if (!root) { + CERROR("Can't make root inode\n"); + RETURN(-EIO); + } + + /* returns -EIO for every operation */ + /* make_bad_inode(root); -- badness - can't umount */ + /* apparently we need to be a directory for the mount to finish */ + root->i_mode = S_IFDIR; + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + CERROR("Can't make root dentry\n"); + iput(root); + RETURN(-EIO); + } + + RETURN(0); +} + +static int server_fill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct vfsmount *mnt; + int rc; + ENTRY; + + /* the One True Mount */ + mnt = server_kernel_mount(sb); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + CERROR("Unable to mount device %s: %d\n", + lsi->lsi_lmd->lmd_dev, rc); + GOTO(out, rc); + } + lsi->lsi_srv_mnt = mnt; + + LASSERT(lsi->lsi_ldd); + CDEBUG(D_MOUNT, "Found service %s for fs '%s' on device %s\n", + lsi->lsi_ldd->ldd_svname, lsi->lsi_ldd->ldd_fsname, + lsi->lsi_lmd->lmd_dev); + + if (class_name2obd(lsi->lsi_ldd->ldd_svname)) { + LCONSOLE_ERROR("The target named %s is already running. " + "Double-mount may have compromised the disk " + "journal.\n", lsi->lsi_ldd->ldd_svname); + unlock_mntput(mnt); + lustre_put_lsi(sb); + GOTO(out, rc = -EALREADY); + } + + /* start MGS before MGC */ + if (IS_MGS(lsi->lsi_ldd)) { + rc = server_start_mgs(sb); + if (rc) { + CERROR("ignoring Failed MGS start!!\n"); + //GOTO(out_mnt, rc); + } + } + + rc = lustre_start_mgc(sb); + if (rc) + GOTO(out_mnt, rc); + + /* Set up all obd devices for service */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_OST(lsi->lsi_ldd) || IS_MDT(lsi->lsi_ldd))) { + rc = server_start_targets(sb, mnt); + if (rc < 0) { + CERROR("Unable to start targets: %d\n", rc); + GOTO(out_mnt, rc); + } + /* FIXME overmount client here, + or can we just start a client log and client_fill_super on this sb? + We need to make sure server_put_super gets called too - ll_put_super + calls lustre_common_put_super; check there for LSI_SERVER flag, + call s_p_s if so. + Probably should start client from new thread so we can return. + Client will not finish until all servers are connected. + Note - MGMT-only server does NOT get a client, since there is no + lustre fs associated - the MGMT is for all lustre fs's */ + } + + rc = server_fill_super_common(sb); + if (rc) + GOTO(out_mnt, rc); + + RETURN(0); + +out_mnt: + server_put_super(sb); +out: + RETURN(rc); +} + +/* Get the index from the obd name. + rc = server type, or + rc < 0 on error + if endptr isn't NULL it is set to end of name */ +int server_name2index(char *svname, __u32 *idx, char **endptr) +{ + unsigned long index; + int rc; + char *dash = strchr(svname, '-'); + if (!dash) { + CERROR("Can't understand server name %s\n", svname); + return(-EINVAL); + } + + if (strncmp(dash + 1, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash + 1, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return(-EINVAL); + + index = simple_strtoul(dash + 4, endptr, 16); + *idx = index; + return rc; +} + +/*************** mount common betweeen server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + RETURN(rc); + } + /* BUSY just means that there's some other obd that + needs the mgc. Let him clean it up. */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + lustre_put_lsi(sb); + RETURN(rc); +} + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(PRINT_MASK, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags); + if (lmd->lmd_opts) + PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts); + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + ENTRY; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + RETURN(0); + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for(i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + RETURN(1); + } + } + RETURN(0); +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr) +{ + char *s1 = ptr, *s2; + __u32 index, *exclude_list; + int rc = 0; + ENTRY; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC(exclude_list, sizeof(index) * MAX_OBD_DEVICES); + if (!exclude_list) + RETURN(-ENOMEM); + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse %s\n", s1); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1); + s1 = s2; + /* now we are pointing at ':' (next exclude) + or ',' (end of excludes) */ + + if (lmd->lmd_exclude_count >= MAX_OBD_DEVICES) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE(exclude_list, sizeof(index) * MAX_OBD_DEVICES); + RETURN(rc); +} + +/* mount -v -t lustre uml1:uml2:/lustre-client /mnt/lustre */ +static int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + ENTRY; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR("Missing mount data: check that " + "/sbin/mount.lustre is installed.\n"); + RETURN(-EINVAL); + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR("You're using an old version of " + "/sbin/mount.lustre. Please install version " + "%s\n", LUSTRE_VERSION_STRING); + RETURN(-EINVAL); + } + lmd->lmd_magic = LMD_MAGIC; + + /* Default flags */ + lmd->lmd_flags |= LMD_FLG_RECOVER; + + s1 = options; + while (*s1) { + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + + /* Client options are parsed in ll_options: eg. flock, + user_xattr, acl */ + + if (strncmp(s1, "recov", 5) == 0) + /* FIXME do something with the RECOVER flag - see lconf */ + lmd->lmd_flags |= LMD_FLG_RECOVER; + else if (strncmp(s1, "norecov", 7) == 0) + lmd->lmd_flags &= ~LMD_FLG_RECOVER; + else if (strncmp(s1, "nosvc", 5) == 0) + lmd->lmd_flags |= LMD_FLG_NOSVC; + + /* ost exclusion list */ + else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + } + + /* Linux 2.4 doesn't pass the device, so we stuck it at the + end of the options. */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* terminate options right before device. device + must be the last one. */ + *s1 = 0; + break; + } + + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) + break; + s1 = s2 + 1; + } + + if (!devname) { + LCONSOLE_ERROR("Can't find the device name " + "(need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strrchr(devname, ':'); + if (s1) { + lmd->lmd_flags = LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') ; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8); + if (!lmd->lmd_profile) + RETURN(-ENOMEM); + sprintf(lmd->lmd_profile, "%s-client", s1); + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + RETURN(-ENOMEM); + strcpy(lmd->lmd_dev, devname); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + RETURN(-ENOMEM); + strcpy(lmd->lmd_opts, options); + } + + lmd->lmd_magic = LMD_MAGIC; + + lmd_print(lmd); + RETURN(rc); + +invalid: + CERROR("Bad mount options %s\n", options); + RETURN(-EINVAL); +} + + +/* Common mount */ +int lustre_fill_super(struct super_block *sb, void *data, int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_sb_info *lsi; + int rc; + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* Figure out the lmd from the mount options */ + if (lmd_parse((char *)data, lmd)) { + lustre_put_lsi(sb); + RETURN(-EINVAL); + } + + if (lmd_is_client(lmd)) { + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + if (!client_fill_super) { + LCONSOLE_ERROR("Nothing registered for client mount!" + " Is llite module loaded?\n"); + rc = -ENODEV; + } else { + rc = lustre_start_mgc(sb); + if (rc) + goto out; + /* Connect and start */ + /* (should always be ll_fill_super) */ + rc = (*client_fill_super)(sb); + /* c_f_s will call lustre_common_put_super on failure */ + + } + } else { + CDEBUG(D_MOUNT, "Mounting server from %s\n", lmd->lmd_dev); + lsi->lsi_flags |= LSI_SERVER; + rc = server_fill_super(sb); + /* s_f_s calls lustre_start_mgc after the mount because we need + the MGS nids which are stored on disk. Plus, we may + need to start the MGS first. */ + /* s_f_s will call server_put_super on failure */ + } + +out: + if (rc){ + CERROR("Unable to mount %s\n", + s2lsi(sb) ? lmd->lmd_dev : ""); + } else { + LCONSOLE_WARN("mount %s complete\n", lmd->lmd_dev); + } + RETURN(rc); +} + + +/* We can't call ll_fill_super by name because it lives in a module that + must be loaded after this one. */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb)) +{ + client_fill_super = cfs; +} + +/***************** FS registration ******************/ + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +/* 2.5 and later */ +struct super_block * lustre_get_sb(struct file_system_type *fs_type, + int flags, const char *devname, void * data) +{ + /* calls back in fill super */ + /* we could append devname= onto options (*data) here, + but 2.4 doesn't get devname. So we do it in mount_lustre.c */ + return get_sb_nodev(fs_type, flags, data, lustre_fill_super); +} + +struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .get_sb = lustre_get_sb, + .kill_sb = kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, +}; + +#else +/* 2.4 */ +static struct super_block *lustre_read_super(struct super_block *sb, + void *data, int silent) +{ + int rc; + ENTRY; + + rc = lustre_fill_super(sb, data, silent); + if (rc) + RETURN(NULL); + RETURN(sb); +} + +static struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .fs_flags = FS_NFSEXP_FSID, + .read_super = lustre_read_super, +}; +#endif + +int lustre_register_fs(void) +{ + return register_filesystem(&lustre_fs_type); +} + +int lustre_unregister_fs(void) +{ + return unregister_filesystem(&lustre_fs_type); +} + +EXPORT_SYMBOL(lustre_register_client_fill_super); +EXPORT_SYMBOL(lustre_common_put_super); +EXPORT_SYMBOL(lustre_process_log); +EXPORT_SYMBOL(lustre_end_log); +EXPORT_SYMBOL(server_get_mount); +EXPORT_SYMBOL(server_put_mount); +EXPORT_SYMBOL(server_register_target); +EXPORT_SYMBOL(server_name2index); +EXPORT_SYMBOL(server_mti_print); +EXPORT_SYMBOL(class_find_param); +EXPORT_SYMBOL(class_match_param); +EXPORT_SYMBOL(class_parse_nid); + + diff --git a/lustre/obdclass/uuid.c b/lustre/obdclass/uuid.c index eb85aea..09302bd 100644 --- a/lustre/obdclass/uuid.c +++ b/lustre/obdclass/uuid.c @@ -20,117 +20,142 @@ #include struct uuid { - __u32 time_low; - __u16 time_mid; - __u16 time_hi_and_version; - __u16 clock_seq; - __u8 node[6]; + __u32 time_low; + __u16 time_mid; + __u16 time_hi_and_version; + __u16 clock_seq; + __u8 node[6]; }; static void uuid_unpack(class_uuid_t in, struct uuid *uu) { - __u8 *ptr = in; - __u32 tmp; + __u8 *ptr = in; + __u32 tmp; - tmp = *ptr++; - tmp = (tmp << 8) | *ptr++; - tmp = (tmp << 8) | *ptr++; - tmp = (tmp << 8) | *ptr++; - uu->time_low = tmp; + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; - tmp = *ptr++; - tmp = (tmp << 8) | *ptr++; - uu->time_mid = tmp; + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; - tmp = *ptr++; - tmp = (tmp << 8) | *ptr++; - uu->time_hi_and_version = tmp; + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; - tmp = *ptr++; - tmp = (tmp << 8) | *ptr++; - uu->clock_seq = tmp; + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; - memcpy(uu->node, ptr, 6); + memcpy(uu->node, ptr, 6); } #if 0 static void uuid_pack(struct uuid *uu, class_uuid_t ptr) { - __u32 tmp; - unsigned char *out = ptr; - - tmp = uu->time_low; - out[3] = (unsigned char) tmp; - tmp >>= 8; - out[2] = (unsigned char) tmp; - tmp >>= 8; - out[1] = (unsigned char) tmp; - tmp >>= 8; - out[0] = (unsigned char) tmp; - - tmp = uu->time_mid; - out[5] = (unsigned char) tmp; - tmp >>= 8; - out[4] = (unsigned char) tmp; - - tmp = uu->time_hi_and_version; - out[7] = (unsigned char) tmp; - tmp >>= 8; - out[6] = (unsigned char) tmp; - - tmp = uu->clock_seq; - out[9] = (unsigned char) tmp; - tmp >>= 8; - out[8] = (unsigned char) tmp; - - memcpy(out+10, uu->node, 6); + __u32 tmp; + unsigned char *out = ptr; + + tmp = uu->time_low; + out[3] = (unsigned char) tmp; + tmp >>= 8; + out[2] = (unsigned char) tmp; + tmp >>= 8; + out[1] = (unsigned char) tmp; + tmp >>= 8; + out[0] = (unsigned char) tmp; + + tmp = uu->time_mid; + out[5] = (unsigned char) tmp; + tmp >>= 8; + out[4] = (unsigned char) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (unsigned char) tmp; + tmp >>= 8; + out[6] = (unsigned char) tmp; + + tmp = uu->clock_seq; + out[9] = (unsigned char) tmp; + tmp >>= 8; + out[8] = (unsigned char) tmp; + + memcpy(out+10, uu->node, 6); } int class_uuid_parse(struct obd_uuid in, class_uuid_t uu) { - struct uuid uuid; - int i; - char *cp, buf[3]; - - if (strlen(in) != 36) - return -1; - for (i=0, cp = in; i <= 36; i++,cp++) { - if ((i == 8) || (i == 13) || (i == 18) || - (i == 23)) - if (*cp == '-') - continue; - if (i== 36) - if (*cp == 0) - continue; - if (!isxdigit(*cp)) - return -1; - } - uuid.time_low = simple_strtoul(in, NULL, 16); - uuid.time_mid = simple_strtoul(in+9, NULL, 16); - uuid.time_hi_and_version = simple_strtoul(in+14, NULL, 16); - uuid.clock_seq = simple_strtoul(in+19, NULL, 16); - cp = in+24; - buf[2] = 0; - for (i=0; i < 6; i++) { - buf[0] = *cp++; - buf[1] = *cp++; - uuid.node[i] = simple_strtoul(buf, NULL, 16); - } - - uuid_pack(&uuid, uu); - return 0; + struct uuid uuid; + int i; + char *cp, buf[3]; + + if (strlen(in) != 36) + return -1; + for (i=0, cp = in; i <= 36; i++,cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) + if (*cp == '-') + continue; + if (i== 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + uuid.time_low = simple_strtoul(in, NULL, 16); + uuid.time_mid = simple_strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = simple_strtoul(in+14, NULL, 16); + uuid.clock_seq = simple_strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + for (i=0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = simple_strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; } #endif + +void generate_random_uuid(unsigned char uuid_out[16]); + +/* We need to have some extra twiddling here because some systems have + * no random state when they start up. */ +void class_generate_random_uuid(class_uuid_t uuid) +{ + struct timeval t; + int *i, j, k; + + LASSERT(sizeof(class_uuid_t) % sizeof(*i) == 0); + + j = jiffies; + do_gettimeofday(&t); + k = t.tv_usec; + + generate_random_uuid(uuid); + + for (i = (int *)uuid; (char *)i < (char *)uuid + sizeof(class_uuid_t); i++) { + *i ^= j ^ k; + j = ((j << 8) & 0xffffff00) | ((j >> 24) & 0x000000ff); + k = ((k >> 8) & 0x00ffffff) | ((k << 24) & 0xff000000); + } +} + void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) { - struct uuid uuid; - - uuid_unpack(uu, &uuid); - sprintf(out->uuid, - "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", - uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, - uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, - uuid.node[0], uuid.node[1], uuid.node[2], - uuid.node[3], uuid.node[4], uuid.node[5]); + struct uuid uuid; + + uuid_unpack(uu, &uuid); + sprintf(out->uuid, + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 458c355..a7c115b 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -89,15 +90,15 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti, /* we don't allocate new transnos for replayed requests */ if (oti->oti_transno == 0) { spin_lock(&filter->fo_translock); - last_rcvd = le64_to_cpu(filter->fo_fsd->fsd_last_transno) + 1; - filter->fo_fsd->fsd_last_transno = cpu_to_le64(last_rcvd); + last_rcvd = le64_to_cpu(filter->fo_fsd->lsd_last_transno) + 1; + filter->fo_fsd->lsd_last_transno = cpu_to_le64(last_rcvd); spin_unlock(&filter->fo_translock); oti->oti_transno = last_rcvd; } else { spin_lock(&filter->fo_translock); last_rcvd = oti->oti_transno; - if (last_rcvd > le64_to_cpu(filter->fo_fsd->fsd_last_transno)) - filter->fo_fsd->fsd_last_transno = + if (last_rcvd > le64_to_cpu(filter->fo_fsd->lsd_last_transno)) + filter->fo_fsd->lsd_last_transno = cpu_to_le64(last_rcvd); spin_unlock(&filter->fo_translock); } @@ -181,8 +182,8 @@ static int filter_client_add(struct obd_device *obd, struct filter_obd *filter, } fed->fed_lr_idx = cl_idx; - fed->fed_lr_off = le32_to_cpu(filter->fo_fsd->fsd_client_start) + - cl_idx * le16_to_cpu(filter->fo_fsd->fsd_client_size); + fed->fed_lr_off = le32_to_cpu(filter->fo_fsd->lsd_client_start) + + cl_idx * le16_to_cpu(filter->fo_fsd->lsd_client_size); LASSERTF(fed->fed_lr_off > 0, "fed_lr_off = %llu\n", fed->fed_lr_off); CDEBUG(D_INFO, "client at index %d (%llu) with UUID '%s' added\n", @@ -317,21 +318,21 @@ static int filter_free_server_data(struct filter_obd *filter) /* assumes caller is already in kernel ctxt */ int filter_update_server_data(struct obd_device *obd, struct file *filp, - struct filter_server_data *fsd, int force_sync) + struct lr_server_data *fsd, int force_sync) { loff_t off = 0; int rc; ENTRY; - CDEBUG(D_INODE, "server uuid : %s\n", fsd->fsd_uuid); + CDEBUG(D_INODE, "server uuid : %s\n", fsd->lsd_uuid); CDEBUG(D_INODE, "server last_rcvd : "LPU64"\n", - le64_to_cpu(fsd->fsd_last_transno)); + le64_to_cpu(fsd->lsd_last_transno)); CDEBUG(D_INODE, "server last_mount: "LPU64"\n", - le64_to_cpu(fsd->fsd_mount_count)); + le64_to_cpu(fsd->lsd_mount_count)); rc = fsfilt_write_record(obd, filp, fsd, sizeof(*fsd), &off,force_sync); if (rc) - CERROR("error writing filter_server_data: rc = %d\n", rc); + CERROR("error writing lr_server_data: rc = %d\n", rc); RETURN(rc); } @@ -367,7 +368,7 @@ int filter_update_last_objid(struct obd_device *obd, obd_gr group, static int filter_init_server_data(struct obd_device *obd, struct file * filp) { struct filter_obd *filter = &obd->u.filter; - struct filter_server_data *fsd; + struct lr_server_data *fsd; struct filter_client_data *fcd = NULL; struct inode *inode = filp->f_dentry->d_inode; unsigned long last_rcvd_size = inode->i_size; @@ -377,9 +378,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) int rc; /* ensure padding in the struct is the correct size */ - CLASSERT(offsetof(struct filter_server_data, fsd_padding) + - sizeof(fsd->fsd_padding) == LR_SERVER_SIZE); - CLASSERT(offsetof(struct filter_client_data, fcd_padding) + + CLASSERT (offsetof(struct lr_server_data, lsd_padding) + + sizeof(fsd->lsd_padding) == LR_SERVER_SIZE); + CLASSERT (offsetof(struct filter_client_data, fcd_padding) + sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE); OBD_ALLOC(fsd, sizeof(*fsd)); @@ -394,16 +395,17 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) } if (last_rcvd_size == 0) { - CWARN("%s: initializing new %s\n", obd->obd_name, LAST_RCVD); - - memcpy(fsd->fsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->fsd_uuid)); - fsd->fsd_last_transno = 0; - mount_count = fsd->fsd_mount_count = 0; - fsd->fsd_server_size = cpu_to_le32(LR_SERVER_SIZE); - fsd->fsd_client_start = cpu_to_le32(LR_CLIENT_START); - fsd->fsd_client_size = cpu_to_le16(LR_CLIENT_SIZE); - fsd->fsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT); + LCONSOLE_WARN("%s: new disk, initializing\n", obd->obd_name); + + memcpy(fsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->lsd_uuid)); + fsd->lsd_last_transno = 0; + mount_count = fsd->lsd_mount_count = 0; + fsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE); + fsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START); + fsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE); + fsd->lsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT); filter->fo_subdir_count = FILTER_SUBDIR_COUNT; + fsd->lsd_feature_incompat = cpu_to_le32(OBD_INCOMPAT_OST); } else { rc = fsfilt_read_record(obd, filp, fsd, sizeof(*fsd), &off); if (rc) { @@ -411,54 +413,54 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) LAST_RCVD, rc); GOTO(err_fsd, rc); } - if (strcmp(fsd->fsd_uuid, obd->obd_uuid.uuid) != 0) { + if (strcmp(fsd->lsd_uuid, obd->obd_uuid.uuid) != 0) { LCONSOLE_ERROR("Trying to start OBD %s using the wrong" " disk %s. Were the /dev/ assignments " "rearranged?\n", - obd->obd_uuid.uuid, fsd->fsd_uuid); + obd->obd_uuid.uuid, fsd->lsd_uuid); GOTO(err_fsd, rc = -EINVAL); } - mount_count = le64_to_cpu(fsd->fsd_mount_count); - filter->fo_subdir_count = le16_to_cpu(fsd->fsd_subdir_count); + mount_count = le64_to_cpu(fsd->lsd_mount_count); + filter->fo_subdir_count = le16_to_cpu(fsd->lsd_subdir_count); } - if (fsd->fsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) { + if (fsd->lsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) { CERROR("%s: unsupported incompat filesystem feature(s) %x\n", - obd->obd_name, le32_to_cpu(fsd->fsd_feature_incompat) & + obd->obd_name, le32_to_cpu(fsd->lsd_feature_incompat) & ~FILTER_INCOMPAT_SUPP); GOTO(err_fsd, rc = -EINVAL); } - if (fsd->fsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) { + if (fsd->lsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) { CERROR("%s: unsupported read-only filesystem feature(s) %x\n", - obd->obd_name, le32_to_cpu(fsd->fsd_feature_rocompat) & + obd->obd_name, le32_to_cpu(fsd->lsd_feature_rocompat) & ~FILTER_ROCOMPAT_SUPP); /* Do something like remount filesystem read-only */ GOTO(err_fsd, rc = -EINVAL); } CDEBUG(D_INODE, "%s: server last_rcvd : "LPU64"\n", - obd->obd_name, le64_to_cpu(fsd->fsd_last_transno)); + obd->obd_name, le64_to_cpu(fsd->lsd_last_transno)); CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n", obd->obd_name, mount_count + 1); CDEBUG(D_INODE, "%s: server data size: %u\n", - obd->obd_name, le32_to_cpu(fsd->fsd_server_size)); + obd->obd_name, le32_to_cpu(fsd->lsd_server_size)); CDEBUG(D_INODE, "%s: per-client data start: %u\n", - obd->obd_name, le32_to_cpu(fsd->fsd_client_start)); + obd->obd_name, le32_to_cpu(fsd->lsd_client_start)); CDEBUG(D_INODE, "%s: per-client data size: %u\n", - obd->obd_name, le32_to_cpu(fsd->fsd_client_size)); + obd->obd_name, le32_to_cpu(fsd->lsd_client_size)); CDEBUG(D_INODE, "%s: server subdir_count: %u\n", - obd->obd_name, le16_to_cpu(fsd->fsd_subdir_count)); + obd->obd_name, le16_to_cpu(fsd->lsd_subdir_count)); CDEBUG(D_INODE, "%s: last_rcvd clients: %lu\n", obd->obd_name, - last_rcvd_size <= le32_to_cpu(fsd->fsd_client_start) ? 0 : - (last_rcvd_size - le32_to_cpu(fsd->fsd_client_start)) / - le16_to_cpu(fsd->fsd_client_size)); + last_rcvd_size <= le32_to_cpu(fsd->lsd_client_start) ? 0 : + (last_rcvd_size - le32_to_cpu(fsd->lsd_client_start)) / + le16_to_cpu(fsd->lsd_client_size)); if (!obd->obd_replayable) { CWARN("%s: recovery support OFF\n", obd->obd_name); GOTO(out, rc = 0); } - for (cl_idx = 0, off = le32_to_cpu(fsd->fsd_client_start); + for (cl_idx = 0, off = le32_to_cpu(fsd->lsd_client_start); off < last_rcvd_size; cl_idx++) { __u64 last_rcvd; struct obd_export *exp; @@ -472,9 +474,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) /* Don't assume off is incremented properly by * fsfilt_read_record(), in case sizeof(*fcd) - * isn't the same as fsd->fsd_client_size. */ - off = le32_to_cpu(fsd->fsd_client_start) + - cl_idx * le16_to_cpu(fsd->fsd_client_size); + * isn't the same as fsd->lsd_client_size. */ + off = le32_to_cpu(fsd->lsd_client_start) + + cl_idx * le16_to_cpu(fsd->lsd_client_size); rc = fsfilt_read_record(obd, filp, fcd, sizeof(*fcd), &off); if (rc) { CERROR("error reading FILT %s idx %d off %llu: rc %d\n", @@ -496,7 +498,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) exp = class_new_export(obd, (struct obd_uuid *)fcd->fcd_uuid); CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64 " srv lr: "LPU64"\n", fcd->fcd_uuid, cl_idx, - last_rcvd, le64_to_cpu(fsd->fsd_last_transno)); + last_rcvd, le64_to_cpu(fsd->lsd_last_transno)); if (IS_ERR(exp)) GOTO(err_client, rc = PTR_ERR(exp)); @@ -516,21 +518,21 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n", cl_idx, last_rcvd); - if (last_rcvd > le64_to_cpu(fsd->fsd_last_transno)) - fsd->fsd_last_transno = cpu_to_le64(last_rcvd); + if (last_rcvd > le64_to_cpu(fsd->lsd_last_transno)) + fsd->lsd_last_transno = cpu_to_le64(last_rcvd); } if (fcd) OBD_FREE(fcd, sizeof(*fcd)); - obd->obd_last_committed = le64_to_cpu(fsd->fsd_last_transno); + obd->obd_last_committed = le64_to_cpu(fsd->lsd_last_transno); if (obd->obd_recoverable_clients) { CWARN("RECOVERY: service %s, %d recoverable clients, " "last_rcvd "LPU64"\n", obd->obd_name, obd->obd_recoverable_clients, - le64_to_cpu(fsd->fsd_last_transno)); + le64_to_cpu(fsd->lsd_last_transno)); obd->obd_next_recovery_transno = obd->obd_last_committed + 1; obd->obd_recovering = 1; obd->obd_recovery_start = CURRENT_SECONDS; @@ -541,7 +543,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) out: filter->fo_mount_count = mount_count + 1; - fsd->fsd_mount_count = cpu_to_le64(filter->fo_mount_count); + fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count); /* save it, so mount count and last_transno is current */ rc = filter_update_server_data(obd, filp, filter->fo_fsd, 1); @@ -659,7 +661,7 @@ static int filter_prep_groups(struct obd_device *obd) CERROR("error renaming O/R to O/0: rc %d\n", rc); GOTO(cleanup_O0, rc); } - filter->fo_fsd->fsd_feature_incompat |= + filter->fo_fsd->lsd_feature_incompat |= cpu_to_le32(OBD_INCOMPAT_GROUPS); rc = filter_update_server_data(obd, filter->fo_rcvd_filp, filter->fo_fsd, 1); @@ -1393,6 +1395,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, struct lustre_cfg* lcfg = buf; struct filter_obd *filter = &obd->u.filter; struct vfsmount *mnt; + struct lustre_mount_info *lmi; struct obd_uuid uuid; __u8 *uuid_ptr; char *str, *label; @@ -1405,25 +1408,38 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) RETURN(-EINVAL); - obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); + lmi = server_get_mount(obd->obd_name); + if (lmi) { + /* We already mounted in lustre_fill_super. + lcfg bufs 1, 2, 4 (device, fstype, mount opts) are ignored.*/ + struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb); + mnt = lmi->lmi_mnt; + obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + } else { + /* old path - used by lctl */ + CERROR("Using old MDS mount method\n"); + mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), + MS_NOATIME|MS_NODIRATIME, + lustre_cfg_string(lcfg, 1), option); + if (IS_ERR(mnt)) { + rc = PTR_ERR(mnt); + LCONSOLE_ERROR("Can't mount disk %s (%d)\n", + lustre_cfg_string(lcfg, 1), rc); + RETURN(rc); + } + + obd->obd_fsops = fsfilt_get_ops(lustre_cfg_string(lcfg, 2)); + } if (IS_ERR(obd->obd_fsops)) - RETURN(PTR_ERR(obd->obd_fsops)); + GOTO(err_mntput, rc = PTR_ERR(obd->obd_fsops)); rc = filter_iobuf_pool_init(filter); if (rc != 0) GOTO(err_ops, rc); - mnt = do_kern_mount(lustre_cfg_string(lcfg, 2),MS_NOATIME|MS_NODIRATIME, - lustre_cfg_string(lcfg, 1), option); - if (IS_ERR(mnt)) { - rc = PTR_ERR(mnt); - LCONSOLE_ERROR("Can't mount disk %s (%d)\n", - lustre_cfg_string(lcfg, 1), rc); - GOTO(err_ops, rc); - } - LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); + /* failover is the default */ obd->obd_replayable = 1; if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { @@ -1447,7 +1463,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, rc = filter_prep(obd); if (rc) - GOTO(err_mntput, rc); + GOTO(err_ops, rc); filter->fo_destroy_in_progress = 0; sema_init(&filter->fo_create_lock, 1); @@ -1498,10 +1514,11 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, } else { str = "no UUID"; } - label = fsfilt_label(obd, obd->u.obt.obt_sb); + + label = fsfilt_get_label(obd, obd->u.obt.obt_sb); if (obd->obd_recovering) { - LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in" + LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in " "recovery until %d %s reconnect, or if no clients" " reconnect for %d:%.02d; during that time new " "clients will not be allowed to connect. " @@ -1526,14 +1543,19 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, err_post: filter_post(obd); -err_mntput: - unlock_kernel(); - mntput(mnt); - obd->u.obt.obt_sb = 0; - lock_kernel(); err_ops: fsfilt_put_ops(obd->obd_fsops); filter_iobuf_pool_done(filter); +err_mntput: + if (lmi) { + server_put_mount(obd->obd_name, mnt); + } else { + /* old method */ + unlock_kernel(); + mntput(mnt); + lock_kernel(); + } + obd->u.obt.obt_sb = 0; return rc; } @@ -1654,12 +1676,12 @@ static int filter_cleanup(struct obd_device *obd) { struct filter_obd *filter = &obd->u.filter; lvfs_sbdev_type save_dev; - int must_relock = 0; + int must_relock = 0, must_put = 0; ENTRY; if (obd->obd_fail) - CERROR("%s: shutting down for failover; client state will" - " be preserved.\n", obd->obd_name); + LCONSOLE_WARN("%s: shutting down for failover; client state " + "will be preserved.\n", obd->obd_name); if (!list_empty(&obd->obd_exports)) { CERROR("%s: still has clients!\n", obd->obd_name); @@ -1687,10 +1709,8 @@ static int filter_cleanup(struct obd_device *obd) LL_DQUOT_OFF(obd->u.obt.obt_sb); - if (atomic_read(&filter->fo_vfsmnt->mnt_count) > 1) - CERROR("%s: mount point %p busy, mnt_count: %d\n", - obd->obd_name, filter->fo_vfsmnt, - atomic_read(&filter->fo_vfsmnt->mnt_count)); + must_put = server_put_mount(obd->obd_name, filter->fo_vfsmnt); + /* must_put is for old method (l_p_m returns non-0 on err) */ /* We can only unlock kernel if we are in the context of sys_ioctl, otherwise we never called lock_kernel */ @@ -1698,9 +1718,10 @@ static int filter_cleanup(struct obd_device *obd) unlock_kernel(); must_relock++; } - - mntput(filter->fo_vfsmnt); - //destroy_buffers(obd->u.obt.obt_sb->s_dev); + + if (must_put) + /* In case we didn't mount with lustre_get_mount -- old method*/ + mntput(filter->fo_vfsmnt); obd->u.obt.obt_sb = NULL; lvfs_clear_rdonly(save_dev); @@ -1750,16 +1771,16 @@ static int filter_connect_internal(struct obd_export *exp, if (data->ocd_connect_flags & OBD_CONNECT_INDEX) { struct filter_obd *filter = &exp->exp_obd->u.filter; - struct filter_server_data *fsd = filter->fo_fsd; - int index = le32_to_cpu(fsd->fsd_ost_index); - - if (!(fsd->fsd_feature_compat & + struct lr_server_data *lsd = filter->fo_fsd; + int index = le32_to_cpu(lsd->lsd_ost_index); + + if (!(lsd->lsd_feature_compat & cpu_to_le32(OBD_COMPAT_OST))) { /* this will only happen on the first connect */ - fsd->fsd_ost_index = cpu_to_le32(data->ocd_index); - fsd->fsd_feature_compat |= cpu_to_le32(OBD_COMPAT_OST); + lsd->lsd_ost_index = cpu_to_le32(data->ocd_index); + lsd->lsd_feature_compat |= cpu_to_le32(OBD_COMPAT_OST); filter_update_server_data(exp->exp_obd, - filter->fo_rcvd_filp, fsd, 1); + filter->fo_rcvd_filp, lsd, 1); } else if (index != data->ocd_index) { LCONSOLE_ERROR("Connection from %s to index " "%u doesn't match actual OST " @@ -2943,12 +2964,12 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen, RETURN(-EINVAL); } - if (keylen < strlen("mds_conn") || - memcmp(key, "mds_conn", keylen) != 0) + if (keylen < strlen(KEY_MDS_CONN) || + memcmp(key, KEY_MDS_CONN, keylen) != 0) RETURN(-EINVAL); - CWARN("%s: received MDS connection from %s\n", obd->obd_name, - obd_export_nid2str(exp)); + LCONSOLE_WARN("%s: received MDS connection from %s\n", obd->obd_name, + obd_export_nid2str(exp)); obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie; /* setup llog imports */ @@ -3137,14 +3158,14 @@ static int __init obdfilter_init(void) init_obd_quota_ops(quota_interface, &filter_sanobd_ops); rc = class_register_type(&filter_obd_ops, lvars.module_vars, - OBD_FILTER_DEVICENAME); + LUSTRE_OST_NAME); if (rc) GOTO(out, rc); rc = class_register_type(&filter_sanobd_ops, lvars.module_vars, - OBD_FILTER_SAN_DEVICENAME); + LUSTRE_OSTSAN_NAME); if (rc) { - class_unregister_type(OBD_FILTER_DEVICENAME); + class_unregister_type(LUSTRE_OST_NAME); out: if (quota_interface) PORTAL_SYMBOL_PUT(filter_quota_interface); @@ -3162,8 +3183,9 @@ static void __exit obdfilter_exit(void) if (quota_interface) PORTAL_SYMBOL_PUT(filter_quota_interface); - class_unregister_type(OBD_FILTER_SAN_DEVICENAME); - class_unregister_type(OBD_FILTER_DEVICENAME); + class_unregister_type(LUSTRE_OSTSAN_NAME); + class_unregister_type(LUSTRE_OST_NAME); + OBD_FREE(obdfilter_created_scratchpad, OBDFILTER_CREATED_SCRATCHPAD_ENTRIES * sizeof(*obdfilter_created_scratchpad)); diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 6b08d94..81d9406 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -8,22 +8,13 @@ #ifdef __KERNEL__ # include #endif -#include +#include #include #include #include #define FILTER_LAYOUT_VERSION "2" -#ifndef OBD_FILTER_DEVICENAME -# define OBD_FILTER_DEVICENAME "obdfilter" -#endif - -#ifndef OBD_FILTER_SAN_DEVICENAME -# define OBD_FILTER_SAN_DEVICENAME "sanobdfilter" -#endif - -#define HEALTH_CHECK "health_check" #define FILTER_INIT_OBJID 0 #define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */ @@ -33,33 +24,12 @@ #define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ -#define FILTER_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS) +#define FILTER_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS | OBD_INCOMPAT_OST | \ + OBD_INCOMPAT_COMMON_LR) #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE) #define GRANT_FOR_LLOG(obd) 16 -/* Data stored per server at the head of the last_rcvd file. In le32 order. - * Try to keep this the same as mds_server_data so we might one day merge. */ -struct filter_server_data { -/* 00*/ __u8 fsd_uuid[40]; /* server UUID */ -/* 28*/ __u64 fsd_last_transno_new;/* future last completed transaction ID */ -/* 30*/ __u64 fsd_last_transno; /* last completed transaction ID */ - __u64 fsd_mount_count; /* FILTER incarnation number */ -/* 40*/ __u32 fsd_feature_compat; /* compatible feature flags */ - __u32 fsd_feature_rocompat;/* read-only compatible feature flags */ - __u32 fsd_feature_incompat;/* incompatible feature flags */ - __u32 fsd_server_size; /* size of server data area */ -/* 50*/ __u32 fsd_client_start; /* start of per-client data area */ - __u16 fsd_client_size; /* size of per-client data area */ - __u16 fsd_subdir_count; /* number of subdirectories for objects */ - __u64 fsd_catalog_oid; /* recovery catalog object id */ -/* 60*/ __u32 fsd_catalog_ogen; /* recovery catalog inode generation */ - __u8 fsd_peeruuid[40]; /* UUID of MDS associated with this OST */ -/* 8c*/ __u32 fsd_ost_index; /* index number of OST in LOV */ - __u32 fsd_mds_index; /* index number of MDS in LMV */ -/* 94*/ __u8 fsd_padding[LR_SERVER_SIZE - 148]; -}; - /* Data stored per client in the last_rcvd file. In le32 order. */ struct filter_client_data { __u8 fcd_uuid[40]; /* client UUID */ @@ -107,7 +77,7 @@ __u64 filter_last_id(struct filter_obd *, struct obdo *); int filter_update_fidea(struct obd_export *exp, struct inode *inode, void *handle, struct obdo *oa); int filter_update_server_data(struct obd_device *, struct file *, - struct filter_server_data *, int force_sync); + struct lr_server_data *, int force_sync); int filter_update_last_objid(struct obd_device *, obd_gr, int force_sync); int filter_common_setup(struct obd_device *, obd_count len, void *buf, void *option); @@ -133,8 +103,8 @@ int filter_commitrw(int cmd, struct obd_export *, struct obdo *, int objcount, struct obd_ioobj *, int niocount, struct niobuf_local *, struct obd_trans_info *, int rc); int filter_brw(int cmd, struct obd_export *, struct obdo *, - struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *, - struct obd_trans_info *); + struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *, + struct obd_trans_info *); void flip_into_page_cache(struct inode *inode, struct page *new_page); /* filter_io_*.c */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 0f2321a..6392f30 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3104,7 +3104,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); - if (KEY_IS("next_id")) { + if (KEY_IS(KEY_NEXT_ID)) { if (vallen != sizeof(obd_id)) RETURN(-EINVAL); obd->u.cli.cl_oscc.oscc_next_id = *((obd_id*)val) + 1; @@ -3123,11 +3123,11 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } - if (KEY_IS("initial_recov")) { + if (KEY_IS(KEY_INIT_RECOV)) { if (vallen != sizeof(int)) RETURN(-EINVAL); imp->imp_initial_recov = *(int *)val; - CDEBUG(D_HA, "%s: set imp_no_init_recov = %d\n", + CDEBUG(D_HA, "%s: set imp_initial_recov = %d\n", exp->exp_obd->obd_name, imp->imp_initial_recov); RETURN(0); @@ -3277,7 +3277,7 @@ static int osc_import_event(struct obd_device *obd, break; } case IMP_EVENT_INACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); break; } case IMP_EVENT_INVALIDATE: { @@ -3305,7 +3305,7 @@ static int osc_import_event(struct obd_device *obd, oscc->oscc_flags &= ~OSCC_FLAG_NOSPC; spin_unlock(&oscc->oscc_lock); } - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); break; } case IMP_EVENT_OCD: { @@ -3318,7 +3318,7 @@ static int osc_import_event(struct obd_device *obd, if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD); + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); break; } default: diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 2cc87af..11da088 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -486,6 +486,8 @@ static void ost_nio_pages_put(struct ptlrpc_request *req, EXIT; } +#if 0 +/* see ldlm_blocking_ast */ /* cut-n-paste of mds_blocking_ast() */ static int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag) @@ -529,7 +531,8 @@ static int ost_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, } RETURN(0); } - +#endif + static int ost_brw_lock_get(int mode, struct obd_export *exp, struct obd_ioobj *obj, struct niobuf_remote *nb, struct lustre_handle *lh) @@ -1629,7 +1632,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, LUSTRE_OST_NAME, + obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME, obd->obd_proc_entry, ost_print_req, ost_num_threads); if (ost->ost_service == NULL) { @@ -1764,15 +1767,15 @@ static int __init ost_init(void) int rc; ENTRY; - lprocfs_init_vars(ost,&lvars); + lprocfs_init_vars(ost, &lvars); rc = class_register_type(&ost_obd_ops, lvars.module_vars, - LUSTRE_OST_NAME); + LUSTRE_OSS_NAME); RETURN(rc); } static void /*__exit*/ ost_exit(void) { - class_unregister_type(LUSTRE_OST_NAME); + class_unregister_type(LUSTRE_OSS_NAME); } MODULE_AUTHOR("Cluster File Systems, Inc. "); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 715f65b..8b89cd9 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -88,6 +88,7 @@ int ptlrpc_init_import(struct obd_import *imp) return 0; } +EXPORT_SYMBOL(ptlrpc_init_import); #define UUID_STR "_UUID" static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) @@ -209,6 +210,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); } +/* unset imp_invalid */ void ptlrpc_activate_import(struct obd_import *imp) { struct obd_device *obd = imp->imp_obd; @@ -366,7 +368,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) /* Don't retry if connect fails */ rc = 0; obd_set_info_async(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", + strlen(KEY_INIT_RECOV), KEY_INIT_RECOV, sizeof(rc), &rc, NULL); } @@ -414,6 +416,7 @@ out: RETURN(rc); } +EXPORT_SYMBOL(ptlrpc_connect_import); static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) { @@ -845,6 +848,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp) switch (imp->imp_connect_op) { case OST_CONNECT: rq_opc = OST_DISCONNECT; break; case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break; + case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break; default: CERROR("don't know how to disconnect from %s (connect_op %d)\n", obd2cli_tgt(imp->imp_obd), imp->imp_connect_op); diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c index d714a84..bcab551 100644 --- a/lustre/ptlrpc/llog_client.c +++ b/lustre/ptlrpc/llog_client.c @@ -309,10 +309,10 @@ out: static int llog_client_close(struct llog_handle *handle) { - int rc = 0; - - ENTRY; - RETURN(rc); + /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because + the servers all close the file at the end of every + other LLOG_ RPC. */ + return(0); } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index ba80326..7926a3f 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -408,7 +408,7 @@ void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) bufcount = m->bufcount; if (n >= bufcount) { - CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + CERROR("msg %p buffer[%d] not present (count %d)\n", m, n, bufcount); return NULL; } @@ -628,6 +628,24 @@ void lustre_swab_mds_body (struct mds_body *b) __swab32s (&b->padding_4); } +void lustre_swab_mgs_target_info(struct mgs_target_info *mti) +{ + int i; + LASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + for (i = 0; i < MTI_NIDS_MAX; i++) { + __swab64s(&mti->mti_nids[i]); + __swab64s(&mti->mti_failnids[i]); + } + for (i = 0; i < 8; i++) { + __swab16s(&mti->mti_failnodes[i]); + } + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_nid_count); + __swab32s(&mti->mti_failnid_count); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); +} + static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i) { __swab64s (&i->dqi_bgrace); @@ -1070,6 +1088,16 @@ void lustre_assert_wire_constants(void) (long long)MDS_STATUS_CONN); LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n", (long long)MDS_STATUS_LOV); + LASSERTF(MGS_CONNECT == 250, " found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, " found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, " found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, " found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, " found %lld\n", + (long long)MGS_TARGET_DEL); LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n", (long long)LDLM_ENQUEUE); LASSERTF(LDLM_CONVERT == 102, " found %lld\n", diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index db5eb7c..d9007e1 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -342,7 +342,7 @@ static int ping_evictor_main(void *arg) time_t expire_time; ENTRY; - ptlrpc_daemonize("ping_evictor"); + ptlrpc_daemonize("ll_evictor"); CDEBUG(D_HA, "Starting Ping Evictor\n"); pet_exp = NULL; @@ -418,7 +418,7 @@ void ping_evictor_start(void) init_waitqueue_head(&pet_waitq); - rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS); + rc = cfs_kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FILES); if (rc < 0) { pet_refcount--; CERROR("Cannot start ping evictor thread: %d\n", rc); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index a3df637..60adc71 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -216,6 +216,7 @@ EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); EXPORT_SYMBOL(lustre_swab_qdata); +EXPORT_SYMBOL(lustre_swab_mgs_target_info); /* recover.c */ EXPORT_SYMBOL(ptlrpc_run_recovery_over_upcall); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 4d41dc0..cbbed63 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -302,8 +302,10 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) } /* + * Administratively active/deactive a client. * This should only be called by the ioctl interface, currently - * with the lctl deactivate and activate commands. + * with the lctl deactivate and activate commands, and + * client umount -f (ll_umount_begin) */ int ptlrpc_set_import_active(struct obd_import *imp, int active) { @@ -333,6 +335,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) RETURN(rc); } +/* Attempt to reconnect an import */ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) { int rc; @@ -370,6 +373,7 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp, ENTRY; spin_lock_irqsave(&imp->imp_lock, flags); + /* Check if reconnect is already in progress */ if (imp->imp_state != LUSTRE_IMP_DISCON) { in_recovery = 1; } diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 631f096..55fd5b3 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -841,7 +841,9 @@ ptlrpc_check_rqbd_pool(struct ptlrpc_service *svc) if (avail <= low_water) ptlrpc_grow_req_bufs(svc); - lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, avail); + if (svc->srv_stats) + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQBUF_AVAIL_CNTR, + avail); } static int diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index ea4f574..6086088 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -201,10 +201,11 @@ int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk) rc = -EINTR; qchk->obd_uuid = cli->cl_target_uuid; + /* FIXME change strncmp to strcmp and save the strlen op */ if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME))) - memcpy(qchk->obd_type, LUSTRE_FILTER_NAME, - strlen(LUSTRE_FILTER_NAME)); + memcpy(qchk->obd_type, LUSTRE_OST_NAME, + strlen(LUSTRE_OST_NAME)); else if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME, strlen(LUSTRE_MDC_NAME))) memcpy(qchk->obd_type, LUSTRE_MDS_NAME, diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index c04cf4f..a7d0a0a 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -8,7 +8,7 @@ pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh pkgexample_scripts += local.sh echo.sh uml.sh lov.sh noinst_DATA = noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh -noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net +noinst_SCRIPTS += runfailure-mds runvmstat runfailure-net noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests noinst_SCRIPTS += sanity.sh rundbench diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index c769134..a4bacc2 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -5,7 +5,7 @@ set -vxe PATH=`dirname $0`/../utils:$PATH -[ "$CONFIGS" ] || CONFIGS="local lov" +[ "$CONFIGS" ] || CONFIGS="local" #"local lov" [ "$MAX_THREADS" ] || MAX_THREADS=10 if [ -z "$THREADS" ]; then KB=`awk '/MemTotal:/ { print $2 }' /proc/meminfo` @@ -19,19 +19,29 @@ fi [ "$MOUNT2" ] || MOUNT2=${MOUNT}2 [ "$TMP" ] || TMP=/tmp [ "$COUNT" ] || COUNT=1000 -#[ "$DEBUG_LVL" ] || DEBUG_LVL=0x370200 [ "$DEBUG_LVL" ] || DEBUG_LVL=0 [ "$DEBUG_OFF" ] || DEBUG_OFF="sysctl -w lnet.debug=$DEBUG_LVL" -[ "$DEBUG_ON" ] || DEBUG_ON="sysctl -w lnet.debug=0x33f0480" +[ "$DEBUG_ON" ] || DEBUG_ON="sysctl -w lnet.debug=0x33f0484" LIBLUSTRE=${LIBLUSTRE:-../liblustre} LIBLUSTRETESTS=${LIBLUSTRETESTS:-$LIBLUSTRE/tests} +LUSTRE=${LUSTRE:-`dirname $0`/..} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. mountconf.sh + +SETUP=${SETUP:-mcsetup} +FORMAT=${FORMAT:-mcformat} +CLEANUP=${CLEANUP:-mcstopall} + for NAME in $CONFIGS; do export NAME MOUNT START CLEAN - [ -e $NAME.sh ] && sh $NAME.sh - [ ! -e $NAME.xml ] && [ -z "$LDAPURL" ] && \ - echo "no config '$NAME.xml'" 1>&2 && exit 1 + . $LUSTRE/tests/cfg/$NAME.sh + + assert_env mds_HOST MDS_MKFS_OPTS MDSDEV + assert_env ost_HOST ost2_HOST OST_MKFS_OPTS OSTDEV + assert_env FSNAME if [ "$RUNTESTS" != "no" ]; then sh runtests @@ -42,7 +52,7 @@ for NAME in $CONFIGS; do fi if [ "$DBENCH" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount_client $MOUNT SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` DB_THREADS=`expr $SPACE / 50000` [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS @@ -50,43 +60,44 @@ for NAME in $CONFIGS; do $DEBUG_OFF sh rundbench 1 $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP if [ $DB_THREADS -gt 1 ]; then $DEBUG_OFF sh rundbench $DB_THREADS $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP fi rm -f /mnt/lustre/`hostname`/client.txt fi + chown $UID $MOUNT && chmod 700 $MOUNT if [ "$BONNIE" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount_client $MOUNT $DEBUG_OFF bonnie++ -f -r 0 -s $(($SIZE / 1024)) -n 10 -u $UID -d $MOUNT $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP fi IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE -s $SIZE" IOZFILE="-f $MOUNT/iozone" if [ "$IOZONE" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount_client $MOUNT $DEBUG_OFF iozone $IOZONE_OPTS $IOZFILE $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP if [ "$O_DIRECT" != "no" -a "$IOZONE_DIR" != "no" ]; then $DEBUG_OFF iozone -I $IOZONE_OPTS $IOZFILE.odir $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP fi SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` @@ -103,21 +114,22 @@ for NAME in $CONFIGS; do done iozone $IOZONE_OPTS -t $IOZ_THREADS $IOZFILE $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP elif [ $IOZVER -lt 3145 ]; then VER=`iozone -v | awk '/Revision:/ { print $3 }'` echo "iozone $VER too old for multi-thread test" fi fi + if [ "$FSX" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount | grep $MOUNT || $SETUP $DEBUG_OFF ./fsx -c 50 -p 1000 -P $TMP -l $SIZE \ -N $(($COUNT * 100)) $MOUNT/fsxfile $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP fi mkdir -p $MOUNT2 @@ -132,11 +144,11 @@ for NAME in $CONFIGS; do esac if [ "$SANITYN" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount_client $MOUNT $DEBUG_OFF if [ "$MDSNODE" -a "$MDSNAME" -a "$CLIENT" ]; then - llmount $MDSNODE:/$MDSNAME/$CLIENT $MOUNT2 + mount_client $MOUNT2 SANITYLOG=$TMP/sanity.log START=: CLEAN=: sh sanityN.sh umount $MOUNT2 else @@ -145,12 +157,12 @@ for NAME in $CONFIGS; do fi $DEBUG_ON - sh llmountcleanup.sh - sh llrmount.sh + $CLEANUP + $SETUP fi if [ "$LIBLUSTRE" != "no" ]; then - mount | grep $MOUNT || sh llmount.sh + mount_client $MOUNT export LIBLUSTRE_MOUNT_POINT=$MOUNT2 export LIBLUSTRE_MOUNT_TARGET=$MDSNODE:/$MDSNAME/$CLIENT export LIBLUSTRE_TIMEOUT=`cat /proc/sys/lustre/timeout` @@ -158,11 +170,11 @@ for NAME in $CONFIGS; do if [ -x $LIBLUSTRETESTS/sanity ]; then $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET fi - sh llmountcleanup.sh - #sh llrmount.sh + $CLEANUP + #$SETUP fi - mount | grep $MOUNT && sh llmountcleanup.sh + $CLEANUP done if [ "$REPLAY_SINGLE" != "no" ]; then diff --git a/lustre/tests/cfg/insanity-local.sh b/lustre/tests/cfg/insanity-local.sh index 2b185f9..e8d323e 100644 --- a/lustre/tests/cfg/insanity-local.sh +++ b/lustre/tests/cfg/insanity-local.sh @@ -1,33 +1,59 @@ +FSNAME=lustre mds_HOST=${mds_HOST:-`hostname`} +mgs_HOST=${mgs_HOST:-$mds_HOST} mdsfailover_HOST=${mdsfailover_HOST:-""} ost1_HOST=${ost1_HOST:-"`hostname`"} ost2_HOST=${ost2_HOST:-"`hostname`"} EXTRA_OSTS=${EXTRA_OSTS:-"`hostname`"} -client_HOST=${client_HOST:-"'*'"} LIVE_CLIENT=${LIVE_CLIENT:-"`hostname`"} # This should always be a list, not a regexp FAIL_CLIENTS=${FAIL_CLIENTS:-""} +MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt} +MDSSIZE=${MDSSIZE:-10000} #50000000 +OSTDEV=${OSTDEV:-"$TMP/${FSNAME}-ost%d"} +OSTSIZE=${OSTSIZE:=10000} #50000000 + NETTYPE=${NETTYPE:-tcp} +MGSNID=`h2$NETTYPE $mgs_HOST` +FSTYPE=${FSTYPE:-ext3} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} +STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} TIMEOUT=${TIMEOUT:-30} -PTLDEBUG=${PTLDEBUG:-0x3f0400} +PTLDEBUG=${PTLDEBUG:-0x33f0404} SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} -MOUNT=${MOUNT:-"/mnt/lustre"} -#CLIENT_UPCALL=${CLIENT_UPCALL:-`pwd`/client-upcall-mdev.sh} -#UPCALL=${CLIENT_UPCALL:-`pwd`/replay-single-upcall.sh} -MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} -MDSSIZE=${MDSSIZE:-10000} #50000000 -MDSJOURNALSIZE=${MDSJOURNALSIZE:-0} +MKFSOPT="" +MOUNTOPT="" +[ "x$MDSJOURNALSIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE" +[ "x$MDSISIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -i $MDSISIZE" +[ "x$MKFSOPT" != "x" ] && + MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$mdsfailover_HOST" != "x" ] && + MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" +[ "x$STRIPE_BYTES" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param default_stripe_size=$STRIPE_BYTES" +[ "x$STRIPES_PER_OBJ" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param default_stripe_count=$STRIPES_PER_OBJ" +MDS_MKFS_OPTS="--mgs --mdt --device-size=$MDSSIZE $MKFSOPT $MOUNTOPT $MDSOPT" -OSTDEV=${OSTDEV:-"$TMP/ost%d-`hostname`"} -OSTSIZE=${OSTSIZE:=10000} #50000000 -OSTJOURNALSIZE=${OSTJOURNALSIZE:-0} +MKFSOPT="" +MOUNTOPT="" +[ "x$OSTJOURNALSIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" +[ "x$MKFSOPT" != "x" ] && + MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$ostfailover_HOST" != "x" ] && + MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" +OST_MKFS_OPTS="--ost --device-size=$OSTSIZE --mgsnode=$MGSNID $MKFSOPT $MOUNTOPT $OSTOPT" -FSTYPE=${FSTYPE:-ext3} -STRIPE_BYTES=${STRIPE_BYTES:-65536} #1048576 -STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} +MDS_MOUNT_OPTS="-o loop" +OST_MOUNT_OPTS="-o loop" +MOUNT=${MOUNT:-"/mnt/lustre"} +PDSH=${PDSH:-no_dsh} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD POWER_DOWN=${POWER_DOWN:-"powerman --off"} POWER_UP=${POWER_UP:-"powerman --on"} diff --git a/lustre/tests/cfg/insanity-ltest.sh b/lustre/tests/cfg/insanity-ltest.sh index 47a7b0c..38ad798 100644 --- a/lustre/tests/cfg/insanity-ltest.sh +++ b/lustre/tests/cfg/insanity-ltest.sh @@ -62,7 +62,6 @@ fi OSTJOURNALSIZE=${OSTJOURNALSIZE:-0} FSTYPE=${FSTYPE:-ext3} -#STRIPE_BYTES=${STRIPE_BYTES:-65536} STRIPE_BYTES=${STRIPE_BYTES:-1048576} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index c68419b..ca7258e 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -3,38 +3,72 @@ MDSNODE=${MDSNODE:-`hostname`} OSTNODE=${OSTNODE:-`hostname`} CLIENT=${CLIENT:-client} +FSNAME=lustre mds_HOST=${mds_HOST:-$MDSNODE} mdsfailover_HOST=${mdsfailover_HOST} +mgs_HOST=${mgs_HOST:-$mds_HOST} ost_HOST=${ost_HOST:-$OSTNODE} +ostfailover_HOST=${ostfailover_HOST} ost2_HOST=${ost2_HOST:-$ost_HOST} -client_HOST=${client_HOST:-$CLIENT} -NETTYPE=${NETTYPE:-tcp} - -MOUNT=${MOUNT:-"/mnt/lustre"} -MOUNT1=${MOUNT1:-$MOUNT} -MOUNT2=${MOUNT2:-${MOUNT}2} -DIR=${DIR:-$MOUNT} -DIR2=${DIR2:-$MOUNT1} -PTLDEBUG=${PTLDEBUG:-0x3f0400} -SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} -PDSH=${PDSH:-no_dsh} TMP=${TMP:-/tmp} -MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} +MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt} MDSSIZE=${MDSSIZE:-100000} -OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`} +MDSOPT=${MDSOPT:-"--mountfsoptions=acl"} +OSTDEV=${OSTDEV:-$TMP/${FSNAME}-ost0} OSTSIZE=${OSTSIZE:-200000} -FSTYPE=${FSTYPE:-ext3} +OSTDEV2=${OSTDEV2:-$TMP/${FSNAME}-ost1} + +NETTYPE=${NETTYPE:-tcp} +MGSNID=`h2$NETTYPE $mgs_HOST` +FSTYPE=${FSTYPE:-ldiskfs} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} +STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} TIMEOUT=${TIMEOUT:-20} UPCALL=${UPCALL:-DEFAULT} +PTLDEBUG=${PTLDEBUG:-0x33f0404} +SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} -MDSOPT=${MDSOPT:-"user_xattr,acl"} -CLIENTOPT=${CLIENTOPT:-"user_xattr,acl"} -MOUNTOPT=${MOUNTOPT:-"user_xattr,acl"} +MKFSOPT="" +MOUNTOPT="" +[ "x$MDSJOURNALSIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE" +[ "x$MDSISIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -i $MDSISIZE" +[ "x$MKFSOPT" != "x" ] && + MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$mdsfailover_HOST" != "x" ] && + MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" +[ "x$STRIPE_BYTES" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param default_stripe_size=$STRIPE_BYTES" +[ "x$STRIPES_PER_OBJ" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param default_stripe_count=$STRIPES_PER_OBJ" +MDS_MKFS_OPTS="--mgs --mdt --device-size=$MDSSIZE --param obd_timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $MDSOPT" -STRIPE_BYTES=${STRIPE_BYTES:-1048576} -STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} +MKFSOPT="" +MOUNTOPT="" +[ "x$OSTJOURNALSIZE" != "x" ] && + MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" +[ "x$MKFSOPT" != "x" ] && + MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$ostfailover_HOST" != "x" ] && + MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" +OST_MKFS_OPTS="--ost --device-size=$OSTSIZE --mgsnode=$MGSNID --param obd_timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT" +OST2_MKFS_OPTS=${OST2_MKFS_OPTS:-${OST_MKFS_OPTS}} + +MDS_MOUNT_OPTS="-o loop" +OST_MOUNT_OPTS="-o loop" +OST2_MOUNT_OPTS="-o loop" +MOUNT=${MOUNT:-/mnt/${FSNAME}} +MOUNT1=${MOUNT1:-$MOUNT} +MOUNT2=${MOUNT2:-${MOUNT}2} +DIR=${DIR:-$MOUNT} +DIR1=${DIR:-$MOUNT1} +DIR2=${DIR2:-$MOUNT2} +MOUNTOPT=${MOUNTOPT:-"user_xattr,acl"} + +PDSH=${PDSH:-no_dsh} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD POWER_DOWN=${POWER_DOWN:-"powerman --off"} POWER_UP=${POWER_UP:-"powerman --on"} diff --git a/lustre/tests/cfg/mdev.sh b/lustre/tests/cfg/mdev.sh index c7f7674..b0d6101 100644 --- a/lustre/tests/cfg/mdev.sh +++ b/lustre/tests/cfg/mdev.sh @@ -23,7 +23,7 @@ FSTYPE=${FSTYPE:-ext3} TIMEOUT=${TIMEOUT:-10} #UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} -STRIPE_BYTES=${STRIPE_BYTES:-65536} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 37f33a4..23ea6ac 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -10,8 +10,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: -ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT" +# bug number for skipped test: mc mc mc mc mc mc mc mc mc +ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT 9 10 11 12 13 13b 14 15 18" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! SRCDIR=`dirname $0` @@ -20,6 +20,7 @@ PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH LUSTRE=${LUSTRE:-`dirname $0`/..} RLUSTRE=${RLUSTRE:-$LUSTRE} MOUNTLUSTRE=${MOUNTLUSTRE:-/sbin/mount.lustre} +MKFSLUSTRE=${MKFSLUSTRE:-/usr/sbin/mkfs.lustre} HOSTNAME=`hostname` . $LUSTRE/tests/test-framework.sh @@ -28,61 +29,76 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} -gen_config() { - rm -f $XMLCONFIG - - add_mds mds --dev $MDSDEV --size $MDSSIZE - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE - add_client client mds --lov lov1 --path $MOUNT +reformat() { + grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT + stop ost -f + stop ost2 -f + stop mds -f + echo Formatting mds, ost, ost2 + add mds $MDS_MKFS_OPTS --reformat $MDSDEV > /dev/null + add ost $OST_MKFS_OPTS --reformat $OSTDEV > /dev/null + add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2 > /dev/null } -gen_second_config() { - rm -f $XMLCONFIG - - add_mds mds2 --dev $MDSDEV --size $MDSSIZE - add_lov lov2 mds2 --stripe_sz $STRIPE_BYTES\ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost2 --lov lov2 --dev $OSTDEV --size $OSTSIZE - add_client client mds2 --lov lov2 --path $MOUNT2 +gen_config() { + reformat + # The MGS must be started before the OSTs for a new fs, so start + # and stop to generate the startup logs. + start_mds + start_ost + sleep 5 + stop_ost + stop_mds } start_mds() { echo "start mds service on `facet_active_host mds`" - start mds --reformat $MDSLCONFARGS || return 94 + start mds $MDSDEV $MDS_MOUNT_OPTS || return 94 } stop_mds() { echo "stop mds service on `facet_active_host mds`" - stop mds $@ || return 97 + # These tests all use non-failover stop + stop mds -f || return 97 } start_ost() { echo "start ost service on `facet_active_host ost`" - start ost --reformat $OSTLCONFARGS || return 95 + start ost $OSTDEV $OST_MOUNT_OPTS || return 95 } stop_ost() { echo "stop ost service on `facet_active_host ost`" - stop ost $@ || return 98 + # These tests all use non-failover stop + stop ost -f || return 98 +} + +start_ost2() { + echo "start ost2 service on `facet_active_host ost2`" + start ost2 $OSTDEV2 $OST2_MOUNT_OPTS || return 92 +} + +stop_ost2() { + echo "stop ost2 service on `facet_active_host ost2`" + # These tests all use non-failover stop + stop ost2 -f || return 93 } mount_client() { local MOUNTPATH=$1 echo "mount lustre on ${MOUNTPATH}....." - zconf_mount `hostname` $MOUNTPATH || return 96 + zconf_mount `hostname` $MOUNTPATH || return 96 } umount_client() { local MOUNTPATH=$1 echo "umount lustre on ${MOUNTPATH}....." - zconf_umount `hostname` $MOUNTPATH || return 97 + zconf_umount `hostname` $MOUNTPATH || return 97 } manual_umount_client(){ - echo "manual umount lustre on ${MOUNTPATH}...." - do_facet client "umount $MOUNT" + echo "manual umount lustre on ${MOUNT}...." + do_facet client "umount -d $MOUNT" } setup() { @@ -91,22 +107,23 @@ setup() { mount_client $MOUNT } +cleanup_nocli() { + stop_mds || return 201 + stop_ost || return 202 + unload_modules || return 203 +} + cleanup() { - umount_client $MOUNT $FORCE || return 200 - stop_mds $FORCE || return 201 - stop_ost $FORCE || return 202 - # catch case where these return just fine, but modules are still not unloaded - /sbin/lsmod | egrep -q "lnet|libcfs" - if [ 1 -ne $? ]; then - echo "modules still loaded..." - /sbin/lsmod - return 203 - fi + umount_client $MOUNT || return 200 + cleanup_nocli || return $? } check_mount() { - do_facet client "touch $DIR/a" || return 71 - do_facet client "rm $DIR/a" || return 72 + do_facet client "cp /etc/passwd $DIR/a" || return 71 + do_facet client "rm $DIR/a" || return 72 + # make sure lustre is actually mounted (touch will block, + # but grep won't, so do it after) + do_facet client "grep $MOUNT' ' /proc/mounts > /dev/null" || return 73 echo "setup single mount lustre success" } @@ -136,9 +153,7 @@ gen_config test_0() { - start_ost - start_mds - mount_client $MOUNT + setup check_mount || return 41 cleanup || return $? } @@ -147,9 +162,7 @@ run_test 0 "single mount setup" test_1() { start_ost echo "start ost second time..." - start ost --reformat $OSTLCONFARGS - start_mds - mount_client $MOUNT + setup check_mount || return 42 cleanup || return $? } @@ -159,8 +172,7 @@ test_2() { start_ost start_mds echo "start mds second time.." - start mds --reformat $MDSLCONFARGS - + start_mds mount_client $MOUNT check_mount || return 43 cleanup || return $? @@ -169,19 +181,17 @@ run_test 2 "start up mds twice" test_3() { setup - mount_client $MOUNT - + #mount.lustre returns an error if already in mtab + mount_client $MOUNT && return $? check_mount || return 44 - - umount_client $MOUNT - cleanup || return $? + cleanup || return $? } run_test 3 "mount client twice" test_4() { setup touch $DIR/$tfile || return 85 - stop_ost --force + stop_ost -f cleanup eno=$? # ok for ost to fail shutdown @@ -195,12 +205,12 @@ run_test 4 "force cleanup ost, then cleanup" test_5() { setup touch $DIR/$tfile || return 1 - stop_mds --force || return 2 + stop_mds -f || return 2 # cleanup may return an error from the failed # disconnects; for now I'll consider this successful # if all the modules have unloaded. - umount $MOUNT & + umount -d $MOUNT & UMOUNT_PID=$! sleep 6 echo "killing umount" @@ -219,102 +229,66 @@ test_5() { grep " $MOUNT " /etc/mtab && echo "test 5: mtab after second umount" && return 11 fi - # cleanup client modules - $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null - + manual_umount_client # stop_mds is a no-op here, and should not fail - stop_mds || return 4 - stop_ost || return 5 - - lsmod | grep -q lnet && return 6 - return 0 + cleanup_nocli || return $? + # df may have lingering entry + manual_umount_client + # mtab may have lingering entry + grep -v $MOUNT" " /etc/mtab > $TMP/mtabtemp + mv $TMP/mtabtemp /etc/mtab } run_test 5 "force cleanup mds, then cleanup" test_5b() { start_ost - [ -d $MOUNT ] || mkdir -p $MOUNT - grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before lconf" && return 9 - $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before mount" && return 10 - llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/mds_svc/client_facet $MOUNT && return 1 + mount_client $MOUNT && return 1 grep " $MOUNT " /etc/mtab && echo "test 5b: mtab after failed mount" && return 11 - - # cleanup client modules - $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null - + umount_client $MOUNT # stop_mds is a no-op here, and should not fail - stop_mds || return 2 - stop_ost || return 3 - - lsmod | grep -q lnet && return 4 + cleanup_nocli || return $? return 0 - } run_test 5b "mds down, cleanup after failed mount (bug 2712)" test_5c() { start_ost start_mds - [ -d $MOUNT ] || mkdir -p $MOUNT - grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before lconf" && return 9 - $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before mount" && return 10 - llmount -vv -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/wrong_mds_svc/client_facet $MOUNT && return 1 + mount -t lustre `facet_nid mgs`:/wrong.$FSNAME $MOUNT || : grep " $MOUNT " /etc/mtab && echo "test 5c: mtab after failed mount" && return 11 - - # cleanup client modules - $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null - - stop_mds || return 2 - stop_ost || return 3 - - lsmod | grep -q lnet && return 4 - return 0 - + umount_client $MOUNT + cleanup_nocli || return $? } run_test 5c "cleanup after failed mount (bug 2712)" test_5d() { start_ost start_mds - stop_ost --force - - [ -d $MOUNT ] || mkdir -p $MOUNT - grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before lconf" && return 9 - $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + stop_ost -f grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before mount" && return 10 - llmount -vv -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`:/mds_svc/client_facet $MOUNT || return 1 - - umount_client $MOUNT || return 2 + mount_client $MOUNT || return 1 + cleanup || return $? grep " $MOUNT " /etc/mtab && echo "test 5d: mtab after unmount" && return 11 - - stop_mds || return 3 - - lsmod | grep -q lnet && return 4 return 0 - } -run_test 5d "ost down, don't crash during mount attempt" +run_test 5d "mount with ost down" test_5e() { start_ost start_mds - sleep 5 # give MDS a chance to connect to OSTs before delaying requests + # give MDS a chance to connect to OSTs (bz 10476) + sleep 5 #define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 do_facet client "sysctl -w lustre.fail_loc=0x80000506" grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10 mount_client $MOUNT || echo "mount failed (not fatal)" - umount_client $MOUNT || return 2 + cleanup || return $? grep " $MOUNT " /etc/mtab && echo "test 5e: mtab after unmount" && return 11 - - stop_mds || return 3 - stop_ost || return 3 - - lsmod | grep -q lnet && return 4 return 0 } run_test 5e "delayed connect, don't crash (bug 10268)" @@ -331,23 +305,16 @@ run_test 6 "manual umount, then mount again" test_7() { setup manual_umount_client - cleanup || return $? + cleanup_nocli || return $? } run_test 7 "manual umount, then cleanup" test_8() { - start_ost - start_mds - - mount_client $MOUNT + setup mount_client $MOUNT2 - check_mount2 || return 45 - umount $MOUNT umount_client $MOUNT2 - - stop_mds - stop_ost + cleanup || return $? } run_test 8 "double mount setup" @@ -663,7 +630,7 @@ test_15() { echo "mount lustre on $MOUNT with $MOUNTLUSTRE: success" [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname` check_mount || return 41 - do_node `hostname` umount $MOUNT + do_node `hostname` umount -d $MOUNT [ -f "$MOUNTLUSTRE" ] && rm -f $MOUNTLUSTRE echo "mount lustre on ${MOUNT} without $MOUNTLUSTRE....." @@ -680,9 +647,7 @@ test_16() { if [ ! -f "$MDSDEV" ]; then echo "no $MDSDEV existing, so mount Lustre to create one" - start_ost - start_mds - mount_client $MOUNT + setup check_mount || return 41 cleanup || return $? fi @@ -691,12 +656,10 @@ test_16() { do_facet mds "[ -d $TMPMTPT ] || mkdir -p $TMPMTPT; mount -o loop -t ext3 $MDSDEV $TMPMTPT || return \$?; chmod 555 $TMPMTPT/{OBJECTS,LOGS,PENDING} || return \$?; - umount $TMPMTPT || return \$?" || return $? + umount -d $TMPMTPT || return \$?" || return $? echo "mount Lustre to change the mode of OBJECTS/LOGS/PENDING, then umount Lustre" - start_ost - start_mds - mount_client $MOUNT + setup check_mount || return 41 cleanup || return $? @@ -729,23 +692,19 @@ test_16() { run_test 16 "verify that lustre will correct the mode of OBJECTS/LOGS/PENDING" test_17() { - TMPMTPT="${MOUNT%/*}/conf17" - if [ ! -f "$MDSDEV" ]; then echo "no $MDSDEV existing, so mount Lustre to create one" - start_ost - start_mds - mount_client $MOUNT + setup check_mount || return 41 cleanup || return $? fi echo "Remove mds config log" - do_facet mds "debugfs -w -R 'unlink LOGS/mds_svc' $MDSDEV || return \$?" || return $? + do_facet mds "debugfs -w -R 'unlink CONFIGS/$FSNAME-MDT0000' $MDSDEV || return \$?" || return $? start_ost - start mds $MDSLCONFARGS && return 42 - cleanup || return $? + start_mds && return 42 + gen_config } run_test 17 "Verify failed mds_postsetup won't fail assertion (2936)" @@ -754,12 +713,11 @@ test_18() { echo "mount mds with large journal..." OLDMDSSIZE=$MDSSIZE MDSSIZE=2000000 + #FIXME have to change MDS_MKFS_OPTS gen_config echo "mount lustre system..." - start_ost - start_mds - mount_client $MOUNT + setup check_mount || return 41 echo "check journal size..." @@ -779,15 +737,73 @@ test_18() { } run_test 18 "check lconf creates large journals" -test_19() { - # first format the ost/mdt +test_19a() { + start_mds || return 1 + stop_mds -f || return 2 +} +run_test 19a "start/stop MDS without OSTs" + +test_19b() { + start_ost || return 1 + stop_ost -f || return 2 +} +run_test 19b "start/stop OSTs without MDS" + +test_20a() { + start_mds + start_ost + stop_ost + stop_mds +} +run_test 20a "start mds before ost, stop ost first" + +test_20b() { start_ost start_mds stop_mds stop_ost - start mds $MDSLCONFARGS || return 1 - stop mds --force || return 2 } -run_test 19 "start/stop MDS without OSTs" +run_test 20b "start ost before mds, stop mds first" + +test_20c() { + start_ost + start_mds + start_ost2 + stop_ost + stop_ost2 + stop_mds +} +run_test 20c "start mds between two osts, stop mds last" + +test_21() { + reformat + start_mds + echo Client mount before any osts are in the logs + mount_client $MOUNT + check_mount && return 41 + pass + + echo Client mount with ost in logs, but none running + start_ost + stop_ost + mount_client $MOUNT + # check_mount will block trying to contact ost + umount_client $MOUNT + pass + + echo Client mount with a running ost + start_ost + mount_client $MOUNT + sleep 5 #bz10476 + check_mount || return 41 + pass + + cleanup +} +run_test 21 "start a client before osts" + + +umount_client $MOUNT +cleanup_nocli equals_msg "Done" diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 03a8f7d..d399036 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -18,7 +18,9 @@ UPCALL=${UPCALL:-DEFAULT} build_test_filter -assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT +assert_env mds_HOST MDS_MKFS_OPTS MDSDEV +assert_env ost1_HOST ost2_HOST OST_MKFS_OPTS OSTDEV +assert_env LIVE_CLIENT FSNAME #### # Initialize all the ostN_HOST @@ -111,48 +113,36 @@ reintegrate_clients() { DOWN_NUM=0 } -gen_config() { - rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE - - if [ ! -z "$mdsfailover_HOST" ]; then - add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE - fi - - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - for i in `seq $NUMOST`; do - dev=`printf $OSTDEV $i` - add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \ - --journal-size $OSTJOURNALSIZE - done - - - add_client client mds --lov lov1 --path $MOUNT +start_ost() { + local dev=`printf $OSTDEV $1` + start ost$1 $dev $OST_MOUNT_OPTS } setup() { - gen_config - + cleanup rm -rf logs/* + wait_for mds + add mds $MDS_MKFS_OPTS --reformat $MDSDEV >> /dev/null + start mds $MDSDEV $MDS_MOUNT_OPTS for i in `seq $NUMOST`; do + local dev=`printf $OSTDEV $i` + local index=$((i - 1)) wait_for ost$i - start ost$i ${REFORMAT} $OSTLCONFARGS + echo Adding ost$i at index $index dev $dev + add ost$i $OST_MKFS_OPTS --reformat --index=$index $dev >> /dev/null + start ost$i $dev $OST_MOUNT_OPTS done [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE - wait_for mds - start mds $MDSLCONFARGS ${REFORMAT} + while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT - } cleanup() { zconf_umount $CLIENTS $MOUNT - - stop mds ${FORCE} $MDSLCONFARGS || : + stop mds -f for i in `seq $NUMOST`; do - stop ost$i ${FORCE} $OSTLCONFARGS || : + stop ost$i -f done } @@ -205,11 +195,9 @@ node_to_ost() { done echo "No ost found for node; $node" return 1 - } - if [ "$ONLY" == "cleanup" ]; then $CLEANUP exit @@ -230,17 +218,14 @@ fi echo "Starting Test 17 at `date`" test_0() { - echo "Failover MDS" facet_failover mds echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 1; } - echo "Failing OST1" facet_failover ost1 echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 2; } - echo "Failing OST2" facet_failover ost2 echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 3; } @@ -261,7 +246,6 @@ test_2() { echo "Verify Lustre filesystem is up and running" client_df - echo "Failing MDS" shutdown_facet mds reboot_facet mds @@ -273,17 +257,15 @@ test_2() { DFPID=$! sleep 5 - echo "Failing OST" shutdown_facet ost1 echo "Reintegrating OST" reboot_facet ost1 wait_for ost1 - start ost1 + start_ost 1 - echo "Failover MDS" wait_for mds - start mds + start mds $MDSDEV $MDS_MOUNT_OPTS #Check FS wait $DFPID @@ -331,7 +313,6 @@ test_4() { echo "Fourth Failure Mode: OST/MDS `date`" #OST Portion - echo "Failing OST ost1" shutdown_facet ost1 #Check FS @@ -341,7 +322,6 @@ test_4() { sleep 5 #MDS Portion - echo "Failing MDS" shutdown_facet mds reboot_facet mds @@ -357,11 +337,10 @@ test_4() { echo "Reintegrating OST" reboot_facet ost1 wait_for ost1 - start ost1 + start_ost 1 - echo "Failover MDS" wait_for mds - start mds + start mds $MDSDEV $MDS_MOUNT_OPTS #Check FS wait $DFPIDA @@ -382,7 +361,6 @@ test_5() { client_df #OST Portion - echo "Failing OST" shutdown_facet ost1 reboot_facet ost1 @@ -393,7 +371,6 @@ test_5() { sleep 5 #OST Portion - echo "Failing OST" shutdown_facet ost2 reboot_facet ost2 @@ -406,9 +383,9 @@ test_5() { #Reintegration echo "Reintegrating OSTs" wait_for ost1 - start ost1 + start_ost 1 wait_for ost2 - start ost2 + start_ost 2 clients_recover_osts ost1 clients_recover_osts ost2 @@ -431,7 +408,6 @@ test_6() { client_touch testfile || return 2 #OST Portion - echo "Failing OST" shutdown_facet ost1 reboot_facet ost1 @@ -454,7 +430,7 @@ test_6() { #Reintegration echo "Reintegrating OST/CLIENTs" wait_for ost1 - start ost1 + start_ost 1 reintegrate_clients sleep 5 @@ -496,7 +472,6 @@ test_7() { client_rm testfile #MDS Portion - echo "Failing MDS" facet_failover mds #Check FS @@ -548,7 +523,6 @@ test_8() { #OST Portion - echo "Failing OST" shutdown_facet ost1 reboot_facet ost1 @@ -565,7 +539,7 @@ test_8() { echo "Reintegrating CLIENTs/OST" reintegrate_clients wait_for ost1 - start ost1 + start_ost 1 wait $DFPID client_df || return 1 client_touch testfile2 || return 2 @@ -637,5 +611,4 @@ test_10() { run_test 10 "Running Availability for 6 hours..." equals_msg "Done, cleaning up" -# we need to force cleanup for the stale MDS conns until bug 5921 is fixed -FORCE=--force $CLEANUP +$CLEANUP diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index ca26b2a..80fa2b3 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -29,13 +29,16 @@ fi [ "$DEBUG" ] && debug_opt="--ptldebug=$DEBUG" [ "$PTLDEBUG" ] && debug_opt="--ptldebug=$PTLDEBUG" -${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ \ - $conf_opt || { +echo llmount: FIXME replace llmount.sh with ". mountconf.sh" and "$SETUP" + +exit 1 + +#${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ $conf_opt || { # maybe acceptor error, dump tcp port usage - netstat -tpn - exit 2 -} +# netstat -tpn +# exit 2 +#} -if [ "$MOUNT2" ]; then - $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 -fi +#if [ "$MOUNT2" ]; then +# $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 +#fi diff --git a/lustre/tests/llmountcleanup.sh b/lustre/tests/llmountcleanup.sh index 3293050..7d8eda9 100755 --- a/lustre/tests/llmountcleanup.sh +++ b/lustre/tests/llmountcleanup.sh @@ -30,8 +30,12 @@ fi [ "$MOUNT2" ] && umount $MOUNT2 -${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \ +#${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \ --dump $TMP/debug $conf_opt + +echo FIXME llmountcleanup should be replaced with $CLEANUP +exit 1 + rc=$? echo "lconf DONE" BUSY=`dmesg | grep -i destruct` diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh deleted file mode 100755 index 434ef44..0000000 --- a/lustre/tests/llrmount.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh -# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: - -export PATH=`dirname $0`/../utils:$PATH - -LCONF=${LCONF:-lconf} -NAME=${NAME:-local} -LLMOUNT=${LLMOUNT:-llmount} - -config=$NAME.xml -mkconfig=$NAME.sh - -if [ "$PORTALS" ]; then - portals_opt="--portals=$PORTALS" -fi - -if [ "$LUSTRE" ]; then - lustre_opt="--lustre=$LUSTRE" -fi - -if [ "$LDAPURL" ]; then - conf_opt="--ldapurl $LDAPURL --config $NAME" -else - if [ ! -f $config -o $mkconfig -nt $config ]; then - sh $mkconfig $config || exit 1 - fi - conf_opt="$config" -fi - -[ "$NODE" ] && node_opt="--node $NODE" -[ "$DEBUG" ] && portals_opt="$portals_opt --ptldebug=$DEBUG" -[ "$PTLDEBUG" ] && portals_opt="$portals_opt --ptldebug=$PTLDEBUG" - -${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || { - # maybe acceptor error, dump tcp port usage - netstat -tpn - exit 2 -} - - -if [ "$MOUNT2" ]; then - $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 -fi diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index 0a8cc71..fb19ac6 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -4,29 +4,30 @@ export PATH=`dirname $0`/../utils:$PATH config=${1:-`basename $0 .sh`.xml} -LMC="${LMC:-lmc} -m $config" +LMC=echo TMP=${TMP:-/tmp} +FSNAME=lustre HOSTNAME=`hostname` -MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} +MDSDEV=${MDSDEV:-$TMP/mdt-${FSNAME}} MDSSIZE=${MDSSIZE:-400000} -FSTYPE=${FSTYPE:-ext3} -MOUNT=${MOUNT:-/mnt/lustre} +MOUNT=${MOUNT:-/mnt/${FSNAME}} MOUNT2=${MOUNT2:-${MOUNT}2} NETTYPE=${NETTYPE:-tcp} [ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" -OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`} +OSTDEV=${OSTDEV:-$TMP/ost0-${FSNAME}} OSTSIZE=${OSTSIZE:-400000} +OSTDEV2=${OSTDEV2:-$TMP/ost1-${FSNAME}} MDS_MOUNT_OPTS="user_xattr,acl,${MDS_MOUNT_OPTS:-""}" CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}" # specific journal size for the ost, in MB JSIZE=${JSIZE:-0} -[ "$JSIZE" -gt 0 ] && JARG="--journal_size $JSIZE" +[ "$JSIZE" -gt 0 ] && OST_MKFS_OPTS=$OST_MKFS_OPTS" -J size=$JSIZE" MDSISIZE=${MDSISIZE:-0} -[ "$MDSISIZE" -gt 0 ] && IARG="--inode_size $MDSISIZE" +[ "$MDSISIZE" -gt 0 ] && MDS_MKFS_OPTS=$MDS_MKFS_OPTS" -i $MDSISIZE" STRIPE_BYTES=${STRIPE_BYTES:-1048576} STRIPES_PER_OBJ=1 # 0 means stripe over all OSTs @@ -58,38 +59,28 @@ h2iib () { esac } -# create nodes -${LMC} --add node --node $HOSTNAME || exit 10 -${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` \ - --nettype $NETTYPE $PORT_OPT || exit 11 -${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT|| exit 12 +MGSNID=`h2$NETTYPE $HOSTNAME` # configure mds server [ "x$MDS_MOUNT_OPTS" != "x" ] && - MDS_MOUNT_OPTS="--mountfsoptions $MDS_MOUNT_OPTS" - + MDS_MOUNT_OPTS="--mountfsoptions=$MDS_MOUNT_OPTS" +[ "x$MDS_MKFS_OPTS" != "x" ] && + MDS_MOUNT_OPTS="--mkfsoptions=\"$MDS_MOUNT_OPTS\"" [ "x$QUOTA_OPTS" != "x" ] && QUOTA_OPTS="--quota $QUOTA_OPTS" - -# configure mds server -${LMC} --add mds --node $HOSTNAME --mds mds1 --fstype $FSTYPE \ - --dev $MDSDEV $MDS_MOUNT_OPTS $QUOTA_OPTS\ - --size $MDSSIZE $JARG $IARG $MDSOPT || exit 20 +[ ! -z "$mdsfailover_HOST" ] && MDS_FAIL_OPT="--failnode=$mdsfailover_HOST" + +MDS_OPTS="--mgs $MDS_FAIL_OPT --device-size=$MDSSIZE $MDS_MOUNT_OPTS $MDS_MKFS_OPTS" +echo mkfs.lustre --mdt $MDS_OPTS --reformat $MDSDEV [ "x$OST_MOUNT_OPTS" != "x" ] && - OST_MOUNT_OPTS="--mountfsoptions $OST_MOUNT_OPTS" - -# configure ost -${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 $LOVOPT || exit 20 - -${LMC} --add ost --node $HOSTNAME --lov lov1 --fstype $FSTYPE \ - --dev $OSTDEV $QUOTA_OPTS\ - $OST_MOUNT_OPTS --size $OSTSIZE $JARG $OSTOPT || exit 30 - -# create client config -[ "x$CLIENTOPT" != "x" ] && CLIENTOPT="--clientoptions $CLIENTOPT" -${LMC} --add mtpt --node $HOSTNAME --path $MOUNT \ - --mds mds1 --lov lov1 $CLIENTOPT || exit 40 -${LMC} --add mtpt --node client --path $MOUNT2 \ - --mds mds1 --lov lov1 $CLIENTOPT || exit 41 + OST_MOUNT_OPTS="--mountfsoptions=$OST_MOUNT_OPTS" +[ "x$OST_MKFS_OPTS" != "x" ] && + OST_MOUNT_OPTS="--mkfsoptions=\"$OST_MOUNT_OPTS\"" + +OST_OPTS="--mgsnode=`h2$NETTYPE $HOSTNAME` $OST_FAIL_OPT --device-size=$OSTSIZE $OST_MOUNT_OPTS $OST_MKFS_OPTS" +echo mkfs.lustre --ost $OST_OPTS --reformat $OSTDEV + +OST2_OPTS="--mgsnode=`h2$NETTYPE $HOSTNAME` $OST_FAIL_OPT --device-size=$OSTSIZE $OST_MOUNT_OPTS $OST_MKFS_OPTS" +echo mkfs.lustre --ost $OST2_OPTS --reformat $OSTDEV2 + diff --git a/lustre/tests/lov.sh b/lustre/tests/lov.sh index 352c2b9..62c3b14 100755 --- a/lustre/tests/lov.sh +++ b/lustre/tests/lov.sh @@ -19,7 +19,7 @@ NETTYPE=${NETTYPE:-tcp} [ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" OSTCOUNT=${OSTCOUNT:-5} -# OSTDEVN will still override the device for OST N +# OSTDEVn will still override the device for OST n OSTSIZE=${OSTSIZE:-150000} # 1 to config an echo client instead of llite diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c index 5a61806..91b6a2f 100644 --- a/lustre/tests/mmap_sanity.c +++ b/lustre/tests/mmap_sanity.c @@ -198,7 +198,7 @@ out_close: return rc; } -/* cocurrent mmap operations on two nodes */ +/* concurrent mmap operations on two nodes */ static int mmap_tst3(char *mnt) { char *ptr, mmap_file[256]; @@ -403,7 +403,7 @@ static int cancel_lru_locks(char *prefix) } if (prefix) - sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/%s_*/lru_size", prefix); + sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*-%s-*/lru_size", prefix); else sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*/lru_size"); @@ -472,7 +472,7 @@ static int mmap_tst5(char *mnt) memset(ptr, 'a', region); /* cancel unused locks */ - cancel_lru_locks("OSC"); + cancel_lru_locks("osc"); if (rc) goto out_unmap; @@ -538,7 +538,7 @@ static int mmap_tst6(char *mnt) goto out; } - cancel_lru_locks("OSC"); + cancel_lru_locks("osc"); if (rc) goto out; @@ -594,11 +594,11 @@ struct test_case { struct test_case tests[] = { { 1, "mmap test1: basic mmap operation", mmap_tst1, 1 }, { 2, "mmap test2: MAP_PRIVATE not write back", mmap_tst2, 1 }, - { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 }, - { 4, "mmap test4: c1 write to f1 from mmaped f2, " - "c2 write to f1 from mmaped f1", mmap_tst4, 2 }, + { 3, "mmap test3: concurrent mmap ops on two nodes", mmap_tst3, 2 }, + { 4, "mmap test4: c1 write to f1 from mmapped f2, " + "c2 write to f1 from mmapped f1", mmap_tst4, 2 }, { 5, "mmap test5: read/write file to/from the buffer " - "which mmaped to just this file", mmap_tst5, 1 }, + "which mmapped to just this file", mmap_tst5, 1 }, { 6, "mmap test6: check mmap write/read content on two nodes", mmap_tst6, 2 }, { 0, NULL, 0, 0 } diff --git a/lustre/tests/mountconf.sh b/lustre/tests/mountconf.sh new file mode 100755 index 0000000..0d71f75 --- /dev/null +++ b/lustre/tests/mountconf.sh @@ -0,0 +1,59 @@ +#!/bin/sh + +#set -vx + +# mountconf setup of MDS and two OSTs + +#export PATH=`dirname $0`/../utils:$PATH +#LUSTRE=${LUSTRE:-`dirname $0`/..} +#. $LUSTRE/tests/test-framework.sh +#init_test_env $@ + +mcstopall() { + # make sure we are using the primary server, so test-framework will + # be able to clean up properly. + activemds=`facet_active mds` + if [ $activemds != "mds" ]; then + fail mds + fi + + grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT $* + stop ost -f + stop ost2 -f + stop mds -f + return 0 +} + +mccleanup() { + echo "mountconf cleanup $*" + mcstopall $* + unload_modules +} + +mcformat() { + mcstopall + echo Formatting mds, ost, ost2 + add mds $MDS_MKFS_OPTS --reformat $MDSDEV > /dev/null || exit 10 + add ost $OST_MKFS_OPTS --reformat $OSTDEV > /dev/null || exit 10 + add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2 > /dev/null || exit 10 +} +export MCFORMAT=${MCFORMAT:-"mcformat"} + +mount_client() { + grep " $1 " /proc/mounts || zconf_mount `hostname` $* +} + +mcsetup() { + echo Setup mds, ost, ost2 + start mds $MDSDEV $MDS_MOUNT_OPTS + start ost $OSTDEV $OST_MOUNT_OPTS + start ost2 $OSTDEV2 $OST2_MOUNT_OPTS + [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE + + mount_client $MOUNT + sleep 5 +} + +export MCSETUP=${MCSETUP:-"mcsetup"} +export MCCLEANUP=${MCCLEANUP:-"mccleanup"} + diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 0d12568..3da2ceb 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -set -vx +#set -vx export PATH=`dirname $0`/../utils:$PATH LFS=${LFS:-lfs} @@ -52,7 +52,7 @@ fi # flush cache to OST(s) so avail numbers are correct sync; sleep 1 ; sync -for OSC in /proc/fs/lustre/osc/OSC*MNT*; do +for OSC in /proc/fs/lustre/osc/*-osc-*; do AVAIL=`cat $OSC/kbytesavail` GRANT=`cat $OSC/cur_grant_bytes` [ $(($AVAIL - $GRANT / 1024)) -lt 400 ] && OSCFULL=full @@ -60,7 +60,7 @@ done if [ -z "$OSCFULL" ]; then echo "no OSTs are close to full" - grep "[0-9]" /proc/fs/lustre/osc/OSC*MNT*/{kbytesavail,cur*} + grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*} SUCCESS=0 fi diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh index f7682bb..1eb5dbd 100644 --- a/lustre/tests/oos2.sh +++ b/lustre/tests/oos2.sh @@ -54,14 +54,14 @@ fi # flush cache to OST(s) so avail numbers are correct sync; sleep 1 ; sync -for OSC in /proc/fs/lustre/osc/OSC*MNT*; do +for OSC in /proc/fs/lustre/osc/*-osc-*; do AVAIL=`cat $OSC/kbytesavail` GRANT=`cat $OSC/cur_grant_bytes` [ $(($AVAIL - $GRANT / 1024)) -lt 400 ] && OSCFULL=full done if [ -z "$OSCFULL" ]; then echo "no OSTs are close to full" - grep "[0-9]" /proc/fs/lustre/osc/OSC*MNT*/{kbytesavail,cur*}|tee -a $LOG + grep "[0-9]" /proc/fs/lustre/osc/*-osc-*/{kbytesavail,cur*}|tee -a $LOG SUCCESS=0 fi diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 25d613e..36e90f3 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -5,57 +5,33 @@ set -e # bug 2986 5494 7288 ALWAYS_EXCEPT="20b 24 27 $RECOVERY_SMALL_EXCEPT" -LUSTRE=${LUSTRE:-`dirname $0`/..} +# Tests that always fail with mountconf -- FIXME +# 16 fails with 1, not evicted +EXCEPT="$EXCEPT 16" -. $LUSTRE/tests/test-framework.sh +LUSTRE=${LUSTRE:-`dirname $0`/..} +. $LUSTRE/tests/test-framework.sh init_test_env $@ - . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} build_test_filter - # Allow us to override the setup if we already have a mounted system by # setting SETUP=" " and CLEANUP=" " SETUP=${SETUP:-"setup"} CLEANUP=${CLEANUP:-"cleanup"} -FORCE=${FORCE:-"--force"} -make_config() { - rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE - add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE - add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE - add_client client mds --lov lov1 --path $MOUNT -} +# for MCSETUP and MCCLEANUP +. mountconf.sh setup() { - make_config - start ost --reformat $OSTLCONFARGS - start ost2 --reformat $OSTLCONFARGS - [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE - start mds $MDSLCONFARGS --reformat - grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT + $MCFORMAT + $MCSETUP } cleanup() { - zconf_umount `hostname` $MOUNT - stop mds ${FORCE} $MDSLCONFARGS - stop ost2 ${FORCE} - stop ost ${FORCE} --dump $TMP/recovery-small-`hostname`.log -} - -replay() { - do_mds "sync" - do_mds 'echo -e "device \$mds1\\nprobe\\nnotransno\\nreadonly" | lctl' - do_client "$1" & - shutdown_mds -f - start_mds - wait - do_client "df -h $MOUNT" # trigger failover, if we haven't already + $MCCLEANUP > /dev/null || { echo "FAILed to clean up"; exit 20; } } if [ ! -z "$EVAL" ]; then @@ -65,12 +41,11 @@ fi if [ "$ONLY" == "cleanup" ]; then sysctl -w lnet.debug=0 || true - FORCE=--force cleanup + cleanup exit fi -REFORMAT=--reformat $SETUP -unset REFORMAT +$SETUP [ "$ONLY" == "setup" ] && exit @@ -93,14 +68,14 @@ test_3() { run_test 3 "stat: drop req, drop rep" test_4() { - do_facet client "cp /etc/resolv.conf $MOUNT/resolv.conf" || return 1 - drop_request "cat $MOUNT/resolv.conf > /dev/null" || return 2 - drop_reply "cat $MOUNT/resolv.conf > /dev/null" || return 3 + do_facet client "cp /etc/passwd $MOUNT/passwd" || return 1 + drop_request "cat $MOUNT/passwd > /dev/null" || return 2 + drop_reply "cat $MOUNT/passwd > /dev/null" || return 3 } run_test 4 "open: drop req, drop rep" test_5() { - drop_request "mv $MOUNT/resolv.conf $MOUNT/renamed" || return 1 + drop_request "mv $MOUNT/passwd $MOUNT/renamed" || return 1 drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2 do_facet client "checkstat -v $MOUNT/renamed-again" || return 3 } @@ -152,7 +127,7 @@ test_11(){ do_facet client multiop $MOUNT/$tfile Ow || return 1 do_facet client multiop $MOUNT/$tfile or || return 2 - cancel_lru_locks OSC + cancel_lru_locks osc do_facet client multiop $MOUNT/$tfile or || return 3 drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected" @@ -207,15 +182,15 @@ test_15() { } run_test 15 "failed open (-ENOMEM)" -READ_AHEAD=`cat /proc/fs/lustre/llite/*/max_read_ahead_mb | head -n 1` +READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1` stop_read_ahead() { - for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do + for f in $LPROC/llite/*/max_read_ahead_mb; do echo 0 > $f done } start_read_ahead() { - for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do + for f in $LPROC/llite/*/max_read_ahead_mb; do echo $READ_AHEAD > $f done } @@ -227,7 +202,7 @@ test_16() { #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE do_facet ost sysctl -w lustre.fail_loc=0x80000504 - cancel_lru_locks OSC + cancel_lru_locks osc # will get evicted here do_facet client "cmp /etc/termcap $MOUNT/termcap" && return 1 sysctl -w lustre.fail_loc=0 @@ -260,7 +235,7 @@ test_18a() { do_facet client mkdir -p $MOUNT/$tdir f=$MOUNT/$tdir/$tfile - cancel_lru_locks OSC + cancel_lru_locks osc pgcache_empty || return 1 # 1 stripe on ost2 @@ -268,14 +243,13 @@ test_18a() { do_facet client cp /etc/termcap $f sync - local osc2_dev=`$LCTL device_list | \ - awk '(/ost2.*client_facet/){print $4}' ` - $LCTL --device %$osc2_dev deactivate + local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'` + $LCTL --device $osc2dev deactivate || return 3 # my understanding is that there should be nothing in the page # cache after the client reconnects? rc=0 pgcache_empty || rc=2 - $LCTL --device %$osc2_dev activate + $LCTL --device $osc2dev activate rm -f $f return $rc } @@ -286,7 +260,7 @@ test_18b() { f=$MOUNT/$tdir/$tfile f2=$MOUNT/$tdir/${tfile}-2 - cancel_lru_locks OSC + cancel_lru_locks osc pgcache_empty || return 1 # shouldn't have to set stripe size of count==1 @@ -329,7 +303,7 @@ test_19b() { do_facet client multiop $f Ow || return 1 do_facet client multiop $f or || return 2 - cancel_lru_locks OSC + cancel_lru_locks osc do_facet client multiop $f or || return 3 drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected" @@ -343,7 +317,7 @@ test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup multiop $DIR/$tdir/${tfile} O_wc & MULTI_PID=$! sleep 1 - cancel_lru_locks OSC + cancel_lru_locks osc #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost sysctl -w lustre.fail_loc=0x80000308 kill -USR1 $MULTI_PID @@ -356,7 +330,7 @@ run_test 20a "ldlm_handle_enqueue error (should return error)" test_20b() { # bug 2986 - ldlm_handle_enqueue error during open mkdir -p $DIR/$tdir touch $DIR/$tdir/${tfile} - cancel_lru_locks OSC + cancel_lru_locks osc #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost sysctl -w lustre.fail_loc=0x80000308 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \ @@ -377,7 +351,7 @@ run_test 20b "ldlm_handle_enqueue error (should return error)" test_24() { # bug 2248 - eviction fails writeback but app doesn't see it mkdir -p $DIR/$tdir - cancel_lru_locks OSC + cancel_lru_locks osc multiop $DIR/$tdir/$tfile Owy_wyc & MULTI_PID=$! usleep 500 @@ -399,7 +373,7 @@ test_26() { # bug 5921 - evict dead exports by pinger echo "skipping test 26 (local OST)" && return [ "`lsmod | grep mds`" ] && \ echo "skipping test 26 (local MDS)" && return - OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports + OST_FILE=$LPROC/obdfilter/ost_svc/num_exports OST_EXP="`do_facet ost cat $OST_FILE`" OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2` echo starting with $OST_NEXP1 OST exports @@ -421,9 +395,9 @@ run_test 26 "evict dead exports" test_26b() { # bug 10140 - evict dead exports by pinger zconf_mount `hostname` $MOUNT2 - MDS_FILE=/proc/fs/lustre/mds/mds_svc/num_exports + MDS_FILE=$LPROC/mds/${mds_svc}/num_exports MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`" - OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports + OST_FILE=$LPROC/obdfilter/${ost_svc}/num_exports OST_NEXP1="`do_facet ost cat $OST_FILE | cut -d' ' -f2`" echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports zconf_umount `hostname` $MOUNT2 -f @@ -566,5 +540,4 @@ test_52() { } run_test 52 "failover OST under load" - -FORCE=--force $CLEANUP +$CLEANUP diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 5fe9d3a..05dfdde 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -14,57 +14,42 @@ init_test_env $@ SETUP=${SETUP:-"setup"} CLEANUP=${CLEANUP:-"cleanup"} -FORCE=${FORCE:-"--force"} - -gen_config() { - rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE - if [ ! -z "$mdsfailover_HOST" ]; then - add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE - fi - - add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover - add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE --failover - add_client client mds --lov lov1 --path $MOUNT -} - - build_test_filter cleanup() { - # make sure we are using the primary MDS, so the config log will + # make sure we are using the primary server, so test-framework will # be able to clean up properly. activemds=`facet_active mds` if [ $activemds != "mds" ]; then fail mds fi - umount $MOUNT2 || true - umount $MOUNT || true - rmmod llite || true - stop mds ${FORCE} - stop ost2 ${FORCE} - stop ost ${FORCE} --dump $TMP/replay-dual-`hostname`.log + grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT + grep " $MOUNT2 " /proc/mounts && zconf_umount `hostname` $MOUNT2 + stop mds -f + stop ost2 -f + stop ost -f } if [ "$ONLY" == "cleanup" ]; then sysctl -w lnet.debug=0 - FORCE=--force cleanup + cleanup exit fi setup() { - gen_config - start ost --reformat $OSTLCONFARGS - start ost2 --reformat $OSTLCONFARGS - start mds $MDSLCONFARGS --reformat - grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT - grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2 - -# echo $TIMEOUT > /proc/sys/lustre/timeout + cleanup + add mds $MDS_MKFS_OPTS --reformat $MDSDEV + add ost $OST_MKFS_OPTS --reformat $OSTDEV + add ost2 $OST2_MKFS_OPTS --reformat $OSTDEV2 + start mds $MDSDEV $MDS_MOUNT_OPTS + start ost $OSTDEV $OST_MOUNT_OPTS + start ost2 $OSTDEV2 $OST2_MOUNT_OPTS + # client actions will get EIO until MDT contacts OSTs, so give it a sec + sleep 5 + zconf_mount `hostname` $MOUNT + zconf_mount `hostname` $MOUNT2 } $SETUP @@ -459,7 +444,7 @@ test_18() { # bug 3822 - evicting client with enqueued lock sleep 1 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305 do_facet client sysctl -w lustre.fail_loc=0x80000305 # drop cb, evict - cancel_lru_locks MDC + cancel_lru_locks mdc usleep 500 # wait to ensure first client is one that will be evicted openfile -f O_RDONLY $MOUNT2/$tdir/f0 wait $OPENPID @@ -472,5 +457,5 @@ if [ "$ONLY" != "setup" ]; then equals_msg test complete, cleaning up SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP - FORCE=--force $CLEANUP + $CLEANUP fi diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index c9ae901..f74e4f4 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -10,38 +10,39 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} +#failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST # Skip these tests # BUG NUMBER: 2766? ALWAYS_EXCEPT="5 $REPLAY_OST_SINGLE_EXCEPT" gen_config() { - rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE - add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover - if [ ! -z "$ostfailover_HOST" ]; then - add_ostfailover ost --dev $OSTDEV --size $OSTSIZE - fi - add_client client mds --lov lov1 --path $MOUNT + grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT + stop ost -f + stop ost2 -f + stop mds -f + echo Formatting mds, ost + add mds $MDS_MKFS_OPTS --reformat $MDSDEV + add ost $OST_MKFS_OPTS --reformat $OSTDEV } cleanup() { - # make sure we are using the primary MDS, so the config log will + # make sure we are using the primary server, so test-framework will # be able to clean up properly. activeost=`facet_active ost` if [ $activeost != "ost" ]; then fail ost fi + zconf_umount `hostname` $MOUNT - stop mds ${FORCE} $MDSLCONFARGS - stop ost ${FORCE} --dump $TMP/replay-ost-single-`hostname`.log + stop mds + stop ost + unload_modules } if [ "$ONLY" == "cleanup" ]; then sysctl -w lnet.debug=0 - FORCE=--force cleanup + cleanup exit fi @@ -52,18 +53,15 @@ CLEANUP=${CLEANUP:-"cleanup"} setup() { gen_config - - start ost --reformat $OSTLCONFARGS + start mds $MDSDEV $MDS_MOUNT_OPTS + start ost $OSTDEV $OST_MOUNT_OPTS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE - start mds --reformat $MDSLCONFARGS if [ -z "`grep " $MOUNT " /proc/mounts`" ]; then # test "-1" needed during initial client->OST connection log "== test 00: target handle mismatch (bug 5317) === `date +%H:%M:%S`" - #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 do_facet ost "sysctl -w lustre.fail_loc=0x80000211" - zconf_mount `hostname` $MOUNT && df $MOUNT && pass || error "mount fail" fi } @@ -117,7 +115,7 @@ test_4() { verify=$ROOT/tmp/verify-$$ dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile # invalidate cache, so that we're reading over the wire - for i in /proc/fs/lustre/ldlm/namespaces/OSC_*MNT*; do + for i in /proc/fs/lustre/ldlm/namespaces/*-osc-*; do echo -n clear > $i/lru_size done cmp $verify $DIR/$tfile & @@ -145,7 +143,7 @@ test_5() { run_test 5 "Fail OST during iozone" kbytesfree() { - awk '{total+=$1} END {print total}' /proc/fs/lustre/osc/OSC_*MNT*/kbytesfree + awk '{total+=$1} END {print total}' /proc/fs/lustre/osc/*-osc-*/kbytesfree } test_6() { @@ -199,4 +197,4 @@ test_7() { run_test 7 "Fail OST before obd_destroy" equals_msg test complete, cleaning up -FORCE=--force $CLEANUP +$CLEANUP diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 8352be3..513766c 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1,6 +1,7 @@ #!/bin/sh set -e +#set -v # # This test needs to be run on the client @@ -12,57 +13,26 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} +. mountconf.sh # Skip these tests -# bug number: 2766 9930 +# bug number: 2766 ALWAYS_EXCEPT="0b $REPLAY_SINGLE_EXCEPT" -gen_config() { - rm -f $XMLCONFIG - add_mds mds --dev $MDSDEV --size $MDSSIZE - if [ ! -z "$mdsfailover_HOST" ]; then - add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE - fi - - add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ - --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE - add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE - add_client client mds --lov lov1 --path $MOUNT -} - build_test_filter -cleanup() { - # make sure we are using the primary MDS, so the config log will - # be able to clean up properly. - activemds=`facet_active mds` - if [ $activemds != "mds" ]; then - fail mds - fi - zconf_umount `hostname` $MOUNT - stop mds ${FORCE} $MDSLCONFARGS - stop ost2 ${FORCE} - stop ost ${FORCE} --dump $TMP/replay-single-`hostname`.log -} +SETUP=${SETUP:-"setup"} +CLEANUP=${CLEANUP:-"mcstopall"} if [ "$ONLY" == "cleanup" ]; then sysctl -w lnet.debug=0 || true - FORCE=--force cleanup - exit + $CLEANUP + exit 0 fi -SETUP=${SETUP:-"setup"} -CLEANUP=${CLEANUP:-"cleanup"} - setup() { - gen_config - - start ost --reformat $OSTLCONFARGS - start ost2 --reformat $OSTLCONFARGS - [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE - start mds $MDSLCONFARGS --reformat - grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT + mcformat + mcsetup } $SETUP @@ -101,20 +71,20 @@ test_1a() { do_facet ost "sysctl -w lustre.fail_loc=0" rm -fr $DIR/$tfile - local old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + local old_last_id=`cat $LPROC/obdfilter/*/last_id` touch -o $DIR/$tfile 1 sync - local new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + local new_last_id=`cat $LPROC/obdfilter/*/last_id` test "$old_last_id" = "$new_last_id" || { echo "OST object create is caused by MDS" return 1 } - old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + old_last_id=`cat $LPROC/obdfilter/*/last_id` echo "data" > $DIR/$tfile sync - new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + new_last_id=`cat $LPROC/obdfilter/*/last_id` test "$old_last_id" = "$new_last_id "&& { echo "CROW does not work on write" return 1 @@ -126,10 +96,10 @@ test_1a() { do_facet ost "sysctl -w lustre.fail_loc=0x80000801" rm -fr $DIR/1a1 - old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + old_last_id=`cat $LPROC/obdfilter/*/last_id` echo "data" > $DIR/1a1 sync - new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` + new_last_id=`cat $LPROC/obdfilter/*/last_id` test "$old_last_id" = "$new_last_id" || { echo "CROW does work with fail_loc=0x80000801" return 1 @@ -760,7 +730,7 @@ test_36() { touch $DIR/$tfile checkstat $DIR/$tfile facet_failover mds - cancel_lru_locks MDC + cancel_lru_locks mdc if dmesg | grep "unknown lock cookie"; then echo "cancel after replay failed" return 1 @@ -812,8 +782,7 @@ test_39() { # bug 4176 run_test 39 "test recovery from unlink llog (test llog_gen_rec) " count_ost_writes() { - cat /proc/fs/lustre/osc/*/stats | - awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' + awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' $LPROC/osc/*/stats } #b=2477,2532 @@ -864,13 +833,13 @@ test_41() { # make sure the start of the file is ost1 lfs setstripe $f $((128 * 1024)) 0 0 do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3 - cancel_lru_locks OSC + cancel_lru_locks osc # fail ost2 and read from ost1 - local osc2_dev=`$LCTL device_list | \ - awk '(/ost2.*client_facet/){print $4}' ` - $LCTL --device %$osc2_dev deactivate + local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'` + [ "$osc2dev" ] || return 4 + $LCTL --device $osc2dev deactivate || return 1 do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3 - $LCTL --device %$osc2_dev activate + $LCTL --device $osc2dev activate || return 2 return 0 } run_test 41 "read from a valid osc while other oscs are invalid" @@ -911,8 +880,10 @@ test_43() { # bug 2530 run_test 43 "mds osc import failure during recovery; don't LBUG" test_44() { - mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices` + mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + [ "$mdcdev" ] || exit 2 for i in `seq 1 10`; do + echo iteration $i #define OBD_FAIL_TGT_CONN_RACE 0x701 do_facet mds "sysctl -w lustre.fail_loc=0x80000701" $LCTL --device $mdcdev recover @@ -924,8 +895,10 @@ test_44() { run_test 44 "race in target handle connect" test_44b() { - mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices` + mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + [ "$mdcdev" ] || exit 2 for i in `seq 1 10`; do + echo iteration $i #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 do_facet mds "sysctl -w lustre.fail_loc=0x80000704" $LCTL --device $mdcdev recover @@ -938,7 +911,8 @@ run_test 44b "race in target handle connect" # Handle failed close test_45() { - mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices` + mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + [ "$mdcdev" ] || exit 2 $LCTL --device $mdcdev recover multiop $DIR/$tfile O_c & @@ -947,13 +921,13 @@ test_45() { # This will cause the CLOSE to fail before even # allocating a reply buffer - $LCTL --device $mdcdev deactivate + $LCTL --device $mdcdev deactivate || return 4 # try the close kill -USR1 $pid wait $pid || return 1 - $LCTL --device $mdcdev activate + $LCTL --device $mdcdev activate || return 5 sleep 1 $CHECKSTAT -t file $DIR/$tfile || return 2 @@ -1012,9 +986,9 @@ test_48() { run_test 48 "MDS->OSC failure during precreate cleanup (2824)" test_50() { - local osc_dev=`$LCTL device_list | \ - awk '(/ost_svc_mds_svc/){print $4}' ` - $LCTL --device %$osc_dev recover && $LCTL --device %$osc_dev recover + local oscdev=`grep ${ost_svc}-osc- $LPROC/devices | awk '{print $1}'` + [ "$oscdev" ] || return 1 + $LCTL --device $oscdev recover && $LCTL --device $oscdev recover # give the mds_lov_sync threads a chance to run sleep 5 } @@ -1023,7 +997,7 @@ run_test 50 "Double OSC recovery, don't LASSERT (3812)" # b3764 timed out lock replay test_52() { touch $DIR/$tfile - cancel_lru_locks MDC + cancel_lru_locks mdc multiop $DIR/$tfile s replay_barrier mds @@ -1092,4 +1066,4 @@ test_58() { run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)" equals_msg test complete, cleaning up -FORCE=--force $CLEANUP +$CLEANUP diff --git a/lustre/tests/runtests b/lustre/tests/runtests index 0969f23..7071490 100755 --- a/lustre/tests/runtests +++ b/lustre/tests/runtests @@ -4,7 +4,19 @@ # Probably a good idea to run this before doing any checkins. # In the future this can become more fancy, but it's OK for now. +LUSTRE=${LUSTRE:-`dirname $0`/..} SRCDIR="`dirname $0`" +export PATH=/sbin:/usr/sbin:$SRCDIR:$SRCDIR/../utils:$PATH + +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. ${CONFIG:=$LUSTRE/tests/cfg/local.sh} +. mountconf.sh + +SETUP=${SETUP:-mcsetup} +FORMAT=${FORMAT:-mcformat} +CLEANUP=${CLEANUP:-mcstopall} + fail() { echo "ERROR: $1" 1>&2 [ $2 ] && RC=$2 || RC=1 @@ -16,14 +28,11 @@ log() { lctl mark "$*" } -export PATH=/sbin:/usr/sbin:$SRCDIR:$SRCDIR/../utils:$PATH ERROR= SRC=/etc [ "$COUNT" ] || COUNT=1000 -[ "$LCONF" ] || LCONF=lconf - [ "$MCREATE" ] || MCREATE=mcreate [ "$MKDIRMANY" ] || MKDIRMANY="createmany -d" @@ -36,10 +45,11 @@ while [ "$1" ]; do shift done -EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`" +EXISTING_MOUNT=`awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts` if [ -z "$EXISTING_MOUNT" ]; then - sh llmount.sh $OPTS - EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`" + $FORMAT + $SETUP + EXISTING_MOUNT=`awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts` [ -z "$EXISTING_MOUNT" ] && fail "no lustre filesystem mounted" 1 I_MOUNTED="yes" fi @@ -93,8 +103,8 @@ done [ "$ERROR" ] && fail "old and new files are different" $ERROR log "finished at `date` ($(($(date +%s) - START)))" -sh llmountcleanup.sh || exit 19 -sh llrmount.sh $OPTS || exit 20 +$CLEANUP || exit 19 +$SETUP || exit 20 log "comparing previously copied files" for f in $FILES; do @@ -104,8 +114,8 @@ done [ "$ERROR" ] && fail "old and new files are different on second diff" $ERROR -sh llmountcleanup.sh || exit 19 -sh llrmount.sh $OPTS || exit 20 +$CLEANUP || exit 19 +$SETUP || exit 20 log "removing $DST" rm -r $V $DST || fail "can't remove $DST" 37 @@ -134,5 +144,5 @@ fi if [ "$I_MOUNTED" = "yes" ]; then sync && sleep 2 && sync # wait for delete thread - sh llmountcleanup.sh || exit 29 + $CLEANUP fi diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 8c1e164..b30fc42 100644 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -128,7 +128,7 @@ pass() { } mounted_lustre_filesystems() { - awk '($3 ~ "lustre") { print $2 }' /proc/mounts + awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts } MOUNT="`mounted_lustre_filesystems`" if [ -z "$MOUNT" ]; then @@ -589,8 +589,7 @@ test_7() echo 0 > /proc/sys/lustre/fail_loc echo " Trigger recovery..." - OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_OST.* / { print $1 }'`" - [ -z "$OSC0_UUID" ] && OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_ost1.* / { print $1 }'`" + OSC0_UUID="`$LCTL dl | awk '/.* *-osc-* / { print $1 }'`" for i in $OSC0_UUID; do $LCTL --device $i activate > /dev/null 2>&1 || error "activate osc failed!" done diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 72ecbc5..b68cb58 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -12,6 +12,12 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42b 42c 42d 45 68"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 63 64b 71 77 101" +# Tests that fail on uml +[ "$UML" = "true" ] && EXCEPT="$EXCEPT 31d" + +# Tests that always fail with mountconf -- FIXME +# 48a moving the working dir succeeds +EXCEPT="$EXCEPT 48a" case `uname -r` in 2.4*) FSTYPE=${FSTYPE:-ext3}; ALWAYS_EXCEPT="$ALWAYS_EXCEPT 76" ;; @@ -62,22 +68,31 @@ else fi fi +SANITYLOG=${SANITYLOG:-/tmp/sanity.log} + export NAME=${NAME:-local} SAVE_PWD=$PWD -clean() { +# for MCSETUP and MCCLEANUP +LUSTRE=${LUSTRE:-`dirname $0`/..} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. ${CONFIG:=$LUSTRE/tests/cfg/local.sh} +. mountconf.sh + +cleanup() { echo -n "cln.." - sh llmountcleanup.sh ${FORCE} > /dev/null || { echo "FAILed to clean up"; exit 20; } + $MCCLEANUP ${FORCE} $* || { echo "FAILed to clean up"; exit 20; } } -CLEAN=${CLEAN:-:} +CLEANUP=${CLEANUP:-:} -start() { +setup() { echo -n "mnt.." - sh llrmount.sh > /dev/null || exit 10 + $MCSETUP || exit 10 echo "done" } -START=${START:-:} +SETUP=${SETUP:-:} log() { echo "$*" @@ -93,7 +108,6 @@ trace() { } TRACE=${TRACE:-""} -LPROC=/proc/fs/lustre check_kernel_version() { VERSION_FILE=$LPROC/kernel_version WANT_VER=$1 @@ -113,8 +127,8 @@ basetest() { } run_one() { - if ! mount | grep -q $DIR; then - $START + if ! grep -q $DIR /proc/mounts; then + $SETUP fi testnum=$1 message=$2 @@ -127,7 +141,7 @@ run_one() { unset TESTNAME pass "($((`date +%s` - $BEFORE))s)" cd $SAVE_PWD - $CLEAN + $CLEANUP } build_test_filter() { @@ -198,13 +212,15 @@ pass() { } mounted_lustre_filesystems() { - awk '($3 ~ "lustre") { print $2 }' /proc/mounts + awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts } -MOUNT="`mounted_lustre_filesystems`" -if [ -z "$MOUNT" ]; then - sh llmount.sh - MOUNT="`mounted_lustre_filesystems`" - [ -z "$MOUNT" ] && error "NAME=$NAME not mounted" + +MOUNTED="`mounted_lustre_filesystems`" +if [ -z "$MOUNTED" ]; then + $MCFORMAT + $MCSETUP + MOUNTED="`mounted_lustre_filesystems`" + [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted" I_MOUNTED=yes fi @@ -731,7 +747,7 @@ test_24n() { $CHECKSTAT ${f}.rename $CHECKSTAT -a ${f} } -run_test 24n "Statting the old file after renameing (Posix rename 2)" +run_test 24n "Statting the old file after renaming (Posix rename 2)" test_24o() { check_kernel_version 37 || return 0 @@ -985,11 +1001,11 @@ reset_enospc() { exhaust_precreations() { OSTIDX=$1 - OST=$(head -n $((OSTIDX + 1)) $LPROC/lov/${LOVNAME}/target_obd |\ - tail -n 1 | awk '{print $2}' | sed -e 's/_UUID$//') - - last_id=$(cat $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc_last_id) - next_id=$(cat $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc_next_id) + OST=$(grep ${OSTIDX}": " $LPROC/lov/${LOVNAME}/target_obd | \ + awk '{print $2}' | sed -e 's/_UUID$//') + # on the mdt's osc + last_id=$(cat $LPROC/osc/${OST}-osc/prealloc_last_id) + next_id=$(cat $LPROC/osc/${OST}-osc/prealloc_next_id) mkdir -p $DIR/d27/${OST} $LSTRIPE $DIR/d27/${OST} 0 $OSTIDX 1 @@ -997,7 +1013,7 @@ exhaust_precreations() { sysctl -w lustre.fail_loc=0x215 echo "Creating to objid $last_id on ost $OST..." createmany -o $DIR/d27/${OST}/f $next_id $((last_id - next_id + 2)) - grep '[0-9]' $LPROC/osc/OSC_*_${OST}_${MDS}/prealloc* + grep '[0-9]' $LPROC/osc/${OST}-osc/prealloc* reset_enospc $2 } @@ -1093,19 +1109,19 @@ test_28() { run_test 28 "create/mknod/mkdir with bad file types ============" cancel_lru_locks() { - for d in $LPROC/ldlm/namespaces/$1*; do + for d in $LPROC/ldlm/namespaces/*-$1-*; do echo clear > $d/lru_size done - grep "[0-9]" $LPROC/ldlm/namespaces/$1*/lock_unused_count /dev/null + grep "[0-9]" $LPROC/ldlm/namespaces/*-$1-*/lock_unused_count /dev/null } test_29() { - cancel_lru_locks MDC + cancel_lru_locks mdc mkdir $DIR/d29 touch $DIR/d29/foo log 'first d29' ls -l $DIR/d29 - MDCDIR=${MDCDIR:-$LPROC/ldlm/namespaces/MDC_*} + MDCDIR=${MDCDIR:-$LPROC/ldlm/namespaces/*-mdc-*} LOCKCOUNTORIG=`cat $MDCDIR/lock_count` LOCKUNUSEDCOUNTORIG=`cat $MDCDIR/lock_unused_count` log 'second d29' @@ -1637,11 +1653,11 @@ setup_test42() { # file truncation, and file removal. test_42a() { setup_test42 - cancel_lru_locks OSC + cancel_lru_locks osc stop_writeback sync; sleep 1; sync # just to be safe BEFOREWRITES=`count_ost_writes` - grep "[0-9]" $LPROC/osc/OSC*MNT*/cur_grant_bytes + grep "[0-9]" $LPROC/osc/*-osc-*/cur_grant_bytes dd if=/dev/zero of=$DIR/f42a bs=1024 count=100 AFTERWRITES=`count_ost_writes` [ $BEFOREWRITES -eq $AFTERWRITES ] || \ @@ -1652,7 +1668,7 @@ run_test 42a "ensure that we don't flush on close ==============" test_42b() { setup_test42 - cancel_lru_locks OSC + cancel_lru_locks osc stop_writeback sync dd if=/dev/zero of=$DIR/f42b bs=1024 count=100 @@ -1691,7 +1707,7 @@ trunc_test() { test=$1 file=$DIR/$test offset=$2 - cancel_lru_locks OSC + cancel_lru_locks osc stop_writeback # prime the file with 0,EOF PW to match touch $file @@ -1701,7 +1717,7 @@ trunc_test() { dd if=/dev/zero of=$file bs=1024 count=100 BEFOREWRITES=`count_ost_writes` $TRUNCATE $file $offset - cancel_lru_locks OSC + cancel_lru_locks osc AFTERWRITES=`count_ost_writes` start_writeback } @@ -1835,7 +1851,7 @@ test_45() { [ $before -gt $after ] || error "writeback didn't lower dirty count" do_dirty_record "echo blah > $f" [ $before -eq $after ] && error "write wasn't cached" - do_dirty_record "cancel_lru_locks OSC" + do_dirty_record "cancel_lru_locks osc" [ $before -gt $after ] || error "lock cancellation didn't lower dirty count" start_writeback } @@ -2064,8 +2080,8 @@ test_52b() { run_test 52b "immutable flag test (should return errors) =======" test_53() { - for i in `ls -d $LPROC/osc/OSC*mds1 2> /dev/null` ; do - ostname=`echo $i | cut -d _ -f 3-4 | sed -e s/_mds1//` + for i in `ls -d $LPROC/osc/*-osc 2> /dev/null` ; do + ostname=`basename $i | cut -d - -f 1-2` ost_last=`cat $LPROC/obdfilter/$ostname/last_id` mds_last=`cat $i/prealloc_last_id` echo "$ostname.last_id=$ost_last ; MDS.last_id=$mds_last" @@ -2304,7 +2320,7 @@ run_test 60b "limit repeated messages from CERROR/CWARN ========" test_61() { f="$DIR/f61" dd if=/dev/zero of=$f bs=`page_size` count=1 - cancel_lru_locks OSC + cancel_lru_locks osc multiop $f OSMWUc || error sync } @@ -2314,7 +2330,7 @@ run_test 61 "mmap() writes don't make sync hang ================" test_62() { f="$DIR/f62" echo foo > $f - cancel_lru_locks OSC + cancel_lru_locks osc sysctl -w lustre.fail_loc=0x405 cat $f && error "cat succeeded, expect -EIO" sysctl -w lustre.fail_loc=0 @@ -2367,7 +2383,7 @@ run_test 63b "async write errors should be returned to fsync ===" test_64a () { df $DIR - grep "[0-9]" $LPROC/osc/OSC*MNT*/cur* + grep "[0-9]" $LPROC/osc/*-osc-*/cur* } run_test 64a "verify filter grant calculations (in kernel) =====" @@ -2454,9 +2470,9 @@ run_test 65i "set default striping on root directory (bug 6367)=" test_65j() { # bug6367 # if we aren't already remounting for each test, do so for this test - if [ "$CLEAN" = ":" ]; then - clean || error "failed to unmount" - start || error "failed to remount" + if [ "$CLEANUP" = ":" ]; then + cleanup -f || error "failed to unmount" + setup || error "failed to remount" fi $LSTRIPE -d $MOUNT || true } @@ -2553,7 +2569,7 @@ test_69() { sysctl -w lustre.fail_loc=0 $DIRECTIO write $f 0 2 || error "write error" - cancel_lru_locks OSC + cancel_lru_locks osc $DIRECTIO read $f 0 1 || error "read error" sysctl -w lustre.fail_loc=0x217 @@ -2603,7 +2619,7 @@ test_72() { # bug 5695 - Test that on 2.6 remove_suid works properly # See if we are still setuid/sgid test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on write" # Now test that MDS is updated too - cancel_lru_locks MDC + cancel_lru_locks mdc test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on MDS" true } @@ -2819,7 +2835,7 @@ test_101() { local nreads=10000 local cache_limit=32 - for s in $LPROC/osc/OSC_*/rpc_stats; do + for s in $LPROC/osc/*-osc*/rpc_stats; do echo 0 > $s done trap cleanup_101 EXIT @@ -2841,7 +2857,7 @@ test_101() { cleanup_101 if [ $(($discard * 10)) -gt $nreads ] ;then - cat $LPROC/osc/OSC_*/rpc_stats + cat $LPROC/osc/*-osc*/rpc_stats cat $LPROC/llite/*/read_ahead_stats error "too many ($discard) discarded pages" fi @@ -2856,7 +2872,7 @@ test_102() { touch $testfile [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return - [ -z "`grep xattr $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return + [ -z "`grep xattr $LPROC/mdc/*-mdc-*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return echo "set/get xattr..." setfattr -n trusted.name1 -v value1 $testfile || error [ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \ @@ -2907,8 +2923,8 @@ test_103 () { [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return [ -z "`mount | grep " $DIR .*\"`" ] && echo "skipping $TESTNAME (must have acl)" && return - [ -z "`grep acl $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return - which setfacl 2>/dev/null || (echo "skipping $TESTNAME (could not find setfacl)" && return) + [ -z "`grep acl $LPROC/mdc/*-mdc-*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return + $(which setfacl 2>/dev/null) || echo "skipping $TESTNAME (could not find setfacl)" && return echo "performing cp ..." run_acl_subtest cp || error @@ -2943,7 +2959,7 @@ test_104() { lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed" lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed" - OSC=`lctl dl | awk '/OSC.*MNT/ {print $4}' | head -n 1` + OSC=`awk '/-osc-/ {print $4}' $LPROC/devices | head -n 1` lctl --device %$OSC deactivate lfs df || error "lfs df with deactivated OSC failed" lctl --device %$OSC recover @@ -2957,11 +2973,12 @@ HOME=$OLDHOME log "cleanup: ======================================================" if [ "`mount | grep ^$NAME`" ]; then - rm -rf $DIR/[Rdfs][1-9]* - if [ "$I_MOUNTED" = "yes" ]; then - sh llmountcleanup.sh || error "llmountcleanup failed" - fi + rm -rf $DIR/[Rdfs][1-9]* fi +if [ "$I_MOUNTED" = "yes" ]; then + $MCCLEANUP -f || error "cleanup failed" +fi + echo '=========================== finished ===============================' [ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 57cfaa8..a32f2b6 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -7,6 +7,9 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b 14c"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +# Tests that fail on uml +[ "$UML" = "true" ] && EXCEPT="$EXCEPT 7" + SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH @@ -33,18 +36,26 @@ fi SAVE_PWD=$PWD -clean() { +# for MCSETUP and MCCLEANUP +LUSTRE=${LUSTRE:-`dirname $0`/..} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. ${CONFIG:=$LUSTRE/tests/cfg/local.sh} +. mountconf.sh + +cleanup() { echo -n "cln.." - sh llmountcleanup.sh ${FORCE} > /dev/null || exit 20 + grep " $MOUNT2 " /proc/mounts && zconf_umount `hostname` $MOUNT2 ${FORCE} + $MCCLEANUP ${FORCE} > /dev/null || { echo "FAILed to clean up"; exit 20; } } -CLEAN=${CLEAN:-} +CLEANUP=${CLEANUP:-:} -start() { +setup() { echo -n "mnt.." - sh llrmount.sh > /dev/null || exit 10 + $MCSETUP || exit 10 echo "done" } -START=${START:-} +SETUP=${SETUP:-:} log() { echo "$*" @@ -61,8 +72,8 @@ trace() { TRACE=${TRACE:-""} run_one() { - if ! mount | grep -q $DIR1; then - $START + if ! grep -q $DIR /proc/mounts; then + $SETUP fi testnum=$1 message=$2 @@ -75,7 +86,27 @@ run_one() { unset TESTNAME pass "($((`date +%s` - $BEFORE))s)" cd $SAVE_PWD - $CLEAN + $CLEANUP +} + +build_test_filter() { + [ "$ALWAYS_EXCEPT$EXCEPT$SANITYN_EXCEPT" ] && \ + echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITYN_EXCEPT`" + + for O in $ONLY; do + eval ONLY_${O}=true + done + for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do + eval EXCEPT_${E}=true + done +} + +_basetest() { + echo $* +} + +basetest() { + IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 } build_test_filter() { @@ -143,11 +174,23 @@ pass() { echo PASS $@ } -export MOUNT1=`mount| awk '/ lustre/ { print $3 }'| head -n 1` -export MOUNT2=`mount| awk '/ lustre/ { print $3 }'| tail -n 1` +mounted_lustre_filesystems() { + awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts +} +MOUNTED="`mounted_lustre_filesystems`" +if [ -z "$MOUNTED" ]; then + $MCFORMAT + $MCSETUP + mount_client $MOUNT2 + MOUNTED="`mounted_lustre_filesystems`" + [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted" + I_MOUNTED=yes +fi +export MOUNT1=`mounted_lustre_filesystems | head -n 1` [ -z "$MOUNT1" ] && error "NAME=$NAME not mounted once" +export MOUNT2=`mounted_lustre_filesystems | tail -n 1` [ "$MOUNT1" = "$MOUNT2" ] && error "NAME=$NAME not mounted twice" -[ `mount| awk '/ lustre/ { print $3 }'| wc -l` -ne 2 ] && \ +[ `mounted_lustre_filesystems | wc -l` -ne 2 ] && \ error "NAME=$NAME mounted more than twice" export DIR1=${DIR1:-$MOUNT1} @@ -387,17 +430,17 @@ test_16() { run_test 16 "2500 iterations of dual-mount fsx =================" cancel_lru_locks() { - for d in /proc/fs/lustre/ldlm/namespaces/$1*; do + for d in /proc/fs/lustre/ldlm/namespaces/*-$1-*; do echo clear > $d/lru_size done - grep "[0-9]" /proc/fs/lustre/ldlm/namespaces/$1*/lock_unused_count /dev/null + grep "[0-9]" /proc/fs/lustre/ldlm/namespaces/*-$1-*/lock_unused_count /dev/null } test_17() { # bug 3513, 3667 [ ! -d /proc/fs/lustre/ost ] && echo "skipping OST-only test" && return cp /etc/termcap $DIR1/f17 - cancel_lru_locks OSC > /dev/null + cancel_lru_locks osc > /dev/null #define OBD_FAIL_ONCE|OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a echo 0x8000030a > /proc/sys/lustre/fail_loc ls -ls $DIR1/f17 | awk '{ print $1,$6 }' > $DIR1/f17-1 & \ @@ -417,7 +460,7 @@ test_19() { # bug3811 [ -d /proc/fs/lustre/obdfilter ] || return 0 MAX=`cat /proc/fs/lustre/obdfilter/*/readcache_max_filesize | head -n 1` - for O in /proc/fs/lustre/obdfilter/OST*; do + for O in /proc/fs/lustre/obdfilter/*OST*; do echo 4096 > $O/readcache_max_filesize done dd if=/dev/urandom of=$TMP/f19b bs=512k count=32 @@ -425,7 +468,7 @@ test_19() { # bug3811 cp $TMP/f19b $DIR1/f19b for i in `seq 1 20`; do [ $((i % 5)) -eq 0 ] && log "test_18 loop $i" - cancel_lru_locks OSC > /dev/null + cancel_lru_locks osc > /dev/null cksum $DIR1/f19b | cut -d" " -f 1,2 > $TMP/sum1 & \ cksum $DIR2/f19b | cut -d" " -f 1,2 > $TMP/sum2 wait @@ -434,7 +477,7 @@ test_19() { # bug3811 [ "`cat $TMP/sum2`" = "$SUM" ] || \ error "$DIR2/f19b `cat $TMP/sum2` != $SUM" done - for O in /proc/fs/lustre/obdfilter/OST*; do + for O in /proc/fs/lustre/obdfilter/*OST*; do echo $MAX > $O/readcache_max_filesize done rm $DIR1/f19b @@ -443,12 +486,12 @@ test_19() { # bug3811 test_20() { mkdir $DIR1/d20 - cancel_lru_locks OSC + cancel_lru_locks osc CNT=$((`cat /proc/fs/lustre/llite/fs0/dump_page_cache | wc -l`)) multiop $DIR1/f20 Ow8190c multiop $DIR2/f20 Oz8194w8190c multiop $DIR1/f20 Oz0r8190c - cancel_lru_locks OSC + cancel_lru_locks osc CNTD=$((`cat /proc/fs/lustre/llite/fs0/dump_page_cache | wc -l` - $CNT)) [ $CNTD -gt 0 ] && \ error $CNTD" page left in cache after lock cancel" || true @@ -497,7 +540,7 @@ test_23() { # Bug 5972 echo "others should see updated atime while another read" > $DIR1/f23 # clear the lock(mode: LCK_PW) gotten from creating operation - cancel_lru_locks OSC + cancel_lru_locks osc time1=`date +%s` sleep 2 @@ -560,6 +603,10 @@ run_test 25 "change ACL on one mountpoint be seen on another ===" log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true +if [ "$I_MOUNTED" = "yes" ]; then + cleanup +fi echo '=========================== finished ===============================' [ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true + diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 3b10909..8ca56d9 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2,6 +2,7 @@ # vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: set -e +#set -vx export REFORMAT="" export VERBOSE=false @@ -36,12 +37,11 @@ init_test_env() { export TMP=${TMP:-$ROOT/tmp} export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests - export LLMOUNT=${LLMOUNT:-"llmount"} - export LCONF=${LCONF:-"lconf"} - export LMC=${LMC:-"lmc"} export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} + export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"} export CHECKSTAT="${CHECKSTAT:-checkstat} " export FSYTPE=${FSTYPE:-"ext3"} + export LPROC=/proc/fs/lustre if [ "$ACCEPTOR_PORT" ]; then export PORT_OPT="--port $ACCEPTOR_PORT" @@ -70,55 +70,87 @@ init_test_env() { # echo "CONFIG=`canonical_path $CONFIG`" > $LUSTRE/tests/CONFIG } +unload_modules() { + lsmod | grep lnet > /dev/null && $LCTL dk $TMP/debug + local MODULES=`$LCTL modules | awk '{ print $2 }'` + rmmod $MODULES >/dev/null 2>&1 + # do it again, in case we tried to unload ksocklnd too early + lsmod | grep lnet > /dev/null && rmmod $MODULES >/dev/null 2>&1 + lsmod | grep lnet && echo "modules still loaded" && return 1 + + LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked"` + LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked"` + if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then + echo "$LEAK_LUSTRE" 1>&2 + echo "$LEAK_PORTALS" 1>&2 + mv $TMP/debug $TMP/debug-leak.`date +%s` + echo "Memory leaks detected" + return 254 + fi +} + # Facet functions +# start facet device options start() { facet=$1 shift - active=`facet_active $facet` - do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ - --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ - $@ $XMLCONFIG + device=$1 + shift + echo "Starting ${facet}: $@ ${device} /mnt/${facet}" + do_facet ${facet} mkdir -p /mnt/${facet} + do_facet ${facet} mount -t lustre $@ ${device} /mnt/${facet} + #do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ + # --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ + # $@ $XMLCONFIG RC=${PIPESTATUS[0]} if [ $RC -ne 0 ]; then - # maybe acceptor error, dump tcp port usage - netstat -tpn + echo mount -t lustre $@ ${device} /mnt/${facet} + echo Start of ${device} on ${facet} failed ${RC} + else + do_facet ${facet} sync + # need the awk in case running with -v + label=`do_facet ${facet} "e2label ${device}" | awk '{print $(NF)}'` + eval export ${facet}_svc=${label} + eval export ${facet}_dev=${device} + eval export ${facet}_opt=\"$@\" + echo Started ${label} fi return $RC } stop() { facet=$1 - active=`facet_active $facet` shift - do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ - --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ - $@ --cleanup $XMLCONFIG + # the following line fails with VERBOSE set + local running=`do_facet ${facet} "grep -c /mnt/${facet}' ' /proc/mounts" | awk '{print $(NF)}'` + if [ $running -ne 0 ]; then + echo "Stopping /mnt/${facet} (opts:$@)" + do_facet ${facet} umount -d $@ /mnt/${facet} + fi + #do_facet ${facet} umount -d $@ /mnt/${facet} >> /dev/null 2>&1 || : + [ -e /proc/fs/lustre ] && grep "ST " /proc/fs/lustre/devices && echo "service didn't stop" && exit 1 + return 0 } zconf_mount() { local OPTIONS - client=$1 - mnt=$2 - - do_node $client mkdir $mnt 2> /dev/null || : - + local client=$1 + local mnt=$2 # Only supply -o to mount if we have options if [ -n "$MOUNTOPT" ]; then OPTIONS="-o $MOUNTOPT" fi - - if [ -x /sbin/mount.lustre ] ; then - do_node $client mount -t lustre $OPTIONS \ - `facet_nid mds`:/mds_svc/client_facet $mnt || return 1 - do_node $client "sysctl -w lnet.debug=$PTLDEBUG; sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }" - else - # this is so cheating - do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG > \ - /dev/null || return 2 - do_node $client $LLMOUNT $OPTIONS \ - `facet_nid mds`:/mds_svc/client_facet $mnt || return 4 + local device=`facet_nid mgs`:/$FSNAME + if [ -z "$mnt" -o -z "$FSNAME" ]; then + echo Bad zconf mount command: opt=$OPTIONS dev=$device mnt=$mnt + exit 1 fi + echo "Starting client: $OPTIONS $device $mnt" + do_node $client mkdir -p $mnt + do_node $client mount -t lustre $OPTIONS $device $mnt || return 1 + + do_node $client "sysctl -w lnet.debug=$PTLDEBUG; sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }" [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname` return 0 } @@ -127,8 +159,11 @@ zconf_umount() { client=$1 mnt=$2 [ "$3" ] && force=-f - do_node $client umount $force $mnt || : - do_node $client $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null || : + local running=`do_node $client "grep -c $mnt' ' /proc/mounts" | awk '{print $(NF)}'` + if [ $running -ne 0 ]; then + echo "Stopping client $mnt (opts:$force)" + do_node $client umount $force $mnt + fi } shutdown_facet() { @@ -137,7 +172,7 @@ shutdown_facet() { $POWER_DOWN `facet_active_host $facet` sleep 2 elif [ "$FAILURE_MODE" = SOFT ]; then - stop $facet --force --failover --nomod + stop $facet fi } @@ -182,7 +217,7 @@ client_reconnect() { facet_failover() { facet=$1 - echo "Failing $facet node `facet_active_host $facet`" + echo "Failing $facet on node `facet_active_host $facet`" shutdown_facet $facet reboot_facet $facet client_df & @@ -192,52 +227,64 @@ facet_failover() { TO=`facet_active_host $facet` echo "Failover $facet to $TO" wait_for $facet - start $facet + local dev=${facet}_dev + local opt=${facet}_opt + start $facet ${!dev} ${!opt} +} + +obd_name() { + local facet=$1 } replay_barrier() { local facet=$1 do_facet $facet sync df $MOUNT - do_facet $facet $LCTL --device %${facet}_svc readonly - do_facet $facet $LCTL --device %${facet}_svc notransno - do_facet $facet $LCTL mark "$facet REPLAY BARRIER" - $LCTL mark "local REPLAY BARRIER" + local svc=${facet}_svc + do_facet $facet $LCTL --device %${!svc} readonly + do_facet $facet $LCTL --device %${!svc} notransno + do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}" + $LCTL mark "local REPLAY BARRIER on ${!svc}" } replay_barrier_nodf() { local facet=$1 do_facet $facet sync - do_facet $facet $LCTL --device %${facet}_svc readonly - do_facet $facet $LCTL --device %${facet}_svc notransno - do_facet $facet $LCTL mark "$facet REPLAY BARRIER" - $LCTL mark "local REPLAY BARRIER" + local svc=${facet}_svc + echo Replay barrier on ${!svc} + do_facet $facet $LCTL --device %${!svc} readonly + do_facet $facet $LCTL --device %${!svc} notransno + do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}" + $LCTL mark "local REPLAY BARRIER on ${!svc}" } mds_evict_client() { UUID=`cat /proc/fs/lustre/mdc/*_MNT_*/uuid` - do_facet mds "echo $UUID > /proc/fs/lustre/mds/mds_svc/evict_client" + do_facet mds "echo $UUID > /proc/fs/lustre/mds/${mds_svc}/evict_client" } fail() { - local facet=$1 - facet_failover $facet + facet_failover $* df $MOUNT || error "post-failover df: $?" } fail_abort() { local facet=$1 - stop $facet --force --failover --nomod + stop $facet change_active $facet - start $facet - do_facet $facet lctl --device %${facet}_svc abort_recovery + local svc=${facet}_svc + local dev=${facet}_dev + local opt=${facet}_opt + start $facet ${!dev} ${!opt} + do_facet $facet lctl --device %${!svc} abort_recovery df $MOUNT || echo "first df failed: $?" sleep 1 df $MOUNT || error "post-failover df: $?" } do_lmc() { - $LMC -m ${XMLCONFIG} $@ + echo There is no lmc. This is mountconf, baby. + exit 1 } h2gm () { @@ -353,69 +400,13 @@ do_facet() { do_node $HOST $@ } -add_facet() { +add() { local facet=$1 shift - echo "add facet $facet: `facet_host $facet`" - do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT \ - --lustre_upcall $UPCALL --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM - do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` \ - --nettype lnet $PORT_OPT -} - -add_mds() { - local MOUNT_OPTS - local facet=$1 - shift - rm -f ${facet}active - add_facet $facet - [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT" - do_lmc --add mds --node ${facet}_facet --mds ${facet}_svc \ - --fstype $FSTYPE $* $MOUNT_OPTS -} - -add_mdsfailover() { - local MOUNT_OPTS - local facet=$1 - shift - add_facet ${facet}failover --lustre_upcall $UPCALL - [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT" - do_lmc --add mds --node ${facet}failover_facet --mds ${facet}_svc \ - --fstype $FSTYPE $* $MOUNT_OPTS -} - -add_ost() { - facet=$1 - shift + # failsafe + stop ${facet} -f rm -f ${facet}active - add_facet $facet - do_lmc --add ost --node ${facet}_facet --ost ${facet}_svc \ - --fstype $FSTYPE $* $OSTOPT -} - -add_ostfailover() { - facet=$1 - shift - add_facet ${facet}failover - do_lmc --add ost --failover --node ${facet}failover_facet \ - --ost ${facet}_svc --fstype $FSTYPE $* $OSTOPT -} - -add_lov() { - lov=$1 - mds_facet=$2 - shift; shift - do_lmc --add lov --mds ${mds_facet}_svc --lov $lov $* $LOVOPT -} - -add_client() { - local MOUNT_OPTS - local facet=$1 - mds=$2 - shift; shift - [ "x$CLIENTOPT" != "x" ] && MOUNT_OPTS="--clientoptions $CLIENTOPT" - add_facet $facet --lustre_upcall $UPCALL - do_lmc --add mtpt --node ${facet}_facet --mds ${mds}_svc $* $MOUNT_OPTS + $MKFS $* } @@ -622,6 +613,7 @@ equals_msg() { log() { echo "$*" + lsmod | grep lnet > /dev/null || modprobe lnet $LCTL mark "$*" 2> /dev/null || true } diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index dc22291..d730e27 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -15,9 +15,15 @@ obdbarrier lload wirecheck lfs +mkfs.lustre +mkfs_lustre +mount.lustre +mount_lustre +tunefs.lustre +tunefs_lustre +llog_reader llmount l_getgroups -mount.lustre wiretest llog_reader .*.cmd diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index efc7547..eb43617 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -8,26 +8,30 @@ AM_LDFLAGS := -L$(top_builddir)/lnet/utils LIBPTLCTL := $(top_builddir)/lnet/utils/libptlctl.a -sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive \ - load_ldap.sh lrun +sbin_scripts = llanalyze llstat.pl llobdstat.pl lactive lrun bin_scripts = lfind lstripe if UTILS -rootsbin_SCRIPTS = mount.lustre -sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount \ - l_getgroups +# mount only finds helpers in /sbin +rootsbin_PROGRAMS = mount.lustre +sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest \ + mount_lustre mkfs_lustre mkfs.lustre \ + tunefs_lustre tunefs.lustre l_getgroups bin_PROGRAMS = lfs llog_reader -lib_LIBRARIES = liblustreapi.a +lib_LIBRARIES = liblustreapi.a sbin_SCRIPTS = $(sbin_scripts) bin_SCRIPTS = $(bin_scripts) endif # UTILS +lctl_SOURCES = parser.c obd.c lustre_cfg.c lctl.c parser.h obdctl.h platform.h lctl_LDADD := $(LIBREADLINE) $(LIBPTLCTL) lctl_DEPENDENCIES := $(LIBPTLCTL) +lfs_SOURCES = lfs.c parser.c obd.c lfs_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a +lload_SOURCES = lload.c lload_LDADD := $(LIBREADLINE) $(LIBPTLCTL) lload_DEPENDENCIES := $(LIBPTLCTL) @@ -35,22 +39,29 @@ liblustreapi_a_SOURCES = liblustreapi.c wirecheck_SOURCES = wirecheck.c wirecheck_CPPFLAGS = -DCC="\"$(CC)\"" + wiretest_SOURCES = wiretest.c -lctl_SOURCES = parser.c obd.c lustre_cfg.c lctl.c parser.h obdctl.h platform.h -lload_SOURCES = lload.c obdio_SOURCES = obdio.c obdiolib.c obdiolib.h obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h -lfs_SOURCES = lfs.c parser.c obd.c -llog_reader_LDADD := $(LIBREADLINE) $(LIBPTLCTL) -llog_reader_DEPENDENCIES := $(LIBPTLCTL) llog_reader_SOURCES = llog_reader.c +llog_reader_LDADD := $(LIBPTLCTL) +llog_reader_DEPENDENCIES := $(LIBPTLCTL) + +mount_lustre_SOURCES = mount_lustre.c +mount_lustre_LDADD := $(LIBPTLCTL) +mount_lustre_DEPENDENCIES := $(LIBPTLCTL) -llmount_SOURCES = llmount.c -llmount_CFLAGS = $(LLMOUNT_GM_CFLAGS) -llmount_LDADD = $(LIBREADLINE) $(LIBPTLCTL) $(LLMOUNT_GM_LDADD) -llmount_DEPENDENCIES := $(LIBPTLCTL) +mkfs_lustre_SOURCES = mkfs_lustre.c +mkfs_lustre_CPPFLAGS = -UTUNEFS $(AM_CPPFLAGS) +mkfs_lustre_LDADD := $(LIBPTLCTL) +mkfs_lustre_DEPENDENCIES := $(LIBPTLCTL) + +tunefs_lustre_SOURCES = $(mkfs_lustre_SOURCES) +tunefs_lustre_CPPFLAGS = -DTUNEFS $(AM_CPPFLAGS) +tunefs_lustre_LDADD := $(mkfs_lustre_LDADD) +tunefs_lustre_DEPENDENCIES := $(mkfs_lustre_DEPENDENCIES) EXTRA_DIST = $(bin_scripts) $(sbin_scripts) @@ -59,5 +70,12 @@ newwiretest: wirehdr.c wirecheck cp wirehdr.c wiretest.c ./wirecheck >> wiretest.c -mount.lustre$(EXEEXT): llmount +# Apparently I can't use .'s in automake names +mount.lustre$(EXEEXT): mount_lustre + cp $< $@ + +mkfs.lustre$(EXEEXT): mkfs_lustre + cp $< $@ + +tunefs.lustre$(EXEEXT): tunefs_lustre cp $< $@ diff --git a/lustre/utils/cluster_scripts/1uml.csv b/lustre/utils/cluster_scripts/1uml.csv new file mode 100644 index 0000000..d6f23a4 --- /dev/null +++ b/lustre/utils/cluster_scripts/1uml.csv @@ -0,0 +1,5 @@ +# combo mdt/mgs +uml1,options lnet networks=tcp,/r/tmp/mdt,mdt|mgs,,,,--device-size=10240 +# ost0 +uml1,options lnet networks=tcp,/r/tmp/ost0,ost,,"uml1@tcp0",,--device-size=10240 + diff --git a/lustre/utils/cluster_scripts/cluster_config.sh b/lustre/utils/cluster_scripts/cluster_config.sh new file mode 100755 index 0000000..818d8846b --- /dev/null +++ b/lustre/utils/cluster_scripts/cluster_config.sh @@ -0,0 +1,705 @@ +#!/bin/bash +# +# cluster_config.sh - configure multiple lustre servers from a csv file +# +# This script is used to parse each line of a spreadsheet (csv file) and +# execute remote pdsh commands to format (mkfs.lustre) every Lustre target +# that will be part of the Lustre cluster. +# +# In addition, it can also verify the network connectivity and hostnames in +# the cluster and produce High-Availability software configurations for +# Heartbeat or CluManager +# +################################################################################ + +# Usage +usage() { + cat >&2 < + + -t HAtype produce High-Availability software configurations + + The argument following -t is used to indicate the High- + Availability software type. The HA software types which + are currently supported are: hbv1 (Heartbeat v1), hbv2 + (Heartbeat v2) and clumanager (CluManager). + -n don't verify network connectivity and hostnames in the + cluster + -f force-format the Lustre targets using --reformat option + -h help and examples + -v verbose mode + csv file a spreadsheet that contains configuration parameters + (separated by commas) for each target in a Lustre cl- + uster +EOF + exit 1 +} + +# Samples +sample() { + cat >&2 <&2 $"`basename $0`: Invalid HA software type" \ + "- ${HATYPE_OPT}!" + usage + fi + ;; + n) + VERIFY_CONNECT=$"no" + ;; + f) + REFORMAT_OPTION=$"--reformat " + ;; + h) + sample + ;; + v) + VERBOSE_OPT=$" -v" + ;; + ?) + usage + esac +done + +# Toss out the parameters we've already processed +shift `expr $OPTIND - 1` + +# Here we expect the csv file +if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: Missing csv file!" + usage +fi + +# Output verbose informations +verbose_output() { + if [ -n "${VERBOSE_OPT}" ]; then + echo "`basename $0`: $*" + fi + return 0 +} + +# Check the csv file +check_file() { + # Check argument + if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: check_file() error: Lack argument"\ + "for function check_file()!" + return 1 + fi + + CSV_FILE=$1 + if [ ! -s ${CSV_FILE} ]; then + echo >&2 $"`basename $0`: check_file() error: ${CSV_FILE}"\ + "does not exist or is empty!" + return 1 + fi + + return 0 +} + +# Parse a line in the csv file +parse_line() { + # Check argument + if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: parse_line() error: Lack argument"\ + "for function parse_line()!" + return 1 + fi + + declare -i i=0 + declare -i length=0 + declare -i idx=0 + declare -i s_quote_flag=0 + declare -i d_quote_flag=0 + local TMP_LETTER LINE + + LINE=$* + + # Initialize the CONFIG_ITEM array + for ((i = 0; i < ${#CONFIG_ITEM[@]}; i++)); do + CONFIG_ITEM[i]=$"" + done + + # Get the length of the line + length=${#LINE} + + i=0 + while [ ${idx} -lt ${length} ]; do + # Get a letter from the line + TMP_LETTER=${LINE:${idx}:1} + + case "${TMP_LETTER}" in + ",") + if [ ${s_quote_flag} -eq 1 ] || [ ${d_quote_flag} -eq 1 ]; then + CONFIG_ITEM[i]=${CONFIG_ITEM[i]}${TMP_LETTER} + else + i=$i+1 + fi + idx=${idx}+1 + continue + ;; + "'") + if [ ${s_quote_flag} -eq 0 ]; then + s_quote_flag=1 + else + s_quote_flag=0 + fi + ;; + "\"") + if [ ${d_quote_flag} -eq 0 ]; then + d_quote_flag=1 + else + d_quote_flag=0 + fi + + if [ ${i} -eq 1 ]; then + CONFIG_ITEM[i]=${CONFIG_ITEM[i]}$"\\"${TMP_LETTER} + idx=${idx}+1 + continue + fi + ;; + " ") + idx=${idx}+1 + continue + ;; + *) + ;; + esac + CONFIG_ITEM[i]=${CONFIG_ITEM[i]}${TMP_LETTER} + idx=${idx}+1 + done + return 0 +} + +# Check the elements required for OSTs, MDTs and MGS +# +# When formatting an OST, the following elements: hostname, module_opts, +# device name, device type and mgs nids, cannot have null value. +# +# When formatting an MDT or MGS, the following elements: hostname, +# module_opts, device name and device type, cannot have null value. +check_element() { + # Check hostname, module_opts, device name and device type + if [ -z "${HOST_NAME}" ]||[ -z "${MODULE_OPTS}" ]||[ -z "${DEVICE_NAME}" ]\ + ||[ -z "${DEVICE_TYPE}" ]; then + echo >&2 $"`basename $0`: check_element() error: Some required"\ + "element has null value! Check hostname, module_opts,"\ + "device name and device type!" + return 1 + fi + + # Check mgs nids + if [ "${DEVICE_TYPE}" = "ost" ]&&[ -z "${MGS_NIDS}" ]; then + echo >&2 $"`basename $0`: check_element() error: OST's mgs nids"\ + "element has null value!" + return 1 + fi + + return 0 +} + +# Check the elements required for HA configuration +check_ha_element() { + if [ -z "${HATYPE_OPT}" ]; then + return 0 + fi + + # Check service IP element + if [ -z "${SRV_IPADDRS}" ]; then + echo >&2 $"`basename $0`: check_ha_element() error: Service IP"\ + "element has null value!" + return 1 + fi + + # Check heartbeat channel element + if [ "${HATYPE_OPT}" != "${HATYPE_CLUMGR}" -a -z "${HB_CHANNELS}" ] + then + echo >&2 $"`basename $0`: check_ha_element() error: Heartbeat"\ + "channel element has null value!" + return 1 + fi + + return 0 +} + +# Check the number of MGS. +# There should be no more than one MGS specified in the entire csv file. +check_mgs() { + # Check the number of explicit MGS + if [ "${DEVICE_TYPE#*mgs*}" != "${DEVICE_TYPE}" ]; then + if [ "${EXP_MGS}" = "${HOST_NAME}" ]; then + echo >&2 $"`basename $0`: check_mgs() error: More than"\ + "one explicit MGS in the csv file!" + return 1 + fi + + if [ -z "${EXP_MGS}" ]; then + EXP_MGS=${HOST_NAME} + fi + + if [ "${EXP_MGS}" != "${HOST_NAME}" ]; then + if [ "${FAILOVERS#*$EXP_MGS*}" = "${FAILOVERS}" ]; then + echo >&2 $"`basename $0`: check_mgs() error:"\ + "More than one explicit MGS in the"\ + "csv file!" + else + echo >&2 $"`basename $0`: check_mgs() error:"\ + "There should not be two entries for"\ + "a server and its failover partner"\ + "in the csv file!" + fi + return 1 + fi + fi + + # Check the number of implicit MGS + if [ "${DEVICE_TYPE}" = "mdt" ]&&[ -z "${MGS_NIDS}" ]; then + if [ "${IMP_MGS}" = "${HOST_NAME}" ]; then + echo >&2 $"`basename $0`: check_mgs() error: More than"\ + "one implicit MGS in the csv file!" + return 1 + fi + + if [ -z "${IMP_MGS}" ]; then + IMP_MGS=${HOST_NAME} + fi + + if [ "${IMP_MGS}" != "${HOST_NAME}" ]; then + if [ "${FAILOVERS#*$IMP_MGS*}" = "${FAILOVERS}" ]; then + echo >&2 $"`basename $0`: check_mgs() error:"\ + "More than one implicit MGS in the"\ + "csv file!" + else + echo >&2 $"`basename $0`: check_mgs() error:"\ + "There should not be two entries for"\ + "a server and its failover partner"\ + "in the csv file!" + fi + return 1 + fi + fi + + if [ -n "${EXP_MGS}" -a -n "${IMP_MGS}" ]; then + echo >&2 $"`basename $0`: check_mgs() error: More than one"\ + "MGS in the csv file!" + return 1 + fi + + return 0 +} + +# Construct the command line of mkfs.lustre +construct_mkfs_cmdline() { + MKFS_CMD=${CMD_PATH}$"mkfs.lustre " + MKFS_CMD=${MKFS_CMD}${REFORMAT_OPTION} + + case "${DEVICE_TYPE}" in + "ost") + MKFS_CMD=${MKFS_CMD}$"--ost " + ;; + "mdt") + MKFS_CMD=${MKFS_CMD}$"--mdt " + ;; + "mgs") + MKFS_CMD=${MKFS_CMD}$"--mgs " + ;; + "mdt|mgs") + MKFS_CMD=${MKFS_CMD}$"--mdt --mgs " + ;; + "mgs|mdt") + MKFS_CMD=${MKFS_CMD}$"--mdt --mgs " + ;; + *) + echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\ + "Invalid device type - \"${DEVICE_TYPE}\"" + return 1 + ;; + esac + + if [ -n "${FS_NAME}" ]; then + MKFS_CMD=${MKFS_CMD}$"--fsname="${FS_NAME}$" " + fi + + if [ -n "${MGS_NIDS}" ]; then + MGS_NIDS=`echo "${MGS_NIDS}" | sed 's/^"//' | sed 's/"$//'` + MKFS_CMD=${MKFS_CMD}$"--mgsnode="${MGS_NIDS}$" " + fi + + if [ -n "${INDEX}" ]; then + MKFS_CMD=${MKFS_CMD}$"--index="${INDEX}$" " + fi + + if [ -n "${FORMAT_OPTIONS}" ]; then + FORMAT_OPTIONS=`echo "${FORMAT_OPTIONS}" | sed 's/^"//' | sed 's/"$//'` + MKFS_CMD=${MKFS_CMD}${FORMAT_OPTIONS}$" " + fi + + if [ -n "${MKFS_OPTIONS}" ]; then + MKFS_OPTIONS=`echo "${MKFS_OPTIONS}" | sed 's/^"//' | sed 's/"$//'` + MKFS_CMD=${MKFS_CMD}$"--mkfsoptions="$"\""${MKFS_OPTIONS}$"\""$" " + fi + + if [ -n "${MOUNT_OPTIONS}" ]; then + MOUNT_OPTIONS=`echo "${MOUNT_OPTIONS}" | sed 's/^"//' | sed 's/"$//'` + MKFS_CMD=${MKFS_CMD}$"--mountfsoptions="$"\""${MOUNT_OPTIONS}$"\""$" " + fi + + if [ -n "${FAILOVERS}" ]; then + FAILOVERS=`echo "${FAILOVERS}" | sed 's/^"//' | sed 's/"$//'` + MKFS_CMD=${MKFS_CMD}$"--failnode="${FAILOVERS}$" " + fi + + MKFS_CMD=${MKFS_CMD}${DEVICE_NAME} + return 0 +} + +# Get all the node names in this failover group +get_nodenames() { + declare -i idx + local failover_nids failover_nid first_nid + + NODE_NAMES[0]=${HOST_NAME} + + failover_nids=`echo ${FAILOVERS}|awk '{split($FAILOVERS, a, ":")}\ + END {for (i in a) print a[i]}'` + + # XXX: Suppose the first nid of one failover node contains the node name + idx=1 + for failover_nid in ${failover_nids} + do + first_nid=`echo ${failover_nid} | awk -F, '{print $1}'` + NODE_NAMES[idx]=${first_nid%@*} + idx=$idx+1 + done + + return 0 +} + +# Produce HA software's configuration files +gen_ha_config() { + local cmd_line + declare -i idx + + if [ -z "${HATYPE_OPT}" ]; then + return 0 + fi + + # Prepare parameters + # Hostnames option + HOSTNAME_OPT=${HOST_NAME} + + if ! get_nodenames; then + return 1 + fi + + for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do + HOSTNAME_OPT=${HOSTNAME_OPT}$":"${NODE_NAMES[idx]} + done + + # Target device option + TARGET_TYPE=${DEVICE_TYPE} + if [ "${TARGET_TYPE}" = "mdt|mgs" -o "${TARGET_TYPE}" = "mgs|mdt" ] + then + TARGET_TYPE=$"mgs_mdt" + fi + TARGET_OPT=${DEVICE_NAME}:${TARGET_TYPE} + + # Service IP address option + SRVADDR_OPT=${SRV_IPADDRS} + + # Heartbeat channels option + HBCHANNEL_OPT=$"\""${HB_CHANNELS}$"\"" + + # Heartbeat options option + HBOPT_OPT=$"\""${HB_OPTIONS}$"\"" + + # Construct the generation script command line + case "${HATYPE_OPT}" in + "${HATYPE_HBV1}"|"${HATYPE_HBV2}") # Heartbeat + cmd_line=${GEN_HB_CONFIG}$" -r ${HATYPE_OPT} -n ${HOSTNAME_OPT}" + cmd_line=${cmd_line}$" -d ${TARGET_OPT} -c ${HBCHANNEL_OPT}" + cmd_line=${cmd_line}$" -s ${SRVADDR_OPT}"${VERBOSE_OPT} + + if [ -n "${HB_OPTIONS}" ]; then + cmd_line=${cmd_line}$" -o ${HBOPT_OPT}" + fi + ;; + "${HATYPE_CLUMGR}") # CluManager + cmd_line=${GEN_CLUMGR_CONFIG}$" -n ${HOSTNAME_OPT}" + cmd_line=${cmd_line}$" -d ${TARGET_OPT} -s ${SRVADDR_OPT}" + cmd_line=${cmd_line}${VERBOSE_OPT} + + if [ -n "${HBCHANNEL_OPT}" ]; then + cmd_line=${cmd_line}$" -c ${HBCHANNEL_OPT}" + fi + + if [ -n "${HB_OPTIONS}" ]; then + cmd_line=${cmd_line}$" -o ${HBOPT_OPT}" + fi + ;; + esac + + # Execute script to generate HA software's configuration files + verbose_output "${cmd_line}" + eval $(echo "${cmd_line}") + if [ $? -ne 0 ]; then + return 1 + fi + + return 0 +} + +# Execute pdsh commands to add lnet options lines to remote nodes' +# modprobe.conf/modules.conf and format(mkfs.lustre) Lustre targets +# +# If -t option exists, then also to produce the HA software's +# configuration files +mass_config() { + # Check argument + if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: mass_config() error: Lack argument"\ + "for function mass_config()!" + return 1 + fi + + CSV_FILE=$1 + local LINE COMMAND + declare -a PDSH_PID + declare -a PDSH_CMD + declare -i line_num=1 + declare -i pid_num=0 + + while read -r LINE; do + # Get rid of the empty line + if [ -z "`echo ${LINE} | awk '/[[:alnum:]]/{print $0}'`" ]; then + line_num=${line_num}+1 + continue + fi + + # Get rid of the comment line + if [ -z "`echo \"${LINE}\" | egrep -v \"([[:space:]]|^)#\"`" ]; then + line_num=${line_num}+1 + continue + fi + + # Parse the config line into CONFIG_ITEM + if ! parse_line $LINE; then + return 1 + fi + + HOST_NAME=${CONFIG_ITEM[0]} + MODULE_OPTS=${CONFIG_ITEM[1]} + DEVICE_NAME=${CONFIG_ITEM[2]} + DEVICE_TYPE=${CONFIG_ITEM[3]} + FS_NAME=${CONFIG_ITEM[4]} + MGS_NIDS=${CONFIG_ITEM[5]} + INDEX=${CONFIG_ITEM[6]} + FORMAT_OPTIONS=${CONFIG_ITEM[7]} + MKFS_OPTIONS=${CONFIG_ITEM[8]} + MOUNT_OPTIONS=${CONFIG_ITEM[9]} + FAILOVERS=${CONFIG_ITEM[10]} + + HB_CHANNELS=${CONFIG_ITEM[11]} + SRV_IPADDRS=${CONFIG_ITEM[12]} + HB_OPTIONS=${CONFIG_ITEM[13]} + + # Check some required elements for formatting target + if ! check_element; then + echo >&2 $"`basename $0`: check_element() error:"\ + "Occurred on line ${line_num} in ${CSV_FILE}" + return 1 + fi + + # Check the number of MGS + if ! check_mgs; then + echo >&2 $"`basename $0`: check_mgs() error:"\ + "Occurred on line ${line_num} in ${CSV_FILE}" + return 1 + fi + + # Construct the command line of mkfs.lustre + if ! construct_mkfs_cmdline; then + echo >&2 $"`basename $0`: construct_mkfs_cmdline() error:"\ + "Occurred on line ${line_num} in ${CSV_FILE}" + return 1 + fi + + # Produce HA software's configuration files + if ! gen_ha_config; then + return 1 + fi + + # Execute pdsh command to add lnet options lines to + # modprobe.conf/modules.conf + COMMAND=$"echo \"${MODULE_OPTS}\"|${MODULE_CONFIG}" + verbose_output "Adding module options to ${HOST_NAME}" + verbose_output ${COMMAND} + ${PDSH} -w ${HOST_NAME} ${COMMAND} >&2 & + PDSH_PID[${pid_num}]=$! + PDSH_CMD[${pid_num}]="${PDSH} -w ${HOST_NAME} ${COMMAND}" + pid_num=${pid_num}+1 + + # Execute pdsh command to format Lustre target + verbose_output "Formatting Lustre target on ${HOST_NAME}..." + verbose_output "Format command line is: ${MKFS_CMD}" + ${PDSH} -w ${HOST_NAME} ${MKFS_CMD} >&2 & + PDSH_PID[${pid_num}]=$! + PDSH_CMD[${pid_num}]="${PDSH} -w ${HOST_NAME} ${MKFS_CMD}" + pid_num=${pid_num}+1 + + line_num=${line_num}+1 + done < ${CSV_FILE} + + # Wait for the exit status of the background pdsh command + verbose_output "Waiting for the return of the pdsh command..." + for ((pid_num = 0; pid_num < ${#PDSH_PID[@]}; pid_num++)); do + wait ${PDSH_PID[${pid_num}]} + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: mass_config() error:"\ + "Fail to execute \"${PDSH_CMD[${pid_num}]}\"!" + fi + done + + rm -rf ${TMP_DIRS} + return 0 +} + +# Main flow +# Check the csv file +if ! check_file $1; then + exit 1 +fi + +if [ "${VERIFY_CONNECT}" != "no" ]; then +# Check the network connectivity and hostnames + verbose_output "Checking the network connectivity and hostnames..." + if ! ${VERIFY_CLUSTER_NET} ${VERBOSE_OPT} ${CSV_FILE}; then + exit 1 + fi + verbose_output "Check the network connectivity and hostnames OK!" +fi + +# Configure the Lustre cluster +verbose_output "******** Lustre cluster configuration START ********" +if ! mass_config ${CSV_FILE}; then + rm -rf ${TMP_DIRS} + exit 1 +fi +verbose_output "******** Lustre cluster configuration END **********" + +exit 0 diff --git a/lustre/utils/cluster_scripts/gen_clumanager_config.sh b/lustre/utils/cluster_scripts/gen_clumanager_config.sh new file mode 100755 index 0000000..8469f7d --- /dev/null +++ b/lustre/utils/cluster_scripts/gen_clumanager_config.sh @@ -0,0 +1,379 @@ +#!/bin/bash +# +# gen_clumanager_config.sh - script for generating the Red Hat's Cluster Manager +# HA software's configuration files +# +################################################################################ + +# Usage +usage() { + cat >&2 < <-d target device> <-s service addresses> + [-c heartbeat channels] [-o heartbeat options] [-v] + + -n hostnames the nodenames of the primary node and its fail- + overs + Multiple nodenames are separated by colon (:) + delimeter. The first one is the nodename of the + primary node, the others are failover nodenames. + -d target device the target device name and type + The name and type are separated by colon (:) + delimeter. The type values are: mgs, mdt, ost or + mgs_mdt. + -s service addresses the IP addresses to failover + Multiple addresses are separated by colon (:) + delimeter. + -c heartbeat channels the methods to send/rcv heartbeats on + The default method is multicast, and multicast_ + ipaddress is "225.0.0.11". + -o heartbeat options a "catchall" for other heartbeat configuration + options + -v verbose mode + +EOF + exit 1 +} + +# Global variables +SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"} +SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}$"verify_serviceIP.sh" + +LUSTRE_SRV_SCRIPT=$"/etc/rc.d/init.d/lustre" # service script for lustre + +TMP_DIR=$"/tmp/clumanager/" # temporary directory +CLUMGR_DIR=$"/etc/" # CluManager configuration directory + +CONFIG_CMD=$"redhat-config-cluster-cmd" + +declare -a NODE_NAMES # node names in the failover group +declare -a SRV_IPADDRS # service IP addresses + +# Get and check the positional parameters +while getopts "n:d:s:c:o:v" OPTION; do + case $OPTION in + n) + HOSTNAME_OPT=$OPTARG + HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'` + if [ ${HOSTNAME_NUM} -lt 2 ]; then + echo >&2 $"`basename $0`: Lack failover nodenames!" + usage + fi + ;; + d) + DEVICE_OPT=$OPTARG + TARGET_DEV=`echo ${DEVICE_OPT} | awk -F":" '{print $1}'` + TARGET_TYPE=`echo ${DEVICE_OPT} | awk -F":" '{print $2}'` + if [ -z "${TARGET_TYPE}" ]; then + echo >&2 $"`basename $0`: Lack target device type!" + usage + fi + if [ "${TARGET_TYPE}" != "mgs" ]&&[ "${TARGET_TYPE}" != "mdt" ]\ + &&[ "${TARGET_TYPE}" != "ost" ]&&[ "${TARGET_TYPE}" != "mgs_mdt" ] + then + echo >&2 $"`basename $0`: Invalid target device type" \ + "- ${TARGET_TYPE}!" + usage + fi + ;; + s) + SRVADDR_OPT=$OPTARG + ;; + c) + HBCHANNEL_OPT=$OPTARG + HBCHANNEL_OPT=`echo "${HBCHANNEL_OPT}" | sed 's/^"//' \ + | sed 's/"$//'` + if [ -n "${HBCHANNEL_OPT}" ] \ + && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*broadcast*}" ] \ + && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*multicast*}" ]; then + echo >&2 $"`basename $0`: Invalid Heartbeat channel" \ + "- ${HBCHANNEL_OPT}!" + usage + fi + ;; + o) + HBOPT_OPT=$OPTARG + HBOPT_OPT=`echo "${HBOPT_OPT}" | sed 's/^"//' | sed 's/"$//'` + ;; + v) + VERBOSE_OPT=$"yes" + ;; + ?) + usage + esac +done + +# Check the required parameters +if [ -z "${HOSTNAME_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -n option!" + usage +fi + +if [ -z "${DEVICE_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -d option!" + usage +fi + +if [ -z "${SRVADDR_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -s option!" + usage +fi + +# Output verbose informations +verbose_output() { + if [ "${VERBOSE_OPT}" = "yes" ]; then + echo "`basename $0`: $*" + fi + return 0 +} + +# get_nodenames +# +# Get all the node names in this failover group +get_nodenames() { + PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'` + + declare -i idx + local nodename_str nodename + + nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\ + END {for (i in a) print a[i]}'` + idx=0 + for nodename in ${nodename_str} + do + NODE_NAMES[idx]=${nodename} + idx=$idx+1 + done + + return 0 +} + +# get_check_srvIPaddrs +# +# Get and check all the service IP addresses in this failover group +get_check_srvIPaddrs() { + declare -i idx + declare -i i + local srvIPaddr_str srvIPaddr + + srvIPaddr_str=`echo ${SRVADDR_OPT}|awk '{split($SRVADDR_OPT, a, ":")}\ + END {for (i in a) print a[i]}'` + idx=0 + for srvIPaddr in ${srvIPaddr_str} + do + SRV_IPADDRS[idx]=${srvIPaddr} + idx=$idx+1 + done + + for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do + for ((i = 0; i < ${#NODE_NAMES[@]}; i++)); do + # Check service IP address + verbose_output "Verifying service IP ${SRV_IPADDRS[idx]} and" \ + "real IP of host ${NODE_NAMES[i]} are in the" \ + "same subnet..." + if ! ${SCRIPT_VERIFY_SRVIP} ${SRV_IPADDRS[idx]} ${NODE_NAMES[i]} + then + return 1 + fi + verbose_output "OK" + done + done + + return 0 +} + +# stop_clumanager +# +# Run pdsh command to stop each node's clumanager service +stop_clumanager() { + declare -i idx + local nodename_str=${PRIM_NODENAME} + + for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do + nodename_str=${nodename_str}$","${NODE_NAMES[idx]} + done + + ${PDSH} -w ${nodename_str} /sbin/service clumanager stop + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: stop_clumanager() error:"\ + "Fail to execute pdsh command!" + return 1 + fi + + return 0 +} + +# check_retval retval +# +# Check the return value of redhat-config-cluster-cmd +check_retval() { + if [ $1 -ne 0 ]; then + echo >&2 "`basename $0`: Fail to run ${CONFIG_CMD}!" + return 1 + fi + + return 0 +} + +# gen_cluster_xml +# +# Run redhat-config-cluster-cmd to create the cluster.xml file +gen_cluster_xml() { + declare -i idx + local mcast_IPaddr + local hbopt_str hbopt + + # Run redhat-config-cluster-cmd to generate cluster.xml + # Add clumembd tag + if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*broadcast*}" ]; then + ${CONFIG_CMD} --clumembd --broadcast=yes + if ! check_retval $?; then + return 1 + fi + elif [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*multicast*}" ]; then + mcast_IPaddr=`echo ${HBCHANNEL_OPT} | awk '{print $2}'` + if [ -n "${mcast_IPaddr}" ]; then + ${CONFIG_CMD} --clumembd --multicast=yes\ + --multicast_ipaddress=${mcast_IPaddr} + if ! check_retval $?; then + return 1 + fi + fi + fi + + # Add cluster tag + ${CONFIG_CMD} --cluster --name='${TARGET_TYPE} failover group' + if ! check_retval $?; then + return 1 + fi + + # Add member tag + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + ${CONFIG_CMD} --add_member --name=${NODE_NAMES[idx]} + if ! check_retval $?; then + return 1 + fi + done + + # Add failoverdomain tag + ${CONFIG_CMD} --add_failoverdomain --name=${TARGET_TYPE}-domain + if ! check_retval $?; then + return 1 + fi + + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + ${CONFIG_CMD} --failoverdomain=${TARGET_TYPE}-domain\ + --add_failoverdomainnode --name=${NODE_NAMES[idx]} + if ! check_retval $?; then + return 1 + fi + done + + # Add service tag + ${CONFIG_CMD} --add_service --name=${TARGET_TYPE}-service + if ! check_retval $?; then + return 1 + fi + + ${CONFIG_CMD} --service=${TARGET_TYPE}-service \ + --userscript=${LUSTRE_SRV_SCRIPT} + if ! check_retval $?; then + return 1 + fi + + ${CONFIG_CMD} --service=${TARGET_TYPE}-service \ + --failoverdomain=${TARGET_TYPE}-domain + if ! check_retval $?; then + return 1 + fi + + for ((idx = 0; idx < ${#SRV_IPADDRS[@]}; idx++)); do + ${CONFIG_CMD} --service=mgs-service \ + --add_service_ipaddress --ipaddress=${SRV_IPADDRS[idx]} + if ! check_retval $?; then + return 1 + fi + done + + # Add other tags + if [ -n "${HBOPT_OPT}"]; then + hbopt_str=`echo ${HBOPT_OPT}|awk '{split($HBOPT_OPT, a, ":")}\ + END {for (i in a) print a[i]}'` + idx=0 + for hbopt in ${hbopt_str} + do + ${CONFIG_CMD} ${hbopt} + if ! check_retval $?; then + return 1 + fi + idx=$idx+1 + done + fi + + return 0 +} + +# create_config +# +# Create the cluster.xml file and scp it to the each node's /etc/ +create_config() { + CONFIG_PRIMNODE=${TMP_DIR}$"cluster.xml."${PRIM_NODENAME} + declare -i idx + + if [ -e ${CONFIG_PRIMNODE} ]; then + verbose_output "${CONFIG_PRIMNODE} already exists." + return 0 + fi + + # Run redhat-config-cluster-cmd to generate cluster.xml + verbose_output "Creating cluster.xml file for" \ + "${PRIM_NODENAME} failover group hosts..." + if ! gen_cluster_xml; then + return 1 + fi + verbose_output "OK" + + /bin/cp -f ${CLUMGR_DIR}cluster.xml ${CONFIG_PRIMNODE} + + # scp the cluster.xml file to all the nodes + verbose_output "Remote copying cluster.xml file to" \ + "${PRIM_NODENAME} failover group hosts..." + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + touch ${TMP_DIR}$"cluster.xml."${NODE_NAMES[idx]} + scp ${CONFIG_PRIMNODE} ${NODE_NAMES[idx]}:${CLUMGR_DIR}cluster.xml + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: Fail to scp cluster.xml file"\ + "to node ${NODE_NAMES[idx]}!" + return 1 + fi + done + verbose_output "OK" + + return 0 +} + +# Main flow +# Get all the node names +if ! get_nodenames; then + exit 1 +fi + +# Get and check all the service IP addresses +if ! get_check_srvIPaddrs; then + exit 1 +fi + +# Stop clumanager services +verbose_output "Stopping clumanager service in the ${PRIM_NODENAME}"\ + "failover group hosts..." +if ! stop_clumanager; then + exit 1 +fi +verbose_output "OK" + +# Generate configuration files +if ! create_config; then + exit 1 +fi + +exit 0 diff --git a/lustre/utils/cluster_scripts/gen_hb_config.sh b/lustre/utils/cluster_scripts/gen_hb_config.sh new file mode 100755 index 0000000..bf66368 --- /dev/null +++ b/lustre/utils/cluster_scripts/gen_hb_config.sh @@ -0,0 +1,591 @@ +#!/bin/bash +# +# gen_hb_config.sh - script for generating the Heartbeat HA software's +# configuration files +# +############################################################################### + +# Usage +usage() { + cat >&2 < <-n hostnames> <-d target device> + <-c heartbeat channels> <-s service address> + [-o heartbeat options] [-v] + + -r HBver the version of Heartbeat software + The Heartbeat software versions which are curr- + ently supported are: hbv1 (Heartbeat version 1) + and hbv2 (Heartbeat version 2). + -n hostnames the nodenames of the primary node and its fail- + overs + Multiple nodenames are separated by colon (:) + delimeter. The first one is the nodename of the + primary node, the others are failover nodenames. + -d target device the target device name and type + The name and type are separated by colon (:) + delimeter. The type values are: mgs, mdt, ost or + mgs_mdt. + -c heartbeat channels the methods and devices to send/rcv heartbeats on + -s service address the IP address to failover + -o heartbeat options a "catchall" for other heartbeat configuration + options + -v verbose mode + +EOF + exit 1 +} + +# Global variables +SCRIPTS_PATH=${CLUSTER_SCRIPTS_PATH:-"./"} +SCRIPT_VERIFY_SRVIP=${SCRIPTS_PATH}$"verify_serviceIP.sh" + +LUSTRE_SRV_SCRIPT=$"lustre" # service script for lustre +MON_SRV_SCRIPT=$"mon" # service script for mon +LUSTRE_MON_SCRIPT=$"simple.health_check.monitor" +LUSTRE_ALERT_SCRIPT=$"fail_lustre.alert" +CIB_GEN_SCRIPT=$"/usr/lib/heartbeat/cts/haresources2cib.py" + +TMP_DIR=$"/tmp/heartbeat/" # temporary directory +HACF_TEMP=${TMP_DIR}$"ha.cf.temp" +AUTHKEYS_TEMP=${TMP_DIR}$"authkeys.temp" +MONCF_TEMP=${TMP_DIR}$"mon.cf.temp" + +HA_DIR=$"/etc/ha.d/" # Heartbeat configuration directory +MON_DIR=$"/etc/mon/" # mon configuration directory +CIB_DIR=$"/var/lib/heartbeat/crm/" # cib.xml directory + +HBVER_HBV1=$"hbv1" # Heartbeat version 1 +HBVER_HBV2=$"hbv2" # Heartbeat version 2 + +declare -a NODE_NAMES # node names in the failover group + +# Get and check the positional parameters +while getopts "r:n:d:c:s:o:v" OPTION; do + case $OPTION in + r) + HBVER_OPT=$OPTARG + if [ "${HBVER_OPT}" != "${HBVER_HBV1}" ] \ + && [ "${HBVER_OPT}" != "${HBVER_HBV2}" ]; then + echo >&2 $"`basename $0`: Invalid Heartbeat software" \ + "version - ${HBVER_OPT}!" + usage + fi + ;; + n) + HOSTNAME_OPT=$OPTARG + HOSTNAME_NUM=`echo ${HOSTNAME_OPT} | awk -F":" '{print NF}'` + if [ ${HOSTNAME_NUM} -lt 2 ]; then + echo >&2 $"`basename $0`: Lack failover nodenames!" + usage + fi + ;; + d) + DEVICE_OPT=$OPTARG + TARGET_DEV=`echo ${DEVICE_OPT} | awk -F":" '{print $1}'` + TARGET_TYPE=`echo ${DEVICE_OPT} | awk -F":" '{print $2}'` + if [ -z "${TARGET_TYPE}" ]; then + echo >&2 $"`basename $0`: Lack target device type!" + usage + fi + if [ "${TARGET_TYPE}" != "mgs" ]&&[ "${TARGET_TYPE}" != "mdt" ]\ + &&[ "${TARGET_TYPE}" != "ost" ]&&[ "${TARGET_TYPE}" != "mgs_mdt" ] + then + echo >&2 $"`basename $0`: Invalid target device type" \ + "- ${TARGET_TYPE}!" + usage + fi + ;; + c) + HBCHANNEL_OPT=$OPTARG + HBCHANNEL_OPT=`echo "${HBCHANNEL_OPT}" | sed 's/^"//' \ + | sed 's/"$//'` + if [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*serial*}" ] \ + && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*bcast*}" ] \ + && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*ucast*}" ] \ + && [ "${HBCHANNEL_OPT}" = "${HBCHANNEL_OPT#*mcast*}" ]; then + echo >&2 $"`basename $0`: Invalid Heartbeat channel" \ + "- ${HBCHANNEL_OPT}!" + usage + fi + ;; + s) + SRVADDR_OPT=$OPTARG + ;; + o) + HBOPT_OPT=$OPTARG + HBOPT_OPT=`echo "${HBOPT_OPT}" | sed 's/^"//' | sed 's/"$//'` + ;; + v) + VERBOSE_OPT=$"yes" + ;; + ?) + usage + esac +done + +# Check the required parameters +if [ -z "${HBVER_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -r option!" + usage +fi + +if [ -z "${HOSTNAME_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -n option!" + usage +fi + +if [ -z "${DEVICE_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -d option!" + usage +fi + +if [ -z "${HBCHANNEL_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -c option!" + usage +fi + +if [ -z "${SRVADDR_OPT}" ]; then + echo >&2 $"`basename $0`: Lack -s option!" + usage +fi + +if [ "${HBVER_OPT}" = "${HBVER_HBV1}" -a ${HOSTNAME_NUM} -gt 2 ]; then + echo >&2 $"`basename $0`: Heartbeat version 1 can only support 2 nodes!" + usage +fi + +# Output verbose informations +verbose_output() { + if [ "${VERBOSE_OPT}" = "yes" ]; then + echo "`basename $0`: $*" + fi + return 0 +} + +# get_nodenames +# +# Get all the node names in this failover group +get_nodenames() { + PRIM_NODENAME=`echo ${HOSTNAME_OPT} | awk -F":" '{print $1}'` + + declare -i idx + local nodename_str nodename + + nodename_str=`echo ${HOSTNAME_OPT}|awk '{split($HOSTNAME_OPT, a, ":")}\ + END {for (i in a) print a[i]}'` + idx=0 + for nodename in ${nodename_str} + do + NODE_NAMES[idx]=${nodename} + idx=$idx+1 + done + + return 0 +} + +# check_srvIPaddr +# +# Check service IP address in this failover group +check_srvIPaddr() { + declare -i idx + + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + # Check service IP address + verbose_output "Verifying service IP ${SRVADDR_OPT} and" \ + "real IP of host ${NODE_NAMES[idx]} are in the" \ + "same subnet..." + if ! ${SCRIPT_VERIFY_SRVIP} ${SRVADDR_OPT} ${NODE_NAMES[idx]} + then + return 1 + fi + verbose_output "OK" + done + + return 0 +} + +# stop_heartbeat +# +# Run pdsh command to stop each node's heartbeat service +stop_heartbeat() { + declare -i idx + local nodename_str=${PRIM_NODENAME} + + for ((idx = 1; idx < ${#NODE_NAMES[@]}; idx++)); do + nodename_str=${nodename_str}$","${NODE_NAMES[idx]} + done + + ${PDSH} -w ${nodename_str} /sbin/service heartbeat stop + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: stop_heartbeat() error:"\ + "Fail to execute pdsh command!" + return 1 + fi + + return 0 +} + +# create_template +# +# Create the templates for ha.cf, authkeys and mon.cf files +create_template() { + /bin/mkdir -p ${TMP_DIR} + + # Create the template for ha.cf + if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then + cat >${HACF_TEMP} <${HACF_TEMP} <${AUTHKEYS_TEMP} <${MONCF_TEMP} <&2 $"`basename $0`: Invalid UDP port" \ + "- ${port}!" + return 1 + fi + fi + + # Add the UDP port number into each failover node's udpport file + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + UDPPORT_NODE=${TMP_DIR}$"udpport."${NODE_NAMES[idx]} + echo ${port} > ${UDPPORT_NODE} + done + + echo ${port} + return 0 +} + +# create_hacf +# +# Create the ha.cf file and scp it to each node's /etc/ha.d/ +create_hacf() { + HACF_PRIMNODE=${TMP_DIR}$"ha.cf."${PRIM_NODENAME} + + declare -i idx + + if [ -s ${HACF_PRIMNODE} ]; then + # The ha.cf file for the primary node has already existed. + verbose_output "${HACF_PRIMNODE} already exists." + return 0 + fi + + /bin/cp -f ${HACF_TEMP} ${HACF_PRIMNODE} + + if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*bcast*}" ] \ + || [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*ucast*}" ]; then + UDPPORT_OPT=$(gen_udpport) + if [ $? -ne 0 ]; then + return 1 + fi + echo "udpport ${UDPPORT_OPT}" >> ${HACF_PRIMNODE} + fi + + if [ "${HBCHANNEL_OPT}" != "${HBCHANNEL_OPT#*serial*}" ]; then + echo "baud 19200" >> ${HACF_PRIMNODE} + fi + + echo ${HBCHANNEL_OPT} | awk '{split($HBCHANNEL_OPT, a, ":")} \ + END {for (i in a) print a[i]}' >> ${HACF_PRIMNODE} + + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + echo "node ${NODE_NAMES[idx]}" >> ${HACF_PRIMNODE} + done + + echo ${HBOPT_OPT} | awk '{split($HBOPT_OPT, a, ":")} \ + END {for (i in a) print a[i]}' >> ${HACF_PRIMNODE} + + # scp ha.cf file to all the nodes + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + touch ${TMP_DIR}$"ha.cf."${NODE_NAMES[idx]} + scp ${HACF_PRIMNODE} ${NODE_NAMES[idx]}:${HA_DIR}ha.cf + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: Fail to scp ha.cf file"\ + "to node ${NODE_NAMES[idx]}!" + return 1 + fi + done + + return 0 +} + +# create_haresources +# +# Create the haresources file and scp it to the each node's /etc/ha.d/ +create_haresources() { + HARES_PRIMNODE=${TMP_DIR}$"haresources."${PRIM_NODENAME} + declare -i idx + + if [ -s ${HARES_PRIMNODE} ]; then + # The haresources file for the primary node has already existed + verbose_output "${HARES_PRIMNODE} already exists." + return 0 + fi + + # Add the resource group line into the haresources file + echo "${PRIM_NODENAME} ${SRVADDR_OPT} "\ + "${LUSTRE_SRV_SCRIPT}::${TARGET_TYPE}::${TARGET_DEV} "\ + "${MON_SRV_SCRIPT}" > ${HARES_PRIMNODE} + + # Generate the cib.xml file + if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then + CIB_PRIMNODE=${TMP_DIR}$"cib.xml."${PRIM_NODENAME} + python ${CIB_GEN_SCRIPT} ${HARES_PRIMNODE} > ${CIB_PRIMNODE} + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: Fail to generate cib.xml file"\ + "for node ${PRIM_NODENAME}!" + return 1 + fi + fi + + # scp the haresources file or cib.xml file + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + touch ${TMP_DIR}$"haresources."${NODE_NAMES[idx]} + if [ "${HBVER_OPT}" = "${HBVER_HBV2}" ]; then + scp ${CIB_PRIMNODE} ${NODE_NAMES[idx]}:${CIB_DIR}cib.xml + else + scp ${HARES_PRIMNODE} ${NODE_NAMES[idx]}:${HA_DIR}haresources + fi + + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: Fail to scp haresources file"\ + "to node ${NODE_NAMES[idx]}!" + return 1 + fi + done + + return 0 +} + +# create_authkeys +# +# Create the authkeys file and scp it to the each node's /etc/ha.d/ +create_authkeys() { + AUTHKEYS_PRIMNODE=${TMP_DIR}$"authkeys."${PRIM_NODENAME} + declare -i idx + + if [ -e ${AUTHKEYS_PRIMNODE} ]; then + verbose_output "${AUTHKEYS_PRIMNODE} already exists." + return 0 + fi + + # scp the authkeys file to all the nodes + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + touch ${TMP_DIR}$"authkeys."${NODE_NAMES[idx]} + scp ${AUTHKEYS_TEMP} ${NODE_NAMES[idx]}:${HA_DIR}authkeys + if [ $? -ne 0 ]; then + echo >&2 "`basename $0`: Fail to scp authkeys file"\ + "to node ${NODE_NAMES[idx]}!" + return 1 + fi + done + + return 0 +} + +# create_moncf +# +# Create the mon.cf file and scp it to the each node's /etc/mon/ +create_moncf() { + MONCF_PRIMNODE=${TMP_DIR}$"mon.cf."${PRIM_NODENAME} + declare -i idx + local hostgroup_str=$"hostgroup ${TARGET_TYPE}-group" + + if [ -e ${MONCF_PRIMNODE} ]; then + verbose_output "${MONCF_PRIMNODE} already exists." + return 0 + fi + + /bin/cp -f ${MONCF_TEMP} ${MONCF_PRIMNODE} + + for ((idx = 0; idx < ${#NODE_NAMES[@]}; idx++)); do + hostgroup_str=${hostgroup_str}$" "${NODE_NAMES[idx]} + done + + echo ${hostgroup_str} >> ${MONCF_PRIMNODE} + + cat >>${MONCF_PRIMNODE} <&2 "`basename $0`: Fail to scp mon.cf file"\ + "to node ${NODE_NAMES[idx]}!" + return 1 + fi + done + + return 0 +} + +# generate_config +# +# Generate the configuration files for Heartbeat and scp them to all the nodes +generate_config() { + if ! create_template; then + return 1 + fi + + verbose_output "Creating and remote copying ha.cf file to"\ + "${PRIM_NODENAME} failover group hosts..." + if ! create_hacf; then + return 1 + fi + verbose_output "OK" + + if [ "${HBVER_OPT}" = "${HBVER_HBV1}" ]; then + verbose_output "Creating and remote copying haresources file"\ + "to ${PRIM_NODENAME} failover group hosts..." + else + verbose_output "Creating and remote copying cib.xml file"\ + "to ${PRIM_NODENAME} failover group hosts..." + fi + + if ! create_haresources; then + return 1 + fi + verbose_output "OK" + + verbose_output "Creating and remote copying authkeys file to" \ + "${PRIM_NODENAME} failover group hosts..." + if ! create_authkeys; then + return 1 + fi + verbose_output "OK" + + verbose_output "Creating and remote copying mon.cf file to" \ + "${PRIM_NODENAME} failover group hosts..." + if ! create_moncf; then + return 1 + fi + verbose_output "OK" + + return 0 +} + +# Main flow +# Get all the node names +if ! get_nodenames; then + exit 1 +fi + +# Check service IP address +if ! check_srvIPaddr; then + exit 1 +fi + +# Stop heartbeat services +verbose_output "Stopping heartbeat service in the ${PRIM_NODENAME}"\ + "failover group hosts..." +if ! stop_heartbeat; then + exit 1 +fi +verbose_output "OK" + +# Generate configuration files +if ! generate_config; then + exit 1 +fi + +exit 0 diff --git a/lustre/utils/cluster_scripts/module_config.sh b/lustre/utils/cluster_scripts/module_config.sh new file mode 100755 index 0000000..baff1eb --- /dev/null +++ b/lustre/utils/cluster_scripts/module_config.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# module_config.sh - add lustre options lines into modprobe.conf or +# modules.conf +# +################################################################################# + +# Check the kernel version +KERNEL_VERSION=`uname -r` +KERNEL_VERSION=${KERNEL_VERSION:0:3} + +if [ "${KERNEL_VERSION}" = "2.4" ]; then + MODULE_CONF=/etc/modules.conf +else + MODULE_CONF=/etc/modprobe.conf +fi + +read -r NETWORKS +MODLINES_FILE=/tmp/modlines$$.txt +START_MARKER=$"# start lustre config" +END_MARKER=$"# end lustre config" + +# Generate a temp file contains lnet options lines +generate_lnet_lines() { + local LNET_LINE TMP_LINE + + TMP_LINE="${NETWORKS}" + + echo ${START_MARKER} > ${MODLINES_FILE} + while true; do + LNET_LINE=${TMP_LINE%%\\n*} + echo ${LNET_LINE} >> ${MODLINES_FILE} + + TMP_LINE=${TMP_LINE#*\\n} + + if [ "${TMP_LINE}" == "${LNET_LINE}" ]; then + break + fi + done + echo ${END_MARKER} >> ${MODLINES_FILE} + + #echo "--------------${MODLINES_FILE}--------------" + #cat ${MODLINES_FILE} + #echo -e "------------------------------------------\n" + + return 0 +} + +if ! generate_lnet_lines; then + exit 1 +fi + +# Add lnet options lines to the module configuration file +if [ -e ${MODULE_CONF} ]; then + # Delete the old options + sed -i "/${START_MARKER}/,/${END_MARKER}/d" ${MODULE_CONF} +fi + +cat ${MODLINES_FILE} >> ${MODULE_CONF} +rm -f ${MODLINES_FILE} +exit 0 diff --git a/lustre/utils/cluster_scripts/verify_cluster_net.sh b/lustre/utils/cluster_scripts/verify_cluster_net.sh new file mode 100755 index 0000000..f5f59c4 --- /dev/null +++ b/lustre/utils/cluster_scripts/verify_cluster_net.sh @@ -0,0 +1,296 @@ +#!/bin/bash +# +# verify_cluster_net.sh - script for Lustre cluster network verification +# +############################################################################### + +# Usage +usage() { + cat >&2 < + + -v verbose mode + csv file a spreadsheet that contains configuration parameters + (separated by commas) for each target in a Lustre cl- + uster, the first field of each line is the host name + of the cluster node + +EOF + exit 1 +} + +# Get and check the positional parameters +while getopts "v" OPTION; do + case $OPTION in + v) + VERBOSE_OPT=$"yes" + ;; + ?) + usage + esac +done + +# Toss out the parameters we've already processed +shift `expr $OPTIND - 1` + +# Here we expect the csv file +if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: Lack csv file!" + usage +fi + +# Global variables +CSV_FILE=$1 +declare -a HOST_NAMES +declare -a HOST_IPADDRS + +# Output verbose informations +verbose_output() { + if [ "${VERBOSE_OPT}" = "yes" ]; then + echo "`basename $0`: $*" + fi + return 0 +} + +# Check the csv file +check_file() { + if [ ! -s ${CSV_FILE} ]; then + echo >&2 $"`basename $0`: check_file() error: ${CSV_FILE}" \ + "does not exist or is empty!" + return 1 + fi + + return 0 +} + +# Get the host names from the csv file +get_hostnames() { + local NAME CHECK_STR + declare -i i + + # Initialize the HOST_NAMES array + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + HOST_NAMES[i]=$"" + done + + CHECK_STR=`egrep -v "([[:space:]]|^)#" ${CSV_FILE} | awk -F, \ + '/[[:alnum:]]/{if ($1 !~/[[:alnum:]]/) print $0}'` + if [ -n "${CHECK_STR}" ]; then + echo >&2 $"`basename $0`: get_hostnames() error: Lack hostname"\ + "field in the line - ${CHECK_STR}" + return 1 + fi + + i=0 + for NAME in `egrep -v "([[:space:]]|^)#" ${CSV_FILE}\ + | awk -F, '/[[:alnum:]]/{print $1}'` + do + HOST_NAMES[i]=${NAME} + i=$i+1 + done + + return 0 +} + +# Check whether the host name matches the name in the local /etc/hosts table +# and whether the IP address according to the host name is correct +local_check() { + # Check argument + if [ $# -ne 2 ]; then + echo >&2 $"`basename $0`: local_check() error: Lack argument" \ + "for function local_check()!" + return 1 + fi + + local RET_STR REAL_NAME + + # Get the IP address according to the host name from /etc/hosts table + # of the current host + HOST_IPADDRS[$2]=`egrep "[[:space:]]$1([[:space:]]|$)" /etc/hosts \ + | awk '{print $1}'` + if [ -z "${HOST_IPADDRS[$2]}" ]; then + echo >&2 "`basename $0`: local_check() error: $1 does not" \ + "exist in the local /etc/hosts table!" + return 1 + fi + + if [ ${#HOST_IPADDRS[$2]} -gt 15 ]; then + echo >&2 "`basename $0`: local_check() error: More than one" \ + "IP address line according to $1 in the local" \ + "/etc/hosts table!" + return 1 + fi + + # Execute pdsh command to get the real host name + RET_STR=`${PDSH} -w ${HOST_IPADDRS[$2]} hostname 2>&1` + if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ]; then + echo >&2 "`basename $0`: local_check() error: pdsh error:" \ + "${RET_STR}" + return 1 + fi + + if [ -z "${RET_STR}" ]; then + echo >&2 "`basename $0`: local_check() error: pdsh error:" \ + "No results from pdsh! Check the network connectivity"\ + "between the local host and ${HOST_IPADDRS[$2]}" \ + "or check the two hosts' rcmd module!" + return 1 + fi + + REAL_NAME=`echo ${RET_STR} | awk '{print $2}'` + if [ "$1" != "${REAL_NAME}" ]; then + echo >&2 "`basename $0`: local_check() error: The real hostname"\ + "according to ${HOST_IPADDRS[$2]} is ${REAL_NAME}," \ + "not $1! Check the local /etc/hosts table!" + return 1 + fi + + return 0 +} + +# Check whether the correct host name and IP address pair matches +# the one in the remote /etc/hosts tables +remote_check() { + # Check argument + if [ $# -ne 2 ]; then + echo >&2 $"`basename $0`: remote_check() error: Lack argument"\ + "for function remote_check()!" + return 1 + fi + + declare -i i + local RET_STR COMMAND IP_ADDR + + COMMAND=$"egrep \"[[:space:]]$1([[:space:]]|$)\" /etc/hosts" + + # Execute pdsh command to check remote /etc/hosts tables + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + RET_STR=`${PDSH} -w ${HOST_NAMES[i]} ${COMMAND} 2>&1` + if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ] + then + echo >&2 "`basename $0`: remote_check() error:" \ + "pdsh error: ${RET_STR}" + return 1 + fi + + IP_ADDR=`echo ${RET_STR} | awk '{print $2}'` + if [ -z "${IP_ADDR}" ]; then + echo >&2 "`basename $0`: remote_check() error:" \ + "$1 does not exist in the ${HOST_NAMES[i]}'s"\ + "/etc/hosts table!" + return 1 + fi + + if [ "${IP_ADDR}" != "${HOST_IPADDRS[$2]}" ]; then + echo >&2 "`basename $0`: remote_check() error:" \ + "IP address ${IP_ADDR} of $1 in the" \ + "${HOST_NAMES[i]}'s /etc/hosts is incorrect!" + return 1 + fi + + done + + return 0 +} + +# Verify forward and reverse network connectivity of the Lustre cluster +network_check () { + # Check argument + if [ $# -eq 0 ]; then + echo >&2 $"`basename $0`: network_check() error: Lack argument" \ + "for function network_check()!" + return 1 + fi + + declare -i i + local RET_STR COMMAND REAL_NAME + + # Execute pdsh command to check network connectivity + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + COMMAND=$"${PDSH} -w ${HOST_NAMES[i]} hostname" + RET_STR=`${PDSH} -w $1 ${COMMAND} 2>&1` + if [ $? -ne 0 ] || [ "${RET_STR}" != "${RET_STR#*connect:*}" ] + then + echo >&2 "`basename $0`: network_check() error:" \ + "pdsh error: ${RET_STR}" + return 1 + fi + + if [ -z "${RET_STR}" ]; then + echo >&2 "`basename $0`: network_check() error:" \ + "pdsh error: Nothing get from pdsh! Check" \ + "the network connectivity between $1 and" \ + "${HOST_NAMES[i]} or the two hosts' rcmd module!" + return 1 + fi + + REAL_NAME=`echo ${RET_STR} | awk '{print $3}'` + if [ "${HOST_NAMES[i]}" != "${REAL_NAME}" ]; then + echo >&2 "`basename $0`: network_check() error:" \ + "${RET_STR}" + return 1 + fi + done + + return 0 +} + +# Verify forward and reverse network connectivity of the Lustre cluster, +# and that hostnames match the names in the /etc/hosts tables. +network_verify() { + declare -i i + + # Initialize the HOST_IPADDRS array + for ((i = 0; i < ${#HOST_IPADDRS[@]}; i++)); do + HOST_IPADDRS[i]=$"" + done + + # Get all the host names from the csv file + if ! get_hostnames; then + return 1 + fi + + # Check whether all the host names match the names in + # all the /etc/hosts tables of the Lustre cluster + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + verbose_output "Verifying IP address of host" \ + "${HOST_NAMES[i]} in the local /etc/hosts..." + if ! local_check ${HOST_NAMES[i]} $i; then + return 1 + fi + verbose_output "OK" + done + + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + verbose_output "Verifying IP address of host" \ + "${HOST_NAMES[i]} in the remote /etc/hosts..." + if ! remote_check ${HOST_NAMES[i]} $i; then + return 1 + fi + verbose_output "OK" + done + + # Verify network connectivity of the Lustre cluster + for ((i = 0; i < ${#HOST_NAMES[@]}; i++)); do + verbose_output "Verifying network connectivity of host" \ + "${HOST_NAMES[i]} to other hosts..." + if ! network_check ${HOST_NAMES[i]}; then + return 1 + fi + verbose_output "OK" + done + + return 0 +} + +# Main flow +if ! check_file; then + exit 1 +fi + +if ! network_verify; then + exit 1 +fi + +exit 0 diff --git a/lustre/utils/cluster_scripts/verify_serviceIP.sh b/lustre/utils/cluster_scripts/verify_serviceIP.sh new file mode 100755 index 0000000..cdc749d --- /dev/null +++ b/lustre/utils/cluster_scripts/verify_serviceIP.sh @@ -0,0 +1,228 @@ +#!/bin/bash +# +# verify_serviceIP.sh - script for verifying the service IP and the real +# interface IP in a remote host are in the same subnet +# +############################################################################### + +# Usage +usage() { + cat >&2 < + + service IPaddr the IP address to failover + hostname the hostname of the remote node + +EOF + exit 1 +} + +# Check arguments +if [ $# -lt 2 ]; then + usage +fi + +# +# inSameIPsubnet serviceIPaddr interfaceIPaddr mask +# +# Given two IP addresses and a subnet mask determine if these IP +# addresses are in the same subnet. If they are, return 0, else return 1. +# +inSameIPsubnet() { + declare -i n + declare -ia mask + declare -ia ip1 ip2 # IP addresses given + declare -i quad1 quad2 # calculated quad words + + # + # Remove '.' characters from dotted decimal notation and save + # in arrays. i.e. + # + # 192.168.1.163 -> array[0] = 192 + # array[1] = 168 + # array[2] = 1 + # array[3] = 163 + # + let n=0 + for quad in $(echo $1 | awk -F. '{print $1 " " $2 " " $3 " " $4}') + do + ip1[n]=$quad + let n=n+1 + done + + let n=0 + for quad in $(echo $2 | awk -F. '{print $1 " " $2 " " $3 " " $4}') + do + ip2[n]=$quad + let n=n+1 + done + + let n=0 + for quad in $(echo $3 | awk -F. '{print $1 " " $2 " " $3 " " $4}') + do + mask[n]=$quad + let n=n+1 + done + + # + # For each quad word, logically AND the IP address with the subnet + # mask to get the network/subnet quad word. If the resulting + # quad words for both IP addresses are the same they are in the + # same IP subnet. + # + for n in 0 1 2 3 + do + let $((quad1=${ip1[n]} & ${mask[n]})) + let $((quad2=${ip2[n]} & ${mask[n]})) + + if [ $quad1 != $quad2 ]; then + echo >&2 $"`basename $0`: Service IP address $1 and"\ + "real interface IP address $2 are in"\ + "different subnets!" + return 1 # in different subnets + fi + done + + return 0 # in the same subnet, all quad words matched +} + +# +# findInterface IPaddr hostname +# +# Given a target IP address and a hostname, find the interface in which +# this address is configured. If found return 0, if not return 1. The +# interface name is returned to stdout. +# +findInterface() { + declare host + declare line + declare intf + declare addr + declare state + + declare target=$1 + declare hostname=$2 + + { + while read host intf line + do + while read host line + do + if [ "$line" = "" ]; then # go to next interface + continue 2 + fi + + set - $line + addr= + while [ $# -gt 0 ]; do + case $1 in + addr:*) + addr=${1##addr:} + if [ -n "$addr" -a "$addr" = "$target" ] + then + echo $intf + return 0 + fi + ;; + esac + shift + done + done + done + } < <(${PDSH} -w $hostname /sbin/ifconfig) + + echo >&2 "`basename $0`: Cannot find the interface in which" \ + "$target is configured in the host $hostname!" + return 1 +} + +# +# findNetmask interface hostname +# +# Given an interface find the netmask addresses associated with it. +# Return 0 when found, else return 1. The netmask is returned to stdout. +# +findNetmask() { + declare line + declare addr + declare target=$1 + declare hostname=$2 + + while read line + do + set - $line + + while [ $# -gt 0 ]; do + case $1 in + Mask:*) + echo ${1##*:} # return netmask addr + return 0 + ;; + esac + shift + done + done < <(${PDSH} -w $hostname /sbin/ifconfig $target) + + echo >&2 "`basename $0`: Cannot find the netmask associated with" \ + "the interface $target in the host $hostname!" + return 1 +} + +# +# check_srvIPaddr serviceIPaddr hostname +# +# Given a service IP address and hostname, check whether the service IP address +# and the real interface IP address of hostname are in the same subnet. +# If they are, return 0, else return 1. +# +check_srvIPaddr() { + declare real_IPaddr + declare real_intf + declare netmask + declare srv_IPaddr=$1 + declare hostname=$2 + + # Get the IP address from /etc/hosts table according to the hostname + real_IPaddr=`egrep "[[:space:]]$hostname([[:space:]]|$)" /etc/hosts \ + | awk '{print $1}'` + if [ -z "$real_IPaddr" ]; then + echo >&2 "`basename $0`: $hostname does not exist in" \ + "the local /etc/hosts table!" + return 1 + fi + + if [ ${#real_IPaddr} -gt 15 ]; then + echo >&2 "`basename $0`: More than one IP address line" \ + "according to $hostname in the local /etc/hosts table!" + return 1 + fi + + # Get the interface in which the real IP address is configured + real_intf=$(findInterface $real_IPaddr $hostname) + if [ $? -ne 0 ]; then + return 1 + fi + real_intf=${real_intf%%:*} + + # Get the netmask address associated with the real interface + netmask=$(findNetmask $real_intf $hostname) + if [ $? -ne 0 ]; then + return 1 + fi + + # Determine if the service IP address and the real IP address + # are in the same subnet + inSameIPsubnet $srv_IPaddr $real_IPaddr $netmask + if [ $? -ne 0 ]; then + return 1 + fi + + return 0 +} + +# Check service IP address +if ! check_srvIPaddr $1 $2; then + exit 1 +fi +exit 0 diff --git a/lustre/utils/ha_assist.sh b/lustre/utils/ha_assist.sh deleted file mode 100755 index 0f737f5..0000000 --- a/lustre/utils/ha_assist.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -echo primary `date` >> /tmp/halog - - diff --git a/lustre/utils/ha_assist2.sh b/lustre/utils/ha_assist2.sh deleted file mode 100755 index a07d8b5..0000000 --- a/lustre/utils/ha_assist2.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -set -vx -date -echo "ha assist checking for problems" -sleep 3 -if [ ! -e /tmp/halog ]; then - echo "no problems, exiting" - exit -fi - -echo "removing /tmp/halog" -rm /tmp/halog - -echo secondary start `date` -echo "- please supply a new mds" - -# invoke ldap client here - - -/usr/src/portals/linux/utils/ptlctl <]"}, {"add_uuid", jt_lcfg_add_uuid, 0, "associate a UUID with a nid\n" - "usage: add_uuid "}, + "usage: add_uuid "}, {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n" "usage: close_uuid "}, {"del_uuid", jt_lcfg_del_uuid, 0, "delete a UUID association\n" @@ -144,8 +144,8 @@ command_t cmdlist[] = { /* Device configuration commands */ {"==== device config =====", jt_noop, 0, "device config"}, {"attach", jt_lcfg_attach, 0, - "set the type of the current device (with and )\n" - "usage: attach type [name [uuid]]"}, + "set the type, name, and uuid of the current device\n" + "usage: attach type name uuid"}, {"setup", jt_lcfg_setup, 0, "type specific device configuration information\n" "usage: setup "}, @@ -172,6 +172,9 @@ command_t cmdlist[] = { "usage: dump_log config-uuid-name"}, {"clear_log", jt_cfg_clear_log, 0, "delete current config log of recorded commands\n" "usage: clear_log config-name"}, + {"conf_param", jt_lcfg_mgsparam, 0, "set a permanent config param\n" + "usage: conf_param ...\n"}, + /* Device operations */ {"=== device operations ==", jt_noop, 0, "device operations"}, @@ -231,14 +234,16 @@ command_t cmdlist[] = { {"del_mount_option", jt_lcfg_del_mount_option, 0, "usage: del_mount_option profile\n"}, {"set_timeout", jt_lcfg_set_timeout, 0, - "usage: set_timeout \n"}, + "usage: conf_param obd_timeout=\n"}, {"set_lustre_upcall", jt_lcfg_set_lustre_upcall, 0, "usage: set_lustre_upcall \n"}, {"add_conn ", jt_lcfg_add_conn, 0, "usage: add_conn [priority]\n"}, {"del_conn ", jt_lcfg_del_conn, 0, "usage: del_conn \n"}, - + {"local_param", jt_lcfg_param, 0, "set a temporary, local param\n" + "usage: local_param ...\n"}, + /* Llog operations */ {"llog_catlist", jt_llog_catlist, 0, "list all catalog logs on current device.\n" diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index aa27001..349c703 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -322,7 +322,7 @@ static int lfs_osts(int argc, char **argv) } else { mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) ==0) { - if (llapi_is_lustre_mnttype(mnt->mnt_type)) { + if (llapi_is_lustre_mnttype(mnt)) { rc = llapi_find(mnt->mnt_dir, obduuid, 0, 0, 0); if (rc) fprintf(stderr, @@ -370,7 +370,7 @@ static int path2mnt(char *path, FILE *fp, char *mntdir, int dir_len) len = 0; mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) == 0) { - if (llapi_is_lustre_mnttype(mnt->mnt_type)) { + if (llapi_is_lustre_mnttype(mnt)) { len = strlen(mnt->mnt_dir); if (len > out_len && !strncmp(rpath, mnt->mnt_dir, len)) { @@ -585,7 +585,7 @@ static int lfs_df(int argc, char **argv) } else { mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) == 0) { - if (llapi_is_lustre_mnttype(mnt->mnt_type)) { + if (llapi_is_lustre_mnttype(mnt)) { rc = mntdf(mnt->mnt_dir, ishow, cooked); if (rc) break; @@ -636,7 +636,7 @@ static int lfs_check(int argc, char **argv) } else { mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) ==0) { - if (llapi_is_lustre_mnttype(mnt->mnt_type)) + if (llapi_is_lustre_mnttype(mnt)) break; mnt = getmntent(fp); } @@ -677,7 +677,7 @@ static int lfs_catinfo(int argc, char **argv) } else { mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) == 0) { - if (llapi_is_lustre_mnttype(mnt->mnt_type)) + if (llapi_is_lustre_mnttype(mnt)) break; mnt = getmntent(fp); } diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 2c10da6..c320aed 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -889,9 +889,12 @@ int llapi_catinfo(char *dir, char *keyword, char *node_name) return rc; } -int llapi_is_lustre_mnttype(char *type) +/* Is this a lustre client fs? */ +int llapi_is_lustre_mnttype(struct mntent *mnt) { - return (strcmp(type,"lustre") == 0 || strcmp(type,"lustre_lite") == 0); + char *type = mnt->mnt_type; + return ((strcmp(type, "lustre") == 0 || strcmp(type,"lustre_lite") == 0) + && (strstr(mnt->mnt_fsname, ":/") != NULL)); } int llapi_quotacheck(char *mnt, int check_type) diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c index 16ce965..03f04c7 100644 --- a/lustre/utils/llog_reader.c +++ b/lustre/utils/llog_reader.c @@ -224,6 +224,7 @@ static void print_1_cfg(struct lustre_cfg *lcfg) return; } + static void print_setup_cfg(struct lustre_cfg *lcfg) { struct lov_desc *desc; @@ -246,10 +247,13 @@ static void print_setup_cfg(struct lustre_cfg *lcfg) return; } -void print_lustre_cfg(struct lustre_cfg *lcfg) +void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip) { enum lcfg_command_type cmd = le32_to_cpu(lcfg->lcfg_command); + if (*skip > 0) + printf("SKIP "); + switch(cmd){ case(LCFG_ATTACH):{ printf("attach "); @@ -326,8 +330,19 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) break; } case(LCFG_MARKER):{ - printf("marker "); - print_1_cfg(lcfg); + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + if (marker->cm_flags & CM_SKIP) { + if (marker->cm_flags & CM_START) + (*skip)++; + if (marker->cm_flags & CM_END) + (*skip)--; + } + printf("marker %d (flags=%#x) %.16s '%s' %s:%s", marker->cm_step, + marker->cm_flags, marker->cm_svname, + marker->cm_comment, ctime(&marker->cm_createtime), + marker->cm_canceltime ? + ctime(&marker->cm_canceltime) : ""); break; } default: @@ -340,9 +355,9 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) void print_records(struct llog_rec_hdr** recs,int rec_number) { __u32 lopt; - int i; + int i, skip = 0; - for(i=0;ilrh_index)); @@ -353,7 +368,7 @@ void print_records(struct llog_rec_hdr** recs,int rec_number) printf("L "); lcfg = (struct lustre_cfg *) ((char*)(recs[i]) + sizeof(struct llog_rec_hdr)); - print_lustre_cfg(lcfg); + print_lustre_cfg(lcfg, &skip); } if (lopt == PTL_CFG_REC){ diff --git a/lustre/utils/load_ldap.sh b/lustre/utils/load_ldap.sh deleted file mode 100755 index 0163b85..0000000 --- a/lustre/utils/load_ldap.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# -# Load a lustre config xml into an openldap database. -# See https://projects.clusterfs.com/lustre/LustreLDAP -# for more details. -# -# Usage: load_ldap.sh -set -e - -LDAP_BASE=${LDAP_BASE:-fs=lustre} -LDAP_ROOTDN=${LDAP_ROOTDN:-cn=Manager,fs=lustre} -LDAP_PW=${LDAP_PW:-secret} -LDAP_AUTH="-x -D $LDAP_ROOTDN -w $LDAP_PW" -LUSTRE=${LUSTRE:-`dirname $0`/..} - -if [ -f $LUSTRE/autoMakefile.am ]; then - CONFDIR=$LUSTRE/conf -else - CONFDIR=/usr/lib/lustre -fi - -TOP=$CONFDIR/top.ldif -XSL=$CONFDIR/lustre2ldif.xsl - -[ ! -z $LDAPURL ] && LDAP_AUTH="$LDAP_AUTH -H $LDAPURL" - -XML=${XML:-$1} - -if [ -z "$XML" ] || [ ! -r $XML ]; then - echo "usage: $0 xmlfile" - exit 1 -fi - -NAME=`basename $XML .xml` -LDIF=/tmp/$NAME.ldif - -# add the top level record, if needed -ldapsearch $LDAP_AUTH -b $LDAP_BASE > /dev/null 2>&1 || - ldapadd $LDAP_AUTH -f $TOP - -# If this config already exists, then delete it -ldapsearch $LDAP_AUTH -b config=$NAME,$LDAP_BASE > /dev/null 2>&1 && - ldapdelete $LDAP_AUTH -r config=$NAME,$LDAP_BASE - -4xslt -D config=$NAME $XML $XSL > $LDIF - -echo "Loading config to 'config=$NAME,$LDAP_BASE' ..." -ldapadd $LDAP_AUTH -f $LDIF - -rm -f $LDIF diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 07df82b..342a4da 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -98,23 +98,14 @@ int jt_lcfg_attach(int argc, char **argv) struct lustre_cfg *lcfg; int rc; - if (argc != 2 && argc != 3 && argc != 4) + if (argc != 4) return CMD_HELP; lustre_cfg_bufs_reset(&bufs, NULL); lustre_cfg_bufs_set_string(&bufs, 1, argv[1]); - if (argc >= 3) { - lustre_cfg_bufs_set_string(&bufs, 0, argv[2]); - } else { - fprintf(stderr, "error: %s: LCFG_ATTACH requires a name\n", - jt_cmdname(argv[0])); - return -EINVAL; - } - - if (argc == 4) { - lustre_cfg_bufs_set_string(&bufs, 2, argv[3]); - } + lustre_cfg_bufs_set_string(&bufs, 0, argv[2]); + lustre_cfg_bufs_set_string(&bufs, 2, argv[3]); lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs); rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); @@ -548,6 +539,12 @@ int jt_lcfg_set_timeout(int argc, char **argv) struct lustre_cfg_bufs bufs; struct lustre_cfg *lcfg; + fprintf(stderr, "%s has been deprecated. Use conf_param instead.\n" + "e.g. conf_param lustre-MDT0000 obd_timeout=50\n", + jt_cmdname(argv[0])); + return CMD_HELP; + + if (argc != 2) return CMD_HELP; @@ -556,6 +553,8 @@ int jt_lcfg_set_timeout(int argc, char **argv) lcfg->lcfg_num = atoi(argv[1]); rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + //rc = lcfg_mgs_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -659,3 +658,77 @@ int jt_lcfg_del_conn(int argc, char **argv) return rc; } + +/* Param set locally, directly on target */ +int jt_lcfg_param(int argc, char **argv) +{ + int i, rc; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + + if (argc >= LUSTRE_CFG_MAX_BUFCOUNT) + return CMD_HELP; + + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + + for (i = 1; i < argc; i++) { + lustre_cfg_bufs_set_string(&bufs, i, argv[i]); + } + + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); + if (rc < 0) { + fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), + strerror(rc = errno)); + } + return rc; +} + +/* Param set in config log on MGS */ +/* conf_param key1=value1 [key2=value2...] */ +int jt_lcfg_mgsparam(int argc, char **argv) +{ + int i, rc, index_offset = 0; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + + if ((argc >= LUSTRE_CFG_MAX_BUFCOUNT) || (argc <= 1)) + return CMD_HELP; + + if (!strchr(argv[1], '=')) { + /* Not key=val, assume */ + rc = jt_lcfg_device(2, argv); + if (rc) + return rc; + index_offset = 1; + } + + if (lcfg_devname == NULL) { + fprintf(stderr, "%s: please use 'cfg_device name' to set the " + "device name for config commands.\n", + jt_cmdname(argv[0])); + return -EINVAL; + } + + lustre_cfg_bufs_reset(&bufs, lcfg_devname); + + for (i = 1; i < (argc - index_offset); i++) { + lustre_cfg_bufs_set_string(&bufs, i, argv[i + index_offset]); + } + + /* We could put other opcodes here. */ + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + + rc = lcfg_mgs_ioctl(argv[0], OBD_DEV_ID, lcfg); + lustre_cfg_free(lcfg); + if (rc < 0) { + fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), + strerror(rc = errno)); + } + + return rc; +} + + diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c new file mode 100644 index 0000000..1f88563 --- /dev/null +++ b/lustre/utils/mkfs_lustre.c @@ -0,0 +1,1271 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Lin Song Tao + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +//#define HAVE_SYS_VFS_H 1 +#include // for BLKGETSIZE64 +#include +#include +#include +#include + + +#define MAX_LOOP_DEVICES 16 +#define L_BLOCK_SIZE 4096 +#define INDEX_UNASSIGNED 0xFFFF + +static char *progname; +static int verbose = 1; +static int print_only = 0; + + +void usage(FILE *out) +{ + fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname); + fprintf(out, "usage: %s [options] \n", progname); + fprintf(out, + "\t:block device or file (e.g /dev/sda or /tmp/ost1)\n" + "\ttarget types:\n" + "\t\t--ost: object storage, mutually exclusive with mdt\n" + "\t\t--mdt: metadata storage, mutually exclusive with ost\n" + "\t\t--mgs: configuration management service - one per site\n" + "\toptions (in order of popularity):\n" + "\t\t--mgsnode=[,<...>] : NID(s) of a remote mgs node\n" + "\t\t\trequired for all targets other than the mgs node\n" + "\t\t--fsname= : default is 'lustre'\n" + "\t\t--failnode=[,<...>] : NID(s) of a failover partner\n" + "\t\t--param = : set a permanent parameter\n" + "\t\t--index=#N : target index\n" + /* FIXME implement 1.6.x + "\t\t--configdev=: store configuration info\n" + "\t\t\tfor this device on an alternate device\n" + */ + "\t\t--mountfsoptions= : permanent mount options\n" + "\t\t--backfstype= : backing fs type (ext3, ldiskfs)\n" + "\t\t--device-size=#N(KB) : device size for loop devices\n" +#ifndef TUNEFS + "\t\t--mkfsoptions= : format options\n" + "\t\t--reformat: overwrite an existing disk\n" + "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n" +#else + "\t\t--erase-params : erase all old parameter settings\n" + "\t\t--nomgs: turn off MGS service on this MDT\n" + "\t\t--writeconf: erase all config logs for this fs.\n" +#endif + "\t\t--print: just report what we would do; don't write to " + "disk\n" + "\t\t--verbose\n" + "\t\t--quiet\n"); + return; +} + +#define vprint if (verbose > 0) printf + +static void fatal(void) +{ + verbose = 0; + fprintf(stderr, "\n%s FATAL: ", progname); +} + +/*================ utility functions =====================*/ + +inline unsigned int +dev_major (unsigned long long int __dev) +{ + return ((__dev >> 8) & 0xfff) | ((unsigned int) (__dev >> 32) & ~0xfff); +} + +inline unsigned int +dev_minor (unsigned long long int __dev) +{ + return (__dev & 0xff) | ((unsigned int) (__dev >> 12) & ~0xff); +} + +int get_os_version() +{ + static int version = 0; + + if (!version) { + int fd; + char release[4] = ""; + + fd = open("/proc/sys/kernel/osrelease", O_RDONLY); + if (fd < 0) + fprintf(stderr, "%s: Warning: Can't resolve kernel " + "version, assuming 2.6\n", progname); + else { + read(fd, release, 4); + close(fd); + } + if (strncmp(release, "2.4.", 4) == 0) + version = 24; + else + version = 26; + } + return version; +} + +int run_command(char *cmd) +{ + char log[] = "/tmp/mkfs_logXXXXXX"; + int fd, rc; + + if (verbose > 1) + printf("cmd: %s\n", cmd); + + if ((fd = mkstemp(log)) >= 0) { + close(fd); + strcat(cmd, " >"); + strcat(cmd, log); + } + strcat(cmd, " 2>&1"); + + /* Can't use popen because we need the rv of the command */ + rc = system(cmd); + if (rc && fd >= 0) { + char buf[128]; + FILE *fp; + fp = fopen(log, "r"); + if (fp) { + while (fgets(buf, sizeof(buf), fp) != NULL) { + if (rc || verbose > 2) + printf(" %s", buf); + } + fclose(fp); + } + } + if (fd >= 0) + remove(log); + return rc; +} + +static int check_mtab_entry(char *spec, char *type) +{ + FILE *fp; + struct mntent *mnt; + + fp = setmntent(MOUNTED, "r"); + if (fp == NULL) + return(0); + + while ((mnt = getmntent(fp)) != NULL) { + if (strcmp(mnt->mnt_fsname, spec) == 0 && + strcmp(mnt->mnt_type, type) == 0) { + endmntent(fp); + fprintf(stderr, "%s: according to %s %s is " + "already mounted on %s\n", + progname, MOUNTED, spec, mnt->mnt_dir); + return(EEXIST); + } + } + endmntent(fp); + + return(0); +} + +/*============ disk dev functions ===================*/ + +/* Setup a file in the first unused loop_device */ +int loop_setup(struct mkfs_opts *mop) +{ + char loop_base[20]; + char l_device[64]; + int i,ret = 0; + + /* Figure out the loop device names */ + if (!access("/dev/loop0", F_OK | R_OK)) + strcpy(loop_base, "/dev/loop\0"); + else if (!access("/dev/loop/0", F_OK | R_OK)) + strcpy(loop_base, "/dev/loop/\0"); + else { + fprintf(stderr, "%s: can't access loop devices\n", progname); + return 1; + } + + /* Find unused loop device */ + for (i = 0; i < MAX_LOOP_DEVICES; i++) { + char cmd[128]; + sprintf(l_device, "%s%d", loop_base, i); + if (access(l_device, F_OK | R_OK)) + break; + sprintf(cmd, "losetup %s > /dev/null 2>&1", l_device); + ret = system(cmd); + /* losetup gets 1 (ret=256) for non-set-up device */ + if (ret) { + /* Set up a loopback device to our file */ + sprintf(cmd, "losetup %s %s", l_device, mop->mo_device); + ret = run_command(cmd); + if (ret) { + fprintf(stderr, "%s: error %d on losetup: %s\n", + progname, ret, strerror(ret)); + return ret; + } + strcpy(mop->mo_loopdev, l_device); + return ret; + } + } + + fprintf(stderr, "%s: out of loop devices!\n", progname); + return EMFILE; +} + +int loop_cleanup(struct mkfs_opts *mop) +{ + char cmd[128]; + int ret = 1; + if ((mop->mo_flags & MO_IS_LOOP) && *mop->mo_loopdev) { + sprintf(cmd, "losetup -d %s", mop->mo_loopdev); + ret = run_command(cmd); + } + return ret; +} + +/* Determine if a device is a block device (as opposed to a file) */ +int is_block(char* devname) +{ + struct stat st; + int ret = 0; + + ret = access(devname, F_OK); + if (ret != 0) + return 0; + ret = stat(devname, &st); + if (ret != 0) { + fprintf(stderr, "%s: cannot stat %s\n", progname, devname); + return -1; + } + return S_ISBLK(st.st_mode); +} + +__u64 get_device_size(char* device) +{ + int ret, fd; + __u64 size = 0; + + fd = open(device, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s: cannot open %s: %s\n", + progname, device, strerror(errno)); + return 0; + } + + /* size in bytes. bz5831 */ + ret = ioctl(fd, BLKGETSIZE64, (void*)&size); + close(fd); + if (ret < 0) { + fprintf(stderr, "%s: size ioctl failed: %s\n", + progname, strerror(errno)); + return 0; + } + + vprint("device size = "LPU64"MB\n", size >> 20); + /* return value in KB */ + return size >> 10; +} + +int loop_format(struct mkfs_opts *mop) +{ + int ret = 0; + + if (mop->mo_device_sz == 0) { + fatal(); + fprintf(stderr, "loop device requires a --device-size= " + "param\n"); + return EINVAL; + } + + ret = creat(mop->mo_device, S_IRUSR|S_IWUSR); + ret = truncate(mop->mo_device, mop->mo_device_sz * 1024); + if (ret != 0) { + ret = errno; + fprintf(stderr, "%s: Unable to create backing store: %d\n", + progname, ret); + } + + return ret; +} + +/* Check whether the file exists in the device */ +static int file_in_dev(char *file_name, char *dev_name) +{ + FILE *fp; + char debugfs_cmd[256]; + unsigned int inode_num; + int i; + + /* Construct debugfs command line. */ + memset(debugfs_cmd, 0, sizeof(debugfs_cmd)); + sprintf(debugfs_cmd, + "debugfs -c -R 'stat %s' %s 2>&1 | egrep '(Inode|unsupported)'", + file_name, dev_name); + + fp = popen(debugfs_cmd, "r"); + if (!fp) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + return 0; + } + + if (fscanf(fp, "Inode: %u", &inode_num) == 1) { /* exist */ + pclose(fp); + return 1; + } + i = fread(debugfs_cmd, 1, sizeof(debugfs_cmd), fp); + if (i) { + /* Filesystem has unsupported feature */ + vprint("%.*s", i, debugfs_cmd); + /* in all likelihood, the "unsupported feature" is + 'extents', which older debugfs does not understand. + Use e2fsprogs-1.38-cfs1 or later, available from + ftp://ftp.lustre.org/pub/lustre/other/e2fsprogs/ */ + return -1; + } + pclose(fp); + return 0; +} + +/* Check whether the device has already been used with lustre */ +static int is_lustre_target(struct mkfs_opts *mop) +{ + int rc; + vprint("checking for existing Lustre data\n"); + + if ((rc = file_in_dev(MOUNT_DATA_FILE, mop->mo_device)) + || (rc = file_in_dev(LAST_RCVD, mop->mo_device))) { + vprint("found Lustre data\n"); + /* in the -1 case, 'extents' means this really IS a lustre + target */ + return rc; + } + + return 0; /* The device is not a lustre target. */ +} + +/* Build fs according to type */ +int make_lustre_backfs(struct mkfs_opts *mop) +{ + char mkfs_cmd[512]; + char buf[40]; + char *dev; + int ret = 0; + int block_count = 0; + + if (mop->mo_device_sz != 0) { + if (mop->mo_device_sz < 8096){ + fprintf(stderr, "%s: size of filesystem must be larger " + "than 8MB, but is set to %lldKB\n", + progname, mop->mo_device_sz); + return EINVAL; + } + block_count = mop->mo_device_sz / (L_BLOCK_SIZE >> 10); + } + + if ((mop->mo_ldd.ldd_mount_type == LDD_MT_EXT3) || + (mop->mo_ldd.ldd_mount_type == LDD_MT_LDISKFS)) { + __u64 device_sz = mop->mo_device_sz; + + /* we really need the size */ + if (device_sz == 0) { + device_sz = get_device_size(mop->mo_device); + if (device_sz == 0) + return ENODEV; + } + + /* Journal size in MB */ + if (strstr(mop->mo_mkfsopts, "-J") == NULL) { + /* Choose our own default journal size */ + long journal_sz = 0, max_sz; + if (device_sz > 1024 * 1024) /* 1GB */ + journal_sz = (device_sz / 102400) * 4; + /* man mkfs.ext3 */ + max_sz = (102400 * L_BLOCK_SIZE) >> 20; /* 400MB */ + if (journal_sz > max_sz) + journal_sz = max_sz; + if (journal_sz) { + sprintf(buf, " -J size=%ld", journal_sz); + strcat(mop->mo_mkfsopts, buf); + } + } + + /* Default bytes_per_inode is block size */ + if (strstr(mop->mo_mkfsopts, "-i") == NULL) { + long bytes_per_inode = 0; + + if (IS_MDT(&mop->mo_ldd)) + bytes_per_inode = 4096; + + /* Allocate fewer inodes on large OST devices. Most + filesystems can be much more aggressive than even + this. */ + if ((IS_OST(&mop->mo_ldd) && (device_sz > 1000000))) + bytes_per_inode = 16384; + + if (bytes_per_inode > 0) { + sprintf(buf, " -i %ld", bytes_per_inode); + strcat(mop->mo_mkfsopts, buf); + } + } + + /* This is an undocumented mke2fs option. Default is 128. */ + if (strstr(mop->mo_mkfsopts, "-I") == NULL) { + long inode_size = 0; + if (IS_MDT(&mop->mo_ldd)) { + if (mop->mo_stripe_count > 77) + inode_size = 512; /* bz 7241 */ + /* cray stripes across all osts (>60) */ + else if (mop->mo_stripe_count > 34) + inode_size = 2048; + else if (mop->mo_stripe_count > 13) + inode_size = 1024; + else + inode_size = 512; + } else if (IS_OST(&mop->mo_ldd)) { + /* now as we store fids in EA on OST we need + to make inode bigger */ + inode_size = 256; + } + + if (inode_size > 0) { + sprintf(buf, " -I %ld", inode_size); + strcat(mop->mo_mkfsopts, buf); + } + + } + + if (verbose < 2) { + strcat(mop->mo_mkfsopts, " -q"); + } + + /* Enable hashed b-tree directory lookup in large dirs bz6224 */ + if (strstr(mop->mo_mkfsopts, "-O") == NULL) { + strcat(mop->mo_mkfsopts, " -O dir_index"); + } + + /* Allow reformat of full devices (as opposed to + partitions.) We already checked for mounted dev. */ + strcat(mop->mo_mkfsopts, " -F"); + + sprintf(mkfs_cmd, "mkfs.ext2 -j -b %d -L %s ", L_BLOCK_SIZE, + mop->mo_ldd.ldd_svname); + + } else if (mop->mo_ldd.ldd_mount_type == LDD_MT_REISERFS) { + long journal_sz = 0; /* FIXME default journal size */ + if (journal_sz > 0) { + sprintf(buf, " --journal_size %ld", journal_sz); + strcat(mop->mo_mkfsopts, buf); + } + sprintf(mkfs_cmd, "mkreiserfs -ff "); + + } else { + fprintf(stderr,"%s: unsupported fs type: %d (%s)\n", + progname, mop->mo_ldd.ldd_mount_type, + MT_STR(&mop->mo_ldd)); + return EINVAL; + } + + /* For loop device format the dev, not the filename */ + dev = mop->mo_device; + if (mop->mo_flags & MO_IS_LOOP) + dev = mop->mo_loopdev; + + vprint("formatting backing filesystem %s on %s\n", + MT_STR(&mop->mo_ldd), dev); + vprint("\ttarget name %s\n", mop->mo_ldd.ldd_svname); + vprint("\t4k blocks %d\n", block_count); + vprint("\toptions %s\n", mop->mo_mkfsopts); + + /* mkfs_cmd's trailing space is important! */ + strcat(mkfs_cmd, mop->mo_mkfsopts); + strcat(mkfs_cmd, " "); + strcat(mkfs_cmd, dev); + if (block_count != 0) { + sprintf(buf, " %d", block_count); + strcat(mkfs_cmd, buf); + } + + vprint("mkfs_cmd = %s\n", mkfs_cmd); + ret = run_command(mkfs_cmd); + if (ret) { + fatal(); + fprintf(stderr, "Unable to build fs %s (%d)\n", dev, ret); + goto out; + } + +out: + return ret; +} + +/* ==================== Lustre config functions =============*/ + +void print_ldd(char *str, struct lustre_disk_data *ldd) +{ + printf("\n %s:\n", str); + printf("Target: %s\n", ldd->ldd_svname); + if (ldd->ldd_svindex == INDEX_UNASSIGNED) + printf("Index: unassigned\n"); + else + printf("Index: %d\n", ldd->ldd_svindex); + printf("UUID: %s\n", (char *)ldd->ldd_uuid); + printf("Lustre FS: %s\n", ldd->ldd_fsname); + printf("Mount type: %s\n", MT_STR(ldd)); + printf("Flags: %#x\n", ldd->ldd_flags); + printf(" (%s%s%s%s%s%s%s%s)\n", + IS_MDT(ldd) ? "MDT ":"", + IS_OST(ldd) ? "OST ":"", + IS_MGS(ldd) ? "MGS ":"", + ldd->ldd_flags & LDD_F_NEED_INDEX ? "needs_index ":"", + ldd->ldd_flags & LDD_F_VIRGIN ? "first_time ":"", + ldd->ldd_flags & LDD_F_UPDATE ? "update ":"", + ldd->ldd_flags & LDD_F_WRITECONF ? "writeconf ":"", + ldd->ldd_flags & LDD_F_UPGRADE14 ? "upgrade1.4 ":""); + printf("Persistent mount opts: %s\n", ldd->ldd_mount_opts); + printf("Parameters:%s\n", ldd->ldd_params); + printf("\n"); +} + +/* Write the server config files */ +int write_local_files(struct mkfs_opts *mop) +{ + char mntpt[] = "/tmp/mntXXXXXX"; + char filepnm[128]; + char *dev; + FILE *filep; + int ret = 0; + + /* Mount this device temporarily in order to write these files */ + if (!mkdtemp(mntpt)) { + fprintf(stderr, "%s: Can't create temp mount point %s: %s\n", + progname, mntpt, strerror(errno)); + return errno; + } + + dev = mop->mo_device; + if (mop->mo_flags & MO_IS_LOOP) + dev = mop->mo_loopdev; + + ret = mount(dev, mntpt, MT_STR(&mop->mo_ldd), 0, NULL); + if (ret) { + fprintf(stderr, "%s: Unable to mount %s: %s\n", + progname, dev, strerror(errno)); + if (errno == ENODEV) { + fprintf(stderr, "Is the %s module available?\n", + MT_STR(&mop->mo_ldd)); + } + goto out_rmdir; + } + + /* Set up initial directories */ + sprintf(filepnm, "%s/%s", mntpt, MOUNT_CONFIGS_DIR); + ret = mkdir(filepnm, 0777); + if ((ret != 0) && (errno != EEXIST)) { + fprintf(stderr, "%s: Can't make configs dir %s (%d)\n", + progname, filepnm, ret); + goto out_umnt; + } else if (errno == EEXIST) { + ret = 0; + } + + /* Save the persistent mount data into a file. Lustre must pre-read + this file to get the real mount options. */ + vprint("Writing %s\n", MOUNT_DATA_FILE); + sprintf(filepnm, "%s/%s", mntpt, MOUNT_DATA_FILE); + filep = fopen(filepnm, "w"); + if (!filep) { + fprintf(stderr, "%s: Unable to create %s file\n", + progname, filepnm); + goto out_umnt; + } + fwrite(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep); + fclose(filep); + + /* COMPAT_146 */ +#ifdef TUNEFS + /* Check for upgrade */ + if ((mop->mo_ldd.ldd_flags & (LDD_F_UPGRADE14 | LDD_F_SV_TYPE_MGS)) + == (LDD_F_UPGRADE14 | LDD_F_SV_TYPE_MGS)) { + char cmd[128]; + char *term; + vprint("Copying old logs\n"); +#if 0 + /* Generate new client log as servers upgrade. Starting a new client + may end up with short lov's, so will be degraded until all servers + upgrade */ + /* Copy the old client log to fsname-client */ + sprintf(filepnm, "%s/%s/%s-client", + mntpt, MOUNT_CONFIGS_DIR, mop->mo_ldd.ldd_fsname); + sprintf(cmd, "cp %s/%s/client %s", mntpt, MDT_LOGS_DIR, + filepnm); + if (verbose > 1) + printf("cmd: %s\n", cmd); + ret = run_command(cmd); + if (ret) { + fprintf(stderr, "%s: Can't copy 1.4 config %s/client " + "(%d)\n", progname, MDT_LOGS_DIR, ret); + fprintf(stderr, "mount -t ext3 %s somewhere, " + "find the client log for fs %s and " + "copy it manually into %s/%s-client, " + "then umount.\n", + mop->mo_device, + mop->mo_ldd.ldd_fsname, MOUNT_CONFIGS_DIR, + mop->mo_ldd.ldd_fsname); + goto out_umnt; + } + #endif + /* We need to use the old mdt log because otherwise mdt won't + have complete lov if old clients connect before all + servers upgrade. */ + /* Copy the old mdt log to fsname-MDT0000 (get old + name from mdt_UUID) */ + ret = 1; + strcpy(filepnm, mop->mo_ldd.ldd_uuid); + term = strstr(filepnm, "_UUID"); + if (term) { + *term = '\0'; + sprintf(cmd, "cp %s/%s/%s %s/%s/%s", + mntpt, MDT_LOGS_DIR, filepnm, + mntpt, MOUNT_CONFIGS_DIR, + mop->mo_ldd.ldd_svname); + if (verbose > 1) + printf("cmd: %s\n", cmd); + ret = run_command(cmd); + } + if (ret) { + fprintf(stderr, "%s: Can't copy 1.4 config %s/%s " + "(%d)\n", progname, MDT_LOGS_DIR, filepnm, ret); + fprintf(stderr, "mount -t ext3 %s somewhere, " + "find the MDT log for fs %s and " + "copy it manually into %s/%s, " + "then umount.\n", + mop->mo_device, + mop->mo_ldd.ldd_fsname, MOUNT_CONFIGS_DIR, + mop->mo_ldd.ldd_svname); + goto out_umnt; + } + } +#endif + /* end COMPAT_146 */ + + +out_umnt: + umount(mntpt); +out_rmdir: + rmdir(mntpt); + return ret; +} + +int read_local_files(struct mkfs_opts *mop) +{ + char mntpt[] = "/tmp/mntXXXXXX"; + char filepnm[128]; + char *dev; + FILE *filep; + int ret = 0; + + /* Mount this device temporarily in order to read these files */ + if (!mkdtemp(mntpt)) { + fprintf(stderr, "%s: Can't create temp mount point %s: %s\n", + progname, mntpt, strerror(errno)); + return errno; + } + + dev = mop->mo_device; + if (mop->mo_flags & MO_IS_LOOP) + dev = mop->mo_loopdev; + + ret = mount(dev, mntpt, MT_STR(&mop->mo_ldd), 0, NULL); + if (ret) { + fprintf(stderr, "%s: Unable to mount %s: %s\n", + progname, dev, strerror(errno)); + goto out_rmdir; + } + + sprintf(filepnm, "%s/%s", mntpt, MOUNT_DATA_FILE); + filep = fopen(filepnm, "r"); + if (filep) { + vprint("Reading %s\n", MOUNT_DATA_FILE); + fread(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep); + } else { + /* COMPAT_146 */ + /* Try to read pre-1.6 config from last_rcvd */ + struct lr_server_data lsd; + vprint("%s: Unable to read %s, trying last_rcvd\n", + progname, MOUNT_DATA_FILE); + sprintf(filepnm, "%s/%s", mntpt, LAST_RCVD); + filep = fopen(filepnm, "r"); + if (!filep) { + fprintf(stderr, "%s: Unable to read old data\n", + progname); + ret = -errno; + goto out_umnt; + } + vprint("Reading %s\n", LAST_RCVD); + ret = fread(&lsd, 1, sizeof(lsd), filep); + if (ret < sizeof(lsd)) { + fprintf(stderr, "%s: Short read (%d of %d)\n", + progname, ret, sizeof(lsd)); + ret = -ferror(filep); + if (ret) + goto out_close; + } + ret = 0; + if (lsd.lsd_feature_compat & OBD_COMPAT_OST) { + mop->mo_ldd.ldd_flags = LDD_F_SV_TYPE_OST; + mop->mo_ldd.ldd_svindex = lsd.lsd_ost_index; + } else if (lsd.lsd_feature_compat & OBD_COMPAT_MDT) { + /* We must co-locate so mgs can see old logs. + If user doesn't want this, they can copy the old + logs manually and re-tunefs. */ + mop->mo_ldd.ldd_flags = + LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_MGS; + mop->mo_ldd.ldd_svindex = lsd.lsd_mdt_index; + } else { + /* If neither is set, we're pre-1.4.6, make a guess. */ + sprintf(filepnm, "%s/%s", mntpt, MDT_LOGS_DIR); + if (lsd.lsd_ost_index > 0) { + mop->mo_ldd.ldd_flags = LDD_F_SV_TYPE_OST; + mop->mo_ldd.ldd_svindex = lsd.lsd_ost_index; + } else { + /* If there's a LOGS dir, it's an MDT */ + if ((ret = access(filepnm, F_OK)) == 0) { + mop->mo_ldd.ldd_flags = + LDD_F_SV_TYPE_MDT | + LDD_F_SV_TYPE_MGS; + /* Old MDT's are always index 0 + (pre CMD) */ + mop->mo_ldd.ldd_svindex = 0; + } else { + /* The index won't be correct */ + mop->mo_ldd.ldd_flags = + LDD_F_SV_TYPE_OST | LDD_F_NEED_INDEX; + } + } + } + + memcpy(mop->mo_ldd.ldd_uuid, lsd.lsd_uuid, + sizeof(mop->mo_ldd.ldd_uuid)); + mop->mo_ldd.ldd_flags |= LDD_F_UPGRADE14; + } + /* end COMPAT_146 */ +out_close: + fclose(filep); + +out_umnt: + umount(mntpt); +out_rmdir: + rmdir(mntpt); + return ret; +} + + +void set_defaults(struct mkfs_opts *mop) +{ + mop->mo_ldd.ldd_magic = LDD_MAGIC; + mop->mo_ldd.ldd_config_ver = 1; + mop->mo_ldd.ldd_flags = LDD_F_NEED_INDEX | LDD_F_UPDATE | LDD_F_VIRGIN; + mop->mo_mgs_failnodes = 0; + strcpy(mop->mo_ldd.ldd_fsname, "lustre"); + if (get_os_version() == 24) + mop->mo_ldd.ldd_mount_type = LDD_MT_EXT3; + else + mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS; + + mop->mo_ldd.ldd_svindex = INDEX_UNASSIGNED; + mop->mo_stripe_count = 1; +} + +static inline void badopt(const char *opt, char *type) +{ + fprintf(stderr, "%s: '--%s' only valid for %s\n", + progname, opt, type); + usage(stderr); +} + +static int add_param(char *buf, char *key, char *val) +{ + int end = sizeof(((struct lustre_disk_data *)0)->ldd_params); + int start = strlen(buf); + int keylen = 0; + + if (key) + keylen = strlen(key); + if (start + 1 + keylen + strlen(val) >= end) { + fprintf(stderr, "%s: params are too long-\n%s %s%s\n", + progname, buf, key ? key : "", val); + return 1; + } + + sprintf(buf + start, " %s%s", key ? key : "", val); + return 0; +} + +/* from mount_lustre */ +/* Get rid of symbolic hostnames for tcp, since kernel can't do lookups */ +#define MAXNIDSTR 1024 +static char *convert_hostnames(char *s1) +{ + char *converted, *s2 = 0, *c; + int left = MAXNIDSTR; + lnet_nid_t nid; + + converted = malloc(left); + c = converted; + while ((left > 0) && ((s2 = strsep(&s1, ",: \0")))) { + nid = libcfs_str2nid(s2); + if (nid == LNET_NID_ANY) { + if (*s2 == '/') + /* end of nids */ + break; + fprintf(stderr, "%s: Can't parse NID '%s'\n", + progname, s2); + free(converted); + return NULL; + } + if (LNET_NETTYP(LNET_NIDNET(nid)) == SOCKLND) { + __u32 addr = LNET_NIDADDR(nid); + c += snprintf(c, left, "%u.%u.%u.%u@%s%u,", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff, + libcfs_lnd2str(SOCKLND), + LNET_NETNUM(LNET_NIDNET(nid))); + } else { + c += snprintf(c, left, "%s,", s2); + } + left = converted + MAXNIDSTR - c; + } + *(c - 1) = '\0'; + return converted; +} + +int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, + char **mountopts) +{ + static struct option long_opt[] = { + {"backfstype", 1, 0, 'b'}, + {"stripe-count-hint", 1, 0, 'c'}, + {"configdev", 1, 0, 'C'}, + {"device-size", 1, 0, 'd'}, + {"erase-params", 0, 0, 'e'}, + {"failnode", 1, 0, 'f'}, + {"failover", 1, 0, 'f'}, + {"mgs", 0, 0, 'G'}, + {"help", 0, 0, 'h'}, + {"index", 1, 0, 'i'}, + {"mkfsoptions", 1, 0, 'k'}, + {"mgsnode", 1, 0, 'm'}, + {"mgsnid", 1, 0, 'm'}, + {"mdt", 0, 0, 'M'}, + {"fsname",1, 0, 'n'}, + {"nomgs", 0, 0, 'N'}, + {"mountfsoptions", 1, 0, 'o'}, + {"ost", 0, 0, 'O'}, + {"param", 1, 0, 'p'}, + {"print", 0, 0, 'P'}, + {"quiet", 0, 0, 'q'}, + {"reformat", 0, 0, 'r'}, + {"verbose", 0, 0, 'v'}, + {"writeconf", 0, 0, 'w'}, + {0, 0, 0, 0} + }; + char *optstring = "b:c:C:d:ef:Ghi:k:m:Mn:No:Op:Pqrvw"; + char opt; + int rc, longidx; + + while ((opt = getopt_long(argc, argv, optstring, long_opt, &longidx)) != + EOF) { + switch (opt) { + case 'b': { + int i = 0; + while (i < LDD_MT_LAST) { + if (strcmp(optarg, mt_str(i)) == 0) { + mop->mo_ldd.ldd_mount_type = i; + break; + } + i++; + } + break; + } + case 'c': + if (IS_MDT(&mop->mo_ldd)) { + int stripe_count = atol(optarg); + if (stripe_count <= 0) { + fprintf(stderr, "%s: bad stripe count " + "%d\n", progname, stripe_count); + return 1; + } + mop->mo_stripe_count = stripe_count; + } else { + badopt(long_opt[longidx].name, "MDT"); + return 1; + } + break; + case 'C': /* Configdev */ + //FIXME + printf("Configdev not implemented\n"); + return 1; + case 'd': + mop->mo_device_sz = atol(optarg); + break; + case 'e': + mop->mo_ldd.ldd_params[0] = '\0'; + break; + case 'f': { + char *nids = convert_hostnames(optarg); + if (!nids) + return 1; + rc = add_param(mop->mo_ldd.ldd_params, PARAM_FAILNODE, + nids); + free(nids); + if (rc) + return rc; + break; + } + case 'G': + mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_MGS; + break; + case 'h': + usage(stdout); + return 1; + case 'i': + if (IS_MDT(&mop->mo_ldd) || IS_OST(&mop->mo_ldd)) { + mop->mo_ldd.ldd_svindex = atol(optarg); + mop->mo_ldd.ldd_flags &= ~LDD_F_NEED_INDEX; + } else { + badopt(long_opt[longidx].name, "MDT,OST"); + return 1; + } + break; + case 'k': + strncpy(mop->mo_mkfsopts, optarg, + sizeof(mop->mo_mkfsopts) - 1); + break; + case 'm': { + char *nids = convert_hostnames(optarg); + if (!nids) + return 1; + rc = add_param(mop->mo_ldd.ldd_params, PARAM_MGSNODE, + nids); + free(nids); + if (rc) + return rc; + mop->mo_mgs_failnodes++; + break; + } + case 'M': + mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_MDT; + break; + case 'n': + if (!(IS_MDT(&mop->mo_ldd) || IS_OST(&mop->mo_ldd))) { + badopt(long_opt[longidx].name, "MDT,OST"); + return 1; + } + if (strlen(optarg) > 8) { + fprintf(stderr, "%s: filesystem name must be " + "<= 8 chars\n", progname); + return 1; + } + if (optarg[0] != 0) + strncpy(mop->mo_ldd.ldd_fsname, optarg, + sizeof(mop->mo_ldd.ldd_fsname) - 1); + break; + case 'N': + mop->mo_ldd.ldd_flags &= ~LDD_F_SV_TYPE_MGS; + break; + case 'o': + *mountopts = optarg; + break; + case 'O': + mop->mo_ldd.ldd_flags |= LDD_F_SV_TYPE_OST; + break; + case 'p': + rc = add_param(mop->mo_ldd.ldd_params, NULL, optarg); + if (rc) + return rc; + break; + case 'P': + print_only++; + break; + case 'q': + verbose--; + break; + case 'r': + mop->mo_flags |= MO_FORCEFORMAT; + break; + case 'v': + verbose++; + break; + case 'w': + mop->mo_ldd.ldd_flags |= LDD_F_WRITECONF; + break; + default: + if (opt != '?') { + fatal(); + fprintf(stderr, "Unknown option '%c'\n", opt); + } + usage(stderr); + return 1; + } + }//while + if (optind >= argc) { + fatal(); + fprintf(stderr, "Bad arguments\n"); + usage(stderr); + return 1; + } + + return 0; +} + +int main(int argc, char *const argv[]) +{ + struct mkfs_opts mop; + struct lustre_disk_data *ldd; + char *mountopts = NULL; + char always_mountopts[512] = ""; + char default_mountopts[512] = ""; + int ret = 0; + + //printf("pad %d\n", offsetof(struct lustre_disk_data, ldd_padding)); + assert(offsetof(struct lustre_disk_data, ldd_padding) == 200); + + if ((progname = strrchr(argv[0], '/')) != NULL) + progname++; + else + progname = argv[0]; + + if (argc < 2) { + usage(stderr); + ret = 1; + goto out; + } + + memset(&mop, 0, sizeof(mop)); + set_defaults(&mop); + + /* device is last arg */ + strcpy(mop.mo_device, argv[argc - 1]); + + if (check_mtab_entry(mop.mo_device, "lustre")) + return(EEXIST); + + /* Are we using a loop device? */ + ret = is_block(mop.mo_device); + if (ret < 0) + goto out; + if (ret == 0) + mop.mo_flags |= MO_IS_LOOP; + +#ifdef TUNEFS + /* For tunefs, we must read in the old values before parsing any + new ones. */ + /* Create the loopback file */ + if (mop.mo_flags & MO_IS_LOOP) { + ret = access(mop.mo_device, F_OK); + if (ret == 0) + ret = loop_setup(&mop); + if (ret) { + fatal(); + fprintf(stderr, "Loop device setup for %s failed: %s\n", + mop.mo_device, strerror(ret)); + goto out; + } + } + + /* Check whether the disk has already been formatted by mkfs.lustre */ + ret = is_lustre_target(&mop); + if (ret == 0) { + fatal(); + fprintf(stderr, "Device %s has not been formatted with " + "mkfs.lustre\n", mop.mo_device); + goto out; + } + + ret = read_local_files(&mop); + if (ret) { + fatal(); + fprintf(stderr, "Failed to read previous Lustre data from %s\n", + mop.mo_device); + goto out; + } + + if (verbose > 0) + print_ldd("Read previous values", &(mop.mo_ldd)); +#endif + + ret = parse_opts(argc, argv, &mop, &mountopts); + if (ret) + goto out; + + ldd = &mop.mo_ldd; + if (!(IS_MDT(ldd) || IS_OST(ldd) || IS_MGS(ldd))) { + fatal(); + fprintf(stderr, "must set target type :{mdt,ost,mgs}\n"); + usage(stderr); + ret = 1; + goto out; + } + + if (IS_MDT(ldd) && !IS_MGS(ldd) && (mop.mo_mgs_failnodes == 0)) { + vprint("No management node specified, adding MGS to this " + "MDT\n"); + ldd->ldd_flags |= LDD_F_SV_TYPE_MGS; + } + + if (!IS_MGS(ldd) && (mop.mo_mgs_failnodes == 0)) { + fatal(); + fprintf(stderr, "Must specify either --mgs or --mgsnode\n"); + usage(stderr); + goto out; + } + + /* These are the permanent mount options (always included) */ + switch (ldd->ldd_mount_type) { + case LDD_MT_EXT3: + case LDD_MT_LDISKFS: { + sprintf(always_mountopts, "errors=remount-ro"); + if (IS_MDT(ldd) || IS_MGS(ldd)) + strcat(always_mountopts, + ",iopen_nopriv,user_xattr"); + if ((get_os_version() == 24) && IS_OST(ldd)) + strcat(always_mountopts, ",asyncdel"); +#if 0 + /* Files created while extents are enabled cannot be read if + mounted with a kernel that doesn't include the CFS patches.*/ + if (IS_OST(ldd) && + ldd->ldd_mount_type == LDD_MT_LDISKFS) { + strcat(default_mountopts, ",extents,mballoc"); + } +#endif + break; + } + case LDD_MT_SMFS: { + mop.mo_flags |= MO_IS_LOOP; + sprintf(always_mountopts, "type=ext3,dev=%s", + mop.mo_device); + break; + } + default: { + fatal(); + fprintf(stderr, "unknown fs type %d '%s'\n", + ldd->ldd_mount_type, + MT_STR(ldd)); + ret = EINVAL; + goto out; + } + } + + if (mountopts) { + /* If user specifies mount opts, don't use defaults, + but always use always_mountopts */ + sprintf(ldd->ldd_mount_opts, "%s,%s", + always_mountopts, mountopts); + } else { +#ifdef TUNEFS + if (ldd->ldd_mount_opts[0] == 0) + /* use the defaults unless old opts exist */ +#endif + { + if (default_mountopts[0]) + sprintf(ldd->ldd_mount_opts, "%s,%s", + always_mountopts, default_mountopts); + else + strcpy(ldd->ldd_mount_opts, + always_mountopts); + } + } + + server_make_name(ldd->ldd_flags, ldd->ldd_svindex, + ldd->ldd_fsname, ldd->ldd_svname); + + if (verbose > 0) + print_ldd("Permanent disk data", ldd); + + if (print_only) { + printf("exiting before disk write.\n"); + goto out; + } + +#ifndef TUNEFS /* mkfs.lustre */ + /* Create the loopback file of the correct size */ + if (mop.mo_flags & MO_IS_LOOP) { + ret = access(mop.mo_device, F_OK); + /* Don't destroy the loopback file if no FORCEFORMAT */ + if (ret || (mop.mo_flags & MO_FORCEFORMAT)) + ret = loop_format(&mop); + if (ret == 0) + ret = loop_setup(&mop); + if (ret) { + fatal(); + fprintf(stderr, "Loop device setup failed: %s\n", + strerror(ret)); + goto out; + } + } + + /* Check whether the disk has already been formatted by mkfs.lustre */ + if (!(mop.mo_flags & MO_FORCEFORMAT)) { + ret = is_lustre_target(&mop); + if (ret) { + fatal(); + fprintf(stderr, "Device %s was previously formatted " + "for lustre. Use --reformat to reformat it, " + "or tunefs.lustre to modify.\n", + mop.mo_device); + goto out; + } + } + + /* Format the backing filesystem */ + ret = make_lustre_backfs(&mop); + if (ret != 0) { + fatal(); + fprintf(stderr, "mkfs failed %d\n", ret); + goto out; + } +#endif + + ret = write_local_files(&mop); + if (ret != 0) { + fatal(); + fprintf(stderr, "failed to write local files\n"); + goto out; + } + +out: + loop_cleanup(&mop); + return ret; +} diff --git a/lustre/utils/module_setup.sh b/lustre/utils/module_setup.sh index c422184..b8371b4 100755 --- a/lustre/utils/module_setup.sh +++ b/lustre/utils/module_setup.sh @@ -1,6 +1,7 @@ #!/bin/sh MDIR=/lib/modules/`uname -r`/lustre +mkdir -p $MDIR KVER=24 EXT=o @@ -15,29 +16,31 @@ fi echo "Copying modules from local build dir to "$MDIR -mkdir -p $MDIR - -cp ../../lnet/libcfs/libcfs.$EXT $MDIR -cp ../../lnet/lnet/lnet.$EXT $MDIR -cp ../../lnet/klnds/socklnd/ksocklnd.$EXT $MDIR -cp ../lvfs/lvfs.$EXT $MDIR -cp ../obdclass/obdclass.$EXT $MDIR -cp ../ptlrpc/ptlrpc.$EXT $MDIR -cp ../mdc/mdc.$EXT $MDIR -cp ../osc/osc.$EXT $MDIR -cp ../lov/lov.$EXT $MDIR -cp ../mds/mds.$EXT $MDIR -cp ../lvfs/$FSFLT.$EXT $MDIR -[ $KVER == "26" ] && cp ../ldiskfs/ldiskfs.$EXT $MDIR -cp ../ost/ost.$EXT $MDIR -cp ../obdfilter/obdfilter.$EXT $MDIR -cp ../llite/llite.$EXT $MDIR - +cp -u ../../lnet/libcfs/libcfs.$EXT $MDIR +cp -u ../../lnet/lnet/lnet.$EXT $MDIR +cp -u ../../lnet/klnds/socklnd/ksocklnd.$EXT $MDIR +cp -u ../lvfs/lvfs.$EXT $MDIR +cp -u ../obdclass/obdclass.$EXT $MDIR +cp -u ../ptlrpc/ptlrpc.$EXT $MDIR +cp -u ../mdc/mdc.$EXT $MDIR +cp -u ../osc/osc.$EXT $MDIR +cp -u ../lov/lov.$EXT $MDIR +cp -u ../mds/mds.$EXT $MDIR +cp -u ../lvfs/$FSFLT.$EXT $MDIR +[ $KVER == "26" ] && cp -u ../ldiskfs/ldiskfs.$EXT $MDIR +cp -u ../ost/ost.$EXT $MDIR +cp -u ../obdfilter/obdfilter.$EXT $MDIR +cp -u ../llite/llite.$EXT $MDIR +cp -u ../mgc/mgc.$EXT $MDIR +cp -u ../mgs/mgs.$EXT $MDIR + +# prevent warnings on my uml +rm -f /lib/modules/`uname -r`/modules.* echo "Depmod" depmod -a -e echo "Copying mount from local build dir to "$MDIR -cp ../utils/mount.lustre /sbin/. +cp -u ../utils/mount.lustre /sbin/. MP="/sbin/modprobe" MPI="$MP --ignore-install" @@ -51,3 +54,8 @@ if [ `egrep -c "lustre|lnet" $MODFILE` -eq 0 ]; then echo "alias lustre llite" >> $MODFILE echo "# end Lustre modules" >> $MODFILE fi + +# To generate gdb debug file: +# modprobe lustre; modprobe mds; modprobe obdfilter; modprobe mgs; modprobe mgc +# rm -f /r/tmp/ogdb-`hostname` +# ./lctl modules > /r/tmp/ogdb-`hostname` diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c new file mode 100644 index 0000000..be8ebdf --- /dev/null +++ b/lustre/utils/mount_lustre.c @@ -0,0 +1,406 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * Author: Nathan Rutman + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "obdctl.h" +#include + +int verbose = 0; +int nomtab = 0; +int fake = 0; +int force = 0; +static char *progname = NULL; + +void usage(FILE *out) +{ + fprintf(out, "%s v"LUSTRE_VERSION_STRING"\n", progname); + fprintf(out, "usage: %s [-fhnv] [-o ] \n", + progname); + fprintf(out, + "\t: the disk device, or for a client:\n" + "\t\t[:...]:/-client\n" + "\t: name of the Lustre filesystem (e.g. lustre1)\n" + "\t: filesystem mountpoint (e.g. /mnt/lustre)\n" + "\t-f|--fake: fake mount (updates /etc/mtab)\n" + "\t--force: force mount even if already in /etc/mtab\n" + "\t-h|--help: print this usage message\n" + "\t-n|--nomtab: do not update /etc/mtab after mount\n" + "\t-v|--verbose: print verbose config settings\n" + "\t: one or more comma separated of:\n" + "\t\t(no)flock,(no)user_xattr,(no)acl\n" + "\t\tnosvc: only start MGC/MGS obds\n" + "\t\texclude=[:] : colon-separated list of " + "inactive OSTs (e.g. lustre-OST0001)\n" + ); + exit((out != stdout) ? EINVAL : 0); +} + +static int check_mtab_entry(char *spec, char *mtpt, char *type) +{ + FILE *fp; + struct mntent *mnt; + + if (force) + return (0); + + fp = setmntent(MOUNTED, "r"); + if (fp == NULL) + return(0); + + while ((mnt = getmntent(fp)) != NULL) { + if (strcmp(mnt->mnt_fsname, spec) == 0 && + strcmp(mnt->mnt_dir, mtpt) == 0 && + strcmp(mnt->mnt_type, type) == 0) { + endmntent(fp); + fprintf(stderr, "%s: according to %s %s is " + "already mounted on %s\n", + progname, MOUNTED, spec, mtpt); + return(EEXIST); + } + } + endmntent(fp); + + return(0); +} + +static int +update_mtab_entry(char *spec, char *mtpt, char *type, char *opts, + int flags, int freq, int pass) +{ + FILE *fp; + struct mntent mnt; + int rc = 0; + + mnt.mnt_fsname = spec; + mnt.mnt_dir = mtpt; + mnt.mnt_type = type; + mnt.mnt_opts = opts ? opts : ""; + mnt.mnt_freq = freq; + mnt.mnt_passno = pass; + + fp = setmntent(MOUNTED, "a+"); + if (fp == NULL) { + fprintf(stderr, "%s: setmntent(%s): %s:", + progname, MOUNTED, strerror (errno)); + rc = 16; + } else { + if ((addmntent(fp, &mnt)) == 1) { + fprintf(stderr, "%s: addmntent: %s:", + progname, strerror (errno)); + rc = 16; + } + endmntent(fp); + } + + return rc; +} + +/* Get rid of symbolic hostnames for tcp, since kernel can't do lookups */ +#define MAXNIDSTR 1024 +static char *convert_hostnames(char *s1) +{ + char *converted, *s2 = 0, *c; + char sep; + int left = MAXNIDSTR; + lnet_nid_t nid; + + converted = malloc(left); + c = converted; + while ((left > 0) && (*s1 != '/')) { + s2 = strpbrk(s1, ",:"); + if (!s2) + goto out_free; + sep = *s2; + *s2 = '\0'; + nid = libcfs_str2nid(s1); + if (nid == LNET_NID_ANY) + goto out_free; + if (LNET_NETTYP(LNET_NIDNET(nid)) == SOCKLND) { + __u32 addr = LNET_NIDADDR(nid); + c += snprintf(c, left, "%u.%u.%u.%u@%s%u%c", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff, + libcfs_lnd2str(SOCKLND), + LNET_NETNUM(LNET_NIDNET(nid)), sep); + } else { + c += snprintf(c, left, "%s%c", s1, sep); + } + left = converted + MAXNIDSTR - c; + s1 = s2 + 1; + } + snprintf(c, left, "%s", s1); + return converted; +out_free: + fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1); + free(converted); + return NULL; +} + +/***************************************************************************** + * + * This part was cribbed from util-linux/mount/mount.c. There was no clear + * license information, but many other files in the package are identified as + * GNU GPL, so it's a pretty safe bet that was their intent. + * + ****************************************************************************/ +struct opt_map { + const char *opt; /* option name */ + int skip; /* skip in mtab option string */ + int inv; /* true if flag value should be inverted */ + int mask; /* flag mask value */ +}; + +static const struct opt_map opt_map[] = { + /* These flags are parsed by mount, not lustre */ + { "defaults", 0, 0, 0 }, /* default options */ + { "rw", 1, 1, MS_RDONLY }, /* read-write */ + { "ro", 0, 0, MS_RDONLY }, /* read-only */ + { "exec", 0, 1, MS_NOEXEC }, /* permit execution of binaries */ + { "noexec", 0, 0, MS_NOEXEC }, /* don't execute binaries */ + { "suid", 0, 1, MS_NOSUID }, /* honor suid executables */ + { "nosuid", 0, 0, MS_NOSUID }, /* don't honor suid executables */ + { "dev", 0, 1, MS_NODEV }, /* interpret device files */ + { "nodev", 0, 0, MS_NODEV }, /* don't interpret devices */ + { "async", 0, 1, MS_SYNCHRONOUS}, /* asynchronous I/O */ + { "auto", 0, 0, 0 }, /* Can be mounted using -a */ + { "noauto", 0, 0, 0 }, /* Can only be mounted explicitly */ + { "nousers", 0, 1, 0 }, /* Forbid ordinary user to mount */ + { "nouser", 0, 1, 0 }, /* Forbid ordinary user to mount */ + { "noowner", 0, 1, 0 }, /* Device owner has no special privs */ + { "_netdev", 0, 0, 0 }, /* Device accessible only via network */ + /* These strings are passed through and parsed in lustre ll_options */ + { "flock", 0, 0, 0 }, /* Enable flock support */ + { "noflock", 1, 1, 0 }, /* Disable flock support */ + { "user_xattr", 0, 0, 0 }, /* Enable get/set user xattr */ + { "nouser_xattr", 1, 1, 0 }, /* Disable user xattr */ + { "acl", 0, 0, 0 }, /* Enable ACL support */ + { "noacl", 1, 1, 0 }, /* Disable ACL support */ + { "nosvc", 0, 0, 0 }, /* Only start MGS/MGC, nothing else */ + { "exclude", 0, 0, 0 }, /* OST exclusion list */ + { NULL, 0, 0, 0 } +}; +/****************************************************************************/ + +/* 1 = found, flag set + 0 = found, no flag set + -1 = not found in above list */ +static int parse_one_option(const char *check, int *flagp) +{ + const struct opt_map *opt; + + for (opt = &opt_map[0]; opt->opt != NULL; opt++) { + if (strncmp(check, opt->opt, strlen(opt->opt)) == 0) { + if (!opt->mask) + return 0; + if (opt->inv) + *flagp &= ~(opt->mask); + else + *flagp |= opt->mask; + return 1; + } + } + fprintf(stderr, "%s: ignoring unknown option '%s'\n", progname, + check); + return -1; +} + +int parse_options(char *orig_options, int *flagp) +{ + char *options, *opt, *nextopt; + + options = calloc(strlen(orig_options) + 1, 1); + *flagp = 0; + nextopt = orig_options; + while ((opt = strsep(&nextopt, ","))) { + if (!*opt) + /* empty option */ + continue; + if (parse_one_option(opt, flagp) == 0) { + /* no mount flags set, so pass this on as an option */ + if (*options) + strcat(options, ","); + strcat(options, opt); + } + } + /* options will always be <= orig_options */ + strcpy(orig_options, options); + free(options); + return 0; +} + + +int main(int argc, char *const argv[]) +{ + char default_options[] = ""; + char *source, *target, *options = default_options, *optcopy; + int i, nargs = 3, opt, rc, flags, optlen; + static struct option long_opt[] = { + {"fake", 0, 0, 'f'}, + {"force", 0, 0, 1}, + {"help", 0, 0, 'h'}, + {"nomtab", 0, 0, 'n'}, + {"options", 1, 0, 'o'}, + {"verbose", 0, 0, 'v'}, + {0, 0, 0, 0} + }; + + progname = strrchr(argv[0], '/'); + progname = progname ? progname + 1 : argv[0]; + + while ((opt = getopt_long(argc, argv, "fhno:v", + long_opt, NULL)) != EOF){ + switch (opt) { + case 1: + ++force; + printf("force: %d\n", force); + nargs++; + break; + case 'f': + ++fake; + printf("fake: %d\n", fake); + nargs++; + break; + case 'h': + usage(stdout); + break; + case 'n': + ++nomtab; + printf("nomtab: %d\n", nomtab); + nargs++; + break; + case 'o': + options = optarg; + nargs++; + break; + case 'v': + ++verbose; + printf("verbose: %d\n", verbose); + nargs++; + break; + default: + fprintf(stderr, "%s: unknown option '%c'\n", + progname, opt); + usage(stderr); + break; + } + } + + if (optind + 2 > argc) { + fprintf(stderr, "%s: too few arguments\n", progname); + usage(stderr); + } + + source = convert_hostnames(argv[optind]); + target = argv[optind + 1]; + + if (!source) { + usage(stderr); + } + + if (verbose > 1) { + for (i = 0; i < argc; i++) + printf("arg[%d] = %s\n", i, argv[i]); + printf("source = %s, target = %s\n", source, target); + } + + if (!force && check_mtab_entry(source, target, "lustre")) + return(EEXIST); + + rc = parse_options(options, &flags); + if (rc) { + fprintf(stderr, "%s: can't parse options: %s\n", + progname, options); + return(EINVAL); + } + + rc = access(target, F_OK); + if (rc) { + rc = errno; + fprintf(stderr, "%s: %s inaccessible: %s\n", progname, target, + strerror(errno)); + return rc; + } + + /* In Linux 2.4, the target device doesn't get passed to any of our + functions. So we'll stick it on the end of the options. */ + optlen = strlen(options) + strlen(",device=") + strlen(source) + 1; + optcopy = malloc(optlen); + strcpy(optcopy, options); + if (*optcopy) + strcat(optcopy, ","); + strcat(optcopy, "device="); + strcat(optcopy, source); + + if (verbose) + printf("mounting device %s at %s, flags=%#x options=%s\n", + source, target, flags, optcopy); + + if (!fake) + /* flags and target get to lustre_get_sb, but not + lustre_fill_super. Lustre ignores the flags, but mount + does not. */ + rc = mount(source, target, "lustre", flags, (void *)optcopy); + + if (rc) { + fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname, + source, target, strerror(errno)); + if (errno == ENODEV) + fprintf(stderr, "Are the lustre modules loaded?\n" + "Check /etc/modules.conf and /proc/filesystems\n"); + if (errno == ENOTBLK) + fprintf(stderr,"Does this filesystem have any OSTs?\n"); + if (errno == ENOENT) + fprintf(stderr,"Is the MGS specification correct? " + "(%s)\n", source); + if (errno == EALREADY) + fprintf(stderr,"The target service is already running. " + "(%s)\n", source); + if (errno == ENXIO) + fprintf(stderr,"The target service failed to start " + "(bad config log?) (%s)\n", source); + if (errno == EIO) + fprintf(stderr,"Is the MGS running? (%s)\n", source); + if (errno == EADDRINUSE) + fprintf(stderr,"The target service's index is already " + "in use. (%s)\n", source); + rc = errno; + } else if (!nomtab) { + rc = update_mtab_entry(source, target, "lustre", options,0,0,0); + } + + free(optcopy); + free(source); + return rc; +} diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 3072e1d..85ee351 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -130,19 +130,6 @@ do { \ } \ } while (0) -int obd_record(enum cfg_record_type type, int len, void *ptr) -{ - struct obd_ioctl_data data; - - IOC_INIT(data); - data.ioc_type = type; - data.ioc_plen1 = len; - data.ioc_pbuf1 = ptr; - IOC_PACK("obd_record", data); - - return l_ioctl(OBD_DEV_ID, OBD_IOC_DORECORD, &data); -} - int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg) { int opc; @@ -166,6 +153,45 @@ int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg) return rc; } +int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg) +{ + struct obd_ioctl_data data; + static int mgs_device = -1; + int rc; + + /* Always operates on MGS dev */ + if (mgs_device == -1) { + static int do_device(char *func, char *devname); + do_disconnect(NULL, 1); + rc = do_device("mgsioc", "MGS"); + if (rc) { + errno = ENODEV; + return -1; + } + mgs_device = cur_device; + } + + IOC_INIT(data); + data.ioc_dev = mgs_device; + data.ioc_type = LUSTRE_CFG_TYPE; + data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + data.ioc_pbuf1 = (void *)lcfg; + IOC_PACK(func, data); + + rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf); + + if (rc == ENODEV) + fprintf(stderr, "Is the MGS running on this node?\n"); + if (rc == ENOSYS) + fprintf(stderr, "Make sure cfg_device is set first.\n"); + if (rc == EINVAL) + fprintf(stderr, "cfg_device should be of the form " + "'lustre-MDT0000'\n"); + + return rc; +} + char *obdo_print(struct obdo *obd) { char buf[1024]; diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index 7a7d43c..afbfb52 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -69,6 +69,7 @@ int jt_llog_remove(int argc, char **argv); int jt_llog_check(int argc, char **argv); int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg); +int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg); int parse_devname(char *func, char *name); char *jt_cmdname(char *func); @@ -88,6 +89,8 @@ int jt_lcfg_set_timeout(int argc, char **argv); int jt_lcfg_set_lustre_upcall(int argc, char **argv); int jt_lcfg_add_conn(int argc, char **argv); int jt_lcfg_del_conn(int argc, char **argv); +int jt_lcfg_param(int argc, char **argv); +int jt_lcfg_mgsparam(int argc, char **argv); int obd_add_uuid(char *uuid, lnet_nid_t nid); diff --git a/lustre/utils/rmmod_all.sh b/lustre/utils/rmmod_all.sh index 9ae82bb..0e1726f 100755 --- a/lustre/utils/rmmod_all.sh +++ b/lustre/utils/rmmod_all.sh @@ -3,6 +3,6 @@ SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH -lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1 +lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1 # do it again, in case we tried to unload ksocklnd too early -lctl modules | awk '{ print $2 }' | xargs rmmod +lsmod | grep lnet > /dev/null && lctl modules | awk '{ print $2 }' | xargs rmmod diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 27b12f7..5a1f55a 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -981,6 +981,12 @@ main(int argc, char **argv) CHECK_VALUE(REINT_OPEN); CHECK_VALUE(REINT_MAX); + CHECK_VALUE(MGS_CONNECT); + CHECK_VALUE(MGS_DISCONNECT); + CHECK_VALUE(MGS_EXCEPTION); + CHECK_VALUE(MGS_TARGET_REG); + CHECK_VALUE(MGS_TARGET_DEL); + CHECK_VALUE(DISP_IT_EXECD); CHECK_VALUE(DISP_LOOKUP_EXECD); CHECK_VALUE(DISP_LOOKUP_NEG); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index dd8664b..f2b33f8 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -15,12 +15,12 @@ void lustre_assert_wire_constants(void); int main() { - lustre_assert_wire_constants(); + lustre_assert_wire_constants(); - if (ret == 0) - printf("wire constants OK\n"); + if (ret == 0) + printf("wire constants OK\n"); - return ret; + return ret; } void lustre_assert_wire_constants(void) @@ -159,6 +159,16 @@ void lustre_assert_wire_constants(void) (long long)MDS_STATUS_CONN); LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n", (long long)MDS_STATUS_LOV); + LASSERTF(MGS_CONNECT == 250, " found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, " found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, " found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, " found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, " found %lld\n", + (long long)MGS_TARGET_DEL); LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n", (long long)LDLM_ENQUEUE); LASSERTF(LDLM_CONVERT == 102, " found %lld\n",