From 2b43d9f2deaf421c9eacc6fd661814282c8f4862 Mon Sep 17 00:00:00 2001 From: nathan Date: Wed, 11 Jan 2006 01:32:53 +0000 Subject: [PATCH] Branch b1_4_mountconf b=9861 - add lov info to mkfs.lustre - add failover nids to registration b=4482 - connect flags for online ost add - better locking in mds_lov_sync - adilger's dirty objids patch - various fixme's (warning: this is completely untested code, may not work) --- lustre/include/linux/lustre_disk.h | 39 +++++++++------- lustre/include/linux/lustre_idl.h | 20 ++++---- lustre/include/linux/obd.h | 4 +- lustre/lov/lov_log.c | 5 +- lustre/lov/lov_obd.c | 18 ++++++-- lustre/mds/mds_lov.c | 74 ++++++++++++++++++++---------- lustre/mgc/mgc_request.c | 4 +- lustre/mgs/mgs_handler.c | 12 ++--- lustre/mgs/mgs_internal.h | 2 +- lustre/mgs/mgs_llog.c | 93 ++++++++++++++++++++++++++++---------- lustre/obdclass/obd_mount.c | 43 +++++++++++------- lustre/ptlrpc/pack_generic.c | 19 +++++--- lustre/ptlrpc/ptlrpc_module.c | 2 +- lustre/utils/mkfs_lustre.c | 37 ++++++++------- 14 files changed, 244 insertions(+), 128 deletions(-) diff --git a/lustre/include/linux/lustre_disk.h b/lustre/include/linux/lustre_disk.h index 5e2d051..d0a6039 100644 --- a/lustre/include/linux/lustre_disk.h +++ b/lustre/include/linux/lustre_disk.h @@ -68,21 +68,33 @@ static inline char *mt_str(enum ldd_mount_type mt) return mount_type_string[mt]; } -#define MAX_FAILOVER_NIDS 10 +#ifndef MTI_NIDS_MAX /* match lustre_idl.h */ +#define MTI_NIDS_MAX 10 +#endif struct lustre_disk_data { __u32 ldd_magic; - __u32 ldd_config_ver; /* we have integrated all llog steps - through this llog ver. */ + __u32 ldd_config_ver; /* not used? */ __u32 ldd_flags; /* LDD_SV_TYPE */ char ldd_fsname[64]; /* filesystem this server is part of */ - char ldd_svname[64]; /* this server's name (lustre-mdt0001) */ - __u16 ldd_svindex; /* server index (0001), must match svname */ - __u16 ldd_mgsnid_count; /* how many failover nids we have for the MGS */ - lnet_nid_t ldd_mgsnid[MAX_FAILOVER_NIDS]; /* mgmt nid list; lmd can override */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u16 ldd_svindex; /* server index (0001), must match + svname */ + __u16 ldd_mgsnid_count; + lnet_nid_t ldd_mgsnid[MTI_NIDS_MAX]; /* mgmt nid list; lmd can + override */ + __u16 ldd_failnid_count; /* server failover nid count */ + lnet_nid_t ldd_failnid[MTI_NIDS_MAX]; /* server failover nids */ enum ldd_mount_type ldd_mount_type; /* target fs type LDD_MT_* */ char ldd_mount_opts[1024]; /* target fs mount opts */ - char ldd_pad[1024]; + + /* Below here is required for writing mdt, ost,or client logs, + and is ignored after that. */ + int ldd_stripe_sz; + int ldd_stripe_count; + int ldd_stripe_pattern; + int ldd_stripe_offset; + int ldd_timeout; /* obd timeout */ }; #define IS_MDT(data) ((data)->ldd_flags & LDD_F_SV_TYPE_MDT) @@ -114,8 +126,9 @@ static inline void ldd_make_sv_name(struct lustre_disk_data *ldd) struct lustre_mount_data { __u32 lmd_magic; __u32 lmd_flags; /* lustre mount flags */ - __u16 lmd_mgsnid_count; /* how many failover nids we have for the MGS */ - lnet_nid_t lmd_mgsnid[MAX_FAILOVER_NIDS]; /* who to contact at startup */ + __u16 lmd_mgsnid_count; /* how many failover nids we have for + the MGS */ + lnet_nid_t lmd_mgsnid[MTI_NIDS_MAX];/* who to contact at startup */ char *lmd_dev; /* device or file system name */ char *lmd_opts; /* lustre mount options (as opposed to _device_ mount options) */ @@ -145,12 +158,6 @@ struct mkfs_opts { char mo_loopdev[128]; /* in case a loop dev is needed */ __u64 mo_device_sz; int mo_flags; - - /* Below here is required for writing mdt,ost,or client logs */ - int mo_stripe_sz; - int mo_stripe_count; - int mo_stripe_pattern; - int mo_timeout; /* obd timeout */ }; /****************** last_rcvd file *********************/ diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 339e86c..488fe60 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -212,6 +212,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define OBD_CONNECT_ACL 0x80ULL /* client using access control lists */ #define OBD_CONNECT_XATTR 0x100ULL /* client using extended attributes*/ #define OBD_CONNECT_CROW 0x200ULL /* MDS is expecting create-on-write */ +#define OBD_CONNECT_EMPTY 0x80000000ULL /* fake: these are empty connect flags*/ /* * set by servers supporting taking extent locks during obd_punch(). Currently * is requested by liblustre clients only. See bug 9528. @@ -939,22 +940,25 @@ typedef enum { #define MTI_NAME_MAXLEN 64 #define MTI_UUID_MAXLEN MTI_NAME_MAXLEN + 5 +#define MTI_NIDS_MAX 10 /* match lustre_disk.h */ struct mgmt_target_info { char mti_fsname[MTI_NAME_MAXLEN]; char mti_svname[MTI_NAME_MAXLEN]; - char mti_nodename[MTI_NAME_MAXLEN]; - char mti_uuid[MTI_UUID_MAXLEN]; - __u64 mti_nid; /* lnet_nid_t */ //nid list? - __u32 mti_config_ver; - __u32 mti_flags; - __u32 mti_stripe_index; - __u32 mti_stripe_pattern; /* PATTERN_RAID0, PATTERN_RAID1 */ + __u64 mti_nids[MTI_NIDS_MAX]; /* lnet_nid_t host nids */ + __u64 mti_failnids[MTI_NIDS_MAX]; /* partner nids */ __u64 mti_stripe_size; __u64 mti_stripe_offset; + __u32 mti_stripe_count; /* how many objects are used */ + __u32 mti_stripe_pattern; /* PATTERN_RAID0, PATTERN_RAID1*/ + __u32 mti_stripe_index; + __u32 mti_nid_count; + __u32 mti_failnid_count; + __u32 mti_config_ver; + __u32 mti_flags; }; -extern void lustre_swab_mgmt_target_info(struct mgmt_target_info *oinfo); +extern void lustre_swab_mgs_target_info(struct mgmt_target_info *oinfo); #define CM_START 0x01 #define CM_END 0x02 diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index ad45929..806cb15 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -377,7 +377,8 @@ struct mds_obd { struct semaphore mds_lov_sem; obd_id *mds_lov_objids; int mds_lov_objids_size; - __u32 mds_lov_objids_red; + __u32 mds_lov_objids_in_file; + unsigned int mds_lov_objids_dirty:1; int mds_lov_nextid_set; struct file *mds_lov_objid_filp; struct file *mds_health_check_filp; @@ -425,6 +426,7 @@ struct lov_obd { struct semaphore lov_lock; atomic_t refcount; struct lov_desc desc; + struct obd_connect_data ocd; int bufsize; int connects; int death_row; /* Do we have tgts scheduled to be deleted? diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c index 4f51a97..325d4b1 100644 --- a/lustre/lov/lov_log.c +++ b/lustre/lov/lov_log.c @@ -101,9 +101,12 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count, int i, rc = 0; ENTRY; - if (count != lov->desc.ld_tgt_count ) + if (count != lov->desc.ld_tgt_count) CERROR("Origin connect mds cnt %d != lov cnt %d\n", count, lov->desc.ld_tgt_count); + /* count must match if we're doing all */ + LASSERT(uuid || (count == lov->desc.ld_tgt_count)); + for (i = 0, tgt = lov->tgts; i < count; i++, tgt++) { struct obd_device *child; struct llog_ctxt *cctxt; diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index abd97db..1adcbff 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -207,6 +207,10 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, int rc, rc2, i; ENTRY; + lov->ocd.ocd_connect_flags = OBD_CONNECT_EMPTY; + if (data) + lov->ocd = *data; + rc = class_connect(conn, obd, cluuid); if (rc) RETURN(rc); @@ -433,6 +437,7 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; + struct obd_connect_data *ocd = NULL; __u32 bufsize, idx; int rc; ENTRY; @@ -503,8 +508,16 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) osc_obd->obd_no_recov = 0; } - /* NULL may need to change when we use flags for osc's */ - rc = lov_connect_obd(obd, tgt, 1, NULL); + if (lov->ocd.ocd_connect_flags != OBD_CONNECT_EMPTY) { + /* Keep the original connect flags pristine */ + OBD_ALLOC(ocd, sizeof(*ocd)); + if (!ocd) + RETURN(-ENOMEM); + *ocd = lov->ocd; + } + rc = lov_connect_obd(obd, tgt, 1, ocd); + if (ocd) + OBD_FREE(ocd, sizeof(*ocd)); if (rc) GOTO(out, rc); @@ -516,7 +529,6 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) if (rc) { CERROR("add failed (%d), deleting %s\n", rc, (char *)tgt->uuid.uuid); - //lov_disconnect_obd(obd, tgt); lov_del_obd(obd, &tgt->uuid, index, 0); } RETURN(rc); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 225b809..f7f1250 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -49,8 +49,10 @@ void mds_lov_update_objids(struct obd_device *obd, obd_id *ids) lock_kernel(); for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) - if (ids[i] > (mds->mds_lov_objids)[i]) + if (ids[i] > (mds->mds_lov_objids)[i]) { (mds->mds_lov_objids)[i] = ids[i]; + mds->mds_lov_objids_dirty = 1; + } unlock_kernel(); EXIT; } @@ -64,6 +66,7 @@ static int mds_lov_read_objids(struct obd_device *obd) ENTRY; LASSERT(!mds->mds_lov_objids_size); + LASSERT(!mds->mds_lov_objids_dirty); /* Read everything in the file, even if our current lov desc has fewer targets. Old targets not in the lov descriptor @@ -83,17 +86,16 @@ static int mds_lov_read_objids(struct obd_device *obd) rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, ids, size, &off); if (rc < 0) { CERROR("Error reading objids %d\n", rc); - } else { - mds->mds_lov_objids_red = size / sizeof(*ids); - rc = 0; + RETURN(rc); } - - for (i = 0; i < mds->mds_lov_objids_red; i++) - //FIXME D_ERROR - CDEBUG(D_INFO|D_ERROR, "read last object "LPU64" for idx %d\n", + + mds->mds_lov_objids_in_file = size / sizeof(*ids); + + for (i = 0; i < mds->mds_lov_objids_in_file; i++) { + CDEBUG(D_INFO, "read last object "LPU64" for idx %d\n", mds->mds_lov_objids[i], i); - - RETURN(rc); + } + RETURN(0); } int mds_lov_write_objids(struct obd_device *obd) @@ -103,19 +105,26 @@ int mds_lov_write_objids(struct obd_device *obd) int i, rc, tgts; ENTRY; - tgts = max(mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids_red); + if (!mds->mds_lov_objids_dirty) + RETURN(0); + + tgts = max(mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids_in_file); if (!tgts) RETURN(0); for (i = 0; i < tgts; i++) - //FIXME D_ERROR - CDEBUG(D_INFO|D_ERROR, "writing last object "LPU64" for idx %d\n", + CDEBUG(D_INFO, "writing last object "LPU64" for idx %d\n", mds->mds_lov_objids[i], i); rc = fsfilt_write_record(obd, mds->mds_lov_objid_filp, mds->mds_lov_objids, tgts * sizeof(obd_id), &off, 0); + if (rc >= 0) { + mds->mds_lov_objids_dirty = 0; + rc = 0; + } + RETURN(rc); } @@ -189,17 +198,23 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) (size > mds->mds_lov_objids_size)) { obd_id *ids; - /* add room for a bunch at a time */ - size = (ld->ld_tgt_count + 8) * sizeof(obd_id); + /* add room by powers of 2 */ + size = 1; + while (size < ld->ld_tgt_count) + size = size << 1; + CERROR("Next size=%d\n", size); + size = size * sizeof(obd_id); OBD_ALLOC(ids, size); if (ids == NULL) GOTO(out, rc = -ENOMEM); memset(ids, 0, size); if (mds->mds_lov_objids_size) { + obd_id *old_ids = mds->mds_lov_objids; memcpy(ids, mds->mds_lov_objids, mds->mds_lov_objids_size); - OBD_FREE(mds->mds_lov_objids, mds->mds_lov_objids_size); + mds->mds_lov_objids = ids; + OBD_FREE(old_ids, mds->mds_lov_objids_size); } mds->mds_lov_objids = ids; mds->mds_lov_objids_size = size; @@ -226,8 +241,8 @@ static int mds_lov_add_ost(struct obd_device *obd, struct obd_device *watched, int rc = 0; ENTRY; - //FIXME remove D_ERROR - CDEBUG(D_CONFIG|D_ERROR, "Updating mds lov for OST idx %d\n", idx); + //FIXME remove D_WARNING + CDEBUG(D_CONFIG|D_WARNING, "Updating mds lov for OST idx %d\n", idx); old_count = mds->mds_lov_desc.ld_tgt_count; rc = mds_lov_update_desc(obd, mds->mds_osc_exp); @@ -240,7 +255,7 @@ static int mds_lov_add_ost(struct obd_device *obd, struct obd_device *watched, RETURN(-EINVAL); } - if (idx >= mds->mds_lov_objids_red) { + if (idx >= mds->mds_lov_objids_in_file) { /* We never read this lastid; ask the osc */ obd_id lastid; __u32 size = sizeof(lastid); @@ -252,6 +267,7 @@ static int mds_lov_add_ost(struct obd_device *obd, struct obd_device *watched, mds->mds_lov_objids[idx] = lastid; CWARN("got last object "LPU64" from OST %d\n", mds->mds_lov_objids[idx], idx); + mds->mds_lov_objids_dirty = 1; mds_lov_write_objids(obd); } else { /* We did read this lastid; tell the osc */ @@ -322,7 +338,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) /* If we're mounting this code for the first time on an existing FS, * we need to populate the objids array from the real OST values */ - if (mds->mds_lov_desc.ld_tgt_count > mds->mds_lov_objids_red) { + if (mds->mds_lov_desc.ld_tgt_count > mds->mds_lov_objids_in_file) { int size = sizeof(obd_id) * mds->mds_lov_desc.ld_tgt_count; rc = obd_get_info(mds->mds_osc_exp, strlen("last_id"), "last_id", &size, mds->mds_lov_objids); @@ -330,6 +346,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) for (i = 0; i < mds->mds_lov_desc.ld_tgt_count; i++) CWARN("got last object "LPU64" from OST %d\n", mds->mds_lov_objids[i], i); + mds->mds_lov_objids_dirty = 1; rc = mds_lov_write_objids(obd); if (rc) CERROR("got last objids from OSTs, but error " @@ -586,7 +603,7 @@ static int __mds_lov_synchronize(void *data) struct mds_obd *mds; struct obd_uuid *uuid = NULL; __u32 idx; - int rc = 0; + int rc = 0, have_sem = 0; ENTRY; obd = mlsi->mlsi_obd; @@ -600,9 +617,15 @@ static int __mds_lov_synchronize(void *data) LASSERT(obd != NULL); - /* Hold this throughout a synchronize, and wherever we - reference the contents of mds_lov_desc */ - down(&mds->mds_lov_sem); + /* We can't change the target count in one of these sync + threads while another sync thread is doing the clearorphans on + all the targets. */ + if (!watched || (idx != MLSI_NO_INDEX)) { + /* if we're syncing a particular target, or we're not + changing the target_count, then we don't need the sem */ + down(&mds->mds_lov_sem); + have_sem++; + } rc = obd_set_info(mds->mds_osc_exp, strlen(KEY_MDS_CONN), KEY_MDS_CONN, 0, uuid); @@ -643,7 +666,8 @@ static int __mds_lov_synchronize(void *data) } out: - up(&mds->mds_lov_sem); + if (have_sem) + up(&mds->mds_lov_sem); class_decref(obd); RETURN(rc); } diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 914599e..96c88f1 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -570,7 +570,7 @@ int mgc_target_add(struct obd_export *exp, struct mgmt_target_info *mti) rc = ptlrpc_queue_wait(req); if (!rc) { rep_mti = lustre_swab_repbuf(req, 0, sizeof(*rep_mti), - lustre_swab_mgmt_target_info); + lustre_swab_mgs_target_info); memcpy(mti, rep_mti, sizeof(*rep_mti)); CDEBUG(D_MGC, "target_add %s got index = %d\n", mti->mti_svname, mti->mti_stripe_index); @@ -603,7 +603,7 @@ int mgc_target_del(struct obd_export *exp, struct mgmt_target_info *mti) if (!rc) { int index; rep_mti = lustre_swab_repbuf(req, 0, sizeof(*rep_mti), - lustre_swab_mgmt_target_info); + lustre_swab_mgs_target_info); index = rep_mti->mti_stripe_index; if (index != mti->mti_stripe_index) { CERROR ("OST DEL failed. rc=%d\n", index); diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index c71287b..acd9df5 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -304,18 +304,16 @@ static int mgs_handle_target_add(struct ptlrpc_request *req) ENTRY; mti = lustre_swab_reqbuf(req, 0, sizeof(*mti), - lustre_swab_mgmt_target_info); + lustre_swab_mgs_target_info); CDEBUG(D_MGS, "adding %s, index=%d\n", mti->mti_svname, mti->mti_stripe_index); /* set the new target index if needed */ - if (mti->mti_flags & LDD_F_NEED_INDEX) { - rc = mgs_set_next_index(obd, mti); - if (rc) { - CERROR("Can't get index (%d)\n", rc); - GOTO(out, rc); - } + rc = mgs_set_index(obd, mti); + if (rc) { + CERROR("Can't get index (%d)\n", rc); + GOTO(out, rc); } /* revoke the config lock so everyone will update */ diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h index a142fc4..fad39a2 100644 --- a/lustre/mgs/mgs_internal.h +++ b/lustre/mgs/mgs_internal.h @@ -14,7 +14,7 @@ extern struct lvfs_callback_ops mgs_lvfs_ops; int mgs_init_db_list(struct obd_device *obd); int mgs_cleanup_db_list(struct obd_device *obd); -int mgs_set_next_index(struct obd_device *obd, struct mgmt_target_info *mti); +int mgs_set_index(struct obd_device *obd, struct mgmt_target_info *mti); int mgs_write_log_target(struct obd_device *obd, struct mgmt_target_info *mti); #endif diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index 45c69ff..91b3141 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -135,7 +135,7 @@ static int next_ost_index(void *index_map, int map_len) set_bit(i, index_map); return i; } - CERROR("max index exceeded.\n"); + CERROR("max index %d exceeded.\n", i); return -1; } @@ -269,7 +269,7 @@ static int mgs_find_or_make_db(struct obd_device *obd, char *name, return 0; } -int mgs_set_next_index(struct obd_device *obd, struct mgmt_target_info *mti) +int mgs_set_index(struct obd_device *obd, struct mgmt_target_info *mti) { struct fs_db *db; int rc = 0; @@ -280,8 +280,25 @@ int mgs_set_next_index(struct obd_device *obd, struct mgmt_target_info *mti) return rc; } - /* FIXME use mti->mti_stripe_index if given, report error if already - in use */ + if (!(mti->mti_flags & LDD_F_NEED_INDEX)) { + if (mti->mti_stripe_index >= INDEX_MAP_SIZE * 8) { + LCONSOLE_ERROR("Server %s requested index %d, but the" + "max index is %d.\n", + mti->mti_svname, mti->mti_stripe_index, + INDEX_MAP_SIZE * 8); + return -ERANGE; + } + if (test_bit(mti->mti_stripe_index, db->fd_index_map)) { + LCONSOLE_ERROR("Server %s requested index %d, but that" + "index is already in use.\n", + mti->mti_svname, mti->mti_stripe_index); + return -EADDRINUSE; + } else { + set_bit(mti->mti_stripe_index, db->fd_index_map); + return 0; + } + } + if (mti->mti_flags & LDD_F_SV_TYPE_OST) { rc = next_ost_index(db->fd_index_map, INDEX_MAP_SIZE); if (rc == -1) @@ -369,6 +386,14 @@ static inline int record_add_uuid(struct obd_device *obd, uint64_t nid, char *uuid) { return record_base(obd,llh,NULL,nid,LCFG_ADD_UUID,uuid,0,0,0); + +} + +static inline int record_add_conn(struct obd_device *obd, + struct llog_handle *llh, + char *uuid) +{ + return record_base(obd,llh,NULL,0,LCFG_ADD_CONN,uuid,0,0,0); } static inline int record_attach(struct obd_device *obd, struct llog_handle *llh, @@ -591,6 +616,7 @@ static int mgs_clear_log(struct obd_device *obd, char *name) /* lov is the first thing in the mdt and client logs */ static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *db, + struct mgmt_target_info *mti, char *logname, char *lovname) { struct llog_handle *llh = NULL; @@ -614,10 +640,10 @@ static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *db, /* Use defaults here, will fix them later with LCFG_PARAM */ lovdesc->ld_magic = LOV_DESC_MAGIC; lovdesc->ld_tgt_count = 0; - lovdesc->ld_pattern = 0; - lovdesc->ld_default_stripe_count = 1; - lovdesc->ld_default_stripe_size = 1024*1024; - lovdesc->ld_default_stripe_offset = 0; + lovdesc->ld_default_stripe_count = mti->mti_stripe_count; + lovdesc->ld_pattern = mti->mti_stripe_pattern; + lovdesc->ld_default_stripe_size = mti->mti_stripe_size; + lovdesc->ld_default_stripe_offset = mti->mti_stripe_offset; sprintf((char*)lovdesc->ld_uuid.uuid, "%s_UUID", lovname); /* can these be the same? */ uuid = (char *)lovdesc->ld_uuid.uuid; @@ -640,7 +666,8 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *db, { struct llog_handle *llh = NULL; char *cliname, *mdcname, *lovname, *nodeuuid, *mdsuuid, *mdcuuid; - int rc, first_log = 0; + lnet_nid_t nid; + int rc, i, first_log = 0; CDEBUG(D_MGS, "writing new mdt %s\n", mti->mti_svname); @@ -650,7 +677,7 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *db, /* This is the first time for all logs for this fs, since any ost should have already started the mdt log. */ first_log++; - rc = mgs_write_log_lov(obd, db, mti->mti_svname, + rc = mgs_write_log_lov(obd, db, mti, mti->mti_svname, lovname); } @@ -682,11 +709,11 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *db, name_create(mti->mti_fsname, "-clilov", &lovname); if (first_log) { /* Start client log */ - rc = mgs_write_log_lov(obd, db, cliname, lovname); + rc = mgs_write_log_lov(obd, db, mti, cliname, lovname); } - /* Add the mdt info to the client */ - name_create(libcfs_nid2str(mti->mti_nid), "_UUID", &nodeuuid); + /* Add the mdt info to the client log */ + name_create(libcfs_nid2str(mti->mti_nids[0]), /*"_UUID"*/"", &nodeuuid); name_create(mti->mti_svname, "-mdc", &mdcname); name_create(mdcname, "_UUID", &mdcuuid); /* @@ -698,13 +725,19 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *db, #14 L mount_option 0: 1:client 2:lov1 3:MDC_uml1_mdsA_MNT_client */ rc = record_start_log(obd, &llh, cliname); - /* FIXME can we just use the nid as the node uuid, or do we really - need the hostname? */ rc = record_marker(obd, llh, db, CM_START, "add mdc"); - rc = record_add_uuid(obd, llh, mti->mti_nid, nodeuuid); + for (i = 0; i < mti->mti_nid_count; i++) { + CERROR("add nid %s\n", libcfs_nid2str(mti->mti_nids[i])); + rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); + } rc = record_attach(obd, llh, mdcname, LUSTRE_MDC_NAME, mdcuuid); rc = record_setup(obd,llh,mdcname,mdsuuid,nodeuuid,0,0); - /* FIXME add uuid, add_conn for failover mdt's */ + for (i = 0; i < mti->mti_failnid_count; i++) { + nid = mti->mti_failnids[i]; + CERROR("add failover nid %s\n", libcfs_nid2str(nid)); + rc = record_add_uuid(obd, llh, nid, libcfs_nid2str(nid)); + rc = record_add_conn(obd, llh, libcfs_nid2str(nid)); + } rc = record_mount_opt(obd, llh, cliname, lovname, mdcname); rc = record_marker(obd, llh, db, CM_END, "add mdc"); rc = record_end_log(obd, &llh); @@ -726,17 +759,18 @@ static int mgs_write_log_osc(struct obd_device *obd, struct fs_db *db, struct llog_handle *llh = NULL; char *nodeuuid, *oscname, *oscuuid, *lovuuid; char index[5]; - int rc; + lnet_nid_t nid; + int i, rc; if (mgs_log_is_empty(obd, logname)) { /* The first time an osc is added, setup the lov */ - rc = mgs_write_log_lov(obd, db, logname, lovname); + rc = mgs_write_log_lov(obd, db, mti, logname, lovname); } CDEBUG(D_MGS, "adding osc for %s to log %s\n", mti->mti_svname, logname); - name_create(libcfs_nid2str(mti->mti_nid), "_UUID", &nodeuuid); + name_create(libcfs_nid2str(mti->mti_nids[0]), /*"_UUID"*/"", &nodeuuid); name_create(mti->mti_svname, "-osc", &oscname); name_create(oscname, "_UUID", &oscuuid); name_create(lovname, "_UUID", &lovuuid); @@ -751,10 +785,18 @@ static int mgs_write_log_osc(struct obd_device *obd, struct fs_db *db, */ rc = record_start_log(obd, &llh, logname); rc = record_marker(obd, llh, db, CM_START, "add osc"); - rc = record_add_uuid(obd, llh, mti->mti_nid, nodeuuid); + for (i = 0; i < mti->mti_nid_count; i++) { + CERROR("add nid %s\n", libcfs_nid2str(mti->mti_nids[i])); + rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); + } rc = record_attach(obd, llh, oscname, LUSTRE_OSC_NAME, lovuuid); rc = record_setup(obd, llh, oscname, ostuuid, nodeuuid, 0, 0); - /* FIXME add uuid, add_conn for failover ost's */ + for (i = 0; i < mti->mti_failnid_count; i++) { + nid = mti->mti_failnids[i]; + CERROR("add failover nid %s\n", libcfs_nid2str(nid)); + rc = record_add_uuid(obd, llh, nid, libcfs_nid2str(nid)); + rc = record_add_conn(obd, llh, libcfs_nid2str(nid)); + } snprintf(index, sizeof(index), "%d", mti->mti_stripe_index); rc = record_lov_add(obd,llh, lovname, ostuuid, index,"1"/*generation*/); rc = record_marker(obd, llh, db, CM_END, "add osc"); @@ -786,8 +828,11 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *db, Heck, what do we do about the client and mds logs? We better abort. */ if (!mgs_log_is_empty(obd, mti->mti_svname)) { - CERROR("The config log for %s already exists, not adding.\n", - mti->mti_svname); + LCONSOLE_ERROR("The config log for %s already exists, yet the " + "server claims it never registered. It may have" + " been reformatted, or the index changed. This " + "must be resolved before this server can be " + "added.\n", mti->mti_svname); return -EALREADY; } /* diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 5ac4bc2..df5ca72 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -319,6 +319,8 @@ static int ldd_write(struct lvfs_run_ctxt *mount_ctxt, ENTRY; LASSERT(ldd->ldd_magic == LDD_MAGIC); + + ldd->ldd_config_ver++; push_ctxt(&saved, mount_ctxt, NULL); @@ -842,26 +844,33 @@ static int server_add_target(struct super_block *sb, struct vfsmount *mnt) sizeof(mti->mti_fsname)); strncpy(mti->mti_svname, ldd->ldd_svname, sizeof(mti->mti_svname)); - // char mti_nodename[NAME_MAXLEN]; - // char mti_uuid[UUID_MAXLEN]; - /* FIXME nid 0 is lo generally, need to send all non-lo nids */ + + mti->mti_nid_count = 0; while ((rc = LNetGetId(i++, &id)) != -ENOENT) { if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) continue; - /* FIXME use all non-lo nids, not just first */ - break; + mti->mti_nids[mti->mti_nid_count] = id.nid; + mti->mti_nid_count++; + if (mti->mti_nid_count >= MTI_NIDS_MAX) { + CWARN("Only using first %d nids for %s\n", + mti->mti_nid_count, mti->mti_svname); + break; + } } - mti->mti_nid = id.nid; + + memcpy(mti->mti_failnids, ldd->ldd_failnid, sizeof(mti->mti_failnids)); + mti->mti_failnid_count = ldd->ldd_failnid_count; mti->mti_config_ver = 0; mti->mti_flags = ldd->ldd_flags; mti->mti_stripe_index = ldd->ldd_svindex; - mti->mti_stripe_pattern = 0; //FIXME - mti->mti_stripe_size = 1024*1024; //FIXME - mti->mti_stripe_offset = 0; //FIXME + mti->mti_stripe_count = ldd->ldd_stripe_count; + mti->mti_stripe_pattern = ldd->ldd_stripe_pattern; + mti->mti_stripe_size = ldd->ldd_stripe_sz; + mti->mti_stripe_offset = ldd->ldd_stripe_offset; CDEBUG(D_MOUNT, "Initial registration %s, fs=%s, %s, index=%04x\n", mti->mti_svname, mti->mti_fsname, - libcfs_nid2str(mti->mti_nid), mti->mti_stripe_index); + libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index); /* Register the target */ /* FIXME use mdc_process_config instead */ @@ -881,17 +890,17 @@ static int server_add_target(struct super_block *sb, struct vfsmount *mnt) " %s\n", ldd->ldd_svindex, mti->mti_stripe_index, mti->mti_svname); - ldd->ldd_flags &= ~(LDD_F_NEED_INDEX | LDD_F_NEED_REGISTER); - /* This server has never been started, so has no config */ - ldd->ldd_config_ver = 0; ldd->ldd_svindex = mti->mti_stripe_index; strncpy(ldd->ldd_svname, mti->mti_svname, sizeof(ldd->ldd_svname)); /* or ldd_make_sv_name(ldd); */ - ldd_write(&mgc->obd_lvfs_ctxt, ldd); /* FIXME write last_rcvd?, disk label? */ } + /* Always write out the new flags */ + ldd->ldd_flags &= ~(LDD_F_NEED_INDEX | LDD_F_NEED_REGISTER); + ldd_write(&mgc->obd_lvfs_ctxt, ldd); + out: if (mti) OBD_FREE(mti, sizeof(*mti)); @@ -942,7 +951,7 @@ static int server_start_targets(struct super_block *sb, struct vfsmount *mnt) to read and write configs locally. */ server_mgc_set_fs(lsi->lsi_mgc, sb); - /* Get a new index if needed */ + /* Register if needed */ if (lsi->lsi_ldd->ldd_flags & (LDD_F_NEED_INDEX | LDD_F_NEED_REGISTER)) { CDEBUG(D_MOUNT, "Need new target index from MGS\n"); @@ -1304,7 +1313,7 @@ static int server_fill_super(struct super_block *sb) /* append ldd nids to lmd nids */ for (i = 0; (i < lsi->lsi_ldd->ldd_mgsnid_count) && - (lsi->lsi_lmd->lmd_mgsnid_count < MAX_FAILOVER_NIDS); i++) { + (lsi->lsi_lmd->lmd_mgsnid_count < MTI_NIDS_MAX); i++) { lsi->lsi_lmd->lmd_mgsnid[lsi->lsi_lmd->lmd_mgsnid_count++] = lsi->lsi_ldd->ldd_mgsnid[i]; } @@ -1475,7 +1484,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) LCONSOLE_ERROR("Can't parse NID '%s'\n", s1); goto invalid; } - if (lmd->lmd_mgsnid_count >= MAX_FAILOVER_NIDS) { + if (lmd->lmd_mgsnid_count >= MTI_NIDS_MAX) { LCONSOLE_ERROR("Too many NIDs: '%s'\n", s1); goto invalid; } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index bd2da48..679d741 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -566,15 +566,22 @@ void lustre_swab_mds_body (struct mds_body *b) CLASSERT(offsetof(typeof(*b), padding_4) != 0); } -void lustre_swab_mgmt_target_info(struct mgmt_target_info *mti) +void lustre_swab_mgs_target_info(struct mgmt_target_info *mti) { - __swab64s(&mti->mti_nid); - __swab32s(&mti->mti_config_ver); - __swab32s(&mti->mti_flags); - __swab32s(&mti->mti_stripe_index); - __swab32s(&mti->mti_stripe_pattern); + int i; + for (i = 0; i < MTI_NIDS_MAX; i++) { + __swab64s(&mti->mti_nids[i]); + __swab64s(&mti->mti_failnids[i]); + } __swab64s(&mti->mti_stripe_size); __swab64s(&mti->mti_stripe_offset); + __swab32s(&mti->mti_stripe_count); + __swab32s(&mti->mti_stripe_pattern); + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_nid_count); + __swab32s(&mti->mti_failnid_count); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); } static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i) diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 83b5766..5738266 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); EXPORT_SYMBOL(lustre_swab_qdata); -EXPORT_SYMBOL(lustre_swab_mgmt_target_info); +EXPORT_SYMBOL(lustre_swab_mgs_target_info); /* recover.c */ EXPORT_SYMBOL(ptlrpc_run_recovery_over_upcall); diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index d53a6a6..c9f0f53 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -138,6 +138,7 @@ int get_os_version() return version; } +/* FIXME use popen */ int run_command(char *cmd) { int i = 0,ret = 0; @@ -440,11 +441,11 @@ int make_lustre_backfs(struct mkfs_opts *mop) if (strstr(mop->mo_mkfsopts, "-I") == NULL) { long inode_size = 0; if (IS_MDT(&mop->mo_ldd)) { - if (mop->mo_stripe_count > 77) + if (mop->mo_ldd.ldd_stripe_count > 77) inode_size = 512; /* bz 7241 */ - else if (mop->mo_stripe_count > 34) + else if (mop->mo_ldd.ldd_stripe_count > 34) inode_size = 2048; - else if (mop->mo_stripe_count > 13) + else if (mop->mo_ldd.ldd_stripe_count > 13) inode_size = 1024; else inode_size = 512; @@ -599,7 +600,7 @@ out_rmdir: void set_defaults(struct mkfs_opts *mop) { mop->mo_ldd.ldd_magic = LDD_MAGIC; - mop->mo_ldd.ldd_config_ver = 0; + mop->mo_ldd.ldd_config_ver = 1; mop->mo_ldd.ldd_flags = LDD_F_NEED_INDEX | LDD_F_NEED_REGISTER; mop->mo_ldd.ldd_mgsnid_count = 0; strcpy(mop->mo_ldd.ldd_fsname, "lustre"); @@ -609,7 +610,9 @@ void set_defaults(struct mkfs_opts *mop) mop->mo_ldd.ldd_mount_type = LDD_MT_LDISKFS; mop->mo_ldd.ldd_svindex = -1; - mop->mo_stripe_count = 1; + mop->mo_ldd.ldd_stripe_count = 1; + mop->mo_ldd.ldd_stripe_sz = 1024 * 1024; + mop->mo_ldd.ldd_stripe_pattern = 0; } static inline void badopt(char opt, char *type) @@ -685,7 +688,12 @@ int main(int argc , char *const argv[]) case 'c': if (IS_MDT(&mop.mo_ldd)) { int stripe_count = atol(optarg); - mop.mo_stripe_count = stripe_count; + if (stripe_count <= 0) { + fprintf(stderr, "%s: bad stripe count " + "%d\n", progname, stripe_count); + exit(1); + } + mop.mo_ldd.ldd_stripe_count = stripe_count; } else { badopt(opt, "MDT"); } @@ -724,9 +732,9 @@ int main(int argc , char *const argv[]) while ((s2 = strsep(&s1, ","))) { mop.mo_ldd.ldd_mgsnid[i++] = libcfs_str2nid(s2); - if (i >= MAX_FAILOVER_NIDS) { - fprintf(stderr, "%s: too many MGS nids, " - "ignoring %s\n", progname, s1); + if (i >= MTI_NIDS_MAX) { + fprintf(stderr, "%s: too many MGS nids," + " ignoring %s\n", progname, s1); break; } } @@ -762,12 +770,12 @@ int main(int argc , char *const argv[]) break; case 's': if (IS_MDT(&mop.mo_ldd)) - mop.mo_stripe_sz = atol(optarg) * 1024; + mop.mo_ldd.ldd_stripe_sz = atol(optarg) * 1024; else badopt(opt, "MDT"); break; case 't': - mop.mo_timeout = atol(optarg); + mop.mo_ldd.ldd_timeout = atol(optarg); break; case 'v': verbose++; @@ -815,8 +823,8 @@ int main(int argc , char *const argv[]) "(is the lnet module loaded?)\n", progname); } else { if (i > 0) { - if (i > MAX_FAILOVER_NIDS) - i = MAX_FAILOVER_NIDS; + if (i > MTI_NIDS_MAX) + i = MTI_NIDS_MAX; vprint("Adding %d local nids for MGS\n", i); memcpy(mop.mo_ldd.ldd_mgsnid, nids, sizeof(mop.mo_ldd.ldd_mgsnid)); @@ -833,9 +841,6 @@ int main(int argc , char *const argv[]) goto out; } - if (IS_MDT(&mop.mo_ldd) && (mop.mo_stripe_sz == 0)) - mop.mo_stripe_sz = 1024 * 1024; - strcpy(mop.mo_device, argv[optind]); /* These are the permanent mount options (always included) */ -- 1.8.3.1