From d1b3ac0a969cc5241c51b6ee60ab258c64ae2373 Mon Sep 17 00:00:00 2001 From: nathan Date: Thu, 12 Jan 2006 22:43:04 +0000 Subject: [PATCH] Branch b1_4_mountconf b=4482 During an ost_add, mds_lov_sync might be calling clearorphans on all osc's before the mds has a chance to tell the osc what it's last transno is, in which case clearorphans resets it to 0. So change mds_lov_sync to only sync a single target at a time; then the clearorphans call can only sync something the mds knows about. This also allows syncing of all osc's in parallel, and removes all requirements for an mds_lov_desc lock in the sync. --- lustre/include/linux/obd.h | 8 +++++-- lustre/lov/lov_obd.c | 50 +++++++++++++++++++++++++++++---------- lustre/mdc/mdc_request.c | 2 +- lustre/mds/handler.c | 11 +++++---- lustre/mds/mds_lov.c | 58 ++++++++++++++++++++-------------------------- lustre/mgc/mgc_request.c | 3 ++- lustre/utils/mkfs_lustre.c | 3 ++- 7 files changed, 81 insertions(+), 54 deletions(-) diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 806cb15..da434e1 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -318,6 +318,7 @@ struct client_obd { struct mdc_rpc_lock *cl_rpc_lock; struct mdc_rpc_lock *cl_setattr_lock; struct osc_creator cl_oscc; + /* mgc datastruct */ struct semaphore cl_mgc_sem; struct vfsmount *cl_mgc_vfsmnt; @@ -374,7 +375,6 @@ struct mds_obd { char *mds_profile; struct obd_export *mds_osc_exp; /* XXX lov_exp */ struct lov_desc mds_lov_desc; - struct semaphore mds_lov_sem; obd_id *mds_lov_objids; int mds_lov_objids_size; __u32 mds_lov_objids_in_file; @@ -540,7 +540,10 @@ enum obd_notify_event { /* Device deactivated */ OBD_NOTIFY_INACTIVE, /* Connect data for import were changed */ - OBD_NOTIFY_OCD + OBD_NOTIFY_OCD, + /* Sync request */ + OBD_NOTIFY_SYNC_NONBLOCK, + OBD_NOTIFY_SYNC }; /* @@ -831,5 +834,6 @@ static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, /* get/set_info keys */ #define KEY_MDS_CONN "mds_conn" #define KEY_NEXT_ID "next_id" +#define KEY_LOVDESC "lovdesc" #endif /* __OBD_H */ diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 02be24d..0dcda79 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -399,18 +399,21 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, static int lov_notify(struct obd_device *obd, struct obd_device *watched, enum obd_notify_event ev, void *data) { - int rc; - struct obd_uuid *uuid; - - if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { - CERROR("unexpected notification of %s %s!\n", - watched->obd_type->typ_name, - watched->obd_name); - return -EINVAL; - } - uuid = &watched->u.cli.cl_import->imp_target_uuid; + int rc = 0; if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + return -EINVAL; + } + uuid = &watched->u.cli.cl_import->imp_target_uuid; + /* Set OSC as active before notifying the observer, so the * observer can use the OSC normally. */ @@ -427,7 +430,29 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, } /* Pass the notification up the chain. */ - rc = obd_notify_observer(obd, watched, ev, data); + if (watched) { + rc = obd_notify_observer(obd, watched, ev, data); + } else { + /* NULL watched means all osc's in the lov (only for syncs) */ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int i; + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) { + if (obd_uuid_empty(&tgt->uuid)) + continue; + tgt_obd = class_exp2obd(tgt->ltd_exp); + rc = obd_notify_observer(obd, tgt_obd, ev, data); + if (rc) { + CERROR("%s: notify %s of %s failed %d\n", + obd->obd_name, + obd->obd_observer->obd_name, + tgt_obd->obd_name, rc); + break; + } + } + } RETURN(rc); } @@ -2186,7 +2211,8 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, GOTO(out, rc); } GOTO(out, rc = 0); - } else if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) { + } else if (keylen >= strlen(KEY_LOVDESC) && + strcmp(key, KEY_LOVDESC) == 0) { struct lov_desc *desc_ret = val; *desc_ret = lov->desc; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index dba5121..f903897 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -1074,7 +1074,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp) if (cli->cl_max_mds_easize < size) cli->cl_max_mds_easize = size; - rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc", + rc = obd_get_info(lov_exp, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC, &valsize, &desc); if (rc) RETURN(rc); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 227a643..41bd97f 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1528,7 +1528,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) sema_init(&mds->mds_orphan_recovery_sem, 1); sema_init(&mds->mds_epoch_sem, 1); - sema_init(&mds->mds_lov_sem, 1); spin_lock_init(&mds->mds_transno_lock); mds->mds_max_mdsize = sizeof(struct lov_mds_md); mds->mds_max_cookiesize = sizeof(struct llog_cookie); @@ -1753,7 +1752,7 @@ int mds_postrecov(struct obd_device *obd) LASSERT(!obd->obd_recovering); LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); - /* FIXME just put this in the synchronize, why not? */ + /* FIXME why not put this in the synchronize? */ /* set nextid first, so we are sure it happens */ rc = mds_lov_set_nextid(obd); if (rc) @@ -1767,8 +1766,12 @@ int mds_postrecov(struct obd_device *obd) item = rc; } - /* Does target_finish_recovery really need this to be synchronous? */ - mds_lov_start_synchronize(obd, NULL, NULL, obd->obd_async_recov); + /* FIXME Does target_finish_recovery really need this to block? */ + /* Notify the LOV, which will in turn call mds_notify for each tgt */ + obd_notify(obd->u.mds.mds_osc_obd, NULL, + obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK : + OBD_NOTIFY_SYNC, NULL); + //mds_lov_start_synchronize(obd, NULL, NULL, obd->obd_async_recov); out: RETURN(rc < 0 ? rc : item); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index e987e51..3281570 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -138,16 +138,6 @@ int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid) LASSERT(mds->mds_lov_objids != NULL); - // FIXME remove all this debug stuff - CERROR("Clearorphans, %d targets\n", mds->mds_lov_desc.ld_tgt_count); - for (rc = 0; rc < mds->mds_lov_desc.ld_tgt_count; rc++) - CDEBUG(D_WARNING, "clearorphans "LPU64" for idx %d\n", - mds->mds_lov_objids[rc], rc); - /* FIXME --- can't clearorphans for lov tgts that the mds does not - know about yet! can only clearorphans on - mds->mds_lov_desc.ld_tgt_count (if that - if late tgt joins - first, is that okay?) */ - /* This create will in fact either create or destroy: If the OST is * missing objects below this ID, they will be created. If it finds * objects above this ID, they will be removed. */ @@ -197,7 +187,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) if (!ld) RETURN(-ENOMEM); - rc = obd_get_info(lov, strlen("lovdesc") + 1, "lovdesc", + rc = obd_get_info(lov, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC, &valsize, ld); if (rc) GOTO(out, rc); @@ -280,7 +270,7 @@ static int mds_lov_add_ost(struct obd_device *obd, struct obd_device *watched, mds->mds_lov_objids_dirty = 1; mds_lov_write_objids(obd); } else { - /* We did read this lastid; tell the osc */ + /* We have read this lastid from disk; tell the osc */ rc = mds_lov_set_nextid(obd); } @@ -613,7 +603,7 @@ static int __mds_lov_synchronize(void *data) struct mds_obd *mds; struct obd_uuid *uuid = NULL; __u32 idx; - int rc = 0, have_sem = 0; + int rc = 0; ENTRY; obd = mlsi->mlsi_obd; @@ -625,17 +615,14 @@ static int __mds_lov_synchronize(void *data) OBD_FREE(mlsi, sizeof(*mlsi)); - LASSERT(obd != NULL); + LASSERT(obd); - /* We can't change the target count in one of these sync - threads while another sync thread is doing clearorphans on - all the targets. - If we're syncing a particular target, or we're not - changing the target_count, then we don't need the sem */ - if (!watched || (idx != MLSI_NO_INDEX)) { - down(&mds->mds_lov_sem); - have_sem++; - } + /* We only sync one osc at a time, so that we don't have to hold + any kind of lock on the whole mds_lov_desc, which may change + (grow) as a result of mds_lov_add_ost. This also avoids any + kind of mismatch between the lov_desc and the mds_lov_desc, + which are not in lock-step during lov_add_obd */ + LASSERT(uuid); if (idx != MLSI_NO_INDEX) { rc = mds_lov_add_ost(obd, watched, idx); @@ -658,12 +645,8 @@ static int __mds_lov_synchronize(void *data) GOTO(out, rc); } - if (uuid) - CWARN("MDS %s: %s now active, resetting orphans\n", - obd->obd_name, (char *)uuid->uuid); - else - CWARN("MDS %s: All %d OSC's now active, resetting orphans\n", - obd->obd_name, mds->mds_lov_desc.ld_tgt_count); + CWARN("MDS %s: %s now active, resetting orphans\n", + obd->obd_name, (char *)uuid->uuid); if (obd->obd_stopping) GOTO(out, rc = -ENODEV); @@ -676,8 +659,6 @@ static int __mds_lov_synchronize(void *data) } out: - if (have_sem) - up(&mds->mds_lov_sem); class_decref(obd); RETURN(rc); } @@ -708,6 +689,8 @@ int mds_lov_start_synchronize(struct obd_device *obd, ENTRY; + LASSERT(watched); + OBD_ALLOC(mlsi, sizeof(*mlsi)); if (mlsi == NULL) RETURN(-ENOMEM); @@ -755,8 +738,16 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, int rc = 0; ENTRY; - if (ev != OBD_NOTIFY_ACTIVE) + switch (ev) { + case OBD_NOTIFY_ACTIVE: + case OBD_NOTIFY_SYNC: + case OBD_NOTIFY_SYNC_NONBLOCK: + break; + default: RETURN(0); + } + + CDEBUG(D_WARNING, "notify %s ev=%d\n", watched->obd_name, ev); if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("unexpected notification of %s %s!\n", @@ -773,7 +764,8 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, } LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); - rc = mds_lov_start_synchronize(obd, watched, data, 1); + rc = mds_lov_start_synchronize(obd, watched, data, + !(ev == OBD_NOTIFY_SYNC)); RETURN(rc); } diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 96c88f1..71c0553 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -83,7 +83,8 @@ static struct config_llog_data *config_log_get(char *logname, int match_instance = 0; if (cfg) { - CDEBUG(D_MGC, "get log %s:%s\n", logname, cfg->cfg_instance); + CDEBUG(D_MGC, "get log %s:%s\n", logname ? logname : "-", + cfg->cfg_instance ? cfg->cfg_instance : "-"); if (cfg->cfg_instance) match_instance++; } diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 9e8abd7..df65efe 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -532,7 +532,8 @@ void print_ldd(struct lustre_disk_data *ldd) printf("Flags: %s%s%s%s\n", IS_MDT(ldd) ? "MDT ":"", IS_OST(ldd) ? "OST ":"", IS_MGMT(ldd) ? "MGMT ":"", - ldd->ldd_flags & LDD_F_NEED_INDEX ? "needs_index ":""); + ldd->ldd_flags & LDD_F_NEED_INDEX ? "needs_index ":"", + ldd->ldd_flags & LDD_F_NEED_REGISTER ? "must_register ":""); printf("Persistent mount opts: %s\n", ldd->ldd_mount_opts); printf("MGS nids: "); for (i = 0; i < ldd->ldd_mgsnid_count; i++) { -- 1.8.3.1