From f30d9441d44cff9d58bd64e3f23fde3d2af609a3 Mon Sep 17 00:00:00 2001 From: nathan Date: Thu, 1 Nov 2007 01:55:50 +0000 Subject: [PATCH] b=12860 i=adilger i=green simultaneous MDT->OST connections at startup can cause the sync to abort, leaving the OSC in a bad state --- lustre/ChangeLog | 7 ++++ lustre/include/obd_support.h | 1 + lustre/lov/lov_obd.c | 16 +++++--- lustre/mds/handler.c | 3 +- lustre/mds/mds_lov.c | 96 +++++++++++++++++++++++++++---------------- lustre/osc/osc_create.c | 51 +++++++++++++---------- lustre/ptlrpc/client.c | 6 ++- lustre/tests/replay-single.sh | 5 ++- 8 files changed, 118 insertions(+), 67 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 760b47b..b1f9d0f 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -498,6 +498,13 @@ Bugzilla : 13147 Description: block reactivating mgc import until all deactivates complete Details : Fix race when failing back MDT/MGS to itself (testing) +Severity : minor +Frequency : at statup only +Bugzilla : 12860 +Description: mds_lov_synchronize race leads to various problems +Details : simultaneous MDT->OST connections at startup can cause the + sync to abort, leaving the OSC in a bad state. + Severity : enhancement Bugzilla : 12194 Description: add optional extra BUILD_VERSION info diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ca70d84..6d5efb3 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -113,6 +113,7 @@ extern int obd_race_state; #define OBD_FAIL_MDS_WRITEPAGE_PACK 0x13b #define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x13c #define OBD_FAIL_MDS_OSC_PRECREATE 0x13d +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x13e #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index f293cfd..adfcf71 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -902,7 +902,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa, /* if called for a specific target, we don't care if it is not active. */ - if (!lov->lov_tgts[i]->ltd_active == 0 && ost_uuid == NULL) { + if (!lov->lov_tgts[i]->ltd_active && ost_uuid == NULL) { CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; } @@ -2390,6 +2390,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; + obd_count count; int i, rc = 0, err; int no_set = !set; int incr = 0, check_uuid = 0, do_inactive = 0; @@ -2401,9 +2402,14 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(-ENOMEM); } + lov_getref(obddev); + count = lov->desc.ld_tgt_count; + if (KEY_IS(KEY_NEXT_ID)) { - if (vallen != lov->desc.ld_tgt_count * sizeof(obd_id)) - RETURN(-EINVAL); + /* We must use mds's idea of # osts for indexing into + mds->mds_lov_objids */ + count = vallen / sizeof(obd_id); + LASSERT(count <= lov->desc.ld_tgt_count); vallen = sizeof(obd_id); incr = sizeof(obd_id); do_inactive = 1; @@ -2415,9 +2421,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, /* use defaults: do_inactive = incr = 0; */ } - lov_getref(obddev); - - for (i = 0; i < lov->desc.ld_tgt_count; i++, val = (char *)val + incr) { + for (i = 0; i < count; i++, val = (char *)val + incr) { /* OST was disconnected */ if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) continue; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index eebf615..571d85b 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2145,9 +2145,10 @@ int mds_postrecov(struct obd_device *obd) LASSERT(!obd->obd_recovering); LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL); - /* FIXME why not put this in the synchronize? */ /* set nextid first, so we are sure it happens */ + mutex_down(&obd->obd_dev_sem); rc = mds_lov_set_nextid(obd); + mutex_up(&obd->obd_dev_sem); if (rc) { CERROR("%s: mds_lov_set_nextid failed %d\n", obd->obd_name, rc); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 5a33a92..f282de1 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -162,8 +162,10 @@ int mds_lov_set_nextid(struct obd_device *obd) ENTRY; LASSERT(!obd->obd_recovering); - LASSERT(mds->mds_lov_objids != NULL); + + /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */ + LASSERT_SEM_LOCKED(&obd->obd_dev_sem); rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID), KEY_NEXT_ID, @@ -241,9 +243,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) /* If we added a target we have to reconnect the llogs */ /* We only _need_ to do this at first add (idx), or the first time after recovery. However, it should now be safe to call anytime. */ - mutex_down(&obd->obd_dev_sem); llog_cat_initialize(obd, NULL, mds->mds_lov_desc.ld_tgt_count, NULL); - mutex_up(&obd->obd_dev_sem); /*XXX this notifies the MDD until lov handling use old mds code */ if (obd->obd_upcall.onu_owner) { @@ -269,45 +269,54 @@ static int mds_lov_update_mds(struct obd_device *obd, int rc = 0; ENTRY; + /* Don't let anyone else mess with mds_lov_objids now */ + mutex_down(&obd->obd_dev_sem); + old_count = mds->mds_lov_desc.ld_tgt_count; rc = mds_lov_update_desc(obd, mds->mds_osc_exp); - if (rc) - RETURN(rc); + if (rc) + GOTO(out, rc); CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n", idx, obd->obd_recovering, obd->obd_async_recov, old_count, mds->mds_lov_desc.ld_tgt_count); /* idx is set as data from lov_notify. */ - if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) { - if (idx >= mds->mds_lov_desc.ld_tgt_count) { - CERROR("index %d > count %d!\n", idx, - mds->mds_lov_desc.ld_tgt_count); - RETURN(-EINVAL); - } - - if (idx >= mds->mds_lov_objids_in_file) { - /* We never read this lastid; ask the osc */ - obd_id lastid; - __u32 size = sizeof(lastid); - rc = obd_get_info(watched->obd_self_export, - strlen("last_id"), - "last_id", &size, &lastid); - if (rc) - RETURN(rc); - mds->mds_lov_objids[idx] = lastid; - mds->mds_lov_objids_dirty = 1; - mds_lov_write_objids(obd); - } else { - /* We have read this lastid from disk; tell the osc. - Don't call this during recovery. */ - rc = mds_lov_set_nextid(obd); + if (idx == MDSLOV_NO_INDEX || obd->obd_recovering) + GOTO(out, rc); + + if (idx >= mds->mds_lov_desc.ld_tgt_count) { + CERROR("index %d > count %d!\n", idx, + mds->mds_lov_desc.ld_tgt_count); + GOTO(out, rc = -EINVAL); + } + + if (idx >= mds->mds_lov_objids_in_file) { + /* We never read this lastid; ask the osc */ + obd_id lastid; + __u32 size = sizeof(lastid); + rc = obd_get_info(watched->obd_self_export, strlen("last_id"), + "last_id", &size, &lastid); + if (rc) + GOTO(out, rc); + mds->mds_lov_objids[idx] = lastid; + mds->mds_lov_objids_dirty = 1; + mds_lov_write_objids(obd); + } else { + /* We have read this lastid from disk; tell the osc. + Don't call this during recovery. */ + rc = mds_lov_set_nextid(obd); + if (rc) { + CERROR("Failed to set next id, idx=%d rc=%d\n", idx,rc); + /* Don't abort the rest of the sync */ + rc = 0; } - - CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n", - mds->mds_lov_objids[idx], idx); } + CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n", + mds->mds_lov_objids[idx], idx, rc); +out: + mutex_up(&obd->obd_dev_sem); RETURN(rc); } @@ -364,6 +373,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) /* Deny new client connections until we are sure we have some OSTs */ obd->obd_no_conn = 1; + mutex_down(&obd->obd_dev_sem); rc = mds_lov_read_objids(obd); if (rc) { CERROR("cannot read %s: rc = %d\n", "lov_objids", rc); @@ -398,6 +408,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) "writing objids file: %d\n", rc); } } + mutex_up(&obd->obd_dev_sem); /* I want to see a callback happen when the OBD moves to a * "For General Use" state, and that's when we'll call @@ -411,6 +422,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) RETURN(rc); err_reg: + mutex_up(&obd->obd_dev_sem); obd_register_observer(mds->mds_osc_obd, NULL); err_discon: obd_disconnect(mds->mds_osc_exp); @@ -706,6 +718,8 @@ static int __mds_lov_synchronize(void *data) uuid = &watched->u.cli.cl_target_uuid; LASSERT(uuid); + OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE); + rc = mds_lov_update_mds(obd, watched, idx, uuid); if (rc != 0) { CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc); @@ -729,8 +743,8 @@ static int __mds_lov_synchronize(void *data) NULL, NULL, uuid); if (rc != 0) { - CERROR("%s: failed at llog_origin_connect: %d\n", - obd->obd_name, rc); + CERROR("%s failed at llog_origin_connect: %d\n", + obd_uuid2str(uuid), rc); GOTO(out, rc); } @@ -761,6 +775,16 @@ static int __mds_lov_synchronize(void *data) } EXIT; out: + if (rc) { + /* Deactivate it for safety */ + CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid), + rc); + if (!obd->obd_stopping && mds->mds_osc_obd && + !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping) + obd_notify(mds->mds_osc_obd, watched, + OBD_NOTIFY_INACTIVE, NULL); + } + class_decref(obd); return rc; } @@ -866,12 +890,14 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, /* We still have to fix the lov descriptor for ost's added after the mdt in the config log. They didn't make it into mds_lov_connect. */ + mutex_down(&obd->obd_dev_sem); rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp); - if (rc) + if (rc) { + mutex_up(&obd->obd_dev_sem); RETURN(rc); + } /* We should update init llog here too for replay unlink and * possiable llog init race when recovery complete */ - mutex_down(&obd->obd_dev_sem); llog_cat_initialize(obd, NULL, obd->u.mds.mds_lov_desc.ld_tgt_count, &watched->u.cli.cl_target_uuid); diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index a644f7c..d79be2f 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -66,7 +66,23 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) spin_lock(&oscc->oscc_lock); oscc->oscc_flags &= ~OSCC_FLAG_CREATING; - if (rc == -ENOSPC || rc == -EROFS) { + switch (rc) { + case 0: { + if (body) { + int diff = body->oa.o_id - oscc->oscc_last_id; + + if (diff < oscc->oscc_grow_count) + oscc->oscc_grow_count = + max(diff/3, OST_MIN_PRECREATE); + else + oscc->oscc_flags &= ~OSCC_FLAG_LOW; + oscc->oscc_last_id = body->oa.o_id; + } + spin_unlock(&oscc->oscc_lock); + break; + } + case -ENOSPC: + case -EROFS: { oscc->oscc_flags |= OSCC_FLAG_NOSPC; if (body && rc == -ENOSPC) { oscc->oscc_grow_count = OST_MIN_PRECREATE; @@ -74,7 +90,17 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) } spin_unlock(&oscc->oscc_lock); DEBUG_REQ(D_INODE, req, "OST out of space, flagging"); - } else if (rc != 0 && rc != -EIO) { + break; + } + case -EIO: { + /* filter always set body->oa.o_id as the last_id + * of filter (see filter_handle_precreate for detail)*/ + if (body && body->oa.o_id > oscc->oscc_last_id) + oscc->oscc_last_id = body->oa.o_id; + spin_unlock(&oscc->oscc_lock); + break; + } + default: { oscc->oscc_flags |= OSCC_FLAG_RECOVERING; oscc->oscc_grow_count = OST_MIN_PRECREATE; spin_unlock(&oscc->oscc_lock); @@ -82,26 +108,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) "Unknown rc %d from async create: failing oscc", rc); ptlrpc_fail_import(req->rq_import, lustre_msg_get_conn_cnt(req->rq_reqmsg)); - } else { - if (rc == 0) { - if (body) { - int diff = body->oa.o_id - oscc->oscc_last_id; - - if (diff < oscc->oscc_grow_count) - oscc->oscc_grow_count = - max(diff/3, OST_MIN_PRECREATE); - else - oscc->oscc_flags &= ~OSCC_FLAG_LOW; - oscc->oscc_last_id = body->oa.o_id; - } - } else { - /* filter always set body->oa.o_id as the last_id - * of filter (see filter_handle_precreate for detail)*/ - if (body && body->oa.o_id > oscc->oscc_last_id) - oscc->oscc_last_id = body->oa.o_id; - } - spin_unlock(&oscc->oscc_lock); - + } } CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n", diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index bd35514..1572579 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -551,10 +551,12 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, /* allow CONNECT even if import is invalid */ ; } else if (imp->imp_invalid) { /* If the import has been invalidated (such as by an OST - * failure), the request must fail with -EIO. */ + * failure) the request must fail with -ESHUTDOWN. This + * indicates the requests should be discarded; an -EIO + * may result in a resend of the request. */ if (!imp->imp_deactive) DEBUG_REQ(D_ERROR, req, "IMP_INVALID"); - *status = -EIO; + *status = -ESHUTDOWN; /* bz 12940 */ } else if (req->rq_import_generation != imp->imp_generation) { DEBUG_REQ(D_ERROR, req, "req wrong generation:"); *status = -EIO; diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 9b8be16..067c5ea 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -476,10 +476,13 @@ test_20b() { # bug 10480 df -P $DIR || df -P $DIR || true # reconnect wait_mds_recovery_done || error "MDS recovery not done" + # FIXME just because recovery is done doesn't mean we've finished + # orphan cleanup. Fake it with a sleep for now... + sleep 10 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \ - error "after $AFTERUSED > before $BEFOREUSED" && return 5 + error "after $AFTERUSED > before $BEFOREUSED" return 0 } run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)" -- 1.8.3.1