From f30d9441d44cff9d58bd64e3f23fde3d2af609a3 Mon Sep 17 00:00:00 2001
From: nathan <nathan>
Date: Thu, 1 Nov 2007 01:55:50 +0000
Subject: [PATCH] b=12860 i=adilger i=green simultaneous MDT->OST connections
 at startup can cause the sync to abort, leaving the OSC in a bad state

---
 lustre/ChangeLog              |  7 ++++
 lustre/include/obd_support.h  |  1 +
 lustre/lov/lov_obd.c          | 16 +++++---
 lustre/mds/handler.c          |  3 +-
 lustre/mds/mds_lov.c          | 96 +++++++++++++++++++++++++++----------------
 lustre/osc/osc_create.c       | 51 +++++++++++++----------
 lustre/ptlrpc/client.c        |  6 ++-
 lustre/tests/replay-single.sh |  5 ++-
 8 files changed, 118 insertions(+), 67 deletions(-)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 760b47b..b1f9d0f 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -498,6 +498,13 @@ Bugzilla   : 13147
 Description: block reactivating mgc import until all deactivates complete
 Details    : Fix race when failing back MDT/MGS to itself (testing)
 
+Severity   : minor
+Frequency  : at statup only
+Bugzilla   : 12860
+Description: mds_lov_synchronize race leads to various problems
+Details    : simultaneous MDT->OST connections at startup can cause the 
+	     sync to abort, leaving the OSC in a bad state.
+
 Severity   : enhancement
 Bugzilla   : 12194
 Description: add optional extra BUILD_VERSION info
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index ca70d84..6d5efb3 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -113,6 +113,7 @@ extern int obd_race_state;
 #define OBD_FAIL_MDS_WRITEPAGE_PACK      0x13b
 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x13c
 #define OBD_FAIL_MDS_OSC_PRECREATE       0x13d
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x13e
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c
index f293cfd..adfcf71 100644
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -902,7 +902,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
 
                 /* if called for a specific target, we don't
                    care if it is not active. */
-                if (!lov->lov_tgts[i]->ltd_active == 0 && ost_uuid == NULL) {
+                if (!lov->lov_tgts[i]->ltd_active && ost_uuid == NULL) {
                         CDEBUG(D_HA, "lov idx %d inactive\n", i);
                         continue;
                 }
@@ -2390,6 +2390,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
 {
         struct obd_device *obddev = class_exp2obd(exp);
         struct lov_obd *lov = &obddev->u.lov;
+        obd_count count;
         int i, rc = 0, err;
         int no_set = !set;
         int incr = 0, check_uuid = 0, do_inactive = 0;
@@ -2401,9 +2402,14 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         RETURN(-ENOMEM);
         }
 
+        lov_getref(obddev);
+        count = lov->desc.ld_tgt_count;
+
         if (KEY_IS(KEY_NEXT_ID)) {
-                if (vallen != lov->desc.ld_tgt_count * sizeof(obd_id))
-                        RETURN(-EINVAL);
+                /* We must use mds's idea of # osts for indexing into
+                   mds->mds_lov_objids */
+                count = vallen / sizeof(obd_id);
+                LASSERT(count <= lov->desc.ld_tgt_count);
                 vallen = sizeof(obd_id);
                 incr = sizeof(obd_id);
                 do_inactive = 1;
@@ -2415,9 +2421,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                 /* use defaults:  do_inactive = incr = 0; */
         }
 
-        lov_getref(obddev);
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++, val = (char *)val + incr) {
+        for (i = 0; i < count; i++, val = (char *)val + incr) {
                 /* OST was disconnected */
                 if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                         continue;
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index eebf615..571d85b 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -2145,9 +2145,10 @@ int mds_postrecov(struct obd_device *obd)
         LASSERT(!obd->obd_recovering);
         LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
 
-        /* FIXME why not put this in the synchronize? */
         /* set nextid first, so we are sure it happens */
+        mutex_down(&obd->obd_dev_sem);
         rc = mds_lov_set_nextid(obd);
+        mutex_up(&obd->obd_dev_sem);
         if (rc) {
                 CERROR("%s: mds_lov_set_nextid failed %d\n",
                        obd->obd_name, rc);
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c
index 5a33a92..f282de1 100644
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -162,8 +162,10 @@ int mds_lov_set_nextid(struct obd_device *obd)
         ENTRY;
 
         LASSERT(!obd->obd_recovering);
-
         LASSERT(mds->mds_lov_objids != NULL);
+     
+        /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */
+        LASSERT_SEM_LOCKED(&obd->obd_dev_sem);
 
         rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID),
                                 KEY_NEXT_ID,
@@ -241,9 +243,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
         /* If we added a target we have to reconnect the llogs */
         /* We only _need_ to do this at first add (idx), or the first time
            after recovery.  However, it should now be safe to call anytime. */
-        mutex_down(&obd->obd_dev_sem);
         llog_cat_initialize(obd, NULL, mds->mds_lov_desc.ld_tgt_count, NULL);
-        mutex_up(&obd->obd_dev_sem);
 
         /*XXX this notifies the MDD until lov handling use old mds code */
         if (obd->obd_upcall.onu_owner) {
@@ -269,45 +269,54 @@ static int mds_lov_update_mds(struct obd_device *obd,
         int rc = 0;
         ENTRY;
 
+        /* Don't let anyone else mess with mds_lov_objids now */
+        mutex_down(&obd->obd_dev_sem);
+
         old_count = mds->mds_lov_desc.ld_tgt_count;
         rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
-        if (rc)
-                RETURN(rc);
+        if (rc) 
+                GOTO(out, rc);
 
         CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n",
                idx, obd->obd_recovering, obd->obd_async_recov, old_count,
                mds->mds_lov_desc.ld_tgt_count);
 
         /* idx is set as data from lov_notify. */
-        if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) {
-                if (idx >= mds->mds_lov_desc.ld_tgt_count) {
-                        CERROR("index %d > count %d!\n", idx,
-                               mds->mds_lov_desc.ld_tgt_count);
-                        RETURN(-EINVAL);
-                }
-
-                if (idx >= mds->mds_lov_objids_in_file) {
-                        /* We never read this lastid; ask the osc */
-                        obd_id lastid;
-                        __u32 size = sizeof(lastid);
-                        rc = obd_get_info(watched->obd_self_export,
-                                          strlen("last_id"),
-                                          "last_id", &size, &lastid);
-                        if (rc)
-                                RETURN(rc);
-                        mds->mds_lov_objids[idx] = lastid;
-                        mds->mds_lov_objids_dirty = 1;
-                        mds_lov_write_objids(obd);
-                } else {
-                        /* We have read this lastid from disk; tell the osc.
-                           Don't call this during recovery. */
-                        rc = mds_lov_set_nextid(obd);
+        if (idx == MDSLOV_NO_INDEX || obd->obd_recovering) 
+                GOTO(out, rc);
+                
+        if (idx >= mds->mds_lov_desc.ld_tgt_count) {
+                CERROR("index %d > count %d!\n", idx, 
+                       mds->mds_lov_desc.ld_tgt_count);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (idx >= mds->mds_lov_objids_in_file) {
+                /* We never read this lastid; ask the osc */
+                obd_id lastid;
+                __u32 size = sizeof(lastid);
+                rc = obd_get_info(watched->obd_self_export, strlen("last_id"),
+                                  "last_id", &size, &lastid);
+                if (rc)
+                        GOTO(out, rc);
+                mds->mds_lov_objids[idx] = lastid;
+                mds->mds_lov_objids_dirty = 1;
+                mds_lov_write_objids(obd);
+        } else {
+                /* We have read this lastid from disk; tell the osc.
+                   Don't call this during recovery. */
+                rc = mds_lov_set_nextid(obd);
+                if (rc) {
+                        CERROR("Failed to set next id, idx=%d rc=%d\n", idx,rc);
+                        /* Don't abort the rest of the sync */
+                        rc = 0;
                 }
-
-                CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n",
-                      mds->mds_lov_objids[idx], idx);
         }
 
+        CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n",
+               mds->mds_lov_objids[idx], idx, rc);
+out:
+        mutex_up(&obd->obd_dev_sem);
         RETURN(rc);
 }
 
@@ -364,6 +373,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         /* Deny new client connections until we are sure we have some OSTs */
         obd->obd_no_conn = 1;
 
+        mutex_down(&obd->obd_dev_sem);
         rc = mds_lov_read_objids(obd);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
@@ -398,6 +408,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                                        "writing objids file: %d\n", rc);
                 }
         }
+        mutex_up(&obd->obd_dev_sem);
 
         /* I want to see a callback happen when the OBD moves to a
          * "For General Use" state, and that's when we'll call
@@ -411,6 +422,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         RETURN(rc);
 
 err_reg:
+        mutex_up(&obd->obd_dev_sem);
         obd_register_observer(mds->mds_osc_obd, NULL);
 err_discon:
         obd_disconnect(mds->mds_osc_exp);
@@ -706,6 +718,8 @@ static int __mds_lov_synchronize(void *data)
         uuid = &watched->u.cli.cl_target_uuid;
         LASSERT(uuid);
 
+        OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE);
+
         rc = mds_lov_update_mds(obd, watched, idx, uuid);
         if (rc != 0) {
                 CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc);
@@ -729,8 +743,8 @@ static int __mds_lov_synchronize(void *data)
                           NULL, NULL, uuid);
 
         if (rc != 0) {
-                CERROR("%s: failed at llog_origin_connect: %d\n",
-                       obd->obd_name, rc);
+                CERROR("%s failed at llog_origin_connect: %d\n",
+                       obd_uuid2str(uuid), rc);
                 GOTO(out, rc);
         }
 
@@ -761,6 +775,16 @@ static int __mds_lov_synchronize(void *data)
         }
         EXIT;
 out:
+        if (rc) {
+                /* Deactivate it for safety */
+                CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid),
+                       rc);
+                if (!obd->obd_stopping && mds->mds_osc_obd &&
+                    !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping) 
+                        obd_notify(mds->mds_osc_obd, watched,
+                                   OBD_NOTIFY_INACTIVE, NULL);
+        }
+
         class_decref(obd);
         return rc;
 }
@@ -866,12 +890,14 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                 /* We still have to fix the lov descriptor for ost's added
                    after the mdt in the config log.  They didn't make it into
                    mds_lov_connect. */
+                mutex_down(&obd->obd_dev_sem);
                 rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp);
-                if (rc)
+                if (rc) {
+                        mutex_up(&obd->obd_dev_sem);
                         RETURN(rc);
+                }
                 /* We should update init llog here too for replay unlink and 
                  * possiable llog init race when recovery complete */
-                mutex_down(&obd->obd_dev_sem);
                 llog_cat_initialize(obd, NULL, 
                                     obd->u.mds.mds_lov_desc.ld_tgt_count,
                                     &watched->u.cli.cl_target_uuid);
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c
index a644f7c..d79be2f 100644
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -66,7 +66,23 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
         
         spin_lock(&oscc->oscc_lock);
         oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
-        if (rc == -ENOSPC || rc == -EROFS) {
+        switch (rc) {
+        case 0: {
+                if (body) {
+                        int diff = body->oa.o_id - oscc->oscc_last_id;
+
+                        if (diff < oscc->oscc_grow_count)
+                                oscc->oscc_grow_count =
+                                        max(diff/3, OST_MIN_PRECREATE);
+                        else
+                                oscc->oscc_flags &= ~OSCC_FLAG_LOW;
+                        oscc->oscc_last_id = body->oa.o_id;
+                }
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        case -ENOSPC:
+        case -EROFS: {
                 oscc->oscc_flags |= OSCC_FLAG_NOSPC;
                 if (body && rc == -ENOSPC) {
                         oscc->oscc_grow_count = OST_MIN_PRECREATE;
@@ -74,7 +90,17 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                 }
                 spin_unlock(&oscc->oscc_lock);
                 DEBUG_REQ(D_INODE, req, "OST out of space, flagging");
-        } else if (rc != 0 && rc != -EIO) {
+                break;
+        }
+        case -EIO: {
+                /* filter always set body->oa.o_id as the last_id 
+                 * of filter (see filter_handle_precreate for detail)*/
+                if (body && body->oa.o_id > oscc->oscc_last_id)
+                        oscc->oscc_last_id = body->oa.o_id;
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        default: {
                 oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
                 oscc->oscc_grow_count = OST_MIN_PRECREATE;
                 spin_unlock(&oscc->oscc_lock);
@@ -82,26 +108,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                           "Unknown rc %d from async create: failing oscc", rc);
                 ptlrpc_fail_import(req->rq_import,
                                    lustre_msg_get_conn_cnt(req->rq_reqmsg));
-        } else {
-                if (rc == 0) {
-                         if (body) {
-                                int diff = body->oa.o_id - oscc->oscc_last_id;
-
-                                if (diff < oscc->oscc_grow_count)
-                                        oscc->oscc_grow_count =
-                                                max(diff/3, OST_MIN_PRECREATE);
-                                else
-                                        oscc->oscc_flags &= ~OSCC_FLAG_LOW;
-                                oscc->oscc_last_id = body->oa.o_id;
-                        }
-                } else {
-                        /* filter always set body->oa.o_id as the last_id 
-                         * of filter (see filter_handle_precreate for detail)*/
-                        if (body && body->oa.o_id > oscc->oscc_last_id)
-                                oscc->oscc_last_id = body->oa.o_id;
-                }
-                spin_unlock(&oscc->oscc_lock);
-
+        }
         }
 
         CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n",
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index bd35514..1572579 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -551,10 +551,12 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
                 /* allow CONNECT even if import is invalid */ ;
         } else if (imp->imp_invalid) {
                 /* If the import has been invalidated (such as by an OST
-                 * failure), the request must fail with -EIO. */
+                 * failure) the request must fail with -ESHUTDOWN.  This
+                 * indicates the requests should be discarded; an -EIO
+                 * may result in a resend of the request. */
                 if (!imp->imp_deactive)
                         DEBUG_REQ(D_ERROR, req, "IMP_INVALID");
-                *status = -EIO;
+                *status = -ESHUTDOWN; /* bz 12940 */
         } else if (req->rq_import_generation != imp->imp_generation) {
                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
                 *status = -EIO;
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index 9b8be16..067c5ea 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -476,10 +476,13 @@ test_20b() { # bug 10480
     df -P $DIR || df -P $DIR || true    # reconnect
     wait_mds_recovery_done || error "MDS recovery not done"
 
+    # FIXME just because recovery is done doesn't mean we've finished 
+    # orphan cleanup.  Fake it with a sleep for now...
+    sleep 10
     AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
     log "before $BEFOREUSED, after $AFTERUSED"
     [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
-        error "after $AFTERUSED > before $BEFOREUSED" && return 5
+        error "after $AFTERUSED > before $BEFOREUSED"
     return 0
 }
 run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)"
-- 
1.8.3.1