b=12860

author nathan <nathan>

Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)

committer nathan <nathan>

Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)
author nathan <nathan>
Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)
committer nathan <nathan>
Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index 4df8ec7..bc6c481 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -14,6 +14,13 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
         * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
         * bug fixes
  
+Severity   : minor
+Frequency  : at statup only
+Bugzilla   : 12860
+Description: mds_lov_synchronize race leads to various problems
+Details    : simultaneous MDT->OST connections at startup can cause the 
+            sync to abort, leaving the OSC in a bad state.
+
  Severity   : enhancement
  Bugzilla   : 12194
  Description: add optional extra BUILD_VERSION info
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index ca7414b..b368b56 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -102,6 +102,7 @@ extern int obd_race_state;
  #define OBD_FAIL_MDS_FS_SETUP            0x135
  #define OBD_FAIL_MDS_RESEND              0x136
  #define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
  
  #define OBD_FAIL_OST                     0x200
  #define OBD_FAIL_OST_CONNECT_NET         0x201
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index 6132e65..7f5970c 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -908,7 +908,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
  
                  /* if called for a specific target, we don't
                     care if it is not active. */
-                if (!lov->lov_tgts[i]->ltd_active == 0 && ost_uuid == NULL) {
+                if (!lov->lov_tgts[i]->ltd_active && ost_uuid == NULL) {
                          CDEBUG(D_HA, "lov idx %d inactive\n", i);
                          continue;
                  }
@@ -2375,6 +2375,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
  {
          struct obd_device *obddev = class_exp2obd(exp);
          struct lov_obd *lov = &obddev->u.lov;
+        obd_count count;
          int i, rc = 0, err, incr = 0, check_uuid = 0, do_inactive = 0;
          int no_set = !set;
          ENTRY;
@@ -2385,15 +2386,20 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                          RETURN(-ENOMEM);
          }
  
+        lov_getref(obddev);
+        count = lov->desc.ld_tgt_count;
+
          if (KEY_IS(KEY_NEXT_ID)) {
-                if (vallen != lov->desc.ld_tgt_count * sizeof(obd_id))
-                        RETURN(-EINVAL);
+                /* We must use mds's idea of # osts for indexing into
+                   mds->mds_lov_objids */
+                count = vallen / sizeof(obd_id);
+                LASSERT(count <= lov->desc.ld_tgt_count);
                  vallen = sizeof(obd_id);
                  incr = sizeof(obd_id);
                  do_inactive = 1;
          } else if (KEY_IS("checksum")) {
                  do_inactive = 1;
-        } else if (KEY_IS("mds_conn") || KEY_IS("unlinked")) {
+        } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) {
                  check_uuid = val ? 1 : 0;
          } else if (KEY_IS("evict_by_nid")) {
                  /* use defaults:
@@ -2401,9 +2407,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                   */
          }
  
-        lov_getref(obddev);
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++, val = (char *)val + incr) {
+        for (i = 0; i < count; i++, val = (char *)val + incr) {
                  /* OST was disconnected */
                  if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
                          continue;
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c

index 1c4e05f..14f095b 100644 (file)
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -2142,9 +2142,10 @@ int mds_postrecov(struct obd_device *obd)
          LASSERT(!obd->obd_recovering);
          LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
  
-        /* FIXME why not put this in the synchronize? */
          /* set nextid first, so we are sure it happens */
+        mutex_down(&obd->obd_dev_sem);
          rc = mds_lov_set_nextid(obd);
+        mutex_up(&obd->obd_dev_sem);
          if (rc) {
                  CERROR("%s: mds_lov_set_nextid failed %d\n",
                         obd->obd_name, rc);
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c

index 8696611..36f893e 100644 (file)
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -159,8 +159,10 @@ int mds_lov_set_nextid(struct obd_device *obd)
          ENTRY;
  
          LASSERT(!obd->obd_recovering);
-
          LASSERT(mds->mds_lov_objids != NULL);
+     
+        /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */
+        LASSERT_SEM_LOCKED(&obd->obd_dev_sem);
  
          rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_NEXT_ID),
                                  KEY_NEXT_ID,
@@ -238,9 +240,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
          /* If we added a target we have to reconnect the llogs */
          /* We only _need_ to do this at first add (idx), or the first time
             after recovery.  However, it should now be safe to call anytime. */
-        mutex_down(&obd->obd_dev_sem);
          llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count, NULL);
-        mutex_up(&obd->obd_dev_sem);
  
  out:
          OBD_FREE(ld, sizeof(*ld));
@@ -260,45 +260,54 @@ static int mds_lov_update_mds(struct obd_device *obd,
          int rc = 0;
          ENTRY;
  
+        /* Don't let anyone else mess with mds_lov_objids now */
+        mutex_down(&obd->obd_dev_sem);
+
          old_count = mds->mds_lov_desc.ld_tgt_count;
          rc = mds_lov_update_desc(obd, mds->mds_osc_exp);
-        if (rc)
-                RETURN(rc);
+        if (rc) 
+                GOTO(out, rc);
  
          CDEBUG(D_CONFIG, "idx=%d, recov=%d/%d, cnt=%d/%d\n",
                 idx, obd->obd_recovering, obd->obd_async_recov, old_count,
                 mds->mds_lov_desc.ld_tgt_count);
  
          /* idx is set as data from lov_notify. */
-        if (idx != MDSLOV_NO_INDEX && !obd->obd_recovering) {
-                if (idx >= mds->mds_lov_desc.ld_tgt_count) {
-                        CERROR("index %d > count %d!\n", idx,
-                               mds->mds_lov_desc.ld_tgt_count);
-                        RETURN(-EINVAL);
-                }
-
-                if (idx >= mds->mds_lov_objids_in_file) {
-                        /* We never read this lastid; ask the osc */
-                        obd_id lastid;
-                        __u32 size = sizeof(lastid);
-                        rc = obd_get_info(watched->obd_self_export,
-                                          strlen("last_id"),
-                                          "last_id", &size, &lastid);
-                        if (rc)
-                                RETURN(rc);
-                        mds->mds_lov_objids[idx] = lastid;
-                        mds->mds_lov_objids_dirty = 1;
-                        mds_lov_write_objids(obd);
-                } else {
-                        /* We have read this lastid from disk; tell the osc.
-                           Don't call this during recovery. */
-                        rc = mds_lov_set_nextid(obd);
+        if (idx == MDSLOV_NO_INDEX || obd->obd_recovering) 
+                GOTO(out, rc);
+                
+        if (idx >= mds->mds_lov_desc.ld_tgt_count) {
+                CERROR("index %d > count %d!\n", idx, 
+                       mds->mds_lov_desc.ld_tgt_count);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (idx >= mds->mds_lov_objids_in_file) {
+                /* We never read this lastid; ask the osc */
+                obd_id lastid;
+                __u32 size = sizeof(lastid);
+                rc = obd_get_info(watched->obd_self_export, strlen("last_id"),
+                                  "last_id", &size, &lastid);
+                if (rc)
+                        GOTO(out, rc);
+                mds->mds_lov_objids[idx] = lastid;
+                mds->mds_lov_objids_dirty = 1;
+                mds_lov_write_objids(obd);
+        } else {
+                /* We have read this lastid from disk; tell the osc.
+                   Don't call this during recovery. */
+                rc = mds_lov_set_nextid(obd);
+                if (rc) {
+                        CERROR("Failed to set next id, idx=%d rc=%d\n", idx,rc);
+                        /* Don't abort the rest of the sync */
+                        rc = 0;
                  }
-
-                CDEBUG(D_CONFIG, "last object "LPU64" from OST %d\n",
-                      mds->mds_lov_objids[idx], idx);
          }
  
+        CDEBUG(D_CONFIG, "last object "LPU64" from OST %d rc=%d\n",
+               mds->mds_lov_objids[idx], idx, rc);
+out:
+        mutex_up(&obd->obd_dev_sem);
          RETURN(rc);
  }
  
@@ -350,6 +359,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
          /* Deny new client connections until we are sure we have some OSTs */
          obd->obd_no_conn = 1;
  
+        mutex_down(&obd->obd_dev_sem);
          rc = mds_lov_read_objids(obd);
          if (rc) {
                  CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
@@ -384,6 +394,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                                         "writing objids file: %d\n", rc);
                  }
          }
+        mutex_up(&obd->obd_dev_sem);
  
          /* I want to see a callback happen when the OBD moves to a
           * "For General Use" state, and that's when we'll call
@@ -395,6 +406,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
          RETURN(rc);
  
  err_reg:
+        mutex_up(&obd->obd_dev_sem);
          obd_register_observer(mds->mds_osc_obd, NULL);
  err_discon:
          obd_disconnect(mds->mds_osc_exp);
@@ -649,9 +661,13 @@ static int __mds_lov_synchronize(void *data)
          uuid = &watched->u.cli.cl_target_uuid;
          LASSERT(uuid);
  
+        OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE);
+
          rc = mds_lov_update_mds(obd, watched, idx, uuid);
-        if (rc != 0)
+        if (rc != 0) {
+                CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc);
                  GOTO(out, rc);
+        }
  
          rc = obd_set_info_async(mds->mds_osc_exp, strlen(KEY_MDS_CONN),
                                  KEY_MDS_CONN, 0, uuid, NULL);
@@ -663,8 +679,8 @@ static int __mds_lov_synchronize(void *data)
                            NULL, NULL, uuid);
  
          if (rc != 0) {
-                CERROR("%s: failed at llog_origin_connect: %d\n",
-                       obd->obd_name, rc);
+                CERROR("%s failed at llog_origin_connect: %d\n",
+                       obd_uuid2str(uuid), rc);
                  GOTO(out, rc);
          }
  
@@ -676,13 +692,20 @@ static int __mds_lov_synchronize(void *data)
  
          rc = mds_lov_clear_orphans(mds, uuid);
          if (rc != 0) {
-                CERROR("%s: failed at mds_lov_clear_orphans: %d\n",
-                       obd->obd_name, rc);
+                CERROR("%s failed at mds_lov_clear_orphans: %d\n",
+                       obd_uuid2str(uuid), rc);
                  GOTO(out, rc);
          }
  
          EXIT;
  out:
+        if (rc) {
+                /* Deactivate it for safety */
+                CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid),
+                       rc);
+                obd_notify(mds->mds_osc_obd, watched, OBD_NOTIFY_INACTIVE,
+                           NULL);
+        }
          class_decref(obd);
          return rc;
  }
@@ -790,7 +813,9 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                  /* We still have to fix the lov descriptor for ost's added
                     after the mdt in the config log.  They didn't make it into
                     mds_lov_connect. */
+                mutex_down(&obd->obd_dev_sem);
                  rc = mds_lov_update_desc(obd, obd->u.mds.mds_osc_exp);
+                mutex_up(&obd->obd_dev_sem);
                  RETURN(rc);
          }
  
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c

index 251483e..5d01ce9 100644 (file)
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -66,7 +66,23 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
          
          spin_lock(&oscc->oscc_lock);
          oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
-        if (rc == -ENOSPC || rc == -EROFS) {
+        switch (rc) {
+        case 0: {
+                if (body) {
+                        int diff = body->oa.o_id - oscc->oscc_last_id;
+
+                        if (diff < oscc->oscc_grow_count)
+                                oscc->oscc_grow_count =
+                                        max(diff/3, OST_MIN_PRECREATE);
+                        else
+                                oscc->oscc_flags &= ~OSCC_FLAG_LOW;
+                        oscc->oscc_last_id = body->oa.o_id;
+                }
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        case -ENOSPC:
+        case -EROFS: {
                  oscc->oscc_flags |= OSCC_FLAG_NOSPC;
                  if (body && rc == -ENOSPC) {
                          oscc->oscc_grow_count = OST_MIN_PRECREATE;
@@ -74,7 +90,17 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                  }
                  spin_unlock(&oscc->oscc_lock);
                  DEBUG_REQ(D_INODE, req, "OST out of space, flagging");
-        } else if (rc != 0 && rc != -EIO) {
+                break;
+        }
+        case -EIO: {
+                /* filter always set body->oa.o_id as the last_id 
+                 * of filter (see filter_handle_precreate for detail)*/
+                if (body && body->oa.o_id > oscc->oscc_last_id)
+                        oscc->oscc_last_id = body->oa.o_id;
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
+        default: {
                  oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
                  oscc->oscc_grow_count = OST_MIN_PRECREATE;
                  spin_unlock(&oscc->oscc_lock);
@@ -82,26 +108,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                            "unknown rc %d from async create: failing oscc", rc);
                  ptlrpc_fail_import(req->rq_import,
                                     lustre_msg_get_conn_cnt(req->rq_reqmsg));
-        } else {
-                if (rc == 0) {
-                        if (body) {
-                                int diff = body->oa.o_id - oscc->oscc_last_id;
-
-                                if (diff < oscc->oscc_grow_count)
-                                        oscc->oscc_grow_count =
-                                                max(diff/3, OST_MIN_PRECREATE);
-                                else
-                                        oscc->oscc_flags &= ~OSCC_FLAG_LOW;
-                                oscc->oscc_last_id = body->oa.o_id;
-                        }
-                } else {
-                        /* filter always set body->oa.o_id as the last_id 
-                         * of filter (see filter_handle_precreate for detail)*/
-                        if (body && body->oa.o_id > oscc->oscc_last_id)
-                                oscc->oscc_last_id = body->oa.o_id;
-                }
-                spin_unlock(&oscc->oscc_lock);
-
+        }
          }
  
          CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n",
@@ -310,8 +317,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                  if (oscc_recovering(oscc)) {
                          struct l_wait_info lwi;
  
-                        CDEBUG(D_HA,"%p: oscc recovery in progress, waiting\n",
-                               oscc);
+                        CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
+                               oscc->oscc_obd->obd_name);
  
                          lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)),
                                            NULL, NULL);
@@ -319,12 +326,12 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                                            !oscc_recovering(oscc), &lwi);
                          LASSERT(rc == 0 || rc == -ETIMEDOUT);
                          if (rc == -ETIMEDOUT) {
-                                CDEBUG(D_HA,"%p: timeout waiting on recovery\n",
-                                       oscc);
+                                CDEBUG(D_HA,"%s: timeout waiting on recovery\n",
+                                       oscc->oscc_obd->obd_name);
                                  RETURN(rc);
                          }
-                        CDEBUG(D_HA, "%p: oscc recovery over, waking up\n",
-                               oscc);
+                        CDEBUG(D_HA, "%s: oscc recovery over, waking up\n",
+                               oscc->oscc_obd->obd_name);
                  }
  
                  spin_lock(&oscc->oscc_lock);
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 8dcd394..58f1a9e 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -427,13 +427,12 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
  
          CDEBUG(D_HA, "transno: "LPD64"\n",
                 lustre_msg_get_transno(req->rq_repmsg));
-        EXIT;
  out_req:
          ptlrpc_req_finished(req);
  out:
          if (rc && !*ea)
                  obd_free_memmd(exp, &lsm);
-        return rc;
+        RETURN(rc);
  }
  
  static int osc_punch_interpret(struct ptlrpc_request *req,
@@ -3342,7 +3341,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
          if (req == NULL)
                  RETURN(-ENOMEM);
  
-        if (KEY_IS("mds_conn"))
+        if (KEY_IS(KEY_MDS_CONN))
                  req->rq_interpret_reply = osc_setinfo_mds_conn_interpret;
  
          ptlrpc_req_set_repsize(req, 1, NULL);
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 56723f1..889de36 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -511,13 +511,9 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
                   imp->imp_state == LUSTRE_IMP_CONNECTING) {
                  /* allow CONNECT even if import is invalid */ ;
          } else if (imp->imp_invalid) {
-                /* If the import has been invalidated (such as by an OST
-                 * failure) the request must fail with -EINVAL.  This
-                 * indicates the requests should be discarded, an -EIO
-                 * may result in a resend of the request. */
                  if (!imp->imp_deactive)
                          DEBUG_REQ(D_ERROR, req, "IMP_INVALID");
-                *status = -EINVAL;
+                *status = -ESHUTDOWN; /* bz 12940 */
          } else if (req->rq_import_generation != imp->imp_generation) {
                  DEBUG_REQ(D_ERROR, req, "req wrong generation:");
                  *status = -EIO;
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c

index 61419d3..25f1330 100644 (file)
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -220,7 +220,7 @@ void request_in_callback(lnet_event_t *ev)
  
          if (ev->unlinked) {
                  service->srv_nrqbd_receiving--;
-                CDEBUG(D_RPCTRACE,"Buffer complete: %d buffers still posted\n",
+                CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
                         service->srv_nrqbd_receiving);
  
                  /* Normally, don't complain about 0 buffers posted; LNET won't
author	nathan <nathan>
	Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)
committer	nathan <nathan>
	Wed, 18 Jul 2007 19:37:28 +0000 (19:37 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/lov/lov_obd.c		patch \| blob \| history
lustre/mds/handler.c		patch \| blob \| history
lustre/mds/mds_lov.c		patch \| blob \| history
lustre/osc/osc_create.c		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/events.c		patch \| blob \| history