Whamcloud - gitweb
b=3869,1742
authorrread <rread>
Sun, 18 Jul 2004 00:29:37 +0000 (00:29 +0000)
committerrread <rread>
Sun, 18 Jul 2004 00:29:37 +0000 (00:29 +0000)
These are the remaining fixes from 1742 that are needed to allow a
client to reconnect during recover.  The request that triggered 3869
on the last run was actually a RESENT request that was sent before
recovery had finished, which this patch will fix.

lustre/include/linux/lustre_export.h
lustre/ldlm/ldlm_lib.c
lustre/mds/mds_fs.c
lustre/obdfilter/filter.c
lustre/osc/osc_request.c
lustre/ptlrpc/import.c

index 8cc24b9..664f936 100644 (file)
@@ -77,8 +77,9 @@ struct obd_export {
         spinlock_t                exp_lock; /* protects flags int below */
         /* ^ protects exp_outstanding_replies too */
         int                       exp_flags;
-        int                       exp_failed:1;
-        int                       exp_libclient:1; /* liblustre client? */
+        int                       exp_failed:1,
+                                  exp_replay_needed:1,
+                                  exp_libclient:1; /* liblustre client? */
         union {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
index 7d28ef0..1183afe 100644 (file)
@@ -887,6 +887,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req,
         if (obd->obd_processing_task == current->pid ||
             transno < obd->obd_next_recovery_transno) {
                 /* Processing the queue right now, don't re-add. */
+                lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
                 LASSERT(list_empty(&req->rq_list));
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 OBD_FREE(reqmsg, req->rq_reqlen);
@@ -990,7 +991,12 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
 
         spin_lock_bh(&obd->obd_processing_task_lock);
-        --obd->obd_recoverable_clients;
+        /* only count the first "replay over" request from each
+           export */
+        if (req->rq_export->exp_replay_needed) {
+                --obd->obd_recoverable_clients;
+                req->rq_export->exp_replay_needed = 0;
+        }
         recovery_done = (obd->obd_recoverable_clients == 0);
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
index fccf34f..e32ba3f 100644 (file)
@@ -327,6 +327,7 @@ static int mds_read_last_rcvd(struct obd_device *obd, struct file *file)
                 spin_lock_init(&med->med_open_lock);
 
                 mcd = NULL;
+                exp->exp_replay_needed = 1;
                 obd->obd_recoverable_clients++;
                 obd->obd_max_recoverable_clients++;
                 class_export_put(exp);
index 7adb22d..dc45318 100644 (file)
@@ -485,7 +485,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 spin_lock_init(&fed->fed_lock);
 
                 fcd = NULL;
+                exp->exp_replay_needed = 1;
                 obd->obd_recoverable_clients++;
+                obd->obd_max_recoverable_clients++;
                 class_export_put(exp);
 
                 CDEBUG(D_OTHER, "client at idx %d has last_rcvd = "LPU64"\n",
index 67e4393..3c5cf7e 100644 (file)
@@ -785,9 +785,6 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
         LASSERT((void *)(niobuf - niocount) ==
                 lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)));
         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
-        spin_lock_irqsave(&req->rq_lock, flags);
-        req->rq_no_resend = 1;
-        spin_unlock_irqrestore(&req->rq_lock, flags);
 
         /* size[0] still sizeof (*body) */
         if (opc == OST_WRITE) {
@@ -908,8 +905,6 @@ restart_bulk:
         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
                                   page_count, pga, &requested_nob, &niocount,
                                   &request);
-        /* NB ^ sets rq_no_resend */
-
         if (rc != 0)
                 return (rc);
 
@@ -938,13 +933,6 @@ static int brw_interpret(struct ptlrpc_request *request,
         struct brw_page *pga = aa->aa_pga;
         ENTRY;
 
-        /* XXX bug 937 here */
-        if (rc == -ETIMEDOUT && request->rq_resend) {
-                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
-                LBUG(); /* re-send.  later. */
-                //goto restart_bulk;
-        }
-
         rc = osc_brw_fini_request(request, oa, requested_nob, niocount,
                                   page_count, pga, rc);
         RETURN (rc);
@@ -964,8 +952,6 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
         rc = osc_brw_prep_request(cmd, class_exp2cliimp(exp), oa, lsm,
                                   page_count, pga, &requested_nob, &nio_count,
                                   &request);
-        /* NB ^ sets rq_no_resend */
-
         if (rc == 0) {
                 LASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
                 aa = (struct osc_brw_async_args *)&request->rq_async_args;
index cf5e0be..5513345 100644 (file)
@@ -100,6 +100,10 @@ int ptlrpc_set_import_discon(struct obd_import *imp)
         spin_lock_irqsave(&imp->imp_lock, flags);
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {
+                CERROR("%s: connection lost to %s@%s\n",
+                       imp->imp_obd->obd_name, 
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
                 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
                 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
@@ -407,6 +411,9 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 if (imp->imp_invalid) {
                         IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
                 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+                        CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+                               imp->imp_obd->obd_name, 
+                               imp->imp_target_uuid.uuid);
                         imp->imp_resend_replay = 1;
                         IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
                 } else {
@@ -476,7 +483,15 @@ static int completed_replay_interpret(struct ptlrpc_request *req,
                                     void * data, int rc)
 {
         atomic_dec(&req->rq_import->imp_replay_inflight);
-        ptlrpc_import_recovery_state_machine(req->rq_import);
+        if (req->rq_status == 0) {
+                ptlrpc_import_recovery_state_machine(req->rq_import);
+        } else {
+                CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+                       "reconnecting\n", 
+                       req->rq_import->imp_obd->obd_name, req->rq_status);
+                ptlrpc_connect_import(req->rq_import, NULL);
+        }
+
         RETURN(0);
 }
 
@@ -557,6 +572,10 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                         GOTO(out, rc);
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
                 ptlrpc_activate_import(imp);
+                CERROR("%s: connection restored to %s@%s\n",
+                       imp->imp_obd->obd_name, 
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
         }
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {