Whamcloud - gitweb
b=5538
[fs/lustre-release.git] / lustre / ptlrpc / recover.c
index 14c9d60..6731c7d 100644 (file)
@@ -68,8 +68,8 @@ void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
                        argv[0], argv[1], argv[2], rc);
 
         } else {
-                CERROR("Invoked upcall %s %s %s\n",
-                       argv[0], argv[1], argv[2]);
+                CWARN("Invoked upcall %s %s %s\n",
+                      argv[0], argv[1], argv[2]);
         }
 }
 
@@ -105,12 +105,12 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
         rc = USERMODEHELPER(argv[0], argv, envp);
         if (rc < 0) {
                 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
-                       "check /proc/sys/lustre/lustre_upcall\n",
+                       "check /proc/sys/lustre/upcall\n",
                        argv[0], argv[1], argv[2], argv[3], argv[4],rc);
 
         } else {
-                CERROR("Invoked upcall %s %s %s %s %s\n",
-                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+                CWARN("Invoked upcall %s %s %s %s %s\n",
+                      argv[0], argv[1], argv[2], argv[3], argv[4]);
         }
 #else
         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
@@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp)
         LASSERT (obd_lustre_upcall != NULL);
         
         if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
-                CDEBUG(D_ERROR, "%s: starting recovery without upcall\n",
+                CDEBUG(D_HA, "%s: starting recovery without upcall\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_connect_import(imp, NULL);
         } 
         else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
-                CDEBUG(D_ERROR, "%s: recovery diabled\n",
+                CDEBUG(D_HA, "%s: recovery disabled\n",
                         imp->imp_target_uuid.uuid);
         } 
         else {
-                CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n",
+                CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_run_failed_import_upcall(imp);
         }
@@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
 {
         int rc = 0;
         struct list_head *tmp, *pos;
-        struct ptlrpc_request *req;
+        struct ptlrpc_request *req = NULL;
         unsigned long flags;
         __u64 last_transno;
         ENTRY;
@@ -187,16 +187,34 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          */
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+
+                /* If need to resend, stop on the matching one first. It's 
+                   possible though it's already been committed, so in that case 
+                   we'll just continue with replay */
+                if (imp->imp_resend_replay && 
+                    req->rq_transno == last_transno) {
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                        break;
+                }
+
                 if (req->rq_transno > last_transno) {
-                        rc = ptlrpc_replay_req(req);
-                        if (rc) {
-                                CERROR("recovery replay error %d for req "
-                                       LPD64"\n", rc, req->rq_xid);
-                                RETURN(rc);
-                        }
-                        *inflight = 1;
+                        imp->imp_last_replay_transno = req->rq_transno;
                         break;
                 }
+
+                req = NULL;
+        }
+
+        imp->imp_resend_replay = 0;
+
+        if (req != NULL) {
+                rc = ptlrpc_replay_req(req);
+                if (rc) {
+                        CERROR("recovery replay error %d for req "
+                               LPD64"\n", rc, req->rq_xid);
+                        RETURN(rc);
+                }
+                *inflight = 1;
         }
         RETURN(rc);
 }
@@ -271,13 +289,12 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
                 rc = ptlrpc_connect_import(imp, NULL);
         }
 
-        
         /* Wait for recovery to complete and resend. If evicted, then
            this request will be errored out later.*/
         spin_lock_irqsave(&failed_req->rq_lock, flags);
         failed_req->rq_resend = 1;
         spin_unlock_irqrestore(&failed_req->rq_lock, flags);
-        
+
         EXIT;
 }
 
@@ -296,10 +313,12 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
          * requests. */
         if (!active) {
                 ptlrpc_invalidate_import(imp, 0);
-        } 
+                imp->imp_deactive = 1;
+        }
 
         /* When activating, mark import valid, and attempt recovery */
         if (active) {
+                imp->imp_deactive = 0;
                 CDEBUG(D_HA, "setting import %s VALID\n",
                        imp->imp_target_uuid.uuid);
                 rc = ptlrpc_recover_import(imp, NULL);
@@ -312,10 +331,10 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
 {
         int rc;
         ENTRY;
-        
+
         /* force import to be disconnected. */
         ptlrpc_set_import_discon(imp);
-        
+
         rc = ptlrpc_recover_import_no_retry(imp, new_uuid);
 
         RETURN(rc);
@@ -356,13 +375,13 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
         if (rc)
                 RETURN(rc);
 
-        CDEBUG(D_ERROR, "%s: recovery started, waiting\n",
+        CDEBUG(D_HA, "%s: recovery started, waiting\n",
                imp->imp_target_uuid.uuid);
 
         lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
         rc = l_wait_event(imp->imp_recovery_waitq,
                           !ptlrpc_import_in_recovery(imp), &lwi);
-        CDEBUG(D_ERROR, "%s: recovery finished\n",
+        CDEBUG(D_HA, "%s: recovery finished\n",
                imp->imp_target_uuid.uuid);
 
         RETURN(rc);
@@ -387,6 +406,9 @@ void ptlrpc_fail_export(struct obd_export *exp)
         CDEBUG(D_HA, "disconnecting export %p/%s\n",
                exp, exp->exp_client_uuid.uuid);
 
+        if (obd_dump_on_timeout)
+                portals_debug_dumplog();
+
         /* Most callers into obd_disconnect are removing their own reference
          * (request, for example) in addition to the one from the hash table.
          * We don't have such a reference here, so make one. */