Whamcloud - gitweb
Make recovery on clients less verbose and misleading.
[fs/lustre-release.git] / lustre / ptlrpc / recover.c
index 1ff5f30..66b1727 100644 (file)
@@ -105,7 +105,7 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
         rc = USERMODEHELPER(argv[0], argv, envp);
         if (rc < 0) {
                 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
-                       "check /proc/sys/lustre/lustre_upcall\n",
+                       "check /proc/sys/lustre/upcall\n",
                        argv[0], argv[1], argv[2], argv[3], argv[4],rc);
 
         } else {
@@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp)
         LASSERT (obd_lustre_upcall != NULL);
         
         if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
-                CDEBUG(D_ERROR, "%s: starting recovery without upcall\n",
+                CDEBUG(D_HA, "%s: starting recovery without upcall\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_connect_import(imp, NULL);
         } 
         else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
-                CDEBUG(D_ERROR, "%s: recovery diabled\n",
+                CDEBUG(D_HA, "%s: recovery disabled\n",
                         imp->imp_target_uuid.uuid);
         } 
         else {
-                CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n",
+                CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_run_failed_import_upcall(imp);
         }
@@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
 {
         int rc = 0;
         struct list_head *tmp, *pos;
-        struct ptlrpc_request *req;
+        struct ptlrpc_request *req = NULL;
         unsigned long flags;
         __u64 last_transno;
         ENTRY;
@@ -187,16 +187,34 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          */
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+
+                /* If need to resend, stop on the matching one first. It's 
+                   possible though it's already been committed, so in that case 
+                   we'll just continue with replay */
+                if (imp->imp_resend_replay && 
+                    req->rq_transno == last_transno) {
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                        break;
+                }
+
                 if (req->rq_transno > last_transno) {
-                        rc = ptlrpc_replay_req(req);
-                        if (rc) {
-                                CERROR("recovery replay error %d for req "
-                                       LPD64"\n", rc, req->rq_xid);
-                                RETURN(rc);
-                        }
-                        *inflight = 1;
+                        imp->imp_last_replay_transno = req->rq_transno;
                         break;
                 }
+
+                req = NULL;
+        }
+
+        imp->imp_resend_replay = 0;
+
+        if (req != NULL) {
+                rc = ptlrpc_replay_req(req);
+                if (rc) {
+                        CERROR("recovery replay error %d for req "
+                               LPD64"\n", rc, req->rq_xid);
+                        RETURN(rc);
+                }
+                *inflight = 1;
         }
         RETURN(rc);
 }
@@ -258,15 +276,24 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
                imp->imp_target_uuid.uuid,
                imp->imp_connection->c_remote_uuid.uuid);
         
-        ptlrpc_set_import_discon(imp);
+        if (ptlrpc_set_import_discon(imp)) {
+                if (!imp->imp_replayable) {
+                        CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                               "auto-deactivating\n",
+                               imp->imp_target_uuid.uuid,
+                               imp->imp_connection->c_remote_uuid.uuid,
+                               imp->imp_obd->obd_name);
+                        ptlrpc_deactivate_import(imp);
+                }
+
+                rc = ptlrpc_connect_import(imp, NULL);
+        }
 
-        rc = ptlrpc_connect_import(imp, NULL);
         
         /* Wait for recovery to complete and resend. If evicted, then
            this request will be errored out later.*/
         spin_lock_irqsave(&failed_req->rq_lock, flags);
-        if (!failed_req->rq_no_resend)
-                failed_req->rq_resend = 1;
+        failed_req->rq_resend = 1;
         spin_unlock_irqrestore(&failed_req->rq_lock, flags);
         
         EXIT;
@@ -279,7 +306,6 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 int ptlrpc_set_import_active(struct obd_import *imp, int active)
 {
         struct obd_device *obd = imp->imp_obd;
-        unsigned long flags;
         int rc = 0;
 
         LASSERT(obd);
@@ -287,23 +313,14 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
         /* When deactivating, mark import invalid, and abort in-flight
          * requests. */
         if (!active) {
-                ptlrpc_invalidate_import(imp);
+                ptlrpc_invalidate_import(imp, 0);
         } 
 
         /* When activating, mark import valid, and attempt recovery */
         if (active) {
                 CDEBUG(D_HA, "setting import %s VALID\n",
                        imp->imp_target_uuid.uuid);
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                imp->imp_invalid = 0;
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
-
                 rc = ptlrpc_recover_import(imp, NULL);
-                if (rc) {
-                        spin_lock_irqsave(&imp->imp_lock, flags);
-                        imp->imp_invalid = 1;
-                        spin_unlock_irqrestore(&imp->imp_lock, flags);
-                }
         }
 
         RETURN(rc);
@@ -347,28 +364,26 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
         spin_lock_irqsave(&imp->imp_lock, flags);
         if (imp->imp_state != LUSTRE_IMP_DISCON) {
                 in_recovery = 1;
-        } 
+        }
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         if (in_recovery == 1)
                 RETURN(-EALREADY);
 
-        
         rc = ptlrpc_connect_import(imp, new_uuid);
         if (rc)
                 RETURN(rc);
 
-        CDEBUG(D_ERROR, "%s: recovery started, waiting\n", 
+        CDEBUG(D_HA, "%s: recovery started, waiting\n",
                imp->imp_target_uuid.uuid);
 
         lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
-        rc = l_wait_event(imp->imp_recovery_waitq, 
+        rc = l_wait_event(imp->imp_recovery_waitq,
                           !ptlrpc_import_in_recovery(imp), &lwi);
-        CDEBUG(D_ERROR, "%s: recovery finished\n", 
+        CDEBUG(D_HA, "%s: recovery finished\n",
                imp->imp_target_uuid.uuid);
 
         RETURN(rc);
-        
 }
 
 void ptlrpc_fail_export(struct obd_export *exp)