Whamcloud - gitweb
- many fixes in liblustre related stuff. By now liblustre is at least build-able...
[fs/lustre-release.git] / lustre / ptlrpc / recover.c
index 18bc6f4..6bf1d35 100644 (file)
@@ -54,7 +54,7 @@ void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
 
         argv[0] = obd_lustre_upcall;
         argv[1] = "RECOVERY_OVER";
-        argv[2] = obd->obd_uuid.uuid;
+        argv[2] = (char *)obd->obd_uuid.uuid;
         argv[3] = NULL;
         
         envp[0] = "HOME=/";
@@ -68,8 +68,8 @@ void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
                        argv[0], argv[1], argv[2], rc);
 
         } else {
-                CERROR("Invoked upcall %s %s %s\n",
-                       argv[0], argv[1], argv[2]);
+                CWARN("Invoked upcall %s %s %s\n",
+                      argv[0], argv[1], argv[2]);
         }
 }
 
@@ -92,10 +92,10 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
         
         argv[0] = obd_lustre_upcall;
         argv[1] = "FAILED_IMPORT";
-        argv[2] = imp->imp_target_uuid.uuid;
+        argv[2] = (char *)imp->imp_target_uuid.uuid;
         argv[3] = imp->imp_obd->obd_name;
-        argv[4] = imp->imp_connection->c_remote_uuid.uuid;
-        argv[5] = imp->imp_obd->obd_uuid.uuid;
+        argv[4] = (char *)imp->imp_connection->c_remote_uuid.uuid;
+        argv[5] = (char *)imp->imp_obd->obd_uuid.uuid;
         argv[6] = NULL;
 
         envp[0] = "HOME=/";
@@ -105,12 +105,12 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
         rc = USERMODEHELPER(argv[0], argv, envp);
         if (rc < 0) {
                 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
-                       "check /proc/sys/lustre/lustre_upcall\n",
+                       "check /proc/sys/lustre/upcall\n",
                        argv[0], argv[1], argv[2], argv[3], argv[4],rc);
 
         } else {
-                CERROR("Invoked upcall %s %s %s %s %s\n",
-                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+                CWARN("Invoked upcall %s %s %s %s %s\n",
+                      argv[0], argv[1], argv[2], argv[3], argv[4]);
         }
 #else
         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
@@ -130,16 +130,16 @@ void ptlrpc_initiate_recovery(struct obd_import *imp)
         LASSERT (obd_lustre_upcall != NULL);
         
         if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
-                CDEBUG(D_ERROR, "%s: starting recovery without upcall\n",
+                CDEBUG(D_HA, "%s: starting recovery without upcall\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_connect_import(imp, NULL);
         } 
         else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
-                CDEBUG(D_ERROR, "%s: recovery diabled\n",
+                CDEBUG(D_HA, "%s: recovery disabled\n",
                         imp->imp_target_uuid.uuid);
         } 
         else {
-                CDEBUG(D_ERROR, "%s: calling upcall to start recovery\n",
+                CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
                         imp->imp_target_uuid.uuid);
                 ptlrpc_run_failed_import_upcall(imp);
         }
@@ -151,7 +151,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
 {
         int rc = 0;
         struct list_head *tmp, *pos;
-        struct ptlrpc_request *req;
+        struct ptlrpc_request *req = NULL;
         unsigned long flags;
         __u64 last_transno;
         ENTRY;
@@ -187,16 +187,35 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          */
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+
+                /* If need to resend, stop on the matching one first. It's 
+                   possible though it's already been committed, so in that case 
+                   we'll just continue with replay */
+                if (imp->imp_resend_replay && 
+                    req->rq_transno == last_transno) {
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                        break;
+                }
+
                 if (req->rq_transno > last_transno) {
-                        rc = ptlrpc_replay_req(req);
-                        if (rc) {
-                                CERROR("recovery replay error %d for req "
-                                       LPD64"\n", rc, req->rq_xid);
-                                RETURN(rc);
-                        }
-                        *inflight = 1;
+                        imp->imp_last_replay_transno = req->rq_transno;
                         break;
                 }
+
+                req = NULL;
+        }
+
+        imp->imp_resend_replay = 0;
+
+        if (req != NULL) {
+                rc = ptlrpc_replay_req(req);
+                if (rc) {
+                        CERROR("recovery replay error %d for req "
+                               LPD64"\n", rc, req->rq_xid);
+                        RETURN(rc);
+                }
+                imp->imp_reqs_replayed++;
+                *inflight = 1;
         }
         RETURN(rc);
 }
@@ -268,17 +287,17 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
                         ptlrpc_deactivate_import(imp);
                 }
 
-                rc = ptlrpc_connect_import(imp, NULL);
+                /* to control recovery via lctl {disable|enable}_recovery */
+                if (imp->imp_deactive == 0)
+                        rc = ptlrpc_connect_import(imp, NULL);
         }
 
-        
         /* Wait for recovery to complete and resend. If evicted, then
            this request will be errored out later.*/
         spin_lock_irqsave(&failed_req->rq_lock, flags);
-        if (!failed_req->rq_no_resend)
-                failed_req->rq_resend = 1;
+        failed_req->rq_resend = 1;
         spin_unlock_irqrestore(&failed_req->rq_lock, flags);
-        
+
         EXIT;
 }
 
@@ -286,7 +305,7 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
  * This should only be called by the ioctl interface, currently
  * with the lctl deactivate and activate commands.
  */
-int  ptlrpc_set_import_active(struct obd_import *imp, int active)
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
 {
         struct obd_device *obd = imp->imp_obd;
         int rc = 0;
@@ -297,10 +316,12 @@ int  ptlrpc_set_import_active(struct obd_import *imp, int active)
          * requests. */
         if (!active) {
                 ptlrpc_invalidate_import(imp, 0);
-        } 
+                imp->imp_deactive = 1;
+        }
 
         /* When activating, mark import valid, and attempt recovery */
         if (active) {
+                imp->imp_deactive = 0;
                 CDEBUG(D_HA, "setting import %s VALID\n",
                        imp->imp_target_uuid.uuid);
                 rc = ptlrpc_recover_import(imp, NULL);
@@ -313,10 +334,10 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
 {
         int rc;
         ENTRY;
-        
+
         /* force import to be disconnected. */
         ptlrpc_set_import_discon(imp);
-        
+
         rc = ptlrpc_recover_import_no_retry(imp, new_uuid);
 
         RETURN(rc);
@@ -335,6 +356,24 @@ int ptlrpc_import_in_recovery(struct obd_import *imp)
         return in_recovery;
 }
 
+int ptlrpc_import_control_recovery(struct obd_import *imp, int disable)
+{
+        unsigned long flags;
+
+        /* with imp_deactivate == 1 pinger won't initiate re-connect */
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        if (disable)
+                imp->imp_deactive = 1;
+        else
+                imp->imp_deactive = 0;
+        if (imp->imp_state == LUSTRE_IMP_DISCON) {
+                imp->imp_force_verify = 1;
+                ptlrpc_pinger_wake_up();
+        }
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        RETURN(0);
+}
+
 static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
                                           char *new_uuid)
 {
@@ -357,13 +396,13 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
         if (rc)
                 RETURN(rc);
 
-        CDEBUG(D_ERROR, "%s: recovery started, waiting\n",
+        CDEBUG(D_HA, "%s: recovery started, waiting\n",
                imp->imp_target_uuid.uuid);
 
         lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
         rc = l_wait_event(imp->imp_recovery_waitq,
                           !ptlrpc_import_in_recovery(imp), &lwi);
-        CDEBUG(D_ERROR, "%s: recovery finished\n",
+        CDEBUG(D_HA, "%s: recovery finished\n",
                imp->imp_target_uuid.uuid);
 
         RETURN(rc);
@@ -388,6 +427,9 @@ void ptlrpc_fail_export(struct obd_export *exp)
         CDEBUG(D_HA, "disconnecting export %p/%s\n",
                exp, exp->exp_client_uuid.uuid);
 
+        if (obd_dump_on_timeout)
+                portals_debug_dumplog();
+
         /* Most callers into obd_disconnect are removing their own reference
          * (request, for example) in addition to the one from the hash table.
          * We don't have such a reference here, so make one. */