Whamcloud - gitweb
- Rename the ptlrpc-general reconnection and replay functions, and export them
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
index 7561ba0..f0fe5ca 100644 (file)
 #include <linux/lustre_ha.h>
 #include <linux/obd_support.h>
 
+/* dump_connection_list, but shorter for nicer debugging logs */
+static void d_c_l(struct list_head *head)
+{
+        int sanity = 0;
+        struct list_head *tmp;
+
+        list_for_each(tmp, head) {
+                struct ptlrpc_connection *conn =
+                        list_entry(tmp, struct ptlrpc_connection,
+                                   c_recovd_data.rd_managed_chain);
+                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+                       conn->c_recovd_data.rd_phase,
+                       conn->c_recovd_data.rd_next_phase);
+                if (sanity++ > 50)
+                        LBUG();
+        }
+}
+
+static void dump_lists(struct recovd_obd *recovd)
+{
+        CDEBUG(D_HA, "managed: \n");
+        d_c_l(&recovd->recovd_managed_items);
+        CDEBUG(D_HA, "troubled: \n");
+        d_c_l(&recovd->recovd_troubled_items);
+}
+
 void recovd_conn_manage(struct ptlrpc_connection *conn,
                         struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover)
 {
         struct recovd_data *rd = &conn->c_recovd_data;
         ENTRY;
 
+        if (!list_empty(&rd->rd_managed_chain)) {
+                if (rd->rd_recovd == recovd && rd->rd_recover == recover) {
+                        CDEBUG(D_HA, "conn %p/%s already setup for recovery\n",
+                               conn, conn->c_remote_uuid);
+                        EXIT;
+                        return;
+                }
+                CDEBUG(D_HA,
+                       "conn %p/%s has recovery items %p/%p, making %p/%p\n",
+                       conn, conn->c_remote_uuid, rd->rd_recovd, rd->rd_recover,
+                       recovd, recover);
+                spin_lock(&rd->rd_recovd->recovd_lock);
+                list_del(&rd->rd_managed_chain);
+                spin_unlock(&rd->rd_recovd->recovd_lock);
+        }
+
         rd->rd_recovd = recovd;
         rd->rd_recover = recover;
+        rd->rd_phase = RD_IDLE;
+        rd->rd_next_phase = RD_TROUBLED;
 
         spin_lock(&recovd->recovd_lock);
         list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
+        dump_lists(recovd);
         spin_unlock(&recovd->recovd_lock);
 
         EXIT;
@@ -44,13 +89,25 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
 
         if (!recovd) {
                 CERROR("no recovd for connection %p\n", conn);
+                EXIT;
                 return;
         }
 
-        CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
         spin_lock(&recovd->recovd_lock);
+        if (rd->rd_phase != RD_IDLE) {
+                CERROR("connection %p to %s already in recovery\n",
+                       conn, conn->c_remote_uuid);
+                /* XXX need to distinguish from failure-in-recovery */
+                spin_unlock(&recovd->recovd_lock);
+                EXIT;
+                return;
+        }
+                
+        CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
         list_del(&rd->rd_managed_chain);
         list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
+        rd->rd_phase = RD_TROUBLED;
+        dump_lists(recovd);
         spin_unlock(&recovd->recovd_lock);
 
         wake_up(&recovd->recovd_waitq);
@@ -58,14 +115,20 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
         EXIT;
 }
 
-/* this function must be called with conn->c_lock held */
 void recovd_conn_fixed(struct ptlrpc_connection *conn)
 {
         struct recovd_data *rd = &conn->c_recovd_data;
         ENTRY;
 
+        CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
+               conn, conn->c_remote_uuid);
+        spin_lock(&rd->rd_recovd->recovd_lock);
         list_del(&rd->rd_managed_chain);
+        rd->rd_phase = RD_IDLE;
+        rd->rd_next_phase = RD_TROUBLED;
         list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
+        dump_lists(rd->rd_recovd);
+        spin_unlock(&rd->rd_recovd->recovd_lock);
 
         EXIT;
 }
@@ -89,7 +152,7 @@ static int recovd_check_event(struct recovd_obd *recovd)
                                                     rd_managed_chain);
 
                 if (rd->rd_phase == rd->rd_next_phase ||
-                    rd->rd_phase == RECOVD_FAILED)
+                    rd->rd_phase == RD_FAILED)
                         GOTO(out, rc = 1);
         }
 
@@ -98,18 +161,6 @@ static int recovd_check_event(struct recovd_obd *recovd)
         RETURN(rc);
 }
 
-static void dump_connection_list(struct list_head *head)
-{
-        struct list_head *tmp;
-
-        list_for_each(tmp, head) {
-                struct ptlrpc_connection *conn =
-                        list_entry(tmp, struct ptlrpc_connection,
-                                   c_recovd_data.rd_managed_chain);
-                CDEBUG(D_NET, "   %p = %s\n", conn, conn->c_remote_uuid);
-        }
-}
-
 static int recovd_handle_event(struct recovd_obd *recovd)
 {
         struct list_head *tmp, *n;
@@ -118,10 +169,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
 
         spin_lock(&recovd->recovd_lock);
 
-        CDEBUG(D_NET, "managed: \n");
-        dump_connection_list(&recovd->recovd_managed_items);
-        CDEBUG(D_NET, "troubled: \n");
-        dump_connection_list(&recovd->recovd_troubled_items);
+        dump_lists(recovd);
 
         /*
          * We use _safe here because one of the callbacks, expecially
@@ -131,12 +179,12 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                 struct recovd_data *rd = list_entry(tmp, struct recovd_data,
                                                     rd_managed_chain);
 
-                if (rd->rd_phase != RECOVD_FAILED &&
+                if (rd->rd_phase != RD_FAILED &&
                     rd->rd_phase != rd->rd_next_phase)
                         continue;
 
                 switch (rd->rd_phase) {
-                    case RECOVD_FAILED:
+                    case RD_FAILED:
                 cb_failed: /* must always reach here with recovd_lock held! */
                         CERROR("recovery FAILED for rd %p (conn %p): %d\n",
                                rd, class_rd2conn(rd), rc);
@@ -146,7 +194,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                         spin_lock(&recovd->recovd_lock);
                         break;
                         
-                    case RECOVD_IDLE:
+                    case RD_TROUBLED:
                         if (!rd->rd_recover) {
                                 CERROR("no rd_recover for rd %p (conn %p)\n",
                                        rd, class_rd2conn(rd));
@@ -155,7 +203,8 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                         }
                         CERROR("starting recovery for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
-                        rd->rd_phase = RECOVD_PREPARING;
+                        rd->rd_phase = RD_PREPARING;
+                        rd->rd_next_phase = RD_PREPARED;
                         
                         spin_unlock(&recovd->recovd_lock);
                         rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
@@ -163,14 +212,14 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                         if (rc)
                                 goto cb_failed;
                         
-                        rd->rd_next_phase = RECOVD_PREPARED;
                         break;
                         
-                    case RECOVD_PREPARED:
-                        rd->rd_phase = RECOVD_RECOVERING;
+                    case RD_PREPARED:
                         
                         CERROR("recovery prepared for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
+                        rd->rd_phase = RD_RECOVERING;
+                        rd->rd_next_phase = RD_RECOVERED;
                         
                         spin_unlock(&recovd->recovd_lock);
                         rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
@@ -178,12 +227,11 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                         if (rc)
                                 goto cb_failed;
                         
-                        rd->rd_next_phase = RECOVD_RECOVERED;
                         break;
                         
-                    case RECOVD_RECOVERED:
-                        rd->rd_phase = RECOVD_IDLE;
-                        rd->rd_next_phase = RECOVD_PREPARING;
+                    case RD_RECOVERED:
+                        rd->rd_phase = RD_IDLE;
+                        rd->rd_next_phase = RD_TROUBLED;
                         
                         CERROR("recovery complete for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
@@ -229,15 +277,13 @@ static int recovd_main(void *arg)
         recovd->recovd_thread = NULL;
         recovd->recovd_state = RECOVD_STOPPED;
         wake_up(&recovd->recovd_ctl_waitq);
-        CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
+        CDEBUG(D_HA, "mgr exiting process %d\n", current->pid);
         RETURN(0);
 }
 
 int recovd_setup(struct recovd_obd *recovd)
 {
         int rc;
-        extern void (*class_signal_connection_failure)
-                (struct ptlrpc_connection *);
 
         ENTRY;
 
@@ -258,15 +304,15 @@ int recovd_setup(struct recovd_obd *recovd)
         wait_event(recovd->recovd_ctl_waitq,
                    recovd->recovd_state == RECOVD_READY);
 
-        /* exported and called by obdclass timeout handlers */
-        class_signal_connection_failure = recovd_conn_fail;
         ptlrpc_recovd = recovd;
+        class_signal_connection_failure = recovd_conn_fail;
 
         RETURN(0);
 }
 
 int recovd_cleanup(struct recovd_obd *recovd)
 {
+        ENTRY;
         spin_lock(&recovd->recovd_lock);
         recovd->recovd_state = RECOVD_STOPPING;
         wake_up(&recovd->recovd_waitq);