Whamcloud - gitweb
file llobdstat.pl was initially added on branch b_devel.
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
index d8ace91..21cb3fe 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_RPC
-
+#ifndef __KERNEL__
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#else 
 #include <linux/lustre_lite.h>
+#endif
+
 #include <linux/lustre_ha.h>
 #include <linux/obd_support.h>
 
 /* dump_connection_list, but shorter for nicer debugging logs */
 static void d_c_l(struct list_head *head)
 {
-        int sanity = 0;
         struct list_head *tmp;
 
         list_for_each(tmp, head) {
                 struct ptlrpc_connection *conn =
                         list_entry(tmp, struct ptlrpc_connection,
                                    c_recovd_data.rd_managed_chain);
-                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, 
+                       conn->c_remote_uuid.uuid,
                        conn->c_recovd_data.rd_phase,
                        conn->c_recovd_data.rd_next_phase);
-                if (sanity++ > 50)
-                        LBUG();
         }
 }
 
@@ -59,16 +63,16 @@ void recovd_conn_manage(struct ptlrpc_connection *conn,
         if (!list_empty(&rd->rd_managed_chain)) {
                 if (rd->rd_recovd == recovd && rd->rd_recover == recover) {
                         CDEBUG(D_HA, "conn %p/%s already setup for recovery\n",
-                               conn, conn->c_remote_uuid);
+                               conn, conn->c_remote_uuid.uuid);
                         EXIT;
                         return;
                 }
                 CDEBUG(D_HA,
                        "conn %p/%s has recovery items %p/%p, making %p/%p\n",
-                       conn, conn->c_remote_uuid, rd->rd_recovd, rd->rd_recover,
+                       conn, conn->c_remote_uuid.uuid, rd->rd_recovd, rd->rd_recover,
                        recovd, recover);
                 spin_lock(&rd->rd_recovd->recovd_lock);
-                list_del(&rd->rd_managed_chain);
+                list_del_init(&rd->rd_managed_chain);
                 spin_unlock(&rd->rd_recovd->recovd_lock);
         }
 
@@ -85,6 +89,24 @@ void recovd_conn_manage(struct ptlrpc_connection *conn,
         EXIT;
 }
 
+void recovd_conn_unmanage(struct ptlrpc_connection *conn)
+{
+        struct recovd_data *rd = &conn->c_recovd_data;
+        struct recovd_obd *recovd = rd->rd_recovd;
+        ENTRY;
+
+        if (recovd) {
+                spin_lock(&recovd->recovd_lock);
+                list_del_init(&rd->rd_managed_chain);
+                rd->rd_recovd = NULL;
+                spin_unlock(&recovd->recovd_lock);
+        }
+        /* should be safe enough, right? */
+        rd->rd_recover = NULL;
+        rd->rd_next_phase = RD_IDLE;
+        rd->rd_next_phase = RD_TROUBLED;
+}
+
 void recovd_conn_fail(struct ptlrpc_connection *conn)
 {
         struct recovd_data *rd = &conn->c_recovd_data;
@@ -98,18 +120,26 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
         }
 
         spin_lock(&recovd->recovd_lock);
-        if (rd->rd_phase != RD_IDLE) {
-                CERROR("connection %p to %s already in recovery\n",
-                       conn, conn->c_remote_uuid);
-                /* XXX need to distinguish from failure-in-recovery */
+        if (rd->rd_phase == RD_TROUBLED || rd->rd_phase == RD_PREPARING) {
+                CDEBUG(D_HA, "connection %p to %s already in recovery\n",
+                       conn, conn->c_remote_uuid.uuid);
                 spin_unlock(&recovd->recovd_lock);
                 EXIT;
                 return;
         }
-                
-        CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
+
+        CERROR("connection %p to %s nid "LPX64" on %s failed\n", conn,
+               conn->c_remote_uuid.uuid, conn->c_peer.peer_nid,
+               conn->c_peer.peer_ni->pni_name);
         list_del(&rd->rd_managed_chain);
         list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
+        if (rd->rd_phase != RD_IDLE) {
+                CDEBUG(D_HA,
+                       "connection %p to %s failed in recovery: restarting\n",
+                       conn, conn->c_remote_uuid.uuid);
+                /* XXX call callback with PHASE_FAILED? */
+                rd->rd_next_phase = RD_TROUBLED;
+        }
         rd->rd_phase = RD_TROUBLED;
         dump_lists(recovd);
         spin_unlock(&recovd->recovd_lock);
@@ -125,7 +155,7 @@ void recovd_conn_fixed(struct ptlrpc_connection *conn)
         ENTRY;
 
         CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
-               conn, conn->c_remote_uuid);
+               conn, conn->c_remote_uuid.uuid);
         spin_lock(&rd->rd_recovd->recovd_lock);
         list_del(&rd->rd_managed_chain);
         rd->rd_phase = RD_IDLE;
@@ -137,7 +167,6 @@ void recovd_conn_fixed(struct ptlrpc_connection *conn)
         EXIT;
 }
 
-
 static int recovd_check_event(struct recovd_obd *recovd)
 {
         int rc = 0;
@@ -192,12 +221,12 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                 cb_failed: /* must always reach here with recovd_lock held! */
                         CERROR("recovery FAILED for rd %p (conn %p): %d\n",
                                rd, class_rd2conn(rd), rc);
-                        
+
                         spin_unlock(&recovd->recovd_lock);
                         (void)rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE);
                         spin_lock(&recovd->recovd_lock);
                         break;
-                        
+
                     case RD_TROUBLED:
                         if (!rd->rd_recover) {
                                 CERROR("no rd_recover for rd %p (conn %p)\n",
@@ -209,38 +238,38 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                                rd, class_rd2conn(rd));
                         rd->rd_phase = RD_PREPARING;
                         rd->rd_next_phase = RD_PREPARED;
-                        
+
                         spin_unlock(&recovd->recovd_lock);
                         rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
                         spin_lock(&recovd->recovd_lock);
                         if (rc)
                                 goto cb_failed;
-                        
+
                         break;
-                        
+
                     case RD_PREPARED:
-                        
+
                         CERROR("recovery prepared for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
                         rd->rd_phase = RD_RECOVERING;
                         rd->rd_next_phase = RD_RECOVERED;
-                        
+
                         spin_unlock(&recovd->recovd_lock);
                         rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
                         spin_lock(&recovd->recovd_lock);
                         if (rc)
                                 goto cb_failed;
-                        
+
                         break;
-                        
+
                     case RD_RECOVERED:
                         rd->rd_phase = RD_IDLE;
                         rd->rd_next_phase = RD_TROUBLED;
-                        
+
                         CERROR("recovery complete for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
                         break;
-                        
+
                     default:
                         break;
                 }
@@ -249,18 +278,25 @@ static int recovd_handle_event(struct recovd_obd *recovd)
         RETURN(0);
 }
 
+#ifdef __KERNEL__
 static int recovd_main(void *arg)
 {
         struct recovd_obd *recovd = (struct recovd_obd *)arg;
-
+        unsigned long flags;
         ENTRY;
 
         lock_kernel();
         daemonize();
-        spin_lock_irq(&current->sigmask_lock);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+        sigfillset(&current->blocked);
+        recalc_sigpending();
+#else
+        spin_lock_irqsave(&current->sigmask_lock, flags);
         sigfillset(&current->blocked);
         recalc_sigpending(current);
-        spin_unlock_irq(&current->sigmask_lock);
+        spin_unlock_irqrestore(&current->sigmask_lock, flags);
+#endif
 
         sprintf(current->comm, "lustre_recovd");
         unlock_kernel();
@@ -287,7 +323,7 @@ static int recovd_main(void *arg)
 
 int recovd_setup(struct recovd_obd *recovd)
 {
-        int rc;
+        int rc = 0; /* initialize for Liblustre */
 
         ENTRY;
 
@@ -313,6 +349,12 @@ int recovd_setup(struct recovd_obd *recovd)
 
         RETURN(0);
 }
+#else 
+int recovd_setup(struct recovd_obd *recovd)
+{
+        return 0;
+}
+#endif
 
 int recovd_cleanup(struct recovd_obd *recovd)
 {