- Add RD_TROUBLED state for items that need to start recovery, and rename

author shaver <shaver>

Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)

committer shaver <shaver>

Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)
author shaver <shaver>
Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)
committer shaver <shaver>
Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)
diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h

index 9f90624..d72a804 100644 (file)
--- a/lustre/include/linux/lustre_ha.h
+++ b/lustre/include/linux/lustre_ha.h
@@ -12,12 +12,13 @@ struct recovd_obd;
  struct ptlrpc_connection;
  
  /* rd_phase/rd_next_phase values */
-#define RECOVD_IDLE              0
-#define RECOVD_PREPARING         1
-#define RECOVD_PREPARED          2
-#define RECOVD_RECOVERING        3
-#define RECOVD_RECOVERED         4
-#define RECOVD_FAILED            5
+#define RD_IDLE              0
+#define RD_TROUBLED          1
+#define RD_PREPARING         2
+#define RD_PREPARED          3
+#define RD_RECOVERING        4
+#define RD_RECOVERED         5
+#define RD_FAILED            6
  
  /* recovd_state values */
  #define RECOVD_READY             1
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index 87e5f0b..a2e2857 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -23,14 +23,39 @@ static int interrupted_completion_wait(void *data)
  static int expired_completion_wait(void *data)
  {
          struct ldlm_lock *lock = data;
+        struct ptlrpc_connection *conn;
+        struct obd_device *obd;
+
          if (!lock)
                  CERROR("NULL lock\n");
-        else if (!lock->l_export)
-                CERROR("lock %p has NULL export\n", lock);
-        else
-                class_signal_connection_failure(lock->l_export->exp_connection);
+        else if (!lock->l_connh)
+                CERROR("lock %p has NULL connh\n", lock);
+        else if (!(obd = class_conn2obd(lock->l_connh)))
+                CERROR("lock %p has NULL obd\n", lock);
+        else if (!(conn = obd->u.cli.cl_import.imp_connection))
+                CERROR("lock %p has NULL connection\n", lock);
+        else {
+                class_signal_connection_failure(conn);
+        }
+        RETURN(0);
+}
+
+#if 0
+static int expired_completion_wait(void *data)
+{
+        struct ldlm_lock *lock = data;
+        struct ptlrpc_connection *conn =
+                class_conn2cliimp(lock->l_connh)->imp_connection;
+
+        if (!conn) {
+                CERROR("lock %p has NULL import connection\n", lock);
+                RETURN(1);
+        }
+
+        class_signal_connection_failure(conn);
          RETURN(0);
  }
+#endif
  
  int ldlm_completion_ast(struct ldlm_lock *lock, int flags)
  {
diff --git a/lustre/lib/l_net.c b/lustre/lib/l_net.c

index 8a83095..89d5237 100644 (file)
--- a/lustre/lib/l_net.c
+++ b/lustre/lib/l_net.c
@@ -344,7 +344,7 @@ static int target_fence_failed_connection(struct ptlrpc_connection *conn)
          ENTRY;
  
          conn->c_level = LUSTRE_CONN_RECOVD;
-        conn->c_recovd_data.rd_phase = RECOVD_PREPARED;
+        conn->c_recovd_data.rd_phase = RD_PREPARED;
  
          RETURN(0);
  }
diff --git a/lustre/llite/super.c b/lustre/llite/super.c

index f2c188e..71413a9 100644 (file)
--- a/lustre/llite/super.c
+++ b/lustre/llite/super.c
@@ -153,6 +153,7 @@ static struct super_block * ll_read_super(struct super_block *sb,
                  CERROR("OSC %s: not setup or attached\n", osc);
                  GOTO(out_mdc, sb = NULL);
          }
+
          err = obd_connect(&sbi->ll_osc_conn, obd, sbi->ll_sb_uuid);
          if (err) {
                  CERROR("cannot connect to %s: rc = %d\n", osc, err);
diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c

index f7787d3..2c25dd9 100644 (file)
--- a/lustre/ptlrpc/recovd.c
+++ b/lustre/ptlrpc/recovd.c
@@ -28,6 +28,8 @@ void recovd_conn_manage(struct ptlrpc_connection *conn,
  
          rd->rd_recovd = recovd;
          rd->rd_recover = recover;
+        rd->rd_phase = RD_IDLE;
+        rd->rd_next_phase = RD_TROUBLED;
  
          spin_lock(&recovd->recovd_lock);
          list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
@@ -50,9 +52,10 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
  
  
          spin_lock(&recovd->recovd_lock);
-        if (rd->rd_phase != RECOVD_IDLE || rd->rd_next_phase != RECOVD_IDLE) {
+        if (rd->rd_phase != RD_IDLE) {
                  CDEBUG(D_INFO, "connection %p to %s already in recovery\n",
                         conn, conn->c_remote_uuid);
+                /* XXX need to distinguish from failure-in-recovery */
                  spin_unlock(&recovd->recovd_lock);
                  EXIT;
                  return;
@@ -61,7 +64,7 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
          CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
          list_del(&rd->rd_managed_chain);
          list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
-        rd->rd_next_phase = RECOVD_PREPARING;
+        rd->rd_phase = RD_TROUBLED;
          spin_unlock(&recovd->recovd_lock);
  
          wake_up(&recovd->recovd_waitq);
@@ -69,14 +72,18 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
          EXIT;
  }
  
-/* this function must be called with conn->c_lock held */
+/* this function must be called with recovd->recovd_lock held */
  void recovd_conn_fixed(struct ptlrpc_connection *conn)
  {
          struct recovd_data *rd = &conn->c_recovd_data;
          ENTRY;
  
+        spin_lock(&rd->rd_recovd->recovd_lock);
          list_del(&rd->rd_managed_chain);
+        rd->rd_phase = RD_IDLE;
+        rd->rd_next_phase = RD_TROUBLED;
          list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
+        spin_unlock(&rd->rd_recovd->recovd_lock);
  
          EXIT;
  }
@@ -100,9 +107,7 @@ static int recovd_check_event(struct recovd_obd *recovd)
                                                      rd_managed_chain);
  
                  if (rd->rd_phase == rd->rd_next_phase ||
-                    (rd->rd_phase == RECOVD_IDLE && 
-                     rd->rd_next_phase == RECOVD_PREPARING) ||
-                    rd->rd_phase == RECOVD_FAILED)
+                    rd->rd_phase == RD_FAILED)
                          GOTO(out, rc = 1);
          }
  
@@ -144,15 +149,12 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                  struct recovd_data *rd = list_entry(tmp, struct recovd_data,
                                                      rd_managed_chain);
  
-                /* XXXshaver This is very ugly -- add a RECOVD_TROUBLED state! */
-                if (rd->rd_phase != RECOVD_FAILED &&
-                    !(rd->rd_phase == RECOVD_IDLE &&
-                      rd->rd_next_phase == RECOVD_PREPARING) &&
+                if (rd->rd_phase != RD_FAILED &&
                      rd->rd_phase != rd->rd_next_phase)
                          continue;
  
                  switch (rd->rd_phase) {
-                    case RECOVD_FAILED:
+                    case RD_FAILED:
                  cb_failed: /* must always reach here with recovd_lock held! */
                          CERROR("recovery FAILED for rd %p (conn %p): %d\n",
                                 rd, class_rd2conn(rd), rc);
@@ -162,7 +164,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                          spin_lock(&recovd->recovd_lock);
                          break;
                          
-                    case RECOVD_IDLE:
+                    case RD_TROUBLED:
                          if (!rd->rd_recover) {
                                  CERROR("no rd_recover for rd %p (conn %p)\n",
                                         rd, class_rd2conn(rd));
@@ -171,7 +173,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                          }
                          CERROR("starting recovery for rd %p (conn %p)\n",
                                 rd, class_rd2conn(rd));
-                        rd->rd_phase = RECOVD_PREPARING;
+                        rd->rd_phase = RD_PREPARING;
                          
                          spin_unlock(&recovd->recovd_lock);
                          rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
@@ -179,11 +181,11 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                          if (rc)
                                  goto cb_failed;
                          
-                        rd->rd_next_phase = RECOVD_PREPARED;
+                        rd->rd_next_phase = RD_PREPARED;
                          break;
                          
-                    case RECOVD_PREPARED:
-                        rd->rd_phase = RECOVD_RECOVERING;
+                    case RD_PREPARED:
+                        rd->rd_phase = RD_RECOVERING;
                          
                          CERROR("recovery prepared for rd %p (conn %p)\n",
                                 rd, class_rd2conn(rd));
@@ -194,12 +196,12 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                          if (rc)
                                  goto cb_failed;
                          
-                        rd->rd_next_phase = RECOVD_RECOVERED;
+                        rd->rd_next_phase = RD_RECOVERED;
                          break;
                          
-                    case RECOVD_RECOVERED:
-                        rd->rd_phase = RECOVD_IDLE;
-                        rd->rd_next_phase = RECOVD_PREPARING;
+                    case RD_RECOVERED:
+                        rd->rd_phase = RD_IDLE;
+                        rd->rd_next_phase = RD_TROUBLED;
                          
                          CERROR("recovery complete for rd %p (conn %p)\n",
                                 rd, class_rd2conn(rd));
diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c

index 0e44c86..e062bcc 100644 (file)
--- a/lustre/ptlrpc/recover.c
+++ b/lustre/ptlrpc/recover.c
@@ -80,6 +80,7 @@ static int ll_recover_upcall(struct ptlrpc_connection *conn)
  {
          char *argv[3];
          char *envp[3];
+        int rc;
  
          ENTRY;
          conn->c_level = LUSTRE_CONN_RECOVD;
@@ -92,7 +93,13 @@ static int ll_recover_upcall(struct ptlrpc_connection *conn)
          envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
          envp[2] = NULL;
  
-        RETURN(call_usermodehelper(argv[0], argv, envp));
+        rc = call_usermodehelper(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking recovery upcall (%s): %d\n",
+                       obd_recovery_upcall, rc);
+                CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
+        }
+        RETURN(rc);
  }
  
  static int ll_recover_reconnect(struct ptlrpc_connection *conn)
@@ -193,8 +200,13 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
  
  static int ll_retry_recovery(struct ptlrpc_connection *conn)
  {
+#if 0
          /* XXX use a timer, sideshow bob */
          recovd_conn_fail(conn);
+        /* XXX this is disabled until I fix it so that we don't just keep
+         * XXX retrying in the case of a missing upcall.
+         */
+#endif
          return 0;
  }
  
diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c

index 742e460..6ef7283 100644 (file)
--- a/lustre/ptlrpc/rpc.c
+++ b/lustre/ptlrpc/rpc.c
@@ -95,7 +95,7 @@ int connmgr_iocontrol(long cmd, struct lustre_handle *hdl, int len, void *karg,
          if (!conn)
                  GOTO(out, rc = -EINVAL);
  
-        if (conn->c_recovd_data.rd_phase != RECOVD_PREPARING)
+        if (conn->c_recovd_data.rd_phase != RD_PREPARING)
                  GOTO(out, rc = -EALREADY);
  
          spin_lock(&conn->c_lock);
@@ -110,7 +110,7 @@ int connmgr_iocontrol(long cmd, struct lustre_handle *hdl, int len, void *karg,
          ptlrpc_readdress_connection(conn, conn->c_remote_uuid);
          spin_unlock(&conn->c_lock);
          
-        conn->c_recovd_data.rd_phase = RECOVD_PREPARED;
+        conn->c_recovd_data.rd_phase = RD_PREPARED;
          wake_up(&recovd->recovd_waitq);
   out:
          spin_unlock(&recovd->recovd_lock);
diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore

index d54e0d6..1acdc7b 100644 (file)
--- a/lustre/tests/.cvsignore
+++ b/lustre/tests/.cvsignore
@@ -18,3 +18,5 @@ tchmod
  toexcl
  fsx
  test_brw
+newfile
+openclose
author	shaver <shaver>
	Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)
committer	shaver <shaver>
	Tue, 1 Oct 2002 18:55:37 +0000 (18:55 +0000)
lustre/include/linux/lustre_ha.h		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/lib/l_net.c		patch \| blob \| history
lustre/llite/super.c		patch \| blob \| history
lustre/ptlrpc/recovd.c		patch \| blob \| history
lustre/ptlrpc/recover.c		patch \| blob \| history
lustre/ptlrpc/rpc.c		patch \| blob \| history
lustre/tests/.cvsignore		patch \| blob \| history