- Add D_HA for recovery diagnostics, and use it in a handful of places.

author shaver <shaver>

Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)

committer shaver <shaver>

Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)
author shaver <shaver>
Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)
committer shaver <shaver>
Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index 45e4def..093640c 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -373,6 +373,8 @@ static void mdc_replay_open(struct ptlrpc_request *req,
          struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0);
  
          mds_unpack_body(body);
+        CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n",
+               data->addr, data->cookie, body->handle.addr, body->handle.cookie);
          memcpy(data, &body->handle, sizeof(*data));
  }
  
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c

index 0f4f30b..3d40061 100644 (file)
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1228,7 +1228,7 @@ err_kfree:
          kfree(mds->mds_fstype);
  err_dec:
          MOD_DEC_USE_COUNT;
-        return rc;
+        RETURN(rc);
  }
  
  static int mds_cleanup(struct obd_device *obddev)
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 4ad83b0..dfd42c5 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -376,6 +376,7 @@ restart1:
                  req = list_entry(tmp, struct ptlrpc_request, rq_list);
                  if (req->rq_import != imp)
                          continue;
+                /* XXX we should make sure that nobody's sleeping on these! */
                  CDEBUG(D_INFO, "Cleaning req %p from sending list.\n", req);
                  list_del_init(&req->rq_list);
                  req->rq_import = NULL;
@@ -405,7 +406,7 @@ restart2:
  void ptlrpc_continue_req(struct ptlrpc_request *req)
  {
          ENTRY;
-        CDEBUG(D_INODE, "continue delayed request "LPD64" opc %d\n", 
+        CDEBUG(D_HA, "continue delayed request "LPD64" opc %d\n", 
                 req->rq_xid, req->rq_reqmsg->opc); 
          req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
          req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
@@ -416,7 +417,7 @@ void ptlrpc_continue_req(struct ptlrpc_request *req)
  void ptlrpc_resend_req(struct ptlrpc_request *req)
  {
          ENTRY;
-        CDEBUG(D_INODE, "resend request "LPD64", opc %d\n", 
+        CDEBUG(D_HA, "resend request "LPD64", opc %d\n", 
                 req->rq_xid, req->rq_reqmsg->opc);
          req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
          req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
@@ -431,7 +432,7 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
  void ptlrpc_restart_req(struct ptlrpc_request *req)
  {
          ENTRY;
-        CDEBUG(D_INODE, "restart completed request "LPD64", opc %d\n", 
+        CDEBUG(D_HA, "restart completed request "LPD64", opc %d\n", 
                 req->rq_xid, req->rq_reqmsg->opc);
          req->rq_status = -ERESTARTSYS;
          req->rq_flags |= PTL_RPC_FL_RECOVERY;
@@ -445,7 +446,7 @@ static int expired_request(void *data)
          struct ptlrpc_request *req = data;
  
          ENTRY;
-        CERROR("req xid "LPD64" op %d: timeout on conn to %s:%d\n",
+        CDEBUG(D_HA, "req xid "LPD64" op %d: timeout on conn to %s:%d\n",
                 (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
                 req->rq_connection->c_remote_uuid,
                 req->rq_import->imp_client->cli_request_portal);
@@ -503,8 +504,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                  list_add_tail(&req->rq_list, &conn->c_delayed_head);
                  spin_unlock(&conn->c_lock);
  
-                CERROR("req xid "LPD64" op %d to %s:%d: waiting for recovery "
-                       "(%d < %d)\n",
+                CDEBUG(D_HA, "req xid "LPD64" op %d to %s:%d: waiting for "
+                       "recovery (%d < %d)\n",
                         (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
                         req->rq_connection->c_remote_uuid,
                         req->rq_import->imp_client->cli_request_portal,
@@ -525,7 +526,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                  if (rc)
                          RETURN(rc);
                  
-                CERROR("process %d resumed\n", current->pid);
+                CDEBUG(D_HA, "process %d resumed\n", current->pid);
          }
   resend:
          req->rq_timeout = obd_timeout;
@@ -544,23 +545,31 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
          spin_unlock(&conn->c_lock);
          rc = ptl_send_rpc(req);
          if (rc) {
-                CERROR("error %d, opcode %d, need recovery\n", rc,
+                CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc,
                         req->rq_reqmsg->opc);
                  /* the sleep below will time out, triggering recovery */
          }
  
-        CDEBUG(D_OTHER, "-- sleeping on xid "LPD64"\n", req->rq_xid);
+        CDEBUG(D_NET, "-- sleeping on req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
          lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request,
                                 interrupted_request,req);
          l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
-        CDEBUG(D_OTHER, "-- done sleeping on xid "LPD64"\n", req->rq_xid);
+        CDEBUG(D_NET, "-- done sleeping on req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
  
          /* Don't resend if we were interrupted. */
          if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) ==
              PTL_RPC_FL_RESEND) {
                  req->rq_flags &= ~PTL_RPC_FL_RESEND;
-                CDEBUG(D_OTHER, "resending req %p xid "LPD64"\n",
-                       req, req->rq_xid);
+                CDEBUG(D_HA, "req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
                  /* we'll get sent again, so balance 2nd request_out_callback */
                  atomic_inc(&req->rq_refcount);
                  goto resend;
diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c

index 5699f75..3a5806a 100644 (file)
--- a/lustre/ptlrpc/recovd.c
+++ b/lustre/ptlrpc/recovd.c
@@ -62,7 +62,7 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
                  return;
          }
                  
-        CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
+        CDEBUG(D_HA, "connection %p to %s failed\n", conn, conn->c_remote_uuid);
          list_del(&rd->rd_managed_chain);
          list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
          rd->rd_phase = RD_TROUBLED;
@@ -78,6 +78,8 @@ void recovd_conn_fixed(struct ptlrpc_connection *conn)
          struct recovd_data *rd = &conn->c_recovd_data;
          ENTRY;
  
+        CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
+               conn, conn->c_remote_uuid);
          spin_lock(&rd->rd_recovd->recovd_lock);
          list_del(&rd->rd_managed_chain);
          rd->rd_phase = RD_IDLE;
@@ -124,7 +126,7 @@ static void dump_connection_list(struct list_head *head)
                  struct ptlrpc_connection *conn =
                          list_entry(tmp, struct ptlrpc_connection,
                                     c_recovd_data.rd_managed_chain);
-                CERROR("   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
                         conn->c_recovd_data.rd_phase,
                         conn->c_recovd_data.rd_next_phase);
          }
@@ -138,9 +140,9 @@ static int recovd_handle_event(struct recovd_obd *recovd)
  
          spin_lock(&recovd->recovd_lock);
  
-        CERROR("managed: \n");
+        CDEBUG(D_HA, "managed: \n");
          dump_connection_list(&recovd->recovd_managed_items);
-        CERROR("troubled: \n");
+        CDEBUG(D_HA, "troubled: \n");
          dump_connection_list(&recovd->recovd_troubled_items);
  
          /*
@@ -173,7 +175,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                                  rc = -EINVAL;
                                  break;
                          }
-                        CERROR("starting recovery for rd %p (conn %p)\n",
+                        CDEBUG(D_HA, "starting recovery for rd %p (conn %p)\n",
                                 rd, class_rd2conn(rd));
                          rd->rd_phase = RD_PREPARING;
                          
@@ -249,7 +251,7 @@ static int recovd_main(void *arg)
          recovd->recovd_thread = NULL;
          recovd->recovd_state = RECOVD_STOPPED;
          wake_up(&recovd->recovd_ctl_waitq);
-        CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
+        CDEBUG(D_HA, "mgr exiting process %d\n", current->pid);
          RETURN(0);
  }
  
@@ -284,6 +286,7 @@ int recovd_setup(struct recovd_obd *recovd)
  
  int recovd_cleanup(struct recovd_obd *recovd)
  {
+        ENTRY;
          spin_lock(&recovd->recovd_lock);
          recovd->recovd_state = RECOVD_STOPPING;
          wake_up(&recovd->recovd_waitq);
diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c

index e17ae79..4717c2a 100644 (file)
--- a/lustre/ptlrpc/recover.c
+++ b/lustre/ptlrpc/recover.c
@@ -64,7 +64,7 @@ int ll_reconnect(struct ptlrpc_connection *conn)
                  old_hdl = imp->imp_handle;
                  imp->imp_handle.addr = request->rq_repmsg->addr;
                  imp->imp_handle.cookie = request->rq_repmsg->cookie;
-                CERROR("reconnected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
+                CDEBUG(D_HA, "reconnected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
                         cli->cl_target_uuid, conn->c_remote_uuid,
                         imp->imp_handle.addr, imp->imp_handle.cookie,
                         old_hdl.addr, old_hdl.cookie);
@@ -95,11 +95,15 @@ static int ll_recover_upcall(struct ptlrpc_connection *conn)
  
          rc = call_usermodehelper(argv[0], argv, envp);
          if (rc < 0) {
+                /*
+                 * Tragically, this will never be run, because call_umh doesn't
+                 * report errors like -ENOENT to its caller.
+                 */
                  CERROR("Error invoking recovery upcall (%s): %d\n",
                         obd_recovery_upcall, rc);
                  CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
          } else {
-                CERROR("Invoked upcall %s for connection %s\n",
+                CDEBUG(D_HA, "Invoked upcall %s for connection %s\n",
                         argv[0], argv[1]);
          }
          RETURN(rc);
@@ -120,13 +124,17 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
          /* 2. walk the request list */
          spin_lock(&conn->c_lock);
  
+        CDEBUG(D_HA, "connection %p to %s has last_xid "LPD64"\n",
+               conn, conn->c_remote_uuid, conn->c_last_xid);
+
          list_for_each_safe(tmp, pos, &conn->c_sending_head) { 
                  req = list_entry(tmp, struct ptlrpc_request, rq_list);
                  
                  /* replay what needs to be replayed */
                  if (req->rq_flags & PTL_RPC_FL_REPLAY) {
-                        CDEBUG(D_NET, "req %Ld needs replay [last rcvd %Ld]\n",
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "FL_REPLAY: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                          rc = ptlrpc_replay_req(req);
  #if 0
  #error We should not hold a spinlock over such a lengthy operation.
@@ -145,17 +153,18 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
                  /* server has seen req, we have reply: skip */
                  if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                      req->rq_xid <= conn->c_last_xid) { 
-                        CDEBUG(D_NET,
-                               "req %Ld was complete: skip [last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "REPLIED SKIP: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                          continue;
                  }
  
                  /* server has lost req, we have reply: resend, ign reply */
                  if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                      req->rq_xid > conn->c_last_xid) { 
-                        CDEBUG(D_NET, "lost req %Ld have rep: replay [last "
-                               "rcvd %Ld]\n", req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "REPLIED RESEND: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                          rc = ptlrpc_replay_req(req); 
                          if (rc) {
                                  CERROR("request resend error %d for req %Ld\n", 
@@ -167,18 +176,18 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
                  /* server has seen req, we have lost reply: -ERESTARTSYS */
                  if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                       req->rq_xid <= conn->c_last_xid) { 
-                        CDEBUG(D_NET, "lost rep %Ld srv did req: restart "
-                               "[last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "RESTARTSYS: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                          ptlrpc_restart_req(req);
                  }
  
                  /* service has not seen req, no reply: resend */
                  if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                       req->rq_xid > conn->c_last_xid) {
-                        CDEBUG(D_NET,
-                               "lost rep/req %Ld: resend [last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "RESEND: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                          ptlrpc_resend_req(req);
                  }
  
@@ -203,7 +212,7 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
  
  static int ll_retry_recovery(struct ptlrpc_connection *conn)
  {
-        CERROR("Recovery has failed on conn %p\n", conn);
+        CDEBUG(D_HA, "Recovery has failed on conn %p\n", conn);
  #if 0
          /* XXX use a timer, sideshow bob */
          recovd_conn_fail(conn);
author	shaver <shaver>
	Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)
committer	shaver <shaver>
	Thu, 10 Oct 2002 21:54:57 +0000 (21:54 +0000)
lustre/mdc/mdc_request.c		patch \| blob \| history
lustre/mds/handler.c		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/recovd.c		patch \| blob \| history
lustre/ptlrpc/recover.c		patch \| blob \| history