From 3e43689d30725a828384be5630e42d612c5531b2 Mon Sep 17 00:00:00 2001
From: shaver <shaver>
Date: Thu, 10 Oct 2002 21:54:57 +0000
Subject: [PATCH] - Add D_HA for recovery diagnostics, and use it in a handful
 of places. - More information about replayed/sleeping/etc. requests. -
 Instrument FH rewriting during open replay.

---
 lustre/mdc/mdc_request.c |  2 ++
 lustre/mds/handler.c     |  2 +-
 lustre/ptlrpc/client.c   | 33 +++++++++++++++++++++------------
 lustre/ptlrpc/recovd.c   | 15 +++++++++------
 lustre/ptlrpc/recover.c  | 41 +++++++++++++++++++++++++----------------
 5 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
index 45e4def..093640c 100644
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -373,6 +373,8 @@ static void mdc_replay_open(struct ptlrpc_request *req,
         struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0);
 
         mds_unpack_body(body);
+        CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n",
+               data->addr, data->cookie, body->handle.addr, body->handle.cookie);
         memcpy(data, &body->handle, sizeof(*data));
 }
 
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index 0f4f30b..3d40061 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1228,7 +1228,7 @@ err_kfree:
         kfree(mds->mds_fstype);
 err_dec:
         MOD_DEC_USE_COUNT;
-        return rc;
+        RETURN(rc);
 }
 
 static int mds_cleanup(struct obd_device *obddev)
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 4ad83b0..dfd42c5 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -376,6 +376,7 @@ restart1:
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 if (req->rq_import != imp)
                         continue;
+                /* XXX we should make sure that nobody's sleeping on these! */
                 CDEBUG(D_INFO, "Cleaning req %p from sending list.\n", req);
                 list_del_init(&req->rq_list);
                 req->rq_import = NULL;
@@ -405,7 +406,7 @@ restart2:
 void ptlrpc_continue_req(struct ptlrpc_request *req)
 {
         ENTRY;
-        CDEBUG(D_INODE, "continue delayed request "LPD64" opc %d\n", 
+        CDEBUG(D_HA, "continue delayed request "LPD64" opc %d\n", 
                req->rq_xid, req->rq_reqmsg->opc); 
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
@@ -416,7 +417,7 @@ void ptlrpc_continue_req(struct ptlrpc_request *req)
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
         ENTRY;
-        CDEBUG(D_INODE, "resend request "LPD64", opc %d\n", 
+        CDEBUG(D_HA, "resend request "LPD64", opc %d\n", 
                req->rq_xid, req->rq_reqmsg->opc);
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
@@ -431,7 +432,7 @@ void ptlrpc_resend_req(struct ptlrpc_request *req)
 void ptlrpc_restart_req(struct ptlrpc_request *req)
 {
         ENTRY;
-        CDEBUG(D_INODE, "restart completed request "LPD64", opc %d\n", 
+        CDEBUG(D_HA, "restart completed request "LPD64", opc %d\n", 
                req->rq_xid, req->rq_reqmsg->opc);
         req->rq_status = -ERESTARTSYS;
         req->rq_flags |= PTL_RPC_FL_RECOVERY;
@@ -445,7 +446,7 @@ static int expired_request(void *data)
         struct ptlrpc_request *req = data;
 
         ENTRY;
-        CERROR("req xid "LPD64" op %d: timeout on conn to %s:%d\n",
+        CDEBUG(D_HA, "req xid "LPD64" op %d: timeout on conn to %s:%d\n",
                (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
                req->rq_connection->c_remote_uuid,
                req->rq_import->imp_client->cli_request_portal);
@@ -503,8 +504,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 list_add_tail(&req->rq_list, &conn->c_delayed_head);
                 spin_unlock(&conn->c_lock);
 
-                CERROR("req xid "LPD64" op %d to %s:%d: waiting for recovery "
-                       "(%d < %d)\n",
+                CDEBUG(D_HA, "req xid "LPD64" op %d to %s:%d: waiting for "
+                       "recovery (%d < %d)\n",
                        (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
                        req->rq_connection->c_remote_uuid,
                        req->rq_import->imp_client->cli_request_portal,
@@ -525,7 +526,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 if (rc)
                         RETURN(rc);
                 
-                CERROR("process %d resumed\n", current->pid);
+                CDEBUG(D_HA, "process %d resumed\n", current->pid);
         }
  resend:
         req->rq_timeout = obd_timeout;
@@ -544,23 +545,31 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
         spin_unlock(&conn->c_lock);
         rc = ptl_send_rpc(req);
         if (rc) {
-                CERROR("error %d, opcode %d, need recovery\n", rc,
+                CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc,
                        req->rq_reqmsg->opc);
                 /* the sleep below will time out, triggering recovery */
         }
 
-        CDEBUG(D_OTHER, "-- sleeping on xid "LPD64"\n", req->rq_xid);
+        CDEBUG(D_NET, "-- sleeping on req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
         lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request,
                                interrupted_request,req);
         l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
-        CDEBUG(D_OTHER, "-- done sleeping on xid "LPD64"\n", req->rq_xid);
+        CDEBUG(D_NET, "-- done sleeping on req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
 
         /* Don't resend if we were interrupted. */
         if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) ==
             PTL_RPC_FL_RESEND) {
                 req->rq_flags &= ~PTL_RPC_FL_RESEND;
-                CDEBUG(D_OTHER, "resending req %p xid "LPD64"\n",
-                       req, req->rq_xid);
+                CDEBUG(D_HA, "req xid "LPD64" op %d to %s:%d\n",
+                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                       req->rq_connection->c_remote_uuid,
+                       req->rq_import->imp_client->cli_request_portal);
                 /* we'll get sent again, so balance 2nd request_out_callback */
                 atomic_inc(&req->rq_refcount);
                 goto resend;
diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c
index 5699f75..3a5806a 100644
--- a/lustre/ptlrpc/recovd.c
+++ b/lustre/ptlrpc/recovd.c
@@ -62,7 +62,7 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
                 return;
         }
                 
-        CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
+        CDEBUG(D_HA, "connection %p to %s failed\n", conn, conn->c_remote_uuid);
         list_del(&rd->rd_managed_chain);
         list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
         rd->rd_phase = RD_TROUBLED;
@@ -78,6 +78,8 @@ void recovd_conn_fixed(struct ptlrpc_connection *conn)
         struct recovd_data *rd = &conn->c_recovd_data;
         ENTRY;
 
+        CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
+               conn, conn->c_remote_uuid);
         spin_lock(&rd->rd_recovd->recovd_lock);
         list_del(&rd->rd_managed_chain);
         rd->rd_phase = RD_IDLE;
@@ -124,7 +126,7 @@ static void dump_connection_list(struct list_head *head)
                 struct ptlrpc_connection *conn =
                         list_entry(tmp, struct ptlrpc_connection,
                                    c_recovd_data.rd_managed_chain);
-                CERROR("   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
                        conn->c_recovd_data.rd_phase,
                        conn->c_recovd_data.rd_next_phase);
         }
@@ -138,9 +140,9 @@ static int recovd_handle_event(struct recovd_obd *recovd)
 
         spin_lock(&recovd->recovd_lock);
 
-        CERROR("managed: \n");
+        CDEBUG(D_HA, "managed: \n");
         dump_connection_list(&recovd->recovd_managed_items);
-        CERROR("troubled: \n");
+        CDEBUG(D_HA, "troubled: \n");
         dump_connection_list(&recovd->recovd_troubled_items);
 
         /*
@@ -173,7 +175,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
                                 rc = -EINVAL;
                                 break;
                         }
-                        CERROR("starting recovery for rd %p (conn %p)\n",
+                        CDEBUG(D_HA, "starting recovery for rd %p (conn %p)\n",
                                rd, class_rd2conn(rd));
                         rd->rd_phase = RD_PREPARING;
                         
@@ -249,7 +251,7 @@ static int recovd_main(void *arg)
         recovd->recovd_thread = NULL;
         recovd->recovd_state = RECOVD_STOPPED;
         wake_up(&recovd->recovd_ctl_waitq);
-        CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
+        CDEBUG(D_HA, "mgr exiting process %d\n", current->pid);
         RETURN(0);
 }
 
@@ -284,6 +286,7 @@ int recovd_setup(struct recovd_obd *recovd)
 
 int recovd_cleanup(struct recovd_obd *recovd)
 {
+        ENTRY;
         spin_lock(&recovd->recovd_lock);
         recovd->recovd_state = RECOVD_STOPPING;
         wake_up(&recovd->recovd_waitq);
diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c
index e17ae79..4717c2a 100644
--- a/lustre/ptlrpc/recover.c
+++ b/lustre/ptlrpc/recover.c
@@ -64,7 +64,7 @@ int ll_reconnect(struct ptlrpc_connection *conn)
                 old_hdl = imp->imp_handle;
                 imp->imp_handle.addr = request->rq_repmsg->addr;
                 imp->imp_handle.cookie = request->rq_repmsg->cookie;
-                CERROR("reconnected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
+                CDEBUG(D_HA, "reconnected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
                        cli->cl_target_uuid, conn->c_remote_uuid,
                        imp->imp_handle.addr, imp->imp_handle.cookie,
                        old_hdl.addr, old_hdl.cookie);
@@ -95,11 +95,15 @@ static int ll_recover_upcall(struct ptlrpc_connection *conn)
 
         rc = call_usermodehelper(argv[0], argv, envp);
         if (rc < 0) {
+                /*
+                 * Tragically, this will never be run, because call_umh doesn't
+                 * report errors like -ENOENT to its caller.
+                 */
                 CERROR("Error invoking recovery upcall (%s): %d\n",
                        obd_recovery_upcall, rc);
                 CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
         } else {
-                CERROR("Invoked upcall %s for connection %s\n",
+                CDEBUG(D_HA, "Invoked upcall %s for connection %s\n",
                        argv[0], argv[1]);
         }
         RETURN(rc);
@@ -120,13 +124,17 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
         /* 2. walk the request list */
         spin_lock(&conn->c_lock);
 
+        CDEBUG(D_HA, "connection %p to %s has last_xid "LPD64"\n",
+               conn, conn->c_remote_uuid, conn->c_last_xid);
+
         list_for_each_safe(tmp, pos, &conn->c_sending_head) { 
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 
                 /* replay what needs to be replayed */
                 if (req->rq_flags & PTL_RPC_FL_REPLAY) {
-                        CDEBUG(D_NET, "req %Ld needs replay [last rcvd %Ld]\n",
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "FL_REPLAY: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                         rc = ptlrpc_replay_req(req);
 #if 0
 #error We should not hold a spinlock over such a lengthy operation.
@@ -145,17 +153,18 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
                 /* server has seen req, we have reply: skip */
                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                     req->rq_xid <= conn->c_last_xid) { 
-                        CDEBUG(D_NET,
-                               "req %Ld was complete: skip [last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "REPLIED SKIP: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                         continue;
                 }
 
                 /* server has lost req, we have reply: resend, ign reply */
                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                     req->rq_xid > conn->c_last_xid) { 
-                        CDEBUG(D_NET, "lost req %Ld have rep: replay [last "
-                               "rcvd %Ld]\n", req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "REPLIED RESEND: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                         rc = ptlrpc_replay_req(req); 
                         if (rc) {
                                 CERROR("request resend error %d for req %Ld\n", 
@@ -167,18 +176,18 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
                 /* server has seen req, we have lost reply: -ERESTARTSYS */
                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                      req->rq_xid <= conn->c_last_xid) { 
-                        CDEBUG(D_NET, "lost rep %Ld srv did req: restart "
-                               "[last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "RESTARTSYS: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                         ptlrpc_restart_req(req);
                 }
 
                 /* service has not seen req, no reply: resend */
                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
                      req->rq_xid > conn->c_last_xid) {
-                        CDEBUG(D_NET,
-                               "lost rep/req %Ld: resend [last rcvd %Ld]\n", 
-                               req->rq_xid, conn->c_last_xid);
+                        CDEBUG(D_HA, "RESEND: xid "LPD64" op %d @ %d\n",
+                               req->rq_xid, req->rq_reqmsg->opc,
+                               req->rq_import->imp_client->cli_request_portal);
                         ptlrpc_resend_req(req);
                 }
 
@@ -203,7 +212,7 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
 
 static int ll_retry_recovery(struct ptlrpc_connection *conn)
 {
-        CERROR("Recovery has failed on conn %p\n", conn);
+        CDEBUG(D_HA, "Recovery has failed on conn %p\n", conn);
 #if 0
         /* XXX use a timer, sideshow bob */
         recovd_conn_fail(conn);
-- 
1.8.3.1