Whamcloud - gitweb
- Add some more verbose logging of the cases that get clients into recovery.
authorshaver <shaver>
Thu, 3 Oct 2002 17:02:19 +0000 (17:02 +0000)
committershaver <shaver>
Thu, 3 Oct 2002 17:02:19 +0000 (17:02 +0000)
- l_wait_event returns -EINTR, not -ERESTARTSYS.

lustre/obdclass/genops.c
lustre/ptlrpc/client.c
lustre/ptlrpc/recovd.c
lustre/ptlrpc/recover.c

index c843d68..2649663 100644 (file)
@@ -38,6 +38,9 @@ static int sync_io_timeout(void *data)
         LASSERT(desc);
         LASSERT(desc->bd_connection);
 
+        CERROR("IO of %d pages to/from %s:%d (conn %p) timed out\n",
+               desc->bd_page_count, desc->bd_connection->c_remote_uuid,
+               desc->bd_portal, desc->bd_connection);
         desc->bd_connection->c_level = LUSTRE_CONN_RECOVD;
         desc->bd_flags |= PTL_RPC_FL_TIMEOUT;
         if (desc->bd_connection && class_signal_connection_failure) {
@@ -73,7 +76,7 @@ int ll_sync_io_cb(struct io_cb_data *data, int err, int phase)
                 ret = l_wait_event(data->waitq, data->complete, &lwi);
                 if (atomic_dec_and_test(&data->refcount))
                         OBD_FREE(data, sizeof(*data));
-                if (ret == -ERESTARTSYS)
+                if (ret == -EINTR)
                         return ret;
         } else if (phase == CB_PHASE_FINISH) {
                 data->err = err;
index a57dafb..1097b58 100644 (file)
@@ -447,8 +447,10 @@ static int expired_request(void *data)
         struct ptlrpc_request *req = data;
         
         ENTRY;
-        CERROR("req timeout on connid %d xid %Ld\n", req->rq_connid,
-               (unsigned long long)req->rq_xid);
+        CERROR("req timeout on connid %d xid %Ld portal %d op %d\n",
+               req->rq_connid, (unsigned long long)req->rq_xid,
+               req->rq_import->imp_client->cli_request_portal,
+               req->rq_reqmsg->opc);
         req->rq_flags |= PTL_RPC_FL_TIMEOUT;
         if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd)
                 RETURN(1);
index 2c25dd9..33a699b 100644 (file)
@@ -53,7 +53,7 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
 
         spin_lock(&recovd->recovd_lock);
         if (rd->rd_phase != RD_IDLE) {
-                CDEBUG(D_INFO, "connection %p to %s already in recovery\n",
+                CERROR("connection %p to %s already in recovery\n",
                        conn, conn->c_remote_uuid);
                 /* XXX need to distinguish from failure-in-recovery */
                 spin_unlock(&recovd->recovd_lock);
@@ -124,7 +124,9 @@ static void dump_connection_list(struct list_head *head)
                 struct ptlrpc_connection *conn =
                         list_entry(tmp, struct ptlrpc_connection,
                                    c_recovd_data.rd_managed_chain);
-                CDEBUG(D_NET, "   %p = %s\n", conn, conn->c_remote_uuid);
+                CERROR("   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+                       conn->c_recovd_data.rd_phase,
+                       conn->c_recovd_data.rd_next_phase);
         }
 }
 
@@ -136,9 +138,9 @@ static int recovd_handle_event(struct recovd_obd *recovd)
 
         spin_lock(&recovd->recovd_lock);
 
-        CDEBUG(D_NET, "managed: \n");
+        CERROR("managed: \n");
         dump_connection_list(&recovd->recovd_managed_items);
-        CDEBUG(D_NET, "troubled: \n");
+        CERROR("troubled: \n");
         dump_connection_list(&recovd->recovd_troubled_items);
 
         /*
index e062bcc..e17ae79 100644 (file)
@@ -98,6 +98,9 @@ static int ll_recover_upcall(struct ptlrpc_connection *conn)
                 CERROR("Error invoking recovery upcall (%s): %d\n",
                        obd_recovery_upcall, rc);
                 CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
+        } else {
+                CERROR("Invoked upcall %s for connection %s\n",
+                       argv[0], argv[1]);
         }
         RETURN(rc);
 }
@@ -184,7 +187,7 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
         conn->c_level = LUSTRE_CONN_FULL;
         recovd_conn_fixed(conn);
 
-        CDEBUG(D_NET, "recovery complete on conn %p(%s), waking delayed reqs\n",
+        CERROR("recovery complete on conn %p(%s), waking delayed reqs\n",
                conn, conn->c_remote_uuid);
         /* Finally, continue what we delayed since recovery started */
         list_for_each_safe(tmp, pos, &conn->c_delayed_head) { 
@@ -200,6 +203,7 @@ static int ll_recover_reconnect(struct ptlrpc_connection *conn)
 
 static int ll_retry_recovery(struct ptlrpc_connection *conn)
 {
+        CERROR("Recovery has failed on conn %p\n", conn);
 #if 0
         /* XXX use a timer, sideshow bob */
         recovd_conn_fail(conn);