Whamcloud - gitweb
b=5684
authorjacob <jacob>
Tue, 15 Feb 2005 21:31:18 +0000 (21:31 +0000)
committerjacob <jacob>
Tue, 15 Feb 2005 21:31:18 +0000 (21:31 +0000)
r=adilger

First stab at improving some user visible error messages.

lnet/include/libcfs/libcfs.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/libcfs/darwin/darwin-tracefile.c
lnet/libcfs/debug.c
lnet/libcfs/linux/linux-tracefile.c
lnet/libcfs/tracefile.c
lnet/utils/debug.c

index 3788b6d..3dc5a53 100644 (file)
@@ -97,6 +97,7 @@ extern unsigned int portal_printk;
 #define D_READA       0x00400000 /* read-ahead */
 #define D_MMAP        0x00800000
 #define D_CONFIG      0x01000000
 #define D_READA       0x00400000 /* read-ahead */
 #define D_MMAP        0x00800000
 #define D_CONFIG      0x01000000
+#define D_CONSOLE     0x02000000
 /* If you change these values, please keep these files up to date...
  *    portals/utils/debug.c
  *    utils/lconf
 /* If you change these values, please keep these files up to date...
  *    portals/utils/debug.c
  *    utils/lconf
@@ -110,7 +111,7 @@ extern unsigned int portal_printk;
 #define CDEBUG(mask, format, a...)                                            \
 do {                                                                          \
         CHECK_STACK(CDEBUG_STACK);                                            \
 #define CDEBUG(mask, format, a...)                                            \
 do {                                                                          \
         CHECK_STACK(CDEBUG_STACK);                                            \
-        if (((mask) & (D_ERROR | D_EMERG | D_WARNING)) ||                     \
+        if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) ||         \
             (portal_debug & (mask) &&                                         \
              portal_subsystem_debug & DEBUG_SUBSYSTEM))                       \
                 portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
             (portal_debug & (mask) &&                                         \
              portal_subsystem_debug & DEBUG_SUBSYSTEM))                       \
                 portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
@@ -160,6 +161,12 @@ do {                                                                          \
 #define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
 #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
 
 #define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
 #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
 
+#define LCONSOLE(mask, format, a...) CDEBUG(D_CONSOLE | (mask), format, ## a)
+#define LCONSOLE_INFO(format, a...)  CDEBUG_LIMIT(D_CONSOLE, format, ## a)
+#define LCONSOLE_WARN(format, a...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## a)
+#define LCONSOLE_ERROR(format, a...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, format, ## a)
+#define LCONSOLE_EMERG(format, a...) CDEBUG(D_CONSOLE | D_EMERG, format, ## a)
+
 #define GOTO(label, rc)                                                 \
 do {                                                                    \
         long GOTO__ret = (long)(rc);                                    \
 #define GOTO(label, rc)                                                 \
 do {                                                                    \
         long GOTO__ret = (long)(rc);                                    \
@@ -413,7 +420,6 @@ extern void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
                               char *format, ...)
             __attribute__ ((format (printf, 7, 8)));
 
                               char *format, ...)
             __attribute__ ((format (printf, 7, 8)));
 
-
 static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
 {
         if (cfs_time_after(cfs_time_current(),
 static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
 {
         if (cfs_time_after(cfs_time_current(),
index 2b08e1c..33b721d 100644 (file)
@@ -479,12 +479,26 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         /* Actual error */
         LASSERT (rc < 0);
 
         /* Actual error */
         LASSERT (rc < 0);
 
-        if (!conn->ksnc_closing)
+        if (!conn->ksnc_closing) {
+                switch (rc) {
+                case -ECONNRESET:
+                        LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+                                      "while we were sending data; it may have "
+                                      "rebooted.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr));
+                        break;
+                default:
+                        LCONSOLE_WARN("There was an unexpected network error "
+                                      "while writing to %u.%u.%u.%u: %d.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr), rc);
+                        break;
+                }
                 CERROR("[%p] Error %d on write to "LPX64
                        " ip %d.%d.%d.%d:%d\n", conn, rc,
                        conn->ksnc_peer->ksnp_nid,
                        HIPQUAD(conn->ksnc_ipaddr),
                        conn->ksnc_port);
                 CERROR("[%p] Error %d on write to "LPX64
                        " ip %d.%d.%d.%d:%d\n", conn, rc,
                        conn->ksnc_peer->ksnp_nid,
                        HIPQUAD(conn->ksnc_ipaddr),
                        conn->ksnc_port);
+        }
 
         ksocknal_close_conn_and_siblings (conn, rc);
         ksocknal_tx_launched (tx);
 
         ksocknal_close_conn_and_siblings (conn, rc);
         ksocknal_tx_launched (tx);
@@ -1798,6 +1812,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         int                 type;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv;
         int                 type;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv;
+        char                ipbuf[PTL_NALFMT_SIZE];
 
         hmv = (ptl_magicversion_t *)&hdr.dest_nid;
         LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
 
         hmv = (ptl_magicversion_t *)&hdr.dest_nid;
         LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
@@ -1860,6 +1875,16 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
                 *nid = le64_to_cpu(hdr.src_nid);
         } else if (*nid != le64_to_cpu (hdr.src_nid)) {
         if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
                 *nid = le64_to_cpu(hdr.src_nid);
         } else if (*nid != le64_to_cpu (hdr.src_nid)) {
+                LCONSOLE_ERROR("Connected successfully to nid "LPX64" on host "
+                               "%u.%u.%u.%u, but they claimed they were nid "
+                               LPX64" (%s); please check your Lustre "
+                               "configuration.\n",
+                               *nid, HIPQUAD(conn->ksnc_ipaddr),
+                               le64_to_cpu(hdr.src_nid),
+                               portals_nid2str(SOCKNAL,
+                                               le64_to_cpu(hdr.src_nid),
+                                               ipbuf));
+                               
                 CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u "
                         "but expecting "LPX64"\n",
                         le64_to_cpu (hdr.src_nid),
                 CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u "
                         "but expecting "LPX64"\n",
                         le64_to_cpu (hdr.src_nid),
@@ -1964,6 +1989,7 @@ ksocknal_autoconnect (ksock_route_t *route)
         unsigned long     flags;
         int               rc;
         int               type;
         unsigned long     flags;
         int               rc;
         int               type;
+        char *err_msg = NULL;
         
         for (;;) {
                 for (type = 0; type < SOCKNAL_CONN_NTYPES; type++)
         
         for (;;) {
                 for (type = 0; type < SOCKNAL_CONN_NTYPES; type++)
@@ -1984,6 +2010,52 @@ ksocknal_autoconnect (ksock_route_t *route)
                 }
         }
 
                 }
         }
 
+        switch (rc) {
+        /* "normal" errors */
+        case -ECONNREFUSED:
+                LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on "
+                               "port %d; check that Lustre is running on that "
+                               "node.\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port);
+                break;
+        case -EHOSTUNREACH:
+        case -ENETUNREACH:
+                LCONSOLE_ERROR("Host %u.%u.%u.%u was unreachable; the network "
+                               "or that node may be down, or Lustre may be "
+                               "misconfigured.\n",
+                               HIPQUAD(route->ksnr_ipaddr));
+                break;
+        case -ETIMEDOUT:
+                LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took "
+                               "too long; that node may be hung or "
+                               "experiencing high load.\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port);
+                break;
+        /* errors that should be rare */
+        case -EPROTO:
+                err_msg = "Portals could not negotiate a connection";
+                break;
+        case -EAGAIN:
+        case -EADDRINUSE:
+                /* -EAGAIN is out of ports, but we specify the ports
+                 * manually.  we really should never get this */
+                err_msg = "no privileged ports were available";
+                break;
+        default:
+                err_msg = "unknown error";
+                break;
+        }
+
+        if (err_msg) {
+                LCONSOLE_ERROR("There was an unexpected error connecting to host "
+                               "%u.%u.%u.%u on port %d: %s (error code %d).\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port,
+                               err_msg, -rc);
+        }
+
         /* Connection attempt failed */
 
         write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
         /* Connection attempt failed */
 
         write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
@@ -2106,11 +2178,34 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                 LASSERT (!conn->ksnc_closing);
 
                 if (SOCK_ERROR(conn->ksnc_sock) != 0) {
                 LASSERT (!conn->ksnc_closing);
 
                 if (SOCK_ERROR(conn->ksnc_sock) != 0) {
-                        /* Something (e.g. failed keepalive) set the socket error */
                         atomic_inc (&conn->ksnc_refcount);
                         atomic_inc (&conn->ksnc_refcount);
+
+                        switch (SOCK_ERROR(conn->ksnc_sock)) {
+                        case ECONNRESET:
+                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
+                                              "was reset; they may have "
+                                              "rebooted.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr));
+                                break;
+                        case ETIMEDOUT:
+                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
+                                              "timed out; the network or that "
+                                              "node may be down.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr));
+                                break;
+                        default:
+                                LCONSOLE_WARN("An unexpected network error "
+                                              "occurred with %u.%u.%u.%u: %d.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr),
+                                              SOCK_ERROR(conn->ksnc_sock));
+                                break;
+                        }
+
+                        /* Something (e.g. failed keepalive) set the socket error */
                         CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n",
                                 SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid,
                                 conn, HIPQUAD(conn->ksnc_ipaddr));
                         CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n",
                                 SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid,
                                 conn, HIPQUAD(conn->ksnc_ipaddr));
+
                         return (conn);
                 }
 
                         return (conn);
                 }
 
@@ -2119,6 +2214,10 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                                       conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
                         atomic_inc (&conn->ksnc_refcount);
                                       conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
                         atomic_inc (&conn->ksnc_refcount);
+                        LCONSOLE_ERROR("A timeout occurred receiving data from "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
                         CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr));
                         return (conn);
                         CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr));
                         return (conn);
@@ -2131,6 +2230,10 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         atomic_inc (&conn->ksnc_refcount);
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         atomic_inc (&conn->ksnc_refcount);
+                        LCONSOLE_ERROR("A timeout occurred sending data to "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
                         CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,
                                 list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
                         CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,
                                 list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
index 9f58267..c621129 100644 (file)
@@ -65,12 +65,17 @@ void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
        } else if ((mask & D_WARNING) != 0) { 
                prefix = "Lustre"; 
                ptype = KERN_WARNING; 
        } else if ((mask & D_WARNING) != 0) { 
                prefix = "Lustre"; 
                ptype = KERN_WARNING; 
-       } else if (portal_printk != 0) { 
+       } else if (portal_printk != 0 || (mask & D_CONSOLE)) {
                prefix = "Lustre"; 
                ptype = KERN_INFO; 
        } 
                prefix = "Lustre"; 
                ptype = KERN_INFO; 
        } 
-       printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, 
-              hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %.*s", ptype, prefix, len, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, 
+                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+       }
 }
 
 /*
 }
 
 /*
index 8cbb117..f5dfaba 100644 (file)
@@ -209,8 +209,13 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
         case OPENIBNAL:
         case RANAL:
         case SOCKNAL:
         case OPENIBNAL:
         case RANAL:
         case SOCKNAL:
-                snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
-                         (__u32)(nid >> 32), HIPQUAD(nid));
+                if ((__u32)(nid >> 32)) {
+                        snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
+                                 (__u32)(nid >> 32), HIPQUAD(nid));
+                } else {
+                        snprintf(str, PTL_NALFMT_SIZE, "%u.%u.%u.%u",
+                                 HIPQUAD(nid));
+                }
                 break;
         case QSWNAL:
         case GMNAL:
                 break;
         case QSWNAL:
         case GMNAL:
index 0c134ee..daba696 100644 (file)
@@ -74,12 +74,17 @@ void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
        } else if ((mask & D_WARNING) != 0) { 
                prefix = "Lustre"; 
                ptype = KERN_WARNING; 
        } else if ((mask & D_WARNING) != 0) { 
                prefix = "Lustre"; 
                ptype = KERN_WARNING; 
-       } else if (portal_printk) { 
+       } else if (portal_printk != 0 || (mask & D_CONSOLE)) {
                prefix = "Lustre"; 
                ptype = KERN_INFO; 
        } 
                prefix = "Lustre"; 
                ptype = KERN_INFO; 
        } 
-       printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, 
-               hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %.*s", ptype, prefix, len, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, 
+                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+       }
        return;
 }
 
        return;
 }
 
index 439589f..e384773 100644 (file)
@@ -226,7 +226,7 @@ void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
                        "tage->used == %u in portals_debug_msg\n", tage->used);
 
  out:
                        "tage->used == %u in portals_debug_msg\n", tage->used);
 
  out:
-        if ((mask & (D_EMERG | D_ERROR | D_WARNING)) || portal_printk)
+        if ((mask & (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE)) || portal_printk)
                 print_to_console(&header, mask, debug_buf, needed, file, fn);
 
         trace_put_tcd(tcd, flags);
                 print_to_console(&header, mask, debug_buf, needed, file, fn);
 
         trace_put_tcd(tcd, flags);
index 2931321..7b0ca54 100644 (file)
@@ -75,7 +75,7 @@ static const char *portal_debug_masks[] =
          "other", "dentry", "portals", "page", 
          "dlmtrace", "error", "emerg", "ha", 
          "rpctrace", "vfstrace", "reada", "mmap",
          "other", "dentry", "portals", "page", 
          "dlmtrace", "error", "emerg", "ha", 
          "rpctrace", "vfstrace", "reada", "mmap",
-         "config", NULL};
+         "config", "console", NULL};
 
 struct debug_daemon_cmd {
         char *cmd;
 
 struct debug_daemon_cmd {
         char *cmd;