From: jacob Date: Tue, 15 Feb 2005 21:31:18 +0000 (+0000) Subject: b=5684 X-Git-Tag: v1_7_100~1619 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d0e963668e08dc3b6fcfaad6dc63dfa0d16e9ffd b=5684 r=adilger First stab at improving some user visible error messages. --- diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index 3788b6d..3dc5a53 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -97,6 +97,7 @@ extern unsigned int portal_printk; #define D_READA 0x00400000 /* read-ahead */ #define D_MMAP 0x00800000 #define D_CONFIG 0x01000000 +#define D_CONSOLE 0x02000000 /* If you change these values, please keep these files up to date... * portals/utils/debug.c * utils/lconf @@ -110,7 +111,7 @@ extern unsigned int portal_printk; #define CDEBUG(mask, format, a...) \ do { \ CHECK_STACK(CDEBUG_STACK); \ - if (((mask) & (D_ERROR | D_EMERG | D_WARNING)) || \ + if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) || \ (portal_debug & (mask) && \ portal_subsystem_debug & DEBUG_SUBSYSTEM)) \ portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ @@ -160,6 +161,12 @@ do { \ #define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a) #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) +#define LCONSOLE(mask, format, a...) CDEBUG(D_CONSOLE | (mask), format, ## a) +#define LCONSOLE_INFO(format, a...) CDEBUG_LIMIT(D_CONSOLE, format, ## a) +#define LCONSOLE_WARN(format, a...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## a) +#define LCONSOLE_ERROR(format, a...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, format, ## a) +#define LCONSOLE_EMERG(format, a...) CDEBUG(D_CONSOLE | D_EMERG, format, ## a) + #define GOTO(label, rc) \ do { \ long GOTO__ret = (long)(rc); \ @@ -413,7 +420,6 @@ extern void portals_debug_msg(int subsys, int mask, char *file, const char *fn, char *format, ...) __attribute__ ((format (printf, 7, 8))); - static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg) { if (cfs_time_after(cfs_time_current(), diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 2b08e1c..33b721d 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -479,12 +479,26 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) /* Actual error */ LASSERT (rc < 0); - if (!conn->ksnc_closing) + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection " + "while we were sending data; it may have " + "rebooted.\n", + HIPQUAD(conn->ksnc_ipaddr)); + break; + default: + LCONSOLE_WARN("There was an unexpected network error " + "while writing to %u.%u.%u.%u: %d.\n", + HIPQUAD(conn->ksnc_ipaddr), rc); + break; + } CERROR("[%p] Error %d on write to "LPX64 " ip %d.%d.%d.%d:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + } ksocknal_close_conn_and_siblings (conn, rc); ksocknal_tx_launched (tx); @@ -1798,6 +1812,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, int type; ptl_hdr_t hdr; ptl_magicversion_t *hmv; + char ipbuf[PTL_NALFMT_SIZE]; hmv = (ptl_magicversion_t *)&hdr.dest_nid; LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); @@ -1860,6 +1875,16 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ *nid = le64_to_cpu(hdr.src_nid); } else if (*nid != le64_to_cpu (hdr.src_nid)) { + LCONSOLE_ERROR("Connected successfully to nid "LPX64" on host " + "%u.%u.%u.%u, but they claimed they were nid " + LPX64" (%s); please check your Lustre " + "configuration.\n", + *nid, HIPQUAD(conn->ksnc_ipaddr), + le64_to_cpu(hdr.src_nid), + portals_nid2str(SOCKNAL, + le64_to_cpu(hdr.src_nid), + ipbuf)); + CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u " "but expecting "LPX64"\n", le64_to_cpu (hdr.src_nid), @@ -1964,6 +1989,7 @@ ksocknal_autoconnect (ksock_route_t *route) unsigned long flags; int rc; int type; + char *err_msg = NULL; for (;;) { for (type = 0; type < SOCKNAL_CONN_NTYPES; type++) @@ -1984,6 +2010,52 @@ ksocknal_autoconnect (ksock_route_t *route) } } + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on " + "port %d; check that Lustre is running on that " + "node.\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + LCONSOLE_ERROR("Host %u.%u.%u.%u was unreachable; the network " + "or that node may be down, or Lustre may be " + "misconfigured.\n", + HIPQUAD(route->ksnr_ipaddr)); + break; + case -ETIMEDOUT: + LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took " + "too long; that node may be hung or " + "experiencing high load.\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_port); + break; + /* errors that should be rare */ + case -EPROTO: + err_msg = "Portals could not negotiate a connection"; + break; + case -EAGAIN: + case -EADDRINUSE: + /* -EAGAIN is out of ports, but we specify the ports + * manually. we really should never get this */ + err_msg = "no privileged ports were available"; + break; + default: + err_msg = "unknown error"; + break; + } + + if (err_msg) { + LCONSOLE_ERROR("There was an unexpected error connecting to host " + "%u.%u.%u.%u on port %d: %s (error code %d).\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_port, + err_msg, -rc); + } + /* Connection attempt failed */ write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); @@ -2106,11 +2178,34 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) LASSERT (!conn->ksnc_closing); if (SOCK_ERROR(conn->ksnc_sock) != 0) { - /* Something (e.g. failed keepalive) set the socket error */ atomic_inc (&conn->ksnc_refcount); + + switch (SOCK_ERROR(conn->ksnc_sock)) { + case ECONNRESET: + LCONSOLE_WARN("A connection with %u.%u.%u.%u " + "was reset; they may have " + "rebooted.\n", + HIPQUAD(conn->ksnc_ipaddr)); + break; + case ETIMEDOUT: + LCONSOLE_WARN("A connection with %u.%u.%u.%u " + "timed out; the network or that " + "node may be down.\n", + HIPQUAD(conn->ksnc_ipaddr)); + break; + default: + LCONSOLE_WARN("An unexpected network error " + "occurred with %u.%u.%u.%u: %d.\n", + HIPQUAD(conn->ksnc_ipaddr), + SOCK_ERROR(conn->ksnc_sock)); + break; + } + + /* Something (e.g. failed keepalive) set the socket error */ CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n", SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid, conn, HIPQUAD(conn->ksnc_ipaddr)); + return (conn); } @@ -2119,6 +2214,10 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) conn->ksnc_rx_deadline)) { /* Timed out incomplete incoming message */ atomic_inc (&conn->ksnc_refcount); + LCONSOLE_ERROR("A timeout occurred receiving data from " + "%u.%u.%u.%u; the network or that node " + "may be down.\n", + HIPQUAD(conn->ksnc_ipaddr)); CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n", peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr)); return (conn); @@ -2131,6 +2230,10 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) /* Timed out messages queued for sending or * buffered in the socket's send buffer */ atomic_inc (&conn->ksnc_refcount); + LCONSOLE_ERROR("A timeout occurred sending data to " + "%u.%u.%u.%u; the network or that node " + "may be down.\n", + HIPQUAD(conn->ksnc_ipaddr)); CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", peer->ksnp_nid, list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", diff --git a/lnet/libcfs/darwin/darwin-tracefile.c b/lnet/libcfs/darwin/darwin-tracefile.c index 9f58267..c621129 100644 --- a/lnet/libcfs/darwin/darwin-tracefile.c +++ b/lnet/libcfs/darwin/darwin-tracefile.c @@ -65,12 +65,17 @@ void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, } else if ((mask & D_WARNING) != 0) { prefix = "Lustre"; ptype = KERN_WARNING; - } else if (portal_printk != 0) { + } else if (portal_printk != 0 || (mask & D_CONSOLE)) { prefix = "Lustre"; ptype = KERN_INFO; } - printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, - hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %.*s", ptype, prefix, len, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, + hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); + } } /* diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 8cbb117..f5dfaba 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -209,8 +209,13 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) case OPENIBNAL: case RANAL: case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", - (__u32)(nid >> 32), HIPQUAD(nid)); + if ((__u32)(nid >> 32)) { + snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", + (__u32)(nid >> 32), HIPQUAD(nid)); + } else { + snprintf(str, PTL_NALFMT_SIZE, "%u.%u.%u.%u", + HIPQUAD(nid)); + } break; case QSWNAL: case GMNAL: diff --git a/lnet/libcfs/linux/linux-tracefile.c b/lnet/libcfs/linux/linux-tracefile.c index 0c134ee..daba696 100644 --- a/lnet/libcfs/linux/linux-tracefile.c +++ b/lnet/libcfs/linux/linux-tracefile.c @@ -74,12 +74,17 @@ void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, } else if ((mask & D_WARNING) != 0) { prefix = "Lustre"; ptype = KERN_WARNING; - } else if (portal_printk) { + } else if (portal_printk != 0 || (mask & D_CONSOLE)) { prefix = "Lustre"; ptype = KERN_INFO; } - printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, - hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %.*s", ptype, prefix, len, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, + hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); + } return; } diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c index 439589f..e384773 100644 --- a/lnet/libcfs/tracefile.c +++ b/lnet/libcfs/tracefile.c @@ -226,7 +226,7 @@ void portals_debug_msg(int subsys, int mask, char *file, const char *fn, "tage->used == %u in portals_debug_msg\n", tage->used); out: - if ((mask & (D_EMERG | D_ERROR | D_WARNING)) || portal_printk) + if ((mask & (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE)) || portal_printk) print_to_console(&header, mask, debug_buf, needed, file, fn); trace_put_tcd(tcd, flags); diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 2931321..7b0ca54 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -75,7 +75,7 @@ static const char *portal_debug_masks[] = "other", "dentry", "portals", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", "reada", "mmap", - "config", NULL}; + "config", "console", NULL}; struct debug_daemon_cmd { char *cmd;