From 439addad84514e7ff6452710e6a7f15b80d7b589 Mon Sep 17 00:00:00 2001 From: adilger Date: Sun, 19 Jun 2005 09:18:22 +0000 Subject: [PATCH] Land b_release_1_4_3 onto HEAD (20050619_0305) b=6411 : enable rate-limiting of console error messages, and some console errors now go only to the kernel log. Use CERROR/CWARN where appropriate to allow rate-limiting of these messages. b=1693 : add /proc/sys/portals/catastrophe entry which will report if that node has previously LBUGged. --- lnet/ChangeLog | 17 ++++ lnet/include/libcfs/darwin/kp30.h | 2 +- lnet/include/libcfs/libcfs.h | 3 + lnet/include/libcfs/linux/kp30.h | 2 + lnet/klnds/gmlnd/gmlnd_api.c | 41 +++++---- lnet/klnds/gmlnd/gmlnd_cb.c | 8 +- lnet/klnds/gmlnd/gmlnd_comm.c | 163 +++++++++++++++++------------------- lnet/klnds/gmlnd/gmlnd_module.c | 17 ++-- lnet/klnds/gmlnd/gmlnd_utils.c | 96 ++++++++++----------- lnet/klnds/iiblnd/iiblnd_cb.c | 4 +- lnet/klnds/openiblnd/openiblnd.c | 2 +- lnet/klnds/openiblnd/openiblnd_cb.c | 6 +- lnet/klnds/ralnd/ralnd.c | 2 +- lnet/klnds/ralnd/ralnd_cb.c | 6 +- lnet/klnds/socklnd/socklnd.c | 17 ++-- lnet/klnds/socklnd/socklnd_cb.c | 139 +++++++++++++++--------------- lnet/libcfs/darwin/darwin-proc.c | 2 + lnet/libcfs/debug.c | 5 +- lnet/libcfs/linux/linux-proc.c | 5 +- lnet/utils/acceptor.c | 72 +++++++++------- 20 files changed, 313 insertions(+), 296 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 27ad0b2..3d8fcc8 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -1,3 +1,20 @@ +2005-06-02 Cluster File Systems, Inc. + * version 1.4.3 + * bug fixes + +Severity : major +Frequency : occasional (large-scale events, cluster reboot, network failure) +Bugzilla : 6411 +Description: too many error messages on console obscure actual problem and + can slow down/panic server, or cause recovery to fail repeatedly +Details : enable rate-limiting of console error messages, and some messages + that were console errors now only go to the kernel log + +Severity : enhancement +Bugzilla : 1693 +Description: add /proc/sys/portals/catastrophe entry which will report if + that node has previously LBUGged + 2005-04-06 Cluster File Systems, Inc. * bugs - update gmnal to use PTL_MTU, fix module refcounting (b=5786) diff --git a/lnet/include/libcfs/darwin/kp30.h b/lnet/include/libcfs/darwin/kp30.h index 7f765e1..5c1acc4 100644 --- a/lnet/include/libcfs/darwin/kp30.h +++ b/lnet/include/libcfs/darwin/kp30.h @@ -32,7 +32,7 @@ #define LASSERT_SPIN_LOCKED(lock) do {} while(0) #endif -#define LBUG_WITH_LOC(file, func, line) do {} while(0) +#define LBUG_WITH_LOC(file, func, line) portals_catastrophe = 1 /* --------------------------------------------------------------------- */ diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index eca3488..6f3ee42 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -38,6 +38,9 @@ extern unsigned int portal_stack; extern unsigned int portal_debug; extern unsigned int portal_printk; +/* Has there been an LBUG? */ +extern unsigned int portals_catastrophe; + /* * struct ptldebug_header is defined in libcfs//libcfs.h */ diff --git a/lnet/include/libcfs/linux/kp30.h b/lnet/include/libcfs/linux/kp30.h index a4e0b21..d2329ba 100644 --- a/lnet/include/libcfs/linux/kp30.h +++ b/lnet/include/libcfs/linux/kp30.h @@ -93,6 +93,7 @@ static inline void our_cond_resched(void) #define LBUG_WITH_LOC(file, func, line) \ do { \ CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ + portals_catastrophe = 1; \ portals_debug_dumplog(); \ portals_run_lbug_upcall(file, func, line); \ panic("LBUG"); \ @@ -101,6 +102,7 @@ do { \ #define LBUG_WITH_LOC(file, func, line) \ do { \ CEMERG("LBUG\n"); \ + portals_catastrophe = 1; \ portals_debug_dumpstack(NULL); \ portals_debug_dumplog(); \ portals_run_lbug_upcall(file, func, line); \ diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index a65272a..bf182b4 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -134,7 +134,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t)); if (!nal_data) { - CDEBUG(D_ERROR, "can't get memory\n"); + CERROR("can't get memory\n"); return(PTL_NO_SPACE); } memset(nal_data, 0, sizeof(gmnal_data_t)); @@ -169,7 +169,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, */ CDEBUG(D_INFO, "Calling gm_init\n"); if (gm_init() != GM_SUCCESS) { - CDEBUG(D_ERROR, "call to gm_init failed\n"); + CERROR("call to gm_init failed\n"); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); @@ -187,29 +187,27 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_INFO, "gm_open returned [%d]\n", gm_status); if (gm_status == GM_SUCCESS) { - CDEBUG(D_INFO, "gm_open succeeded port[%p]\n", - nal_data->gm_port); + CDEBUG(D_INFO,"gm_open succeeded port[%p]\n",nal_data->gm_port); } else { switch(gm_status) { case(GM_INVALID_PARAMETER): - CDEBUG(D_ERROR, "gm_open Failure. Invalid Parameter\n"); + CERROR("gm_open Failure. Invalid Parameter\n"); break; case(GM_BUSY): - CDEBUG(D_ERROR, "gm_open Failure. GM Busy\n"); + CERROR("gm_open Failure. GM Busy\n"); break; case(GM_NO_SUCH_DEVICE): - CDEBUG(D_ERROR, "gm_open Failure. No such device\n"); + CERROR("gm_open Failure. No such device\n"); break; case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib " - "and driver\n"); + CERROR("gm_open Failure. Incompatile lib and driver\n"); break; case(GM_OUT_OF_MEMORY): - CDEBUG(D_ERROR, "gm_open Failure. Out of Memory\n"); + CERROR("gm_open Failure. Out of Memory\n"); break; default: - CDEBUG(D_ERROR, "gm_open Failure. Unknow error " - "code [%d]\n", gm_status); + CERROR("gm_open Failure. Unknow error code [%d]\n", + gm_status); break; } GMNAL_GM_LOCK(nal_data); @@ -225,7 +223,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_min_size_for_length(gmnal_small_msg_size); if (gmnal_alloc_srxd(nal_data) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "Failed to allocate small rx descriptors\n"); + CERROR("Failed to allocate small rx descriptors\n"); gmnal_free_txd(nal_data); GMNAL_GM_LOCK(nal_data); gm_close(nal_data->gm_port); @@ -255,7 +253,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, * Allocate pools of small tx buffers and descriptors */ if (gmnal_alloc_txd(nal_data) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "Failed to allocate small tx descriptors\n"); + CERROR("Failed to allocate small tx descriptors\n"); GMNAL_GM_LOCK(nal_data); gm_close(nal_data->gm_port); gm_finalize(); @@ -285,7 +283,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, if (gm_status != GM_SUCCESS) { gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); - CDEBUG(D_ERROR, "can't determine node id\n"); + CERROR("can't determine node id\n"); gmnal_free_txd(nal_data); gmnal_free_srxd(nal_data); GMNAL_GM_LOCK(nal_data); @@ -305,7 +303,7 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, &global_nid); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "failed to obtain global id\n"); + CERROR("failed to obtain global id\n"); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); gmnal_free_txd(nal_data); @@ -327,14 +325,14 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, */ process_id.pid = requested_pid; process_id.nid = global_nid; - + CDEBUG(D_INFO, "portals_pid is [%u]\n", process_id.pid); CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid); - + CDEBUG(D_PORTALS, "calling lib_init\n"); - if (lib_init(libnal, nal, process_id, + if (lib_init(libnal, nal, process_id, requested_limits, actual_limits) != PTL_OK) { - CDEBUG(D_ERROR, "lib_init failed\n"); + CERROR("lib_init failed\n"); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); gmnal_free_txd(nal_data); @@ -343,10 +341,9 @@ gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); + PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); PORTAL_FREE(libnal, sizeof(lib_nal_t)); return(PTL_FAIL); - } if (libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data) != 0) { diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index 6394c37..ddff6b9 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -166,7 +166,7 @@ ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, "] nid["LPU64"]\n", niov, offset, len, nid); nal_data = libnal->libnal_data; if (!nal_data) { - CDEBUG(D_ERROR, "no nal_data\n"); + CERROR("no nal_data\n"); return(PTL_FAIL); } else { CDEBUG(D_INFO, "nal_data [%p]\n", nal_data); @@ -205,7 +205,7 @@ ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, stxd, len); } else { - CDEBUG(D_ERROR, "Large message send is not supported\n"); + CERROR("Large message send is not supported\n"); lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, @@ -230,7 +230,7 @@ ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, LPSZ"] len["LPSZ"]\n", nid, kniov, offset, len); nal_data = libnal->libnal_data; if (!nal_data) { - CDEBUG(D_ERROR, "no nal_data\n"); + CERROR("no nal_data\n"); return(PTL_FAIL); } else { CDEBUG(D_INFO, "nal_data [%p]\n", nal_data); @@ -292,7 +292,7 @@ ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); iovec_dup = iovec; - CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); + CERROR("Large message send it is not supported yet\n"); PORTAL_FREE(iovec, kniov*sizeof(struct iovec)); return(PTL_FAIL); for (i=0; ibuffer; switch(((gmnal_msghdr_t*)buffer)->type) { case(GMNAL_SMALL_MESSAGE): - gmnal_pre_receive(nal_data, we, - GMNAL_SMALL_MESSAGE); - break; + gmnal_pre_receive(nal_data, we, GMNAL_SMALL_MESSAGE); + break; case(GMNAL_LARGE_MESSAGE_INIT): - gmnal_pre_receive(nal_data, we, - GMNAL_LARGE_MESSAGE_INIT); - break; + gmnal_pre_receive(nal_data,we,GMNAL_LARGE_MESSAGE_INIT); + break; case(GMNAL_LARGE_MESSAGE_ACK): - gmnal_pre_receive(nal_data, we, - GMNAL_LARGE_MESSAGE_ACK); - break; + gmnal_pre_receive(nal_data, we,GMNAL_LARGE_MESSAGE_ACK); + break; default: - CDEBUG(D_ERROR, "Unsupported message type\n"); + CERROR("Unsupported message type\n"); gmnal_rx_bad(nal_data, we, NULL); } PORTAL_FREE(we, sizeof(gmnal_rxtwe_t)); @@ -200,7 +197,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) ptl_hdr_t *portals_hdr; int rc; - CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", + CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", nal_data, we, gmnal_type); buffer = we->buffer; @@ -217,20 +214,19 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) "type [%d], length [%d], buffer [%p]\n", snode, sport, type, length, buffer); CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], " - "gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, + "gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, gmnal_msghdr->magic, gmnal_msghdr->type); CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], " - "dest_node ["LPD64"]\n", portals_hdr->src_nid, + "dest_node ["LPD64"]\n", portals_hdr->src_nid, portals_hdr->dest_nid); - /* - * Get a receive descriptor for this message + * Get a receive descriptor for this message */ srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer); CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n"); if (!srxd) { - CDEBUG(D_ERROR, "Failed to get receive descriptor\n"); + CERROR("Failed to get receive descriptor\n"); /* I think passing a NULL srxd to lib_parse will crash * gmnal_recv() */ LBUG(); @@ -239,7 +235,7 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) } /* - * no need to bother portals library with this + * no need to bother portals library with this */ if (gmnal_type == GMNAL_LARGE_MESSAGE_ACK) { gmnal_large_tx_ack_received(nal_data, srxd); @@ -250,8 +246,8 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) srxd->type = gmnal_type; srxd->nsiov = gmnal_msghdr->niov; srxd->gm_source_node = gmnal_msghdr->sender_node_id; - - CDEBUG(D_PORTALS, "Calling lib_parse buffer is [%p]\n", + + CDEBUG(D_PORTALS, "Calling lib_parse buffer is [%p]\n", buffer+GMNAL_MSGHDR_SIZE); /* * control passes to lib, which calls cb_recv @@ -306,7 +302,7 @@ gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd) if (srxd) { gmnal_rx_requeue_buffer(nal_data, srxd); } else { - CDEBUG(D_ERROR, "Can't find a descriptor for this buffer\n"); + CERROR("Can't find a descriptor for this buffer\n"); /* * get rid of it ? */ @@ -334,7 +330,7 @@ gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie) if (!private) { - CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); + CERROR("gmnal_small_rx no context\n"); lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -386,7 +382,7 @@ gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, hdr->dest_nid, hdr->src_nid); if (!nal_data) { - CDEBUG(D_ERROR, "no nal_data\n"); + CERROR("no nal_data\n"); return(PTL_FAIL); } else { CDEBUG(D_INFO, "nal_data [%p]\n", nal_data); @@ -397,7 +393,7 @@ gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, &local_nid); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "Failed to obtain local id\n"); + CERROR("Failed to obtain local id\n"); return(PTL_FAIL); } CDEBUG(D_INFO, "Local Node_id is [%u][%x]\n", local_nid, local_nid); @@ -431,20 +427,20 @@ gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " - "gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] " - "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + "gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] " + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, global_nid, local_nid, stxd); GMNAL_GM_LOCK(nal_data); stxd->gm_priority = GM_LOW_PRIORITY; stxd->gm_target_node = local_nid; - gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, - stxd->gm_size, stxd->msg_size, - GM_LOW_PRIORITY, local_nid, + gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, + stxd->gm_size, stxd->msg_size, + GM_LOW_PRIORITY, local_nid, gmnal_small_tx_callback, (void*)stxd); GMNAL_GM_UNLOCK(nal_data); CDEBUG(D_INFO, "done\n"); - + return(PTL_OK); } @@ -480,7 +476,7 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) gm_status); gnid = 0; } - CDEBUG(D_ERROR, "Result of send stxd [%p] is [%s] to [%u]\n", + CERROR("Result of send stxd [%p] is [%s] to [%u]\n", stxd, gmnal_gm_error(status), gnid); } @@ -494,22 +490,20 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) /* * do a resend on the dropped ones */ - CDEBUG(D_ERROR, "send stxd [%p] was dropped " - "resending\n", context); + CERROR("send stxd [%p] dropped, resending\n", context); GMNAL_GM_LOCK(nal_data); - gm_send_to_peer_with_callback(nal_data->gm_port, - stxd->buffer, - stxd->gm_size, - stxd->msg_size, - stxd->gm_priority, - stxd->gm_target_node, + gm_send_to_peer_with_callback(nal_data->gm_port, + stxd->buffer, + stxd->gm_size, + stxd->msg_size, + stxd->gm_priority, + stxd->gm_target_node, gmnal_small_tx_callback, context); GMNAL_GM_UNLOCK(nal_data); - return; - case(GM_TIMED_OUT): - case(GM_SEND_TIMED_OUT): + case(GM_TIMED_OUT): + case(GM_SEND_TIMED_OUT): /* * drop these ones */ @@ -628,7 +622,7 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, context); GMNAL_GM_UNLOCK(nal_data); } else { - CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is " + CERROR("send_to_peer status for stxd [%p] is " "[%d][%s]\n", stxd, status, gmnal_gm_error(status)); } @@ -669,7 +663,7 @@ gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, if (libnal) nal_data = (gmnal_data_t*)libnal->libnal_data; else { - CDEBUG(D_ERROR, "no libnal.\n"); + CERROR("no libnal.\n"); return(GMNAL_STATUS_FAIL); } @@ -755,7 +749,7 @@ gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, iov->iov_base, iov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " + CERROR("gm_register_memory returns [%d][%s] " "for memory [%p] len ["LPSZ"]\n", gm_status, gmnal_gm_error(gm_status), iov->iov_base, iov->iov_len); @@ -784,7 +778,7 @@ gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, &local_nid); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "Failed to obtain local id\n"); + CERROR("Failed to obtain local id\n"); gmnal_return_stxd(nal_data, stxd); /* TO DO deregister memory on failure */ return(GMNAL_STATUS_FAIL); @@ -795,9 +789,9 @@ gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, local_nid, gmnal_large_tx_callback, (void*)stxd); GMNAL_GM_UNLOCK(nal_data); - + CDEBUG(D_INFO, "done\n"); - + return(PTL_OK); } @@ -837,7 +831,7 @@ gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, libnal, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { - CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); + CERROR("gmnal_large_rx no context\n"); lib_finalize(libnal, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -880,21 +874,21 @@ gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, if (nriov > 1) gm_bcopy(&riov[1], &srxd->riov[1], (nriov-1)*(sizeof(struct iovec))); srxd->nriov = nriov; - + riov = srxd->riov; nriov_dup = nriov; riov_dup = riov; while(nriov--) { - CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n", + CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n", riov->iov_base, riov->iov_len); GMNAL_GM_LOCK(nal_data); - gm_status = gm_register_memory(nal_data->gm_port, + gm_status = gm_register_memory(nal_data->gm_port, riov->iov_base, riov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " - "for memory [%p] len ["LPSZ"]\n", - gm_status, gmnal_gm_error(gm_status), + CERROR("gm_register_memory returns [%d][%s] " + "for memory [%p] len ["LPSZ"]\n", + gm_status, gmnal_gm_error(gm_status), riov->iov_base, riov->iov_len); GMNAL_GM_LOCK(nal_data); while (riov_dup != riov) { @@ -918,9 +912,9 @@ gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, * now do gm_get to get the data */ srxd->cookie = cookie; - if (gmnal_remote_get(srxd, srxd->nsiov, (struct iovec*)buffer, + if (gmnal_remote_get(srxd, srxd->nsiov, (struct iovec*)buffer, nriov_dup, riov_dup) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "can't get the data"); + CERROR("can't get the data"); } CDEBUG(D_INFO, "lgmanl_large_rx done\n"); @@ -949,7 +943,7 @@ gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov, ncalls = gmnal_copyiov(0, srxd, nsiov, siov, nriov, riov); if (ncalls < 0) { - CDEBUG(D_ERROR, "there's something wrong with the iovecs\n"); + CERROR("there's something wrong with the iovecs\n"); return(GMNAL_STATUS_FAIL); } CDEBUG(D_INFO, "gmnal_remote_get ncalls [%d]\n", ncalls); @@ -959,7 +953,7 @@ gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov, ncalls = gmnal_copyiov(1, srxd, nsiov, siov, nriov, riov); if (ncalls < 0) { - CDEBUG(D_ERROR, "there's something wrong with the iovecs\n"); + CERROR("there's something wrong with the iovecs\n"); return(GMNAL_STATUS_FAIL); } @@ -991,15 +985,15 @@ gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov, CDEBUG(D_TRACE, "copy[%d] nal_data[%p]\n", do_copy, nal_data); if (do_copy) { if (!nal_data) { - CDEBUG(D_ERROR, "Bad args No nal_data\n"); + CERROR("Bad args No nal_data\n"); return(GMNAL_STATUS_FAIL); } GMNAL_GM_LOCK(nal_data); - if (gm_global_id_to_node_id(nal_data->gm_port, - srxd->gm_source_node, + if (gm_global_id_to_node_id(nal_data->gm_port, + srxd->gm_source_node, &source_node) != GM_SUCCESS) { - CDEBUG(D_ERROR, "cannot resolve global_id [%u] " + CERROR("cannot resolve global_id [%u] " "to local node_id\n", srxd->gm_source_node); GMNAL_GM_UNLOCK(nal_data); return(GMNAL_STATUS_FAIL); @@ -1013,7 +1007,7 @@ gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov, * Set pointer in stxd to srxd so callback count in srxd * can be decremented to find last callback to complete */ - CDEBUG(D_INFO, "gmnal_copyiov source node is G[%u]L[%d]\n", + CDEBUG(D_INFO, "gmnal_copyiov source node is G[%u]L[%d]\n", srxd->gm_source_node, source_node); } @@ -1124,8 +1118,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, CDEBUG(D_TRACE, "called for context [%p]\n", context); if (status != GM_SUCCESS) { - CDEBUG(D_ERROR, "reports error [%d][%s]\n", status, - gmnal_gm_error(status)); + CERROR("reports error [%d/%s]\n",status,gmnal_gm_error(status)); } spin_lock(&srxd->callback_lock); @@ -1144,11 +1137,11 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, CDEBUG(D_ERROR, "NOT final callback context[%p]\n", srxd); return; } - + /* * Let our client application proceed - */ - CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); + */ + CERROR("final callback context[%p]\n", srxd); lib_finalize(libnal, srxd, srxd->cookie, PTL_OK); /* @@ -1164,10 +1157,10 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, riov = srxd->riov; GMNAL_GM_LOCK(nal_data); while (nriov--) { - CDEBUG(D_ERROR, "deregister memory [%p]\n", riov->iov_base); - if (gm_deregister_memory(srxd->nal_data->gm_port, - riov->iov_base, riov->iov_len)) { - CDEBUG(D_ERROR, "failed to deregister memory [%p]\n", + CERROR("deregister memory [%p]\n", riov->iov_base); + if (gm_deregister_memory(srxd->nal_data->gm_port, + riov->iov_base, riov->iov_len)) { + CERROR("failed to deregister memory [%p]\n", riov->iov_base); } riov++; @@ -1202,7 +1195,7 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "srxd[%p] target_node [%u]\n", srxd, + CDEBUG(D_TRACE, "srxd[%p] target_node [%u]\n", srxd, srxd->gm_source_node); GMNAL_GM_LOCK(nal_data); @@ -1210,7 +1203,7 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) srxd->gm_source_node, &local_nid); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "Failed to obtain local id\n"); + CERROR("Failed to obtain local id\n"); return; } CDEBUG(D_INFO, "Local Node_id is [%u][%x]\n", local_nid, local_nid); @@ -1244,20 +1237,20 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " "gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] " - "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, srxd->gm_source_node, local_nid, stxd); GMNAL_GM_LOCK(nal_data); stxd->gm_priority = GM_LOW_PRIORITY; stxd->gm_target_node = local_nid; - gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, - stxd->gm_size, stxd->msg_size, - GM_LOW_PRIORITY, local_nid, - gmnal_large_tx_ack_callback, + gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, + stxd->gm_size, stxd->msg_size, + GM_LOW_PRIORITY, local_nid, + gmnal_large_tx_ack_callback, (void*)stxd); - + GMNAL_GM_UNLOCK(nal_data); CDEBUG(D_INFO, "gmnal_large_tx_ack :: done\n"); - + return; } @@ -1265,19 +1258,19 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) /* * A callback to indicate the small transmit operation is compete * Check for errors and try to deal with them. - * Call lib_finalise to inform the client application that the + * Call lib_finalise to inform the client application that the * send is complete and the memory can be reused. * Return the stxd when finished with it (returns a send token) */ -void -gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, +void +gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, gm_status_t status) { gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; if (!stxd) { - CDEBUG(D_ERROR, "send completion event for unknown stxd\n"); + CERROR("send completion event for unknown stxd\n"); return; } CDEBUG(D_TRACE, "send completion event for stxd [%p] status is [%d]\n", diff --git a/lnet/klnds/gmlnd/gmlnd_module.c b/lnet/klnds/gmlnd/gmlnd_module.c index 3851649..3dd09b3 100644 --- a/lnet/klnds/gmlnd/gmlnd_module.c +++ b/lnet/klnds/gmlnd/gmlnd_module.c @@ -42,7 +42,7 @@ gmnal_cmd(struct portals_cfg *pcfg, void *private) gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_cmd [%d] private [%p]\n", + CDEBUG(D_TRACE, "gmnal_cmd [%d] private [%p]\n", pcfg->pcfg_command, private); nal_data = (gmnal_data_t*)private; switch(pcfg->pcfg_command) { @@ -53,23 +53,24 @@ gmnal_cmd(struct portals_cfg *pcfg, void *private) PORTAL_ALLOC(name, pcfg->pcfg_plen1); copy_from_user(name, PCFG_PBUF(pcfg, 1), pcfg->pcfg_plen1); - + GMNAL_GM_LOCK(nal_data); //nid = gm_host_name_to_node_id(nal_data->gm_port, name); - gm_status = gm_host_name_to_node_id_ex (nal_data->gm_port, 0, name, &nid); + gm_status = gm_host_name_to_node_id_ex(nal_data->gm_port, 0, + name, &nid); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_INFO, "gm_host_name_to_node_id_ex(...host %s) failed[%d]\n", - name, gm_status); + CDEBUG(D_INFO, "gm_host_name_to_node_id_ex(...host %s) " + "failed[%d]\n", name, gm_status); return (-1); } else CDEBUG(D_INFO, "Local node %s id is [%d]\n", name, nid); GMNAL_GM_LOCK(nal_data); - gm_status = gm_node_id_to_global_id(nal_data->gm_port, + gm_status = gm_node_id_to_global_id(nal_data->gm_port, nid, &gnid); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_INFO, "gm_node_id_to_global_id failed[%d]\n", + CDEBUG(D_INFO, "gm_node_id_to_global_id failed[%d]\n", gm_status); return(-1); } @@ -100,12 +101,10 @@ gmnal_load(void) } else { CDEBUG(D_INFO, "Portals GMNAL Failed to initialise\n"); return(-ENODEV); - } CDEBUG(D_INFO, "This is the end of the gmnal init routine"); - return(0); } diff --git a/lnet/klnds/gmlnd/gmlnd_utils.c b/lnet/klnds/gmlnd/gmlnd_utils.c index 508a48c..a725088 100644 --- a/lnet/klnds/gmlnd/gmlnd_utils.c +++ b/lnet/klnds/gmlnd/gmlnd_utils.c @@ -68,10 +68,10 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) ntx = gm_num_send_tokens(nal_data->gm_port); GMNAL_GM_UNLOCK(nal_data); CDEBUG(D_INFO, "total number of send tokens available is [%d]\n", ntx); - + /* - * allocate a number for small sends - * num_stxds from gmnal_module.c + * allocate a number for small sends + * num_stxds from gmnal_module.c */ nstx = num_stxds; /* @@ -84,19 +84,19 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) */ nltx = ntx - (nrxt_stx + nstx); if (nltx < 1) { - CDEBUG(D_ERROR, "No tokens available for large messages\n"); + CERROR("No tokens available for large messages\n"); return(GMNAL_STATUS_FAIL); } /* - * A semaphore is initialised with the + * A semaphore is initialised with the * number of transmit tokens available. * To get a stxd, acquire the token semaphore. - * this decrements the available token count - * (if no tokens you block here, someone returning a + * this decrements the available token count + * (if no tokens you block here, someone returning a * stxd will release the semaphore and wake you) - * When token is obtained acquire the spinlock + * When token is obtained acquire the spinlock * to manipulate the list */ GMNAL_TXD_TOKEN_INIT(nal_data, nstx); @@ -105,21 +105,20 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_RXT_TXD_LOCK_INIT(nal_data); GMNAL_LTXD_TOKEN_INIT(nal_data, nltx); GMNAL_LTXD_LOCK_INIT(nal_data); - + for (i=0; i<=nstx; i++) { PORTAL_ALLOC(txd, sizeof(gmnal_stxd_t)); if (!txd) { - CDEBUG(D_ERROR, "Failed to malloc txd [%d]\n", i); + CERROR("Failed to malloc txd [%d]\n", i); return(GMNAL_STATUS_NOMEM); } GMNAL_GM_LOCK(nal_data); - txbuffer = gm_dma_malloc(nal_data->gm_port, + txbuffer = gm_dma_malloc(nal_data->gm_port, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); + CERROR("Failed to gm_dma_malloc txbuffer [%d], " + "size [%d]\n", i,GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); } @@ -138,7 +137,7 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) for (i=0; i<=nrxt_stx; i++) { PORTAL_ALLOC(txd, sizeof(gmnal_stxd_t)); if (!txd) { - CDEBUG(D_ERROR, "Failed to malloc txd [%d]\n", i); + CERROR("Failed to malloc txd [%d]\n", i); return(GMNAL_STATUS_NOMEM); } GMNAL_GM_LOCK(nal_data); @@ -146,9 +145,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); + CERROR("Failed to gm_dma_malloc txbuffer [%d]," + " size [%d]\n",i,GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); } @@ -252,7 +250,7 @@ gmnal_get_stxd(gmnal_data_t *nal_data, int block) CDEBUG(D_PORTALS, "Got token\n"); } else { if (GMNAL_TXD_TRYGETTOKEN(nal_data)) { - CDEBUG(D_ERROR, "can't get token\n"); + CERROR("can't get token\n"); return(NULL); } } @@ -260,7 +258,7 @@ gmnal_get_stxd(gmnal_data_t *nal_data, int block) txd = nal_data->stxd; nal_data->stxd = txd->next; GMNAL_TXD_UNLOCK(nal_data); - CDEBUG(D_INFO, "got [%p], head is [%p]\n", txd, + CDEBUG(D_INFO, "got [%p], head is [%p]\n", txd, nal_data->stxd); txd->kniov = 0; } /* general txd get */ @@ -273,7 +271,7 @@ gmnal_get_stxd(gmnal_data_t *nal_data, int block) void gmnal_return_stxd(gmnal_data_t *nal_data, gmnal_stxd_t *txd) { - CDEBUG(D_TRACE, "nal_data [%p], txd[%p] rxt[%d]\n", nal_data, + CDEBUG(D_TRACE, "nal_data [%p], txd[%p] rxt[%d]\n", nal_data, txd, txd->rxt); /* @@ -356,9 +354,9 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) GMNAL_GM_LOCK(nal_data); nrx = gm_num_receive_tokens(nal_data->gm_port); GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "total number of receive tokens available is [%d]\n", + CDEBUG(D_INFO, "total number of receive tokens available is [%d]\n", nrx); - + nsrx = nrx/2; nsrx = 12; /* @@ -367,7 +365,7 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) */ nsrx = num_stxds*2 + 2; - CDEBUG(D_INFO, "Allocated [%d] receive tokens to small messages\n", + CDEBUG(D_INFO, "Allocated [%d] receive tokens to small messages\n", nsrx); @@ -376,7 +374,7 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) gm_hash_hash_ptr, 0, 0, nsrx, 0); GMNAL_GM_UNLOCK(nal_data); if (!nal_data->srxd_hash) { - CDEBUG(D_ERROR, "Failed to create hash table\n"); + CERROR("Failed to create hash table\n"); return(GMNAL_STATUS_NOMEM); } @@ -386,43 +384,40 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) for (i=0; i<=nsrx; i++) { PORTAL_ALLOC(rxd, sizeof(gmnal_srxd_t)); if (!rxd) { - CDEBUG(D_ERROR, "Failed to malloc rxd [%d]\n", i); + CERROR("Failed to malloc rxd [%d]\n", i); return(GMNAL_STATUS_NOMEM); } #if 0 PORTAL_ALLOC(rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], " - "size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); + CERROR("Failed to malloc rxbuffer [%d], " + "size [%d]\n", i,GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); } CDEBUG(D_NET, "Calling gm_register_memory with port [%p] " - "rxbuffer [%p], size [%d]\n", nal_data->gm_port, + "rxbuffer [%p], size [%d]\n", nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_LOCK(nal_data); - gm_status = gm_register_memory(nal_data->gm_port, rxbuffer, + gm_status = gm_register_memory(nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p]," + CERROR("gm_register_memory failed buffer [%p]," " index [%d]\n", rxbuffer, i); switch(gm_status) { case(GM_FAILURE): - CDEBUG(D_ERROR, "GM_FAILURE\n"); + CERROR("GM_FAILURE\n"); break; case(GM_PERMISSION_DENIED): - CDEBUG(D_ERROR, "PERMISSION_DENIED\n"); + CERROR("PERMISSION_DENIED\n"); break; case(GM_INVALID_PARAMETER): - CDEBUG(D_ERROR, "INVALID_PARAMETER\n"); + CERROR("INVALID_PARAMETER\n"); break; default: - CDEBUG(D_ERROR, "Unknown error[%d]\n", - gm_status); + CERROR("Unknown error[%d]\n",gm_status); break; - } return(GMNAL_STATUS_FAIL); } @@ -432,22 +427,21 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); + CERROR("Failed to gm_dma_malloc rxbuffer [%d], " + "size [%d]\n",i ,GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); } #endif - + rxd->buffer = rxbuffer; rxd->size = GMNAL_SMALL_MSG_SIZE(nal_data); rxd->gmsize = gm_min_size_for_length(rxd->size); - if (gm_hash_insert(nal_data->srxd_hash, + if (gm_hash_insert(nal_data->srxd_hash, (void*)rxbuffer, (void*)rxd)) { - CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] " + CERROR("failed to create hash entry rxd[%p] " "for rxbuffer[%p]\n", rxd, rxbuffer); return(GMNAL_STATUS_FAIL); } @@ -584,7 +578,7 @@ gmnal_stop_rxthread(gmnal_data_t *nal_data) } if (nal_data->rxthread_flag != GMNAL_THREAD_RESET) { - CDEBUG(D_ERROR, "I don't know how to wake the thread\n"); + CERROR("I don't know how to wake the thread\n"); } else { CDEBUG(D_INFO, "rx thread seems to have stopped\n"); } @@ -612,7 +606,7 @@ gmnal_stop_ctthread(gmnal_data_t *nal_data) } if (nal_data->ctthread_flag == GMNAL_THREAD_STOP) { - CDEBUG(D_ERROR, "I DON'T KNOW HOW TO WAKE THE THREAD\n"); + CERROR("I DON'T KNOW HOW TO WAKE THE THREAD\n"); } else { CDEBUG(D_INFO, "CT THREAD SEEMS TO HAVE STOPPED\n"); } @@ -889,7 +883,7 @@ gmnal_is_small_msg(gmnal_data_t *nal_data, int niov, struct iovec *iov, CDEBUG(D_INFO, "Yep, small message\n"); return(1); } else { - CDEBUG(D_ERROR, "No, not small message\n"); + CERROR("No, not small message\n"); /* * could be made up of lots of little ones ! */ @@ -914,7 +908,7 @@ gmnal_add_rxtwe(gmnal_data_t *nal_data, gm_recv_t *recv) PORTAL_ALLOC(we, sizeof(gmnal_rxtwe_t)); if (!we) { - CDEBUG(D_ERROR, "failed to malloc\n"); + CERROR("failed to malloc\n"); return(GMNAL_STATUS_FAIL); } we->buffer = gm_ntohp(recv->buffer); @@ -981,7 +975,7 @@ gmnal_get_rxtwe(gmnal_data_t *nal_data) if (!nal_data->rxtwe_head) nal_data->rxtwe_tail = NULL; } else { - CDEBUG(D_WARNING, "woken but no work\n"); + CWARN("woken but no work\n"); } spin_unlock(&nal_data->rxtwe_lock); } while (!we); @@ -1016,7 +1010,7 @@ gmnal_start_kernel_threads(gmnal_data_t *nal_data) nal_data->ctthread_pid = kernel_thread(gmnal_ct_thread, (void*)nal_data, 0); if (nal_data->ctthread_pid <= 0) { - CDEBUG(D_ERROR, "Caretaker thread failed to start\n"); + CERROR("Caretaker thread failed to start\n"); return(GMNAL_STATUS_FAIL); } @@ -1053,7 +1047,7 @@ gmnal_start_kernel_threads(gmnal_data_t *nal_data) nal_data->rxthread_pid[threads] = kernel_thread(gmnal_rx_thread, (void*)nal_data, 0); if (nal_data->rxthread_pid[threads] <= 0) { - CDEBUG(D_ERROR, "Receive thread failed to start\n"); + CERROR("Receive thread failed to start\n"); gmnal_stop_rxthread(nal_data); gmnal_stop_ctthread(nal_data); return(GMNAL_STATUS_FAIL); diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index b9ca677..eb9e6fa 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -1486,7 +1486,7 @@ init_tx: } else { LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ - CDEBUG(D_WARNING,"No data: immediate completion\n"); + CWARN("No data: immediate completion\n"); lib_finalize (&kibnal_lib, NULL, libmsg, status == 0 ? PTL_OK : PTL_FAIL); } @@ -2449,7 +2449,7 @@ kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) goto out; } - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + CWARN("Connection %p -> "LPX64" ESTABLISHED.\n", conn, conn->ibc_peer->ibp_nid); out: diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index 480c5aa..3862c5b 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -800,7 +800,7 @@ kibnal_stop_ip_listener(int clear_acceptq) down(&kibnal_data.kib_listener_signal); LASSERT (kibnal_data.kib_listener_sock == NULL); - CDEBUG(D_WARNING, "Listener stopped\n"); + CWARN("Listener stopped\n"); if (!clear_acceptq) return; diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index dee5bd9..a356eaf 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -1832,7 +1832,7 @@ kibnal_conn_callback (tTS_IB_CM_EVENT event, break; case TS_IB_CM_DISCONNECTED: - CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n", + CWARN("Connection %p -> "LPX64" DISCONNECTED.\n", conn, conn->ibc_peer->ibp_nid); kibnal_close_conn (conn, 0); break; @@ -1968,7 +1968,7 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, case TS_IB_CM_ESTABLISHED: LASSERT (conn != NULL); - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + CWARN("Connection %p -> "LPX64" ESTABLISHED.\n", conn, conn->ibc_peer->ibp_nid); kibnal_connreq_done(conn, 0); @@ -2037,7 +2037,7 @@ kibnal_active_conn_callback (tTS_IB_CM_EVENT event, } case TS_IB_CM_ESTABLISHED: - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED\n", + CWARN("Connection %p -> "LPX64" ESTABLISHED\n", conn, conn->ibc_peer->ibp_nid); kibnal_connreq_done(conn, 0); diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index b7ae218..f984e6f 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -920,7 +920,7 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) if (nstale != 0) CWARN("Closed %d stale conns to "LPX64"\n", nstale, peer_nid); - CDEBUG(D_WARNING, "New connection to "LPX64" on devid[%d] = %d\n", + CWARN("New connection to "LPX64" on devid[%d] = %d\n", peer_nid, conn->rac_device->rad_idx, conn->rac_device->rad_id); /* Ensure conn gets checked. Transmits may have been queued and an diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index b4184b5..dd910ce 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -1438,7 +1438,7 @@ kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg, case RAP_NOT_DONE: if (time_after_eq(jiffies, conn->rac_last_tx + conn->rac_keepalive*HZ)) - CDEBUG(D_WARNING, "EAGAIN sending %02x (idle %lu secs)\n", + CWARN("EAGAIN sending %02x (idle %lu secs)\n", msg->ram_type, (jiffies - conn->rac_last_tx)/HZ); return -EAGAIN; } @@ -1901,8 +1901,8 @@ kranal_complete_closed_conn (kra_conn_t *conn) kranal_tx_done(tx, -ECONNABORTED); } - CDEBUG(D_WARNING, "Closed conn %p -> "LPX64": nmsg %d nreplies %d\n", - conn, conn->rac_peer->rap_nid, nfma, nreplies); + CWARN("Closed conn %p -> "LPX64": nmsg %d nreplies %d\n", + conn, conn->rac_peer->rap_nid, nfma, nreplies); } int diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 448871e..295ec35 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1132,9 +1132,10 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) rc = ksocknal_close_stale_conns_locked(peer, incarnation); if (rc != 0) - CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", - rc, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr)); + CDEBUG(D_HA, + "Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", + rc, conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr)); write_unlock_irqrestore (global_lock, flags); @@ -1146,11 +1147,11 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) ksocknal_putconnsock(conn); } - CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" - " incarnation:"LPX64" sched[%d]/%d\n", - nid, HIPQUAD(conn->ksnc_myipaddr), - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, - (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); + CDEBUG(D_HA, "New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d " + "incarnation:"LPX64" sched[%d]/%d\n", + nid, HIPQUAD(conn->ksnc_myipaddr), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, + (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers),irq); ksocknal_put_conn (conn); return (0); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index ef8ca0f..bd26027 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -102,7 +102,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) if (rc <= 0) /* sent nothing? */ return (rc); - + nob = rc; LASSERT (nob <= tx->tx_resid); tx->tx_resid -= nob; @@ -130,7 +130,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) { int rc; int bufnob; - + if (ksocknal_data.ksnd_stall_tx != 0) { set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); @@ -158,7 +158,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock); if (rc > 0) /* sent something? */ conn->ksnc_tx_bufnob += rc; /* account it */ - + if (bufnob < conn->ksnc_tx_bufnob) { /* allocated send buffer bytes < computed; infer * something got ACKed */ @@ -182,7 +182,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) sched = conn->ksnc_scheduler; spin_lock_irqsave(&sched->kss_lock, flags); - + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && !conn->ksnc_tx_ready) { /* SOCK_NOSPACE is set when the socket fills @@ -228,12 +228,12 @@ ksocknal_recv_iov (ksock_conn_t *conn) /* received something... */ nob = rc; - + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); mb(); /* order with setting rx_started */ conn->ksnc_rx_started = 1; - + conn->ksnc_rx_nob_wanted -= nob; conn->ksnc_rx_nob_left -= nob; @@ -265,10 +265,10 @@ ksocknal_recv_kiov (ksock_conn_t *conn) /* Never touch conn->ksnc_rx_kiov or change connection * status inside ksocknal_lib_recv_iov */ rc = ksocknal_lib_recv_kiov(conn); - + if (rc <= 0) return (rc); - + /* received something... */ nob = rc; @@ -279,7 +279,7 @@ ksocknal_recv_kiov (ksock_conn_t *conn) conn->ksnc_rx_nob_wanted -= nob; conn->ksnc_rx_nob_left -= nob; - + do { LASSERT (conn->ksnc_rx_nkiov > 0); @@ -305,7 +305,7 @@ ksocknal_receive (ksock_conn_t *conn) * progress/completion. */ int rc; ENTRY; - + if (ksocknal_data.ksnd_stall_rx != 0) { set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout(cfs_time_seconds (ksocknal_data.ksnd_stall_rx)); @@ -416,7 +416,7 @@ ksocknal_tx_launched (ksock_tx_t *tx) #if SOCKNAL_ZC if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { ksock_conn_t *conn = tx->tx_conn; - + /* zccd skbufs are still in-flight. First take a ref on * conn, so it hangs about for ksocknal_tx_done... */ atomic_inc (&conn->ksnc_refcount); @@ -437,7 +437,7 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) { unsigned long flags; int rc; - + rc = ksocknal_transmit (conn, tx); CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); @@ -472,7 +472,7 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) SOCKNAL_ENOMEM_RETRY), ksocknal_data.ksnd_reaper_waketime)) cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq); - + spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags); return (rc); } @@ -494,7 +494,7 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) HIPQUAD(conn->ksnc_ipaddr), rc); break; } - CERROR("[%p] Error %d on write to "LPX64 + CDEBUG(D_HA, "[%p] Error %d on write to "LPX64 " ip %d.%d.%d.%d:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr), @@ -514,16 +514,16 @@ ksocknal_launch_autoconnect_locked (ksock_route_t *route) /* called holding write lock on ksnd_global_lock */ LASSERT (!route->ksnr_connecting); - + route->ksnr_connecting = 1; /* scheduling conn for autoconnectd */ atomic_inc (&route->ksnr_refcount); /* extra ref for autoconnectd */ - + spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); - + list_add_tail (&route->ksnr_connect_list, &ksocknal_data.ksnd_autoconnectd_routes); cfs_waitq_signal (&ksocknal_data.ksnd_autoconnectd_waitq); - + spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); } @@ -639,7 +639,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) * ksnc_sock... */ LASSERT(!conn->ksnc_closing); LASSERT(tx->tx_resid == tx->tx_nob); - + CDEBUG (D_NET, "Sending to "LPX64" ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr), @@ -664,7 +664,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) } list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); - + if (conn->ksnc_tx_ready && /* able to send */ !conn->ksnc_tx_scheduled) { /* not scheduled to send */ /* +1 ref for scheduler */ @@ -684,7 +684,7 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) struct list_head *tmp; ksock_route_t *route; int bits; - + list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); bits = route->ksnr_connected; @@ -698,7 +698,7 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) continue; } - + /* connection being established? */ if (route->ksnr_connecting) continue; @@ -706,10 +706,10 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) /* too soon to retry this guy? */ if (!cfs_time_aftereq (cfs_time_current(), route->ksnr_timeout)) continue; - + return (route); } - + return (NULL); } @@ -721,11 +721,11 @@ ksocknal_find_connecting_route_locked (ksock_peer_t *peer) list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); - + if (route->ksnr_connecting) return (route); } - + return (NULL); } @@ -737,7 +737,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - + /* Ensure the frags we've been given EXACTLY match the number of * bytes we want to send. Many TCP/IP stacks disregard any total * size parameters passed to them and just look at the frags. @@ -777,7 +777,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (0); } } - + /* I'll need a write lock... */ read_unlock (g_lock); #endif @@ -814,7 +814,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) write_unlock_irqrestore (g_lock, flags); return (0); } - + write_unlock_irqrestore (g_lock, flags); return (-EHOSTUNREACH); } @@ -850,12 +850,12 @@ ksocknal_sendmsg(lib_nal_t *nal, LASSERT (payload_kiov == NULL || !in_interrupt ()); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - + if (payload_iov != NULL) desc_size = offsetof(ksock_ltx_t, ltx_iov[1 + payload_niov]); else desc_size = offsetof(ksock_ltx_t, ltx_kiov[payload_niov]); - + if (in_interrupt() || type == PTL_MSG_ACK || type == PTL_MSG_REPLY) { @@ -865,7 +865,7 @@ ksocknal_sendmsg(lib_nal_t *nal, } else { PORTAL_ALLOC(ltx, desc_size); } - + if (ltx == NULL) { CERROR("Can't allocate tx desc type %d size %d %s\n", type, desc_size, in_interrupt() ? "(intr)" : ""); @@ -875,16 +875,16 @@ ksocknal_sendmsg(lib_nal_t *nal, atomic_inc(&ksocknal_data.ksnd_nactive_ltxs); ltx->ltx_desc_size = desc_size; - + /* We always have 1 mapped frag for the header */ ltx->ltx_tx.tx_iov = ltx->ltx_iov; ltx->ltx_iov[0].iov_base = <x->ltx_hdr; ltx->ltx_iov[0].iov_len = sizeof(*hdr); ltx->ltx_hdr = *hdr; - + ltx->ltx_private = private; ltx->ltx_cookie = cookie; - + ltx->ltx_tx.tx_isfwd = 0; ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_nob; @@ -911,7 +911,7 @@ ksocknal_sendmsg(lib_nal_t *nal, rc = ksocknal_launch_packet(<x->ltx_tx, nid); if (rc == 0) return (PTL_OK); - + ksocknal_free_ltx(ltx); return (PTL_FAIL); } @@ -946,7 +946,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) ptl_nid_t nid = fwd->kprfd_gateway_nid; ksock_ftx_t *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch; int rc; - + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); @@ -1144,7 +1144,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - + /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed * buffer */ LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t)); @@ -1153,7 +1153,7 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) conn->ksnc_rx_nkiov = niov; conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); - + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob); return (0); @@ -1238,7 +1238,7 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) if (nob_to_skip == 0) { /* right at next packet boundary now */ conn->ksnc_rx_started = 0; mb (); /* racing with timeout thread */ - + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); @@ -1286,7 +1286,7 @@ ksocknal_process_receive (ksock_conn_t *conn) { ksock_fmb_t *fmb; int rc; - + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); /* doesn't need a forwarding buffer */ @@ -1339,7 +1339,7 @@ ksocknal_process_receive (ksock_conn_t *conn) /* short read */ return (-EAGAIN); } - + switch (conn->ksnc_rx_state) { case SOCKNAL_RX_HEADER: if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) && @@ -1422,7 +1422,7 @@ ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, LASSERT (mlen <= rlen); LASSERT (niov <= PTL_MD_MAX_IOV); - + conn->ksnc_cookie = msg; conn->ksnc_rx_nob_wanted = mlen; conn->ksnc_rx_nob_left = rlen; @@ -1450,7 +1450,7 @@ ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, LASSERT (mlen <= rlen); LASSERT (niov <= PTL_MD_MAX_IOV); - + conn->ksnc_cookie = msg; conn->ksnc_rx_nob_wanted = mlen; conn->ksnc_rx_nob_left = rlen; @@ -1483,7 +1483,7 @@ ksocknal_sched_cansleep(ksock_sched_t *sched) #endif list_empty(&sched->kss_rx_conns) && list_empty(&sched->kss_tx_conns)); - + spin_unlock_irqrestore(&sched->kss_lock, flags); return (rc); } @@ -1571,16 +1571,16 @@ int ksocknal_scheduler (void *arg) conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); list_del (&conn->ksnc_tx_list); - + LASSERT(conn->ksnc_tx_scheduled); LASSERT(conn->ksnc_tx_ready); LASSERT(!list_empty(&conn->ksnc_tx_queue)); - + tx = list_entry(conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); /* dequeue now so empty list => more to send */ list_del(&tx->tx_list); - + /* Clear tx_ready in case send isn't complete. Do * it BEFORE we call process_transmit, since * write_space can set it any time after we release @@ -1613,7 +1613,7 @@ int ksocknal_scheduler (void *arg) /* drop my ref */ ksocknal_put_conn (conn); } - + did_something = 1; } #if SOCKNAL_ZC @@ -1692,7 +1692,7 @@ void ksocknal_write_callback (ksock_conn_t *conn) ksock_sched_t *sched; unsigned long flags; ENTRY; - + sched = conn->ksnc_scheduler; spin_lock_irqsave (&sched->kss_lock, flags); @@ -1763,10 +1763,10 @@ ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs) rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); return (rc); } - + if (nipaddrs == 0) return (0); - + for (i = 0; i < nipaddrs; i++) { ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]); } @@ -1879,11 +1879,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, portals_nid2str(SOCKNAL, le64_to_cpu(hdr.src_nid), ipbuf)); - - CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u " - "but expecting "LPX64"\n", - le64_to_cpu (hdr.src_nid), - HIPQUAD(conn->ksnc_ipaddr), *nid); + return (-EPROTO); } @@ -1917,7 +1913,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, if (nips == 0) return (0); - + rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs)); if (rc != 0) { CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n", @@ -1927,7 +1923,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, for (i = 0; i < nips; i++) { ipaddrs[i] = __le32_to_cpu(ipaddrs[i]); - + if (ipaddrs[i] == 0) { CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n", i, *nid, HIPQUAD(conn->ksnc_ipaddr)); @@ -1945,7 +1941,7 @@ ksocknal_connect_peer (ksock_route_t *route, int type) int rc; int port; int may_retry; - + /* Iterate through reserved ports. When typed connections are * used, we will need to bind to multiple ports, but we only know * this at connect time. But, by that time we've already called @@ -2191,16 +2187,16 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) break; default: LCONSOLE_WARN("An unexpected network error " - "occurred with %u.%u.%u.%u: %d.\n", + "occurred with %u.%u.%u.%u: %d\n", HIPQUAD(conn->ksnc_ipaddr), SOCK_ERROR(conn->ksnc_sock)); break; } /* Something (e.g. failed keepalive) set the socket error */ - CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n", - SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid, - conn, HIPQUAD(conn->ksnc_ipaddr)); + CDEBUG(D_HA,"Socket error %d: "LPX64" %p %d.%d.%d.%d\n", + SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid, + conn, HIPQUAD(conn->ksnc_ipaddr)); return (conn); } @@ -2259,16 +2255,15 @@ ksocknal_check_peer_timeouts (int idx) list_for_each (ptmp, peers) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); conn = ksocknal_find_timed_out_conn (peer); - + if (conn != NULL) { read_unlock (&ksocknal_data.ksnd_global_lock); - CERROR ("Timeout out conn->"LPX64" ip %d.%d.%d.%d:%d\n", - peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); + CERROR("Timeout out conn->"LPX64" ip %d.%d.%d.%d:%d\n", + peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); - + /* NB we won't find this one again, but we can't * just proceed with the next peer, since we dropped * ksnd_global_lock and it might be dead already! */ @@ -2308,7 +2303,7 @@ ksocknal_reaper (void *arg) conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); - + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); ksocknal_terminate_conn (conn); @@ -2322,7 +2317,7 @@ ksocknal_reaper (void *arg) conn = list_entry (ksocknal_data.ksnd_zombie_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); - + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); ksocknal_destroy_conn (conn); diff --git a/lnet/libcfs/darwin/darwin-proc.c b/lnet/libcfs/darwin/darwin-proc.c index efa51aa..f2b48d5 100644 --- a/lnet/libcfs/darwin/darwin-proc.c +++ b/lnet/libcfs/darwin/darwin-proc.c @@ -36,6 +36,7 @@ extern unsigned int portal_debug; extern char debug_file_path[1024]; extern unsigned int portal_subsystem_debug; extern unsigned int portal_printk; +extern unsigned int portals_catastrophe; extern atomic_t portal_kmemory; extern long max_debug_mb; @@ -68,6 +69,7 @@ SYSCTL_PROC(_portals, OID_AUTO, trace_daemon, SYSCTL_PROC(_portals, OID_AUTO, debug_mb, CTLTYPE_INT | CTLFLAG_RW, &max_debug_mb, 0, &cfs_debug_mb, "L", "max debug size"); +#warning "add 'catastrophe' entry for LBUG detection" static cfs_sysctl_table_t top_table[] = { diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 3c9a99f..9f0ce91 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -31,7 +31,7 @@ #include "tracefile.h" -unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_NAL); +unsigned int portal_subsystem_debug = ~0 - (S_PORTALS); EXPORT_SYMBOL(portal_subsystem_debug); unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA | @@ -45,6 +45,9 @@ EXPORT_SYMBOL(portal_printk); unsigned int portal_stack; EXPORT_SYMBOL(portal_stack); +unsigned int portals_catastrophe; +EXPORT_SYMBOL(portals_catastrophe); + #ifdef __KERNEL__ atomic_t portal_kmemory = ATOMIC_INIT(0); EXPORT_SYMBOL(portal_kmemory); diff --git a/lnet/libcfs/linux/linux-proc.c b/lnet/libcfs/linux/linux-proc.c index 70f4059..77277ba 100644 --- a/lnet/libcfs/linux/linux-proc.c +++ b/lnet/libcfs/linux/linux-proc.c @@ -71,6 +71,7 @@ enum { PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ PSDEV_PORTALS_UPCALL, /* User mode upcall script */ PSDEV_PORTALS_MEMUSED, /* bytes currently PORTAL_ALLOCated */ + PSDEV_PORTALS_CATASTROPHE,/* if we have LBUGged or panic'd */ }; static struct ctl_table portals_table[] = { @@ -86,7 +87,9 @@ static struct ctl_table portals_table[] = { sizeof(portals_upcall), 0644, NULL, &proc_dostring, &sysctl_string}, {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter, - sizeof(int), 0644, NULL, &proc_dointvec}, + sizeof(int), 0444, NULL, &proc_dointvec}, + {PSDEV_PORTALS_CATASTROPHE, "catastrophe", &portals_catastrophe, + sizeof(int), 0444, NULL, &proc_dointvec}, {0} }; diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c index e5bb46b..a4d1804 100644 --- a/lnet/utils/acceptor.c +++ b/lnet/utils/acceptor.c @@ -27,29 +27,37 @@ /* should get this from autoconf somehow */ #ifndef PIDFILE_DIR #define PIDFILE_DIR "/var/run" -#endif +#endif #define PROGNAME "acceptor" #ifdef HAVE_LIBWRAP /* needed because libwrap declares these as externs */ -int allow_severity = LOG_INFO; -int deny_severity = LOG_WARNING; +int allow_severity = LOG_INFO; +int deny_severity = LOG_WARNING; #endif +void usage(char *myname) +{ + fprintf(stderr, "usage: %s [-N nal_id] [-p] [-l] port\n\n" + " -l\tKeep stdin/stdout open\n" + " -p\tAllow connections from non-privileged ports\n", myname); + exit (1); +} + void create_pidfile(char *name, int port) { char pidfile[1024]; FILE *fp; - snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", PIDFILE_DIR, name, port); - + if ((fp = fopen(pidfile, "w"))) { fprintf(fp, "%d\n", getpid()); fclose(fp); } else { - syslog(LOG_ERR, "%s: %s\n", pidfile, + syslog(LOG_ERR, "%s: %s\n", pidfile, strerror(errno)); } } @@ -58,43 +66,43 @@ int pidfile_exists(char *name, int port) { char pidfile[1024]; - snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", PIDFILE_DIR, name, port); - + if (!access(pidfile, F_OK)) { - fprintf(stderr, "%s: exists, acceptor already running.\n", + fprintf(stderr, "%s: exists, acceptor already running.\n", pidfile); return (1); - } + } return (0); } void show_connection (int fd, __u32 net_ip) { - struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); - __u32 host_ip = ntohl (net_ip); + static long last_time; + static __u32 host_ip; + long now = time(0); + struct hostent *h; int len; char host[1024]; - + + /* Don't show repeats for same host, it adds no value */ + if (host_ip == ntohl(net_ip) && (now - last_time) < 5) + return; + + h = gethostbyaddr((char *)&net_ip, sizeof(net_ip), AF_INET); + last_time = now; + host_ip = ntohl(net_ip); + if (h == NULL) - snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, - (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); + snprintf(host, sizeof(host), "%d.%d.%d.%d", + (host_ip >> 24) & 0xff, (host_ip >> 16) & 0xff, + (host_ip >> 8) & 0xff, host_ip & 0xff); else - snprintf (host, sizeof(host), "%s", h->h_name); - - syslog (LOG_INFO, "Accepted host: %s\n", host); -} + snprintf(host, sizeof(host), "%s", h->h_name); -void -usage (char *myname) -{ - fprintf (stderr, - "Usage: %s [-N nal_id] [-p] [-l] port\n\n" - " -l\tKeep stdin/stdout open\n" - " -p\tAllow connections from non-privileged ports\n", - myname); - exit (1); + syslog(LOG_INFO, "Accepted host: %s\n", host); } int main(int argc, char **argv) @@ -106,7 +114,7 @@ int main(int argc, char **argv) int nal = SOCKNAL; int rport; int require_privports = 1; - + while ((c = getopt (argc, argv, "N:lp")) != -1) { switch (c) { case 'N': @@ -189,7 +197,7 @@ int main(int argc, char **argv) struct request_info request; #endif char addrstr[INET_ADDRSTRLEN]; - + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); if ( cfd < 0 ) { perror("accept"); @@ -228,11 +236,11 @@ int main(int argc, char **argv) pcfg.pcfg_nal = nal; pcfg.pcfg_fd = cfd; pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */ - + PORTAL_IOC_INIT(data); data.ioc_pbuf1 = (char*)&pcfg; data.ioc_plen1 = sizeof(pcfg); - + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { perror("ioctl failed"); } else { -- 1.8.3.1