From: eeb Date: Mon, 29 Sep 2003 10:45:41 +0000 (+0000) Subject: * Fixed qswnal peer death notification bug X-Git-Tag: 0.9.1~178 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=9eac846771c015535ade18e872b79bb95506a2bd;p=fs%2Flustre-release.git * Fixed qswnal peer death notification bug * Fixed missing kfree in the router * Converted some socknal and router CERRORs into CWARN/CDEBUGs --- diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 99f299f..7f8bc96 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -119,6 +119,18 @@ kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) } void +kqswnal_notify_peer_down(kqswnal_tx_t *ktx) +{ + struct timeval now; + time_t then; + + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; + + kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then); +} + +void kqswnal_unmap_tx (kqswnal_tx_t *ktx) { if (ktx->ktx_nmappedpages == 0) @@ -421,8 +433,6 @@ static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - struct timeval now; - time_t then; LASSERT (txd != NULL); LASSERT (ktx != NULL); @@ -437,12 +447,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) CERROR ("Tx completion to "LPX64" failed: %d\n", ktx->ktx_nid, status); - do_gettimeofday (&now); - then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; - - kpr_notify (&kqswnal_data.kqn_router, - ktx->ktx_nid, 0, then); - + kqswnal_notify_peer_down(ktx); status = -EIO; } @@ -483,10 +488,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) default: /* fatal error */ CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); - - /* Tell router I think a node is down */ - kpr_notify (&kqswnal_data.kqn_router, ktx->ktx_nid, - 0, ktx->ktx_launchtime); + kqswnal_notify_peer_down(ktx); return (rc); } } diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index b0b9342..65db867 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1526,9 +1526,9 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) conn, rc, conn->ksnc_peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port); else - CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n", - conn, conn->ksnc_peer->ksnp_nid, - conn->ksnc_ipaddr, conn->ksnc_port); + CWARN ("[%p] EOF from "LPX64" ip %08x:%d\n", + conn, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); } goto out; } diff --git a/lnet/router/router.c b/lnet/router/router.c index a03fb42..32f741f 100644 --- a/lnet/router/router.c +++ b/lnet/router/router.c @@ -119,6 +119,8 @@ kpr_do_upcall (void *arg) snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); portals_run_upcall (argv); + + kfree (u); } void @@ -161,11 +163,12 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, /* can't do predictions... */ do_gettimeofday (&now); if (when > now.tv_sec) { - CERROR ("Ignoring prediction from %s of [%d] "LPX64" %s " - "%ld seconds in the future\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, alive ? "up" : "down", - when - now.tv_sec); + CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s " + "%ld seconds in the future\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, + alive ? "up" : "down", + when - now.tv_sec); return (EINVAL); } @@ -189,14 +192,14 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if (rc != 0) { /* gateway not found */ write_unlock_irqrestore(&kpr_rwlock, flags); - CERROR ("Gateway not found\n"); + CDEBUG (D_NET, "Gateway not found\n"); return (rc); } if (when < ge->kpge_timestamp) { /* out of date information */ write_unlock_irqrestore (&kpr_rwlock, flags); - CERROR ("Out of date\n"); + CDEBUG (D_NET, "Out of date\n"); return (0); } @@ -206,7 +209,7 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if ((!ge->kpge_alive) == (!alive)) { /* new date for old news */ write_unlock_irqrestore (&kpr_rwlock, flags); - CERROR ("Old news\n"); + CDEBUG (D_NET, "Old news\n"); return (0); } @@ -250,10 +253,12 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if (byNal) { /* It wasn't userland that notified me... */ - CERROR ("Doing upcall\n"); + CWARN ("Upcall: NAL %d NID "LPX64" is %s\n", + gateway_nalid, gateway_nid, + alive ? "alive" : "dead"); kpr_upcall (gateway_nalid, gateway_nid, alive, when); } else { - CERROR (" NOT Doing upcall\n"); + CDEBUG (D_NET, " NOT Doing upcall\n"); } return (0); diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 99f299f..7f8bc96 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -119,6 +119,18 @@ kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) } void +kqswnal_notify_peer_down(kqswnal_tx_t *ktx) +{ + struct timeval now; + time_t then; + + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; + + kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then); +} + +void kqswnal_unmap_tx (kqswnal_tx_t *ktx) { if (ktx->ktx_nmappedpages == 0) @@ -421,8 +433,6 @@ static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - struct timeval now; - time_t then; LASSERT (txd != NULL); LASSERT (ktx != NULL); @@ -437,12 +447,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) CERROR ("Tx completion to "LPX64" failed: %d\n", ktx->ktx_nid, status); - do_gettimeofday (&now); - then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; - - kpr_notify (&kqswnal_data.kqn_router, - ktx->ktx_nid, 0, then); - + kqswnal_notify_peer_down(ktx); status = -EIO; } @@ -483,10 +488,7 @@ kqswnal_launch (kqswnal_tx_t *ktx) default: /* fatal error */ CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); - - /* Tell router I think a node is down */ - kpr_notify (&kqswnal_data.kqn_router, ktx->ktx_nid, - 0, ktx->ktx_launchtime); + kqswnal_notify_peer_down(ktx); return (rc); } } diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index b0b9342..65db867 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -1526,9 +1526,9 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) conn, rc, conn->ksnc_peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port); else - CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n", - conn, conn->ksnc_peer->ksnp_nid, - conn->ksnc_ipaddr, conn->ksnc_port); + CWARN ("[%p] EOF from "LPX64" ip %08x:%d\n", + conn, conn->ksnc_peer->ksnp_nid, + conn->ksnc_ipaddr, conn->ksnc_port); } goto out; } diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c index a03fb42..32f741f 100644 --- a/lustre/portals/router/router.c +++ b/lustre/portals/router/router.c @@ -119,6 +119,8 @@ kpr_do_upcall (void *arg) snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); portals_run_upcall (argv); + + kfree (u); } void @@ -161,11 +163,12 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, /* can't do predictions... */ do_gettimeofday (&now); if (when > now.tv_sec) { - CERROR ("Ignoring prediction from %s of [%d] "LPX64" %s " - "%ld seconds in the future\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, alive ? "up" : "down", - when - now.tv_sec); + CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s " + "%ld seconds in the future\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, + alive ? "up" : "down", + when - now.tv_sec); return (EINVAL); } @@ -189,14 +192,14 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if (rc != 0) { /* gateway not found */ write_unlock_irqrestore(&kpr_rwlock, flags); - CERROR ("Gateway not found\n"); + CDEBUG (D_NET, "Gateway not found\n"); return (rc); } if (when < ge->kpge_timestamp) { /* out of date information */ write_unlock_irqrestore (&kpr_rwlock, flags); - CERROR ("Out of date\n"); + CDEBUG (D_NET, "Out of date\n"); return (0); } @@ -206,7 +209,7 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if ((!ge->kpge_alive) == (!alive)) { /* new date for old news */ write_unlock_irqrestore (&kpr_rwlock, flags); - CERROR ("Old news\n"); + CDEBUG (D_NET, "Old news\n"); return (0); } @@ -250,10 +253,12 @@ kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, if (byNal) { /* It wasn't userland that notified me... */ - CERROR ("Doing upcall\n"); + CWARN ("Upcall: NAL %d NID "LPX64" is %s\n", + gateway_nalid, gateway_nid, + alive ? "alive" : "dead"); kpr_upcall (gateway_nalid, gateway_nid, alive, when); } else { - CERROR (" NOT Doing upcall\n"); + CDEBUG (D_NET, " NOT Doing upcall\n"); } return (0);