From b59b755c10688550430a50066fc786196e6f9dd9 Mon Sep 17 00:00:00 2001 From: eeb Date: Wed, 29 Jun 2005 21:33:57 +0000 Subject: [PATCH] * Improved router error messages and move some of the checking out of qswnal and socknal and into the router itself. * Don't force FMR unmap on error if it wasn't mapped in the first place. * Make socknal close the incoming connection if it receives junk for forwarding * allow "lctl network down" == "lctl network unconfigure" (I kept wanting to type 'down' and now I can :) --- lnet/include/lnet/Makefile.am | 4 +- lnet/include/lnet/lib-lnet.h | 11 +++-- lnet/include/lnet/lib-p30.h | 11 +++-- lnet/include/lnet/lib-types.h | 4 +- lnet/klnds/qswlnd/qswlnd.h | 1 - lnet/klnds/qswlnd/qswlnd_cb.c | 21 +++------ lnet/klnds/socklnd/socklnd.c | 20 +++------ lnet/klnds/socklnd/socklnd.h | 1 - lnet/klnds/socklnd/socklnd_cb.c | 78 ++++++++++++++-------------------- lnet/klnds/viblnd/viblnd_cb.c | 2 +- lnet/lnet/acceptor.c | 23 +++++++--- lnet/lnet/api-ni.c | 2 +- lnet/lnet/lib-move.c | 5 +++ lnet/lnet/module.c | 1 + lnet/lnet/router.c | 94 ++++++++++++++++++++++++++++++++++------- lnet/utils/portals.c | 4 +- 16 files changed, 169 insertions(+), 113 deletions(-) diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am index c9dc994..0c44c1c 100644 --- a/lnet/include/lnet/Makefile.am +++ b/lnet/include/lnet/Makefile.am @@ -7,6 +7,4 @@ endif DIST_SUBDIRS := $(SUBDIRS) EXTRA_DIST = api.h api-support.h build_check.h errno.h \ - kpr.h lib-p30.h lib-types.h \ - p30.h ptlctl.h \ - socknal.h types.h + lib-p30.h lib-types.h p30.h ptlctl.h socknal.h types.h diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index eb77d14..2fa1619 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -377,12 +377,15 @@ int kpr_initialise(void); void kpr_finalise(void); static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, - int nob, int niov, ptl_kiov_t *kiov, +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t target_nid, + ptl_nid_t sender_nid, ptl_nid_t source_nid, + ptl_hdr_t *hdr, int nob, int niov, ptl_kiov_t *kiov, kpr_fwd_callback_t callback, void *callback_arg) { - fwd->kprfd_target_nid = nid; - fwd->kprfd_gateway_nid = nid; + fwd->kprfd_target_nid = target_nid; + fwd->kprfd_gateway_nid = target_nid; + fwd->kprfd_sender_nid = sender_nid; + fwd->kprfd_source_nid = source_nid; fwd->kprfd_hdr = hdr; fwd->kprfd_nob = nob; fwd->kprfd_niov = niov; diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h index eb77d14..2fa1619 100644 --- a/lnet/include/lnet/lib-p30.h +++ b/lnet/include/lnet/lib-p30.h @@ -377,12 +377,15 @@ int kpr_initialise(void); void kpr_finalise(void); static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, - int nob, int niov, ptl_kiov_t *kiov, +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t target_nid, + ptl_nid_t sender_nid, ptl_nid_t source_nid, + ptl_hdr_t *hdr, int nob, int niov, ptl_kiov_t *kiov, kpr_fwd_callback_t callback, void *callback_arg) { - fwd->kprfd_target_nid = nid; - fwd->kprfd_gateway_nid = nid; + fwd->kprfd_target_nid = target_nid; + fwd->kprfd_gateway_nid = target_nid; + fwd->kprfd_sender_nid = sender_nid; + fwd->kprfd_source_nid = source_nid; fwd->kprfd_hdr = hdr; fwd->kprfd_nob = nob; fwd->kprfd_niov = niov; diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 3f6c5e3..5f1b411 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -249,7 +249,9 @@ typedef union { typedef struct { struct list_head kprfd_list; /* stash in queues (routing target can use) */ ptl_nid_t kprfd_target_nid; /* final destination NID */ - ptl_nid_t kprfd_gateway_nid; /* gateway NID */ + ptl_nid_t kprfd_gateway_nid; /* next hop NID */ + ptl_nid_t kprfd_sender_nid; /* previous hop NID */ + ptl_nid_t kprfd_source_nid; /* original sender's NID */ ptl_hdr_t *kprfd_hdr; /* header in wire byte order */ int kprfd_nob; /* # payload bytes */ int kprfd_niov; /* # payload frags */ diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 8269d29..77ae7f6 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -75,7 +75,6 @@ #define DEBUG_SUBSYSTEM S_NAL #include -#include #include #include diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index bbb778c..6001e06 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -1490,6 +1490,8 @@ kqswnal_parse (kqswnal_rx_t *krx) { ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); ptl_nid_t dest_nid; + ptl_nid_t src_nid; + ptl_nid_t sender_nid; int payload_nob; int nob; int niov; @@ -1509,22 +1511,13 @@ kqswnal_parse (kqswnal_rx_t *krx) return; } - dest_nid = le64_to_cpu(hdr->dest_nid); - + dest_nid = le64_to_cpu(hdr->dest_nid); /* final dest */ + src_nid = le64_to_cpu(hdr->src_nid); /* original source */ + sender_nid = PTL_MKNID(PTL_NIDNET(kqswnal_data.kqn_ni->ni_nid), + ep_rxd_node(krx->krx_rxd)); /* who sent it to me */ #if KQSW_CHECKSUM LASSERTF (0, "checksums for forwarded packets not implemented\n"); #endif - - if (PTL_NIDNET(dest_nid) == PTL_NIDNET(kqswnal_data.kqn_ni->ni_nid)) { - /* should have gone direct to peer */ - CERROR("dropping packet from %s for %s: target is peer\n", - libcfs_nid2str(le64_to_cpu(hdr->src_nid)), - libcfs_nid2str(dest_nid)); - - kqswnal_rx_decref (krx); - return; - } - nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE; niov = 0; if (nob > 0) { @@ -1543,7 +1536,7 @@ kqswnal_parse (kqswnal_rx_t *krx) } } - kpr_fwd_init (&krx->krx_fwd, dest_nid, + kpr_fwd_init (&krx->krx_fwd, dest_nid, sender_nid, src_nid, hdr, payload_nob, niov, krx->krx_kiov, kqswnal_fwd_callback, krx); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 7fcdc58..6effa0a 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1027,6 +1027,11 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) if (route != NULL) { peer = route->ksnr_peer; ksocknal_peer_addref(peer); + + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + ipaddrs, nipaddrs); + rc = 0; } else { ni = ptl_net2ni(PTL_NIDNET(nid)); @@ -1065,25 +1070,14 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) ksocknal_peer_addref(peer); write_unlock_irqrestore(global_lock, flags); - } - - if (route != NULL) { - /* additional routes after interface exchange? */ - ksocknal_create_routes(peer, conn->ksnc_port, - ipaddrs, nipaddrs); - rc = 0; - } else { - ptl_ni_t *ni = peer->ksnp_ni; - ksock_net_t *net = ni->ni_data; nipaddrs = ksocknal_select_ips(peer, ipaddrs, nipaddrs); - rc = ksocknal_send_hello (conn, ni->ni_nid, net->ksnn_incarnation, ipaddrs, nipaddrs); + if (rc < 0) + goto failed_2; } - if (rc < 0) - goto failed_2; write_lock_irqsave (global_lock, flags); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 47bc850..f139ef1 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -39,7 +39,6 @@ #endif #include -#include #include #include #include diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 439cf03..f4dc566 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1057,7 +1057,9 @@ int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { int payload_nob = conn->ksnc_rx_nob_left; + ptl_nid_t src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid); ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); + ptl_nid_t sender_nid = conn->ksnc_peer->ksnp_nid; int niov = 0; int nob = payload_nob; @@ -1088,13 +1090,13 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) niov++; } - kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr, - payload_nob, niov, fmb->fmb_kiov, + kpr_fwd_init(&fmb->fmb_fwd, dest_nid, sender_nid, src_nid, + &fmb->fmb_hdr, payload_nob, niov, fmb->fmb_kiov, ksocknal_fmb_callback, fmb); if (payload_nob == 0) { /* got complete packet already */ CDEBUG (D_NET, "%p %s->%s fwd_start (immediate)\n", conn, - libcfs_nid2str(le64_to_cpu(conn->ksnc_hdr.src_nid)), + libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid)); kpr_fwd_start (conn->ksnc_peer->ksnp_ni, &fmb->fmb_fwd); @@ -1116,12 +1118,11 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); CDEBUG (D_NET, "%p %s->%s %d reading body\n", conn, - libcfs_nid2str(le64_to_cpu(conn->ksnc_hdr.src_nid)), - libcfs_nid2str(dest_nid), payload_nob); + libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid), payload_nob); return (0); } -void +ptl_err_t ksocknal_fwd_parse (ksock_conn_t *conn) { ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); @@ -1141,32 +1142,8 @@ ksocknal_fwd_parse (ksock_conn_t *conn) libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid), body_len); - ksocknal_new_packet (conn, 0); /* on to new packet */ - return; - } - - if (PTL_NIDNET(conn->ksnc_hdr.dest_nid) == - PTL_NIDNET(conn->ksnc_peer->ksnp_ni->ni_nid)) { - /* should have gone direct */ - CERROR ("dropping packet from %s for %s: " - "target is a peer\n", - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - - /* on to next packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - if (!kpr_forwarding()) { - CERROR("dropping packet from %s for %s: " - "I'm not a router\n", - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - - ksocknal_new_packet (conn, body_len); /* on to new packet */ - return; + return PTL_FAIL; } if (body_len > PTL_MTU) { /* too big to forward */ @@ -1177,12 +1154,14 @@ ksocknal_fwd_parse (ksock_conn_t *conn) body_len); /* on to new packet (skip this one's body) */ ksocknal_new_packet (conn, body_len); - return; + return PTL_FAIL; } conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ conn->ksnc_rx_nob_left = body_len; /* stash packet size */ conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ + + return PTL_OK; } int @@ -1305,23 +1284,28 @@ ksocknal_process_receive (ksock_conn_t *conn) case SOCKNAL_RX_HEADER: rc = ptl_parse(conn->ksnc_peer->ksnp_ni, &conn->ksnc_hdr, conn); - if (rc == PTL_IFACE_DUP) { + switch (rc) { + case PTL_OK: + break; + + case PTL_IFACE_DUP: /* This packet isn't for me (still in net byte order) */ - ksocknal_fwd_parse (conn); - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ - return (0); /* => come back later */ - case SOCKNAL_RX_SLOP: /* skipping packet's body */ - goto try_read; /* => go read it */ - case SOCKNAL_RX_GET_FMB: /* forwarding */ - goto get_fmb; /* => go get a fwd msg buffer */ - default: - LBUG (); + rc = ksocknal_fwd_parse (conn); + if (rc == PTL_OK) { + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ + return (0); /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + goto get_fmb; /* => go get a fwd msg buffer */ + default: + LBUG (); + } + /* Not Reached */ } - /* Not Reached */ - } - - if (rc != PTL_OK) { + /* fall through */ + default: /* I just received garbage: give up on this conn */ ksocknal_close_conn_and_siblings (conn, rc); return (-EPROTO); diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index db89c41..bb4e1eb 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -37,7 +37,7 @@ kibnal_tx_done (kib_tx_t *tx) #if IBNAL_USE_FMR if (tx->tx_md.md_fmrcount == 0 || - ptlrc != PTL_OK) { + (ptlrc != PTL_OK && tx->tx_md.md_active)) { vv_return_t vvrc; /* mapping must be active (it dropped fmrcount to 0) */ diff --git a/lnet/lnet/acceptor.c b/lnet/lnet/acceptor.c index ecb27b8..726ce6c 100644 --- a/lnet/lnet/acceptor.c +++ b/lnet/lnet/acceptor.c @@ -267,6 +267,8 @@ ptl_accept(struct socket *sock, __u32 magic, int choose_ni) } if (!choose_ni) { + CDEBUG(D_WARNING, "Skipped %s from %u.%u.%u.%u\n", + libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip)); /* I got called just to skip the connection request */ return PTL_OK; } @@ -290,6 +292,9 @@ ptl_accept(struct socket *sock, __u32 magic, int choose_ni) return PTL_FAIL; } + CDEBUG(D_WARNING, "Accept %s from %u.%u.%u.%u\n", + libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip)); + rc = ni->ni_nal->nal_accept(ni, sock); if (rc != PTL_OK) CERROR("NI %s refused connection from %u.%u.%u.%u\n", @@ -306,22 +311,24 @@ ptl_acceptor(void *arg) char name[16]; struct socket *newsock; int rc; + int n_acceptor_nis; __u32 magic; __u32 peer_ip; int peer_port; ptl_ni_t *blind_ni; + LASSERT (ptl_acceptor_state.pta_sock == NULL); + /* If there is only a single NI that needs me, I'll pass her * connections "blind". Otherwise I'll have to read the bytestream to - * see which NI the connection is for. */ - rc = ptl_count_acceptor_nis(&blind_ni); - LASSERT (rc > 0); - if (rc > 1) { + * see which NI the connection is for. NB I don't get to run at all if + * there are 0 acceptor_nis... */ + n_acceptor_nis = ptl_count_acceptor_nis(&blind_ni); + LASSERT (n_acceptor_nis > 0); + if (n_acceptor_nis > 1) { ptl_ni_decref(blind_ni); blind_ni = NULL; } - - LASSERT (ptl_acceptor_state.pta_sock == NULL); snprintf(name, sizeof(name), "acceptor_%03d", acceptor_port); kportal_daemonize(name); @@ -350,6 +357,10 @@ ptl_acceptor(void *arg) if (rc != 0) return rc; + CDEBUG(D_WARNING, "Acceptor starting (%d nets: blind_ni %s)\n", + n_acceptor_nis, + blind_ni == NULL ? "NULL" : libcfs_nid2str(blind_ni->ni_nid)); + while (ptl_acceptor_state.pta_shutdown == 0) { rc = libcfs_sock_accept(&newsock, ptl_acceptor_state.pta_sock); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 6714c6c..53afa32 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -714,7 +714,7 @@ ptl_startup_nalnis (void) break; PTL_MUTEX_UP(&ptl_apini.apini_nal_mutex); -#ifdef __KERNEL +#ifdef __KERNEL__ if (retry) { CERROR("Can't load NAL %s, module %s\n", libcfs_nal2str(nal_type), diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 8a44af7..ba31c3f 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -588,6 +588,11 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg, gw_nid = kpr_lookup (&ni, target.nid, sizeof(*hdr) + len); if (gw_nid == PTL_NID_ANY) { CERROR("No route to %s\n", libcfs_id2str(target)); + LCONSOLE_ERROR("Cannot send to %s: %s is not a local network " + "and I can't route to it. Is lustre configured " + "correctly?\n", libcfs_nid2str(target.nid), + libcfs_net2str(PTL_NIDNET(target.nid))); + return PTL_FAIL; } diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index 18e9ed2..0a96659 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -138,6 +138,7 @@ EXPORT_SYMBOL(PtlEQFree); EXPORT_SYMBOL(PtlEQGet); EXPORT_SYMBOL(PtlGetId); EXPORT_SYMBOL(PtlMDBind); +EXPORT_SYMBOL(ptl_apini); EXPORT_SYMBOL(ptl_iov_nob); EXPORT_SYMBOL(ptl_copy_iov2buf); EXPORT_SYMBOL(ptl_copy_buf2iov); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 52e5c5e..aebf507 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -363,7 +363,7 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) { ptl_nid_t target_nid = fwd->kprfd_target_nid; __u32 target_net = PTL_NIDNET(target_nid); - __u32 source_net = PTL_NIDNET(src_ni->ni_nid); + __u32 receiver_net = PTL_NIDNET(src_ni->ni_nid); int nob = fwd->kprfd_nob; kpr_gateway_entry_t *ge; ptl_ni_t *dst_ni; @@ -375,12 +375,17 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) int rc; int found; - CDEBUG (D_NET, "forward [%p] %s from %s\n", fwd, - libcfs_nid2str(target_nid), libcfs_nid2str(src_ni->ni_nid)); + CDEBUG (D_NET, "src %s sender %s receiver %s target %s\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid), + libcfs_nid2str(target_nid)); - LASSERT (target_net != source_net); LASSERT (nob == ptl_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); + /* it's not for any local NID (i.e. it's going to get sent) */ + LASSERT (!ptl_islocalnid(target_nid)); + fwd->kprfd_src_ni = src_ni; /* stash calling ni */ read_lock_irqsave(&kpr_state.kpr_rwlock, flags); @@ -390,20 +395,59 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) kpr_state.kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); spin_unlock(&kpr_state.kpr_stats_lock); + rc = -EDESTADDRREQ; + if (target_net == receiver_net) { + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Refusing to forward message from %s for %s " + "received from %s on %s: it should have been " + "sent directly\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid)); + goto failed; + } + rc = -ESHUTDOWN; - if (src_ni->ni_shutdown) /* caller is shutting down */ + if (src_ni->ni_shutdown) { /* caller is shutting down */ + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Refusing to forward message from %s for %s " + "received from %s on %s: system shutting down\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid)); goto failed; - + } + rc = -ENETUNREACH; - if (!kpr_forwarding()) /* I'm not a router */ + if (!kpr_forwarding()) { /* I'm not a router */ + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Refusing to forward message from %s for %s " + "received from %s on %s: forwarding disabled!\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid)); goto failed; - + } + /* Is the target_nid on a local network? */ dst_ni = ptl_net2ni(target_net); if (dst_ni != NULL) { - if (dst_ni->ni_nal->nal_fwd == NULL) + if (dst_ni->ni_nal->nal_fwd == NULL) { + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Refusing to forward message from %s for %s " + "received from %s on %s: " + "net %s doesn't route!\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid), + libcfs_net2str(PTL_NIDNET(dst_ni->ni_nid))); goto failed; - + } + fwd->kprfd_gateway_nid = target_nid; atomic_inc (&kpr_state.kpr_queue_depth); @@ -429,8 +473,17 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) break; } - if (!found) + if (!found) { + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Can't forward message from %s for %s " + "received from %s on %s: " + "no routes to destination network!\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid)); goto failed; + } /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ dst_ni = NULL; @@ -438,7 +491,7 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) list_for_each (e, &ne->kpne_routes) { re = list_entry (e, kpr_route_entry_t, kpre_list); - if (PTL_NIDNET(re->kpre_gateway->kpge_nid) == source_net) + if (PTL_NIDNET(re->kpre_gateway->kpge_nid) == receiver_net) continue; /* don't route to same net */ if (!re->kpre_gateway->kpge_alive) @@ -468,9 +521,18 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) LASSERT ((ge == NULL) == (dst_ni == NULL)); - if (ge == NULL) + if (ge == NULL) { + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + LCONSOLE_ERROR("Can't forward message from %s for %s " + "received from %s on %s: " + "all relevant gateways are down!\n", + libcfs_nid2str(fwd->kprfd_source_nid), + libcfs_nid2str(fwd->kprfd_target_nid), + libcfs_nid2str(fwd->kprfd_sender_nid), + libcfs_nid2str(src_ni->ni_nid)); goto failed; - + } + kpr_update_weight (ge, nob); fwd->kprfd_gateway_nid = ge->kpge_nid; @@ -493,12 +555,12 @@ kpr_fwd_start (ptl_ni_t *src_ni, kpr_fwd_desc_t *fwd) kpr_state.kpr_fwd_errors++; spin_unlock_irqrestore(&kpr_state.kpr_stats_lock, flags); + read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); + CDEBUG (D_NET, "Failed to forward [%p] %s from %s\n", fwd, libcfs_nid2str(target_nid), libcfs_nid2str(src_ni->ni_nid)); (fwd->kprfd_callback)(src_ni, fwd->kprfd_callback_arg, rc); - - read_unlock_irqrestore(&kpr_state.kpr_rwlock, flags); } void diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 0bd7f8e..057b760 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -311,7 +311,9 @@ int jt_ptl_network(int argc, char **argv) int count; int rc; - if (set && !strcmp(argv[1], "unconfigure")) { + if (set && + (!strcmp(argv[1], "unconfigure") || + !strcmp(argv[1], "down"))) { PORTAL_IOC_INIT(data); rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_UNCONFIGURE, &data); -- 1.8.3.1