From a4450c9338dd7f1ba7c2ced2a99710c3da6f3319 Mon Sep 17 00:00:00 2001 From: eeb Date: Tue, 7 Feb 2006 15:48:01 +0000 Subject: [PATCH] * landing future protocol compatibility work (b_rls146_lnetprotovrsn) on b_release_1_4_6 --- lnet/include/libcfs/kp30.h | 4 +- lnet/include/libcfs/linux/linux-tcpip.h | 1 + lnet/include/lnet/lib-lnet.h | 5 + lnet/include/lnet/lib-types.h | 22 +- lnet/include/lnet/lnetctl.h | 2 + lnet/include/lnet/ptllnd_wire.h | 2 +- lnet/include/lnet/types.h | 2 + lnet/klnds/gmlnd/gmlnd.h | 9 +- lnet/klnds/gmlnd/gmlnd_api.c | 5 +- lnet/klnds/gmlnd/gmlnd_cb.c | 15 + lnet/klnds/gmlnd/gmlnd_comm.c | 52 +++- lnet/klnds/gmlnd/gmlnd_module.c | 21 +- lnet/klnds/iiblnd/iiblnd.h | 12 +- lnet/klnds/iiblnd/iiblnd_cb.c | 97 +++++-- lnet/klnds/openiblnd/openiblnd.c | 101 ++++++- lnet/klnds/openiblnd/openiblnd.h | 13 - lnet/klnds/ptllnd/ptllnd_peer.c | 18 +- lnet/klnds/ptllnd/ptllnd_rx_buf.c | 56 +++- lnet/klnds/qswlnd/qswlnd.c | 4 + lnet/klnds/qswlnd/qswlnd.h | 49 +++- lnet/klnds/qswlnd/qswlnd_cb.c | 490 +++++++++++++++++++++++--------- lnet/klnds/ralnd/ralnd.c | 121 ++++++-- lnet/klnds/ralnd/ralnd.h | 14 +- lnet/klnds/socklnd/socklnd.c | 29 +- lnet/klnds/socklnd/socklnd_cb.c | 88 ++++-- lnet/klnds/socklnd/socklnd_lib-linux.c | 8 +- lnet/klnds/viblnd/viblnd_cb.c | 121 ++++++-- lnet/klnds/viblnd/viblnd_wire.h | 7 +- lnet/klnds/viblnd/wirecheck.c | 4 + lnet/libcfs/module.c | 2 +- lnet/lnet/acceptor.c | 146 ++++++---- lnet/lnet/api-ni.c | 246 +++++++++++++++- lnet/lnet/module.c | 1 - lnet/utils/gmlndnid.c | 140 +++++---- lnet/utils/portals.c | 89 +++++- lnet/utils/ptlctl.c | 4 +- lustre/utils/lctl.c | 3 + 37 files changed, 1584 insertions(+), 419 deletions(-) diff --git a/lnet/include/libcfs/kp30.h b/lnet/include/libcfs/kp30.h index 3dd22de..924fad9 100644 --- a/lnet/include/libcfs/kp30.h +++ b/lnet/include/libcfs/kp30.h @@ -432,7 +432,7 @@ extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg); #define IOC_LIBCFS_LWT_SNAPSHOT _IOWR('e', 34, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_LWT_LOOKUP_STRING _IOWR('e', 35, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_MEMHOG _IOWR('e', 36, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_PING _IOWR('e', 37, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING_TEST _IOWR('e', 37, IOCTL_LIBCFS_TYPE) /* lnet ioctls */ #define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) @@ -444,6 +444,8 @@ extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg); #define IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) /* lnd ioctls */ #define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) diff --git a/lnet/include/libcfs/linux/linux-tcpip.h b/lnet/include/libcfs/linux/linux-tcpip.h index b7bea59..32ff5ea 100644 --- a/lnet/include/libcfs/linux/linux-tcpip.h +++ b/lnet/include/libcfs/linux/linux-tcpip.h @@ -46,6 +46,7 @@ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) # define sk_wmem_queued wmem_queued # define sk_err err +# define sk_route_caps route_caps #endif #define SOCK_SNDBUF(so) ((so)->sk->sk_sndbuf) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index dca0d8b..a5582ee 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -622,6 +622,11 @@ int lnet_acceptor_port(void); int lnet_acceptor_start(void); void lnet_acceptor_stop(void); +int lnet_ping_target_init(void); +void lnet_ping_target_fini(void); +int lnet_ping(lnet_process_id_t id, int timeout_ms, + lnet_process_id_t *ids, int n_ids); + int lnet_parse_ip2nets (char **networksp, char *ip2nets); int lnet_parse_routes (char *route_str, int *im_a_router); int lnet_parse_networks (struct list_head *nilist, char *networks); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 9959e9c..c8b9139 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -114,10 +114,22 @@ typedef struct { __u16 version_minor; /* increment on compatible change */ } WIRE_ATTR lnet_magicversion_t; -/* PROTO MAGIC for LNDs that once used their own private acceptor */ +/* PROTO MAGIC for LNDs */ #define LNET_PROTO_OPENIB_MAGIC 0x0be91b91 +#define LNET_PROTO_IIB_MAGIC 0x0be91b91 +#define LNET_PROTO_VIB_MAGIC 0x0be91b91 #define LNET_PROTO_RA_MAGIC 0x0be91b92 +#define LNET_PROTO_QSW_MAGIC 0x0be91b93 #define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_PTL_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define LNET_PROTO_GM_MAGIC 0x6d797269 /* 'myri'! */ +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond with a + * "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + #define LNET_PROTO_TCP_VERSION_MAJOR 1 #define LNET_PROTO_TCP_VERSION_MINOR 0 @@ -129,7 +141,6 @@ typedef struct { __u64 acr_nid; /* target NID */ } WIRE_ATTR lnet_acceptor_connreq_t; -#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 #define LNET_PROTO_ACCEPTOR_VERSION 1 /* forward refs */ @@ -457,6 +468,8 @@ typedef struct char *ln_network_tokens; /* space for network names */ int ln_network_tokens_nob; + int ln_testprotocompat; /* test protocol compatibility flags */ + struct list_head ln_finalizeq; /* msgs waiting to complete finalizing */ #ifdef __KERNEL__ void **ln_finalizers; /* threads doing finalization */ @@ -465,6 +478,9 @@ typedef struct int ln_finalizing; #endif struct list_head ln_test_peers; /* failure simulation */ + + lnet_handle_md_t ln_ping_target_md; + lnet_handle_me_t ln_ping_target_me; #ifdef LNET_USE_LIB_FREELIST lnet_freelist_t ln_free_mes; @@ -479,4 +495,6 @@ typedef struct lnet_counters_t ln_counters; } lnet_t; +#define LNET_PING_MATCHBITS 0x6666666666666666LL + #endif diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index 4ff635e..83b707d 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -54,6 +54,7 @@ int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); +int jt_ptl_ping_test(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ @@ -65,6 +66,7 @@ int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); int jt_ptl_lwt(int argc, char **argv); +int jt_ptl_testprotocompat(int argc, char **argv); int jt_ptl_memhog(int argc, char **argv); int dbg_initialize(int argc, char **argv); diff --git a/lnet/include/lnet/ptllnd_wire.h b/lnet/include/lnet/ptllnd_wire.h index 02f595c..794fa8e 100644 --- a/lnet/include/lnet/ptllnd_wire.h +++ b/lnet/include/lnet/ptllnd_wire.h @@ -76,7 +76,7 @@ typedef struct kptl_msg }kptl_msg_t; -#define PTLLND_MSG_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define PTLLND_MSG_MAGIC LNET_PROTO_PTL_MAGIC #define PTLLND_MSG_VERSION 0x01 #define PTLLND_MSG_TYPE_INVALID 0x00 diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index c043ee2..30397a7 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -3,6 +3,8 @@ #include +#define LNET_RESERVED_PORTAL 0 /* portals reserved for lnet's own use */ + typedef __u64 lnet_nid_t; typedef __u32 lnet_pid_t; diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index 189c87a..4ce4a56 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -83,7 +83,8 @@ /* Default Tunable Values */ #define GMNAL_PORT 4 /* which port to use */ #define GMNAL_NTX 256 /* # tx descs */ -#define GMNAL_NTX_PEER 8 /* # concurrent sends per peer */ +#define GMNAL_CREDITS 128 /* # concurrent sends */ +#define GMNAL_PEER_CREDITS 8 /* # concurrent sends per peer */ #define GMNAL_NRX_SMALL 128 /* # small receives to post */ #define GMNAL_NRX_LARGE 64 /* # large receives to post */ #define GMNAL_NLARGE_TX_BUFS 32 /* # large tx buffers */ @@ -113,7 +114,7 @@ typedef struct { } gmm_u; } WIRE_ATTR gmnal_msg_t; -#define GMNAL_MSG_MAGIC 0x6d797269 /* 'myri'! */ +#define GMNAL_MSG_MAGIC LNET_PROTO_GM_MAGIC #define GMNAL_MSG_VERSION 1 #define GMNAL_MSG_IMMEDIATE 1 @@ -186,7 +187,6 @@ typedef struct gmnal_ni { int gmni_shutdown; /* tell all threads to exit */ struct list_head gmni_idle_txs; /* idle tx's */ - wait_queue_head_t gmni_idle_tx_wait; /* block here for idle tx */ int gmni_tx_credits; /* # transmits still possible */ struct list_head gmni_idle_ltxbs; /* idle large tx buffers */ struct list_head gmni_buf_txq; /* tx's waiting for buffers */ @@ -200,7 +200,8 @@ typedef struct gmnal_ni { typedef struct { int *gm_port; int *gm_ntx; - int *gm_ntx_peer; + int *gm_credits; + int *gm_peer_credits; int *gm_nlarge_tx_bufs; int *gm_nrx_small; int *gm_nrx_large; diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index 2ff9e84..5685ca7 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -123,8 +123,8 @@ gmnal_startup(lnet_ni_t *ni) LASSERT (ni->ni_lnd == &the_gmlnd); - ni->ni_maxtxcredits = *gmnal_tunables.gm_ntx; - ni->ni_peertxcredits = *gmnal_tunables.gm_ntx_peer; + ni->ni_maxtxcredits = *gmnal_tunables.gm_credits; + ni->ni_peertxcredits = *gmnal_tunables.gm_peer_credits; if (the_gmni != NULL) { CERROR("Only 1 instance supported\n"); @@ -143,7 +143,6 @@ gmnal_startup(lnet_ni_t *ni) gmni->gmni_ni = ni; spin_lock_init(&gmni->gmni_tx_lock); spin_lock_init(&gmni->gmni_gm_lock); - init_waitqueue_head(&gmni->gmni_idle_tx_wait); INIT_LIST_HEAD(&gmni->gmni_idle_txs); INIT_LIST_HEAD(&gmni->gmni_idle_ltxbs); INIT_LIST_HEAD(&gmni->gmni_buf_txq); diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index 679c324..54eca16 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -108,6 +108,21 @@ gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_hdr = *hdr; tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[0]); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_magic = + LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + if (tx->tx_msgnob + len <= gmni->gmni_small_msgsize) { /* whole message fits in tx_buf */ char *buffer = &(GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_payload[0]); diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index a5a443a..29c3d21 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -47,6 +47,8 @@ gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx) gmni->gmni_small_msgsize; int flip; + /* rc = 0:SUCCESS -ve:failure +ve:version mismatch */ + /* GM may not overflow our buffer */ LASSERT (rx->rx_recv_nob <= buffnob); @@ -61,6 +63,9 @@ gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx) flip = 0; } else if (msg->gmm_magic == __swab32(GMNAL_MSG_MAGIC)) { flip = 1; + } else if (msg->gmm_magic == LNET_PROTO_MAGIC || + msg->gmm_magic == __swab32(LNET_PROTO_MAGIC)) { + return EPROTO; } else { CERROR("Bad magic from gmid %u: %08x\n", rx->rx_recv_gmid, msg->gmm_magic); @@ -69,9 +74,7 @@ gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx) if (msg->gmm_version != (flip ? __swab16(GMNAL_MSG_VERSION) : GMNAL_MSG_VERSION)) { - CERROR("Bad version from gmid %u: %d\n", - rx->rx_recv_gmid, msg->gmm_version); - return -EPROTO; + return EPROTO; } if (rx->rx_recv_nob < hdr_size) { @@ -360,6 +363,43 @@ gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx) spin_unlock(&gmni->gmni_gm_lock); } +void +gmnal_version_reply (gmnal_ni_t *gmni, gmnal_rx_t *rx) +{ + /* Future protocol version compatibility support! + * The next gmlnd-specific protocol rev will first send a message to + * check version; I reply with a stub message containing my current + * magic+version... */ + gmnal_msg_t *msg; + gmnal_tx_t *tx = gmnal_get_tx(gmni); + + if (tx == NULL) { + CERROR("Can't allocate tx to send version info to %u\n", + rx->rx_recv_gmid); + return; + } + + LASSERT (tx->tx_lntmsg == NULL); /* no finalize */ + + tx->tx_nid = LNET_NID_ANY; + tx->tx_gmlid = rx->rx_recv_gmid; + + msg = GMNAL_NETBUF_MSG(&tx->tx_buf); + msg->gmm_magic = GMNAL_MSG_MAGIC; + msg->gmm_version = GMNAL_MSG_VERSION; + + /* just send magic + version */ + tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_type); + tx->tx_large_nob = 0; + + spin_lock(&gmni->gmni_tx_lock); + + list_add_tail(&tx->tx_list, &gmni->gmni_buf_txq); + gmnal_check_txqueues_locked(gmni); + + spin_unlock(&gmni->gmni_tx_lock); +} + int gmnal_rx_thread(void *arg) { @@ -426,6 +466,7 @@ gmnal_rx_thread(void *arg) /* We're connectionless: simply drop packets with * errors */ rc = gmnal_unpack_msg(gmni, rx); + if (rc == 0) { gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf); @@ -434,8 +475,11 @@ gmnal_rx_thread(void *arg) &msg->gmm_u.immediate.gmim_hdr, msg->gmm_srcnid, rx, 0); + } else if (rc > 0) { + gmnal_version_reply(gmni, rx); + rc = -EPROTO; /* repost rx */ } - + if (rc < 0) /* parse failure */ gmnal_post_rx(gmni, rx); diff --git a/lnet/klnds/gmlnd/gmlnd_module.c b/lnet/klnds/gmlnd/gmlnd_module.c index 91ecf04..3881e6c 100644 --- a/lnet/klnds/gmlnd/gmlnd_module.c +++ b/lnet/klnds/gmlnd/gmlnd_module.c @@ -30,8 +30,12 @@ static int ntx = GMNAL_NTX; CFS_MODULE_PARM(ntx, "i", int, 0444, "# tx descriptors"); -static int ntx_peer = GMNAL_NTX_PEER; -CFS_MODULE_PARM(ntx_peer, "i", int, 0444, +static int credits = GMNAL_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = GMNAL_PEER_CREDITS; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, "# concurrent sends per peer"); static int nlarge_tx_bufs = GMNAL_NLARGE_TX_BUFS; @@ -49,7 +53,8 @@ CFS_MODULE_PARM(nrx_large, "i", int, 0444, gmnal_tunables_t gmnal_tunables = { .gm_port = &port, .gm_ntx = &ntx, - .gm_ntx_peer = &ntx_peer, + .gm_credits = &credits, + .gm_peer_credits = &peer_credits, .gm_nlarge_tx_bufs = &nlarge_tx_bufs, .gm_nrx_small = &nrx_small, .gm_nrx_large = &nrx_large, @@ -61,13 +66,15 @@ static ctl_table gmnal_ctl_table[] = { sizeof (int), 0444, NULL, &proc_dointvec}, {2, "ntx", &ntx, sizeof (int), 0444, NULL, &proc_dointvec}, - {3, "ntx_peer", &ntx_peer, + {3, "credits", &credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {4, "peer_credits", &peer_credits, sizeof (int), 0444, NULL, &proc_dointvec}, - {4, "nlarge_tx_bufs", &nlarge_tx_bufs, + {5, "nlarge_tx_bufs", &nlarge_tx_bufs, sizeof (int), 0444, NULL, &proc_dointvec}, - {5, "nrx_small", &nrx_small, + {6, "nrx_small", &nrx_small, sizeof (int), 0444, NULL, &proc_dointvec}, - {6, "nrx_large", &nrx_large, + {7, "nrx_large", &nrx_large, sizeof (int), 0444, NULL, &proc_dointvec}, {0} }; diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index edb2865..5315b5b 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -327,7 +327,7 @@ typedef struct } WIRE_ATTR ibm_u; } WIRE_ATTR kib_msg_t; -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_MAGIC LNET_PROTO_IIB_MAGIC /* unique magic */ #define IBNAL_MSG_VERSION 1 /* current protocol version */ #define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ @@ -341,6 +341,11 @@ typedef struct #define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ #define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ +/* connection rejection reasons */ +#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ +#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ +#define IBNAL_REJECT_FATAL 2 /* Anything else */ + /***********************************************************************/ typedef struct kib_rx /* receive message */ @@ -422,11 +427,6 @@ typedef struct kib_conn #define IBNAL_CONN_DISCONNECTING 4 /* to send disconnect req */ #define IBNAL_CONN_DISCONNECTED 5 /* no more QP or CM traffic */ -/* CAVEAT EMPTOR: keep in sync with kibnal_reject() */ -#define IBNAL_REJECT_NO_RESOURCES 0 /* Out of memory/conns etc */ -#define IBNAL_REJECT_CONN_RACE 1 /* You lost connection race */ -#define IBNAL_REJECT_FATAL 2 /* Anything else */ - /* types of connection */ #define IBNAL_CONN_ACTIVE 0 /* active connect */ #define IBNAL_CONN_PASSIVE 1 /* passive connect */ diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index ab1798b..5ed878a 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -2111,19 +2111,23 @@ kibnal_connreq_done (kib_conn_t *conn, int type, int status) void kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why) { - /* CAVEAT EMPTOR: keep IBNAL_REJECT_xxx in sync */ - static CM_REJECT_INFO msgs[3] = { - {.Reason = RC_NO_RESOURCES}, /* IBNAL_REJECT_NO_RESOURCES */ - {.Reason = RC_USER_REJ, /* IBNAL_REJECT_CONN_RACE */ - .PrivateData[0] = 0}, - {.Reason = RC_USER_REJ, /* IBNAL_REJECT_FATAL */ - .PrivateData[0] = 1}}; - - CM_REJECT_INFO *msg = &msgs[why]; - FSTATUS frc; + static CM_REJECT_INFO msgs[3]; + CM_REJECT_INFO *msg = &msgs[why]; + FSTATUS frc; LASSERT (why >= 0 && why < sizeof(msgs)/sizeof(msgs[0])); + /* If I wasn't so lazy, I'd initialise this only once; it's effectively + * read-only... */ + msg->Reason = RC_USER_REJ; + msg->PrivateData[0] = (IBNAL_MSG_MAGIC) & 0xff; + msg->PrivateData[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; + msg->PrivateData[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; + msg->PrivateData[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; + msg->PrivateData[4] = (IBNAL_MSG_VERSION) & 0xff; + msg->PrivateData[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; + msg->PrivateData[6] = why; + frc = iba_cm_reject(cep, msg); if (frc != FSUCCESS) CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid)); @@ -2132,9 +2136,12 @@ kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why) void kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) { - kib_peer_t *peer = conn->ibc_peer; - unsigned long flags; - + kib_peer_t *peer = conn->ibc_peer; + unsigned long flags; + int magic; + int version; + int why; + LASSERT (type == IBNAL_CONN_ACTIVE || type == IBNAL_CONN_PASSIVE); @@ -2167,11 +2174,31 @@ kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) break; case RC_USER_REJ: + magic = (rej->PrivateData[0]) | + (rej->PrivateData[1] << 8) | + (rej->PrivateData[2] << 16) | + (rej->PrivateData[3] << 24); + version = (rej->PrivateData[4]) | + (rej->PrivateData[5] << 8); + why = (rej->PrivateData[6]); + + if (magic != IBNAL_MSG_MAGIC || + version != IBNAL_MSG_VERSION) { + CERROR("%s connection with %s rejected " + "(magic/ver %08x/%d why %d): " + "incompatible protocol\n", + (type == IBNAL_CONN_ACTIVE) ? + "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), + magic, version, why); + break; + } + if (type == IBNAL_CONN_ACTIVE && - rej->PrivateData[0] == 0) { + why == IBNAL_REJECT_CONN_RACE) { /* lost connection race */ - CWARN("Connection to %s rejected " - "(lost connection race)\n", + CWARN("Connection to %s rejected: " + "lost connection race\n", libcfs_nid2str(peer->ibp_nid)); write_lock_irqsave(&kibnal_data.kib_global_lock, @@ -2187,7 +2214,12 @@ kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) flags); break; } - /* fall through */ + + CERROR("%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), why); + break; + default: CERROR("%s connection with %s rejected: %d\n", (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", @@ -2271,6 +2303,22 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob) unsigned long flags; int rc; + if ((msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) || + (msg->ibm_magic == IBNAL_MSG_MAGIC && + msg->ibm_version != IBNAL_MSG_VERSION) || + (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC) && + msg->ibm_version != __swab16(IBNAL_MSG_VERSION))) { + /* Future protocol version compatibility support! + * If the iiblnd-specific protocol changes, or when LNET + * unifies protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer which protocol I + * speak. */ + kibnal_reject(LNET_NID_ANY, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + rc = kibnal_unpack_msg(msg, nob); if (rc != 0) { CERROR("Error %d unpacking connreq\n", rc); @@ -2644,6 +2692,21 @@ kibnal_pathreq_callback (void *arg, QUERY *qry, IBNAL_MSG_CONNREQ, conn->ibc_peer->ibp_nid, 0); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_magic = + LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + /* Flag I'm getting involved with the CM... */ kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index a7c4bcd..a30d521 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -228,6 +228,20 @@ kibnal_make_svcqry (kib_conn_t *conn) kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0); kibnal_pack_msg(msg, 0, peer->ibp_nid, 0); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + msg->ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + msg->ibm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + rc = lnet_connect(&sock, peer->ibp_nid, 0, peer->ibp_ip, peer->ibp_port); if (rc != 0) @@ -302,6 +316,7 @@ kibnal_handle_svcqry (struct socket *sock) kib_msg_t *msg; __u64 srcnid; __u64 srcstamp; + int version; int rc; rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); @@ -320,31 +335,88 @@ kibnal_handle_svcqry (struct socket *sock) rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic), lnet_acceptor_timeout()); if (rc != 0) { - CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n", + CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n", rc, HIPQUAD(peer_ip), peer_port); goto out; } - if (msg->ibm_magic == IBNAL_MSG_MAGIC || - msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { - /* it's my magic; read the rest in... */ - rc = libcfs_sock_read(sock, &msg->ibm_version, - offsetof(kib_msg_t, ibm_u) - - offsetof(kib_msg_t, ibm_version), - lnet_acceptor_timeout()); - } else { - /* This might be a generic acceptor connection request... */ + if (msg->ibm_magic != IBNAL_MSG_MAGIC && + msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + /* Unexpected magic! */ + if (the_lnet.ln_ptlcompat == 0) { + if (msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) { + /* future protocol version compatibility! + * When LNET unifies protocols over all LNDs, + * the first thing sent will be a version + * query. I send back a reply in my current + * protocol to tell her I'm "old" */ + kibnal_init_msg(msg, 0, 0); + kibnal_pack_msg(msg, 0, LNET_NID_ANY, 0); + goto reply; + } + + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%u.%u.%u.%u/%d\n", msg->ibm_magic, + IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port); + goto out; + } + + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. */ rc = lnet_accept(kibnal_data.kib_ni, sock, msg->ibm_magic); if (rc != 0) goto out; - /* ...followed by my service query */ - rc = libcfs_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u), + /* It was an acceptor connection request! + * Now I should see my magic... */ + rc = libcfs_sock_read(sock, &msg->ibm_magic, + sizeof(msg->ibm_magic), lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n", + rc, HIPQUAD(peer_ip), peer_port); + goto out; + } + + if (msg->ibm_magic != IBNAL_MSG_MAGIC && + msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Bad magic(2) %#08x (%#08x expected) from " + "%u.%u.%u.%u/%d\n", msg->ibm_magic, + IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port); + goto out; + } + } + + /* Now check version */ + + rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version), + lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n", + rc, HIPQUAD(peer_ip), peer_port); + goto out; + } + + version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? + msg->ibm_version : __swab32(msg->ibm_version); + /* Peer is a different protocol version: reply in my current protocol + * to tell her I'm "old" */ + if (version != IBNAL_MSG_VERSION) { + kibnal_init_msg(msg, 0, 0); + kibnal_pack_msg(msg, 0, LNET_NID_ANY, 0); + goto reply; } + /* Now read in all the rest */ + rc = libcfs_sock_read(sock, &msg->ibm_type, + offsetof(kib_msg_t, ibm_u) - + offsetof(kib_msg_t, ibm_type), + lnet_acceptor_timeout()); if (rc != 0) { - CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n", + CERROR("Error %d receiving svcqry(4) from %u.%u.%u.%u/%d\n", rc, HIPQUAD(peer_ip), peer_port); goto out; } @@ -382,7 +454,8 @@ kibnal_handle_svcqry (struct socket *sock) msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey; kibnal_pack_msg(msg, 0, srcnid, srcstamp); - + + reply: rc = libcfs_sock_write (sock, msg, msg->ibm_nob, 0); if (rc != 0) { CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n", diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index 3b5959a..153a543 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -480,19 +480,6 @@ kibnal_wreqid_is_rx (__u64 wreqid) return (wreqid & 1) != 0; } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - int kibnal_startup (lnet_ni_t *ni); void kibnal_shutdown (lnet_ni_t *ni); int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index a5e6702..cc5043a 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -751,9 +751,22 @@ kptllnd_peer_check_sends ( goto failed_without_lock; } STAT_UPDATE(kps_posted_tx_bulk_mds); + + } else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO && + the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + tx->tx_msg->ptlm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + tx->tx_msg->ptlm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); } - - + /* * Setup the MD */ @@ -1392,7 +1405,6 @@ kptllnd_tx_launch (kptl_tx_t *tx, */ CDEBUG(D_NET, "TXHello=%p\n", hello_tx); - spin_lock_irqsave(&peer->peer_lock, flags); kptllnd_peer_queue_tx_locked(peer, hello_tx); kptllnd_peer_queue_tx_locked(peer, tx); diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c index 3430cda..b725c06 100644 --- a/lnet/klnds/ptllnd/ptllnd_rx_buf.c +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -614,6 +614,46 @@ kptllnd_rx_schedule (kptl_rx_t *rx) spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); } +void +kptllnd_version_nak (kptl_rx_t *rx) +{ + /* Fire-and-forget a stub message that will let the peer know my + * protocol magic/version */ + static struct { + __u32 magic; + __u16 version; + } version_reply = { + .magic = PTLLND_MSG_MAGIC, + .version = PTLLND_MSG_VERSION}; + + static ptl_md_t md = { + .start = &version_reply, + .length = sizeof(version_reply), + .threshold = 1, + .options = 0, + .user_ptr = NULL, + .eq_handle = PTL_EQ_NONE}; + + ptl_handle_md_t mdh; + kptl_rx_buffer_t *rxb = rx->rx_rxb; + kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data; + int rc; + + rc = PtlMDBind(kptllnd_data->kptl_nih, md, PTL_UNLINK, &mdh); + if (rc != PTL_OK) { + CERROR("Can't version NAK "FMT_NID"/%d: bind failed %d\n", + rx->rx_initiator.nid, rx->rx_initiator.pid, rc); + return; + } + + rc = PtlPut(mdh, PTL_NOACK_REQ, rx->rx_initiator, + *kptllnd_tunables.kptl_portal, 0, + LNET_MSG_MATCHBITS, 0, 0); + + if (rc != PTL_OK) + CERROR("Can't version NAK "FMT_NID"/%d: put failed %d\n", + rx->rx_initiator.nid, rx->rx_initiator.pid, rc); +} void kptllnd_rx_scheduler_handler(kptl_rx_t *rx) @@ -629,8 +669,20 @@ kptllnd_rx_scheduler_handler(kptl_rx_t *rx) CDEBUG(D_NET, ">>> RXRXRXRXRX rx=%p nob=%d "FMT_NID"/%d\n", rx, rx->rx_nob, rx->rx_initiator.nid, rx->rx_initiator.pid); - if (rx->rx_nob == 0) { - /* discard silently!!! */ + if ((rx->rx_nob >= 4 && + (msg->ptlm_magic == LNET_PROTO_MAGIC || + msg->ptlm_magic == __swab32(LNET_PROTO_MAGIC))) || + (rx->rx_nob >= 6 && + ((msg->ptlm_magic == PTLLND_MSG_MAGIC && + msg->ptlm_version != PTLLND_MSG_VERSION) || + (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC) && + msg->ptlm_version != __swab16(PTLLND_MSG_VERSION))))) { + /* Future protocol compatibility support! + * When LNET unifies protocols over all LNDs, or if the + * ptllnd-specific protocol changes, it will expect "old" peers + * to reply with a stub message containing their + * magic/version. */ + kptllnd_version_nak(rx); goto out; } diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 57400b3..67f74b2 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -341,8 +341,12 @@ kqswnal_startup (lnet_ni_t *ni) /* Leave kqn_rpc_success zeroed */ #if MULTIRAIL_EKC kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; + kqswnal_data.kqn_rpc_version.Data[0] = QSWLND_PROTO_VERSION; + kqswnal_data.kqn_rpc_magic.Data[0] = LNET_PROTO_QSW_MAGIC; #else kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; + kqswnal_data.kqn_rpc_version.Status = QSWLND_PROTO_VERSION; + kqswnal_data.kqn_rpc_magic.Status = LNET_PROTO_QSW_MAGIC; #endif /* pointers/lists/locks initialised */ diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index c5f3c61..d9d55ed 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -78,8 +78,6 @@ #include #include -#define KQSW_HDR_SIZE (sizeof (lnet_hdr_t)) - /* * Performance Tuning defines * NB no mention of PAGE_SIZE for interoperability @@ -100,29 +98,31 @@ #define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ /* fixed constants */ -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ +#define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */ #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ /* * derived constants */ -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + *kqswnal_tunables.kqn_tx_maxcontig) +#define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \ + kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig])) /* The pre-allocated tx buffer (hdr + small payload) */ #define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MTU) + 1) /* Reserve elan address space for pre-allocated and pre-mapped transmit * buffer and a full payload too. Extra pages allow for page alignment */ -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + LNET_MTU)) +#define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MTU)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) /* biggest complete packet we can receive (or transmit) */ +/* Wire messages */ /* Remote memory descriptor */ typedef struct { @@ -134,6 +134,36 @@ typedef struct #endif } kqswnal_remotemd_t; +/* RDMA request */ +typedef struct +{ + lnet_hdr_t kqim_hdr; /* LNET header */ + char kqim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kqswnal_immediate_msg_t; + +/* Immediate data */ +typedef struct +{ + lnet_hdr_t kqrm_hdr; /* LNET header */ + kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */ +} WIRE_ATTR kqswnal_rdma_msg_t; + +typedef struct +{ + __u32 kqm_magic; /* I'm a qswlnd message */ + __u16 kqm_version; /* this is my version number */ + __u16 kqm_type; /* msg type */ + union { + kqswnal_immediate_msg_t immediate; + kqswnal_rdma_msg_t rdma; + } WIRE_ATTR kqm_u; +} WIRE_ATTR kqswnal_msg_t; + +#define QSWLND_PROTO_VERSION 1 + +#define QSWLND_MSG_IMMEDIATE 0 +#define QSWLND_MSG_RDMA 1 + typedef struct kqswnal_rx { struct list_head krx_list; /* enqueue -> thread */ @@ -147,7 +177,8 @@ typedef struct kqswnal_rx #endif int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ - int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ + int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */ + int krx_raw_lnet_hdr:1; /* msg is a raw lnet hdr (portals compatible) */ int krx_rpc_reply_status; /* what status to send */ int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ @@ -170,7 +201,7 @@ typedef struct kqswnal_tx int ktx_npages; /* pages reserved for mapping messages */ int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ - lnet_nid_t ktx_nid; /* destination node */ + lnet_nid_t ktx_nid; /* destination node */ void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ @@ -254,6 +285,8 @@ typedef struct EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ EP_STATUSBLK kqn_rpc_failed; + EP_STATUSBLK kqn_rpc_version; /* reply to future version query */ + EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */ } kqswnal_data_t; /* kqn_init state */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 23ddfd8..8bae5e5 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -445,6 +445,9 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) #else status = ep_txd_statusblk(txd)->Status; #endif + if (status != 0) + CERROR("%s RPC status %08x\n", + libcfs_nid2str(ktx->ktx_nid), status); break; case KTX_SENDING: @@ -483,6 +486,25 @@ kqswnal_launch (kqswnal_tx_t *ktx) switch (ktx->ktx_state) { case KTX_GETTING: case KTX_PUTTING: + if (the_lnet.ln_testprotocompat != 0 && + the_lnet.ln_ptlcompat == 0) { + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + + /* single-shot proto test: + * Future version queries will use an RPC, so I'll + * co-opt one of the existing ones */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + msg->kqm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + msg->kqm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. * The other frags are the payload, awaiting RDMA */ rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, @@ -688,25 +710,16 @@ kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, #endif kqswnal_remotemd_t * -kqswnal_parse_rmd (kqswnal_rx_t *krx, int type) +kqswnal_get_portalscompat_rmd (kqswnal_rx_t *krx) { + /* Check that the RMD sent after the "raw" LNET header in a + * portals-compatible QSWLND message is OK */ char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); - lnet_hdr_t *hdr = (lnet_hdr_t *)buffer; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); + kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + sizeof(lnet_hdr_t)); - /* Note RDMA addresses are sent in native endian-ness. When - * EKC copes with different endian nodes, I'll fix this (and - * eat my hat :) */ + /* Note RDMA addresses are sent in native endian-ness in the "old" + * portals protocol so no swabbing... */ - LASSERT (krx->krx_nob >= sizeof(*hdr)); - - if (le32_to_cpu(hdr->type) != type) { - CERROR ("Unexpected optimized get/put type %d (%d expected)" - "from %s\n", le32_to_cpu(hdr->type), type, - libcfs_nid2str(kqswnal_rx_nid(krx))); - return (NULL); - } - if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", @@ -796,11 +809,11 @@ kqswnal_rdma_fetch_complete (EP_RXD *rxd) } int -kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, +kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, + int rdma_store, kqswnal_remotemd_t *rmd, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int len) { - kqswnal_remotemd_t *rmd; kqswnal_tx_t *ktx; int eprc; int rc; @@ -809,19 +822,12 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, int ndatav; #endif - LASSERT (type == LNET_MSG_GET || - type == LNET_MSG_PUT || - type == LNET_MSG_REPLY); /* Not both mapped and paged payload */ LASSERT (iov == NULL || kiov == NULL); /* RPC completes with failure by default */ LASSERT (krx->krx_rpc_reply_needed); LASSERT (krx->krx_rpc_reply_status != 0); - rmd = kqswnal_parse_rmd(krx, type); - if (rmd == NULL) - return (-EPROTO); - if (len == 0) { /* data got truncated to nothing. */ lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0); @@ -875,24 +881,17 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, goto out; } #else - switch (type) { - default: - LBUG(); - case LNET_MSG_GET: + if (rdma_store) { ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, ktx->ktx_nfrag, ktx->ktx_frags, rmd->kqrmd_nfrag, rmd->kqrmd_frag); - break; - - case LNET_MSG_PUT: - case LNET_MSG_REPLY: + } else { ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, rmd->kqrmd_nfrag, rmd->kqrmd_frag, ktx->ktx_nfrag, ktx->ktx_frags); - break; } - + if (ndatav < 0) { CERROR ("Can't create datavec: %d\n", ndatav); rc = ndatav; @@ -900,11 +899,8 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, } #endif - switch (type) { - default: - LBUG(); - case LNET_MSG_GET: + if (rdma_store) { #if MULTIRAIL_EKC eprc = ep_complete_rpc(krx->krx_rxd, kqswnal_rdma_store_complete, ktx, @@ -924,10 +920,7 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, krx->krx_rpc_reply_needed = 0; rc = -ECONNABORTED; } - break; - - case LNET_MSG_PUT: - case LNET_MSG_REPLY: + } else { #if MULTIRAIL_EKC eprc = ep_rpc_get (krx->krx_rxd, kqswnal_rdma_fetch_complete, ktx, @@ -944,7 +937,6 @@ kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, int type, krx->krx_rpc_reply_needed = 0; rc = -ECONNABORTED; } - break; } out: @@ -970,6 +962,7 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; + int nob; kqswnal_tx_t *ktx; int rc; /* NB 1. hdr is in network byte order */ @@ -1006,10 +999,7 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) ktx->ktx_args[1] = lntmsg; ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ - memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ - - /* The first frag will be the pre-mapped buffer for (at least) the - * portals header. */ + /* The first frag will be the pre-mapped buffer. */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; if ((!target_is_router && /* target.nid is final dest */ @@ -1023,17 +1013,36 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) *kqswnal_tunables.kqn_optimized_puts != 0 && payload_nob >= *kqswnal_tunables.kqn_optimized_puts)) { lnet_libmd_t *md = lntmsg->msg_md; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - + lnet_hdr_t *mhdr; + kqswnal_remotemd_t *rmd; + /* Optimised path: I send over the Elan vaddrs of the local * buffers, and my peer DMAs directly to/from them. * * First I set up ktx as if it was going to send this * payload, (it needs to map it anyway). This fills * ktx_frags[1] and onward with the network addresses - * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my - * message. */ + * of the buffer frags. */ + + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET + * header + rdma descriptor */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + rmd = (kqswnal_remotemd_t *)(mhdr + 1); + } else { + /* Send an RDMA message */ + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_RDMA; + + mhdr = &msg->kqm_u.rdma.kqrm_hdr; + rmd = &msg->kqm_u.rdma.kqrm_rmd; + } + + *mhdr = *hdr; + nob = (((char *)rmd) - ktx->ktx_buffer); if (type == LNET_MSG_GET) { if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) != 0) @@ -1057,23 +1066,21 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) goto out; rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1; - - payload_nob = offsetof(kqswnal_remotemd_t, - kqrmd_frag[rmd->kqrmd_nfrag]); - LASSERT (KQSW_HDR_SIZE + payload_nob <= KQSW_TX_BUFFER_SIZE); + nob += offsetof(kqswnal_remotemd_t, + kqrmd_frag[rmd->kqrmd_nfrag]); + LASSERT (nob <= KQSW_TX_BUFFER_SIZE); #if MULTIRAIL_EKC memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], rmd->kqrmd_nfrag * sizeof(EP_NMD)); - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); #else memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], rmd->kqrmd_nfrag * sizeof(EP_IOVEC)); ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; + ktx->ktx_frags[0].Len = nob; #endif if (type == LNET_MSG_GET) { /* Allocate reply message now while I'm in thread context */ @@ -1087,36 +1094,73 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) } } else if (payload_nob <= *kqswnal_tunables.kqn_tx_maxcontig) { - + lnet_hdr_t *mhdr; + char *payload; + /* small message: single frag copied into the pre-mapped buffer */ + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET header + * + payload */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + payload = (char *)(mhdr + 1); + } else { + /* Send an IMMEDIATE message */ + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_IMMEDIATE; + + mhdr = &msg->kqm_u.immediate.kqim_hdr; + payload = msg->kqm_u.immediate.kqim_payload; + } + + *mhdr = *hdr; + nob = (payload - ktx->ktx_buffer) + payload_nob; #if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); #else ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; + ktx->ktx_frags[0].Len = nob; #endif if (payload_kiov != NULL) - lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, ktx->ktx_buffer, - KQSW_HDR_SIZE, + lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, payload_niov, payload_kiov, payload_offset, payload_nob); else - lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, ktx->ktx_buffer, - KQSW_HDR_SIZE, + lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, payload_niov, payload_iov, payload_offset, payload_nob); } else { - + lnet_hdr_t *mhdr; + /* large message: multiple frags: first is hdr in pre-mapped buffer */ - + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET header + * + payload */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + nob = sizeof(lnet_hdr_t); + } else { + /* Send an IMMEDIATE message */ + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_IMMEDIATE; + + mhdr = &msg->kqm_u.immediate.kqim_hdr; + nob = offsetof(kqswnal_msg_t, + kqm_u.immediate.kqim_payload); + } + + *mhdr = *hdr; + #if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE); + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); #else ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; + ktx->ktx_frags[0].Len = nob; #endif if (payload_kiov != NULL) rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, @@ -1126,18 +1170,20 @@ kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) payload_niov, payload_iov); if (rc != 0) goto out; + + nob += payload_nob; } - ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + ktx->ktx_port = (nob <= KQSW_SMALLMSG) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; rc = kqswnal_launch (ktx); out: - CDEBUG(rc == 0 ? D_NET : D_ERROR, "%s %u bytes to %s%s: rc %d\n", + CDEBUG(rc == 0 ? D_NET : D_ERROR, "%s %d bytes to %s%s: rc %d\n", routing ? (rc == 0 ? "Routed" : "Failed to route") : (rc == 0 ? "Sent" : "Failed to send"), - payload_nob, libcfs_nid2str(target.nid), + nob, libcfs_nid2str(target.nid), target_is_router ? "(router)" : "", rc); if (rc != 0) { @@ -1231,6 +1277,10 @@ kqswnal_rx_done (kqswnal_rx_t *krx) /* We've not completed the peer's RPC yet... */ sblk = (krx->krx_rpc_reply_status == 0) ? &kqswnal_data.kqn_rpc_success : + (krx->krx_rpc_reply_status == LNET_PROTO_QSW_MAGIC) ? + &kqswnal_data.kqn_rpc_magic : + (krx->krx_rpc_reply_status == QSWLND_PROTO_VERSION) ? + &kqswnal_data.kqn_rpc_version : &kqswnal_data.kqn_rpc_failed; LASSERT (!in_interrupt()); @@ -1261,20 +1311,174 @@ void kqswnal_parse (kqswnal_rx_t *krx) { lnet_ni_t *ni = kqswnal_data.kqn_ni; - lnet_hdr_t *hdr = (lnet_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); - lnet_nid_t fromnid; + kqswnal_msg_t *msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); + lnet_nid_t fromnid = kqswnal_rx_nid(krx); + int swab; + int n; + int i; + int nob; int rc; LASSERT (atomic_read(&krx->krx_refcount) == 1); - fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd)); + /* If ln_ptlcompat is set, peers may send me an "old" unencapsulated + * lnet hdr */ + LASSERT (offsetof(kqswnal_msg_t, kqm_u) <= sizeof(lnet_hdr_t)); - rc = lnet_parse(ni, hdr, kqswnal_rx_nid(krx), krx, - krx->krx_rpc_reply_needed); - if (rc < 0) { - kqswnal_rx_decref(krx); + if (krx->krx_nob < offsetof(kqswnal_msg_t, kqm_u)) { + CERROR("Short message %d received from %s\n", + krx->krx_nob, libcfs_nid2str(fromnid)); + goto done; + } + + swab = msg->kqm_magic == __swab32(LNET_PROTO_QSW_MAGIC); + + if (swab || msg->kqm_magic == LNET_PROTO_QSW_MAGIC) { + + if (swab) { + __swab16s(&msg->kqm_version); + __swab16s(&msg->kqm_type); + } + + if (msg->kqm_version != QSWLND_PROTO_VERSION) { + /* Future protocol version compatibility support! + * The next qswlnd-specific protocol rev will first + * send an RPC to check version; I reply with a status + * block containing my current version... */ + + if (!krx->krx_rpc_reply_needed) { + CERROR("Unexpected version %d from %s\n", + msg->kqm_version, libcfs_nid2str(fromnid)); + goto done; + } + + krx->krx_rpc_reply_status = QSWLND_PROTO_VERSION; + goto done; + } + + switch (msg->kqm_type) { + default: + CERROR("Bad request type %x from %s\n", + msg->kqm_type, libcfs_nid2str(fromnid)); + goto done; + + case QSWLND_MSG_IMMEDIATE: + if (krx->krx_rpc_reply_needed) { + /* Should have been a simple message */ + CERROR("IMMEDIATE sent as RPC from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); + if (krx->krx_nob < nob) { + CERROR("Short IMMEDIATE %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + + rc = lnet_parse(ni, &msg->kqm_u.immediate.kqim_hdr, + fromnid, krx, 0); + if (rc < 0) + goto done; + return; + + case QSWLND_MSG_RDMA: + if (!krx->krx_rpc_reply_needed) { + /* Should have been a simple message */ + CERROR("RDMA sent as simple message from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + nob = offsetof(kqswnal_msg_t, + kqm_u.rdma.kqrm_rmd.kqrmd_frag[0]); + if (krx->krx_nob < nob) { + CERROR("Short RDMA message %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + + if (swab) + __swab32s(&msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag); + + n = msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag; + nob = offsetof(kqswnal_msg_t, + kqm_u.rdma.kqrm_rmd.kqrmd_frag[n]); + + if (krx->krx_nob < nob) { + CERROR("short RDMA message %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + + if (swab) { + for (i = 0; i < n; i++) { +#if MULTIRAIL_EKC + EP_NMD *nmd = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i]; + + __swab32s(&nmd->nmd_addr); + __swab32s(&nmd->nmd_len); + __swab32s(&nmd->nmd_attr); +#else + EP_IOVEC *iov = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i]; + + __swab32s(&iov->Base); + __swab32s(&iov->Len); +#endif + } + } + + rc = lnet_parse(ni, &msg->kqm_u.rdma.kqrm_hdr, + fromnid, krx, 1); + if (rc < 0) + goto done; + return; + } + /* Not Reached */ + } + + if (msg->kqm_magic == LNET_PROTO_MAGIC || + msg->kqm_magic == __swab32(LNET_PROTO_MAGIC)) { + /* Future protocol version compatibility support! + * When LNET unifies protocols over all LNDs, the first thing a + * peer will send will be a version query RPC. I reply with a + * status block containing LNET_PROTO_QSW_MAGIC to inform her + * that I'm "old" */ + + if (!krx->krx_rpc_reply_needed) { + CERROR("Unexpected magic %08x from %s\n", + msg->kqm_magic, libcfs_nid2str(fromnid)); + goto done; + } + + krx->krx_rpc_reply_status = LNET_PROTO_QSW_MAGIC; + goto done; + } + + if (the_lnet.ln_ptlcompat != 0) { + /* Portals compatibility (strong or weak) + * This could be an unencapsulated LNET header. If it's big + * enough, let LNET's parser sort it out */ + + if (krx->krx_nob < sizeof(lnet_hdr_t)) { + CERROR("Short portals-compatible message from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + krx->krx_raw_lnet_hdr = 1; + rc = lnet_parse(ni, (lnet_hdr_t *)msg, + fromnid, krx, krx->krx_rpc_reply_needed); + if (rc < 0) + goto done; return; } + + CERROR("Unrecognised magic %08x from %s\n", + msg->kqm_magic, libcfs_nid2str(fromnid)); + done: + kqswnal_rx_decref(krx); } /* Receive Interrupt Handler: posts to schedulers */ @@ -1294,6 +1498,7 @@ kqswnal_rxhandler(EP_RXD *rxd) krx->krx_state = KRX_PARSE; krx->krx_rxd = rxd; krx->krx_nob = nob; + krx->krx_raw_lnet_hdr = 0; /* RPC reply iff rpc request received without error */ krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) && @@ -1304,9 +1509,7 @@ kqswnal_rxhandler(EP_RXD *rxd) krx->krx_rpc_reply_status = -EPROTO; atomic_set (&krx->krx_refcount, 1); - /* must receive a whole header to be able to parse */ - if (status != EP_SUCCESS || nob < sizeof (lnet_hdr_t)) - { + if (status != EP_SUCCESS) { /* receives complete with failure when receiver is removed */ #if MULTIRAIL_EKC if (status == EP_SHUTDOWN) @@ -1337,7 +1540,7 @@ kqswnal_rxhandler(EP_RXD *rxd) } int -kqswnal_recv (lnet_ni_t *ni, +kqswnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, @@ -1348,70 +1551,97 @@ kqswnal_recv (lnet_ni_t *ni, unsigned int mlen, unsigned int rlen) { - kqswnal_rx_t *krx = (kqswnal_rx_t *)private; - char *buffer = page_address(krx->krx_kiov[0].kiov_page); - lnet_hdr_t *hdr = (lnet_hdr_t *)buffer; - int hdrtype = le32_to_cpu(hdr->type); - int rc; + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + lnet_nid_t fromnid; + kqswnal_msg_t *msg; + lnet_hdr_t *hdr; + kqswnal_remotemd_t *rmd; + int msg_offset; + int rc; - /* NB hdr still in network byte order */ + LASSERT (!in_interrupt ()); /* OK to map */ + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd)); + msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); + if (krx->krx_rpc_reply_needed) { /* optimized (rdma) request sent as RPC */ - switch (hdrtype) { - case LNET_MSG_PUT: - case LNET_MSG_REPLY: - /* This is an optimized PUT/REPLY */ - rc = kqswnal_rdma(krx, lntmsg, hdrtype, - niov, iov, kiov, offset, mlen); - break; - - case LNET_MSG_GET: - if (lntmsg == NULL) { - /* No buffer match: my decref will complete the - * RPC with failure */ - rc = 0; - } else { - /* Matched something! */ - rc = kqswnal_rdma (krx, lntmsg, - LNET_MSG_GET, - lntmsg->msg_niov, - lntmsg->msg_iov, - lntmsg->msg_kiov, - lntmsg->msg_offset, - lntmsg->msg_len); - } - break; - default: - CERROR("Bad RPC type %d\n", hdrtype); - rc = -EPROTO; - break; + if (krx->krx_raw_lnet_hdr) { + LASSERT (the_lnet.ln_ptlcompat != 0); + hdr = (lnet_hdr_t *)msg; + rmd = kqswnal_get_portalscompat_rmd(krx); + if (rmd == NULL) + return (-EPROTO); + } else { + LASSERT (msg->kqm_type == QSWLND_MSG_RDMA); + hdr = &msg->kqm_u.rdma.kqrm_hdr; + rmd = &msg->kqm_u.rdma.kqrm_rmd; + } + + /* NB header is still in wire byte order */ + + switch (le32_to_cpu(hdr->type)) { + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + /* This is an optimized PUT/REPLY */ + rc = kqswnal_rdma(krx, lntmsg, 0, rmd, + niov, iov, kiov, offset, mlen); + break; + + case LNET_MSG_GET: + if (lntmsg == NULL) { + /* No buffer match: my decref will + * complete the RPC with failure */ + rc = 0; + } else { + /* Matched something! */ + rc = kqswnal_rdma(krx, lntmsg, 1, rmd, + lntmsg->msg_niov, + lntmsg->msg_iov, + lntmsg->msg_kiov, + lntmsg->msg_offset, + lntmsg->msg_len); + } + break; + + default: + CERROR("Bad RPC type %d\n", + le32_to_cpu(hdr->type)); + rc = -EPROTO; + break; } + kqswnal_rx_decref(krx); return rc; } + + if (krx->krx_raw_lnet_hdr) { + LASSERT (the_lnet.ln_ptlcompat != 0); + msg_offset = sizeof(lnet_hdr_t); + } else { + LASSERT (msg->kqm_type == QSWLND_MSG_IMMEDIATE); + msg_offset = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); + } - if (krx->krx_nob < KQSW_HDR_SIZE + rlen) { - CERROR("Bad message size: have %d, need %d + %d\n", - krx->krx_nob, (int)KQSW_HDR_SIZE, rlen); + if (krx->krx_nob < msg_offset + rlen) { + CERROR("Bad message size from %s: have %d, need %d + %d\n", + libcfs_nid2str(fromnid), krx->krx_nob, + msg_offset, rlen); kqswnal_rx_decref(krx); return -EPROTO; } - /* It must be OK to kmap() if required */ - LASSERT (kiov == NULL || !in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - if (kiov != NULL) lnet_copy_kiov2kiov(niov, kiov, offset, krx->krx_npages, krx->krx_kiov, - KQSW_HDR_SIZE, mlen); + msg_offset, mlen); else lnet_copy_kiov2iov(niov, iov, offset, krx->krx_npages, krx->krx_kiov, - KQSW_HDR_SIZE, mlen); + msg_offset, mlen); lnet_finalize(ni, lntmsg, 0); kqswnal_rx_decref(krx); diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index 005c283..19bcce9 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -46,6 +46,10 @@ kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, lnet_nid_t dstnid) connreq->racr_magic = RANAL_MSG_MAGIC; connreq->racr_version = RANAL_MSG_VERSION; + + if (conn == NULL) /* prepping a "stub" reply */ + return; + connreq->racr_devid = conn->rac_device->rad_id; connreq->racr_srcnid = lnet_ptlcompat_srcnid(kranal_data.kra_ni->ni_nid, dstnid); @@ -63,37 +67,97 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active) { int timeout = active ? *kranal_tunables.kra_timeout : lnet_acceptor_timeout(); + int swab; int rc; + /* return 0 on success, -ve on error, +ve to tell the peer I'm "old" */ + rc = libcfs_sock_read(sock, &connreq->racr_magic, sizeof(connreq->racr_magic), timeout); if (rc != 0) { - CERROR("Read failed: %d\n", rc); - return rc; + CERROR("Read(magic) failed(1): %d\n", rc); + return -EIO; } - if (!active && - connreq->racr_magic != RANAL_MSG_MAGIC && + if (connreq->racr_magic != RANAL_MSG_MAGIC && connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) { - /* Is this a generic acceptor connection request? */ + /* Unexpected magic! */ + if (!active && + the_lnet.ln_ptlcompat == 0 && + (connreq->racr_magic == LNET_PROTO_MAGIC || + connreq->racr_magic == __swab32(LNET_PROTO_MAGIC))) { + /* future protocol version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. +ve rc means I + * reply with my current magic/version */ + return EPROTO; + } + + if (active || + the_lnet.ln_ptlcompat == 0) { + CERROR("Unexpected magic %08x (1)\n", + connreq->racr_magic); + return -EPROTO; + } + + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. This isn't a connreq, so I'll get the acceptor to + * look at it... */ rc = lnet_accept(kranal_data.kra_ni, sock, connreq->racr_magic); - if (rc != 0) /* nope */ + if (rc != 0) return -EPROTO; + /* ...and if it's OK I'm back to looking for a connreq... */ rc = libcfs_sock_read(sock, &connreq->racr_magic, sizeof(connreq->racr_magic), timeout); if (rc != 0) { - CERROR("Read failed: %d\n", rc); - return rc; + CERROR("Read(magic) failed(2): %d\n", rc); + return -EIO; } + + if (connreq->racr_magic != RANAL_MSG_MAGIC && + connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) { + CERROR("Unexpected magic %08x(2)\n", + connreq->racr_magic); + return -EPROTO; + } + } + + swab = (connreq->racr_magic == __swab32(RANAL_MSG_MAGIC)); + + rc = libcfs_sock_read(sock, &connreq->racr_version, + sizeof(connreq->racr_version), timeout); + if (rc != 0) { + CERROR("Read(version) failed: %d\n", rc); + return -EIO; } + + if (swab) + __swab16s(&connreq->racr_version); - if (connreq->racr_magic != RANAL_MSG_MAGIC) { - if (__swab32(connreq->racr_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x\n", connreq->racr_magic); + if (connreq->racr_version != RANAL_MSG_VERSION) { + if (active) { + CERROR("Unexpected version %d\n", connreq->racr_version); return -EPROTO; } + /* If this is a future version of the ralnd protocol, and I'm + * passive (accepted the connection), tell my peer I'm "old" + * (+ve rc) */ + return EPROTO; + } + + rc = libcfs_sock_read(sock, &connreq->racr_devid, + sizeof(connreq->racr_version) - + offsetof(kra_connreq_t, racr_devid), + timeout); + if (rc != 0) { + CERROR("Read(body) failed: %d\n", rc); + return -EIO; + } + if (swab) { __swab32s(&connreq->racr_magic); __swab16s(&connreq->racr_version); __swab16s(&connreq->racr_devid); @@ -109,11 +173,6 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active) __swab32s(&connreq->racr_riparams.CompletionCookie); } - if (connreq->racr_version != RANAL_MSG_VERSION) { - CERROR("Unexpected version %d\n", connreq->racr_version); - return -EPROTO; - } - if (connreq->racr_srcnid == LNET_NID_ANY || connreq->racr_dstnid == LNET_NID_ANY) { CERROR("Received LNET_NID_ANY\n"); @@ -442,12 +501,26 @@ kranal_passive_conn_handshake (struct socket *sock, lnet_nid_t *src_nidp, } rc = kranal_recv_connreq(sock, &rx_connreq, 0); - if (rc != 0) { + + if (rc < 0) { CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rc); return rc; } + if (rc > 0) { + /* Request from "new" peer: send reply with my MAGIC/VERSION to + * tell her I'm old... */ + kranal_pack_connreq(&tx_connreq, NULL, LNET_NID_ANY); + + rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), 0); + if (rc != 0) + CERROR("Can't tx stub connreq to %u.%u.%u.%u/%d: %d\n", + HIPQUAD(peer_ip), peer_port, rc); + + return -EPROTO; + } + for (i = 0;;i++) { if (i == kranal_data.kra_ndevs) { CERROR("Can't match dev %d from %u.%u.%u.%u/%d\n", @@ -507,6 +580,20 @@ kranal_active_conn_handshake(kra_peer_t *peer, kranal_pack_connreq(&connreq, conn, peer->rap_nid); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + connreq.racr_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + connreq.racr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + rc = lnet_connect(&sock, peer->rap_nid, 0, peer->rap_ip, peer->rap_port); if (rc != 0) diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index 5b43340..945fc25 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -341,19 +341,6 @@ typedef struct kra_peer unsigned long rap_reconnect_interval; /* exponential backoff */ } kra_peer_t; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - extern kra_data_t kranal_data; extern kra_tunables_t kranal_tunables; @@ -478,3 +465,4 @@ extern void kranal_connect (kra_peer_t *peer); extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); extern int kranal_tunables_init(void); extern void kranal_tunables_fini(void); +extern void kranal_init_msg(kra_msg_t *msg, int type); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 71cc574..4506cbd 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -974,10 +974,6 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, LASSERT ((route == NULL) == (type == SOCKLND_CONN_NONE)); - rc = ksocknal_lib_setup_sock (sock); - if (rc != 0) - return (rc); - irq = ksocknal_lib_sock_irq (sock); rc = -ENOMEM; @@ -1051,7 +1047,16 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, /* additional routes after interface exchange? */ ksocknal_create_routes(peer, conn->ksnc_port, ipaddrs, nipaddrs); - rc = 0; + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + rc = ksocknal_lib_setup_sock(sock); + if (rc != 0) + goto failed_1; + write_lock_irqsave (global_lock, flags); } else { rc = ksocknal_create_peer(&peer, ni, peerid); @@ -1103,12 +1108,18 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, /* set CONN_NONE makes returned HELLO acknowledge I * lost a connection race */ conn->ksnc_type = SOCKLND_CONN_NONE; - ksocknal_send_hello (ni, conn, peerid.nid, - ipaddrs, 0); + ksocknal_send_hello(ni, conn, peerid.nid, NULL, 0); } else { nipaddrs = ksocknal_select_ips(peer, ipaddrs, nipaddrs); - rc = ksocknal_send_hello (ni, conn, peerid.nid, - ipaddrs, nipaddrs); + rc = ksocknal_send_hello(ni, conn, peerid.nid, + ipaddrs, nipaddrs); + + /* Setup the socket (it disables SO_LINGER). I don't + * do it if I'm sending a negative response to ensure + * the response isn't discarded when I close the socket + * immediately after sending it. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); } write_lock_irqsave(global_lock, flags); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index f6539fb..ccd1a8b 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1382,6 +1382,20 @@ ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, lnet_nid_t peer_nid, hmv->version_major = cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR); hmv->version_minor = cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid); hdr.src_nid = cpu_to_le64 (srcnid); @@ -1464,15 +1478,40 @@ ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, return (rc); } - if (!active && - hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { - /* Is this a generic acceptor connection request? */ + if (hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + if (!active && + the_lnet.ln_ptlcompat == 0 && + (hmv->magic == LNET_PROTO_MAGIC || + hmv->magic == __swab32(LNET_PROTO_MAGIC))) { + /* future protocol version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back a + * 'hello' in my current format to tell her I'm + * "old" */ + ksocknal_send_hello(ni, conn, ni->ni_nid, NULL, 0); + return -EPROTO; + } + + if (active || + the_lnet.ln_ptlcompat == 0) { + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%u.%u.%u.%u\n", __cpu_to_le32 (hmv->magic), + LNET_PROTO_TCP_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. This isn't a 'hello', so I'll get the acceptor to + * look at it... */ rc = lnet_accept(ni, sock, hmv->magic); if (rc != 0) return -EPROTO; - /* Yes it is! Start over again now I've skipping the generic - * request */ + /* ...and if it's OK I'm back to looking for a 'hello'... */ rc = libcfs_sock_read(sock, &hmv->magic, sizeof (hmv->magic), timeout); if (rc != 0) { @@ -1481,15 +1520,16 @@ ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, LASSERT (rc < 0 && rc != -EALREADY); return (rc); } - } - if (hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n", - __cpu_to_le32 (hmv->magic), LNET_PROTO_TCP_MAGIC, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + if (hmv->magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + CERROR ("Bad magic(2) %#08x (%#08x expected) from " + "%u.%u.%u.%u\n", __cpu_to_le32 (hmv->magic), + LNET_PROTO_TCP_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } } - + rc = libcfs_sock_read(sock, &hmv->magic + 1, sizeof(*hmv) - sizeof(hmv->magic), timeout); if (rc != 0) { @@ -1501,14 +1541,22 @@ ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, if (hmv->version_major != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR) || hmv->version_minor != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR)) { - CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from %u.%u.%u.%u\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - LNET_PROTO_TCP_VERSION_MAJOR, - LNET_PROTO_TCP_VERSION_MINOR, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + if (active) { + CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" + " from %u.%u.%u.%u\n", + le16_to_cpu (hmv->version_major), + le16_to_cpu (hmv->version_minor), + LNET_PROTO_TCP_VERSION_MAJOR, + LNET_PROTO_TCP_VERSION_MINOR, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + /* If this is a future version of the socklnd protocol, and I'm + * passive (accepted the connection), send back a 'hello' to + * tell my peer I'm "old" */ + ksocknal_send_hello(ni, conn, ni->ni_nid, NULL, 0); + return -EPROTO; } #if (LNET_PROTO_TCP_VERSION_MAJOR != 1) diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index a3122f6..5f693d0 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -241,8 +241,8 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) #if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) if (zcsize >= ksocknal_data.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + (sock->sk->sk_route_caps & NETIF_F_SG) && + (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { int msgflg = MSG_DONTWAIT; @@ -306,8 +306,8 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) #if SOCKNAL_ZC if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { + (sock->sk->sk_route_caps & NETIF_F_SG) && + (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; int fragsize = kiov->kiov_len; diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index 216fbe1..44dd4a9 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -1991,10 +1991,30 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) } void -kibnal_connreq_done(kib_conn_t *conn, int active, int status) +kibnal_reject(cm_cep_handle_t cep, int why) { - static cm_reject_data_t rej; + static cm_reject_data_t rejs[3]; + cm_reject_data_t *rej = &rejs[why]; + + LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0])); + + /* If I wasn't so lazy, I'd initialise this only once; it's effective + * read-only */ + rej->reason = cm_rej_code_usr_rej; + rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff; + rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; + rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; + rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; + rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff; + rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; + rej->priv_data[6] = why; + + cm_reject(cep, rej); +} +void +kibnal_connreq_done(kib_conn_t *conn, int active, int status) +{ struct list_head txs; kib_peer_t *peer = conn->ibc_peer; kib_peer_t *peer2; @@ -2026,9 +2046,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) case IBNAL_CONN_ACTIVE_CHECK_REPLY: /* got a connection reply but failed checks */ LASSERT (active); - memset(&rej, 0, sizeof(rej)); - rej.reason = cm_rej_code_usr_rej; - cm_reject(conn->ibc_cep, &rej); + kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL); break; case IBNAL_CONN_ACTIVE_CONNECT: @@ -2246,10 +2264,10 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) static kib_msg_t txmsg; static kib_msg_t rxmsg; static cm_reply_data_t reply; - static cm_reject_data_t reject; kib_conn_t *conn = NULL; int rc = 0; + int reason; int rxmsgnob; kib_connvars_t *cv; kib_peer_t *tmp_peer; @@ -2264,6 +2282,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) { CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number)); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2271,15 +2290,33 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg)); memcpy(&rxmsg, cmreq->priv_data, rxmsgnob); + if ((rxmsg.ibm_magic == LNET_PROTO_MAGIC || + rxmsg.ibm_magic == __swab32(LNET_PROTO_MAGIC)) || + (rxmsg.ibm_magic == IBNAL_MSG_MAGIC && + rxmsg.ibm_version != IBNAL_MSG_VERSION) || + (rxmsg.ibm_magic == __swab32(IBNAL_MSG_MAGIC) && + rxmsg.ibm_version != __swab16(IBNAL_MSG_VERSION))) { + /* Future protocol version compatibility support! + * If the viblnd-specific protocol changes, or when LNET + * unifies protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer which protocol I + * speak. */ + reason = IBNAL_REJECT_FATAL; + goto reject; + } + rc = kibnal_unpack_msg(&rxmsg, rxmsgnob); if (rc != 0) { CERROR("Can't parse connection request: %d\n", rc); + reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) { CERROR("Unexpected connreq msg type: %x from %s\n", rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2288,6 +2325,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) CERROR("Can't accept %s: bad dst nid %s\n", libcfs_nid2str(rxmsg.ibm_srcnid), libcfs_nid2str(rxmsg.ibm_dstnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2296,6 +2334,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2304,6 +2343,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_max_msg_size, IBNAL_MSG_SIZE); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2312,6 +2352,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_u.connparams.ibcp_max_frags, IBNAL_MAX_RDMA_FRAGS); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2319,6 +2360,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) if (conn == NULL) { CERROR("Can't create conn for %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } @@ -2329,6 +2371,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_conn_decref(conn); conn = NULL; + reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } @@ -2351,6 +2394,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) CERROR("gid2gid_index failed for %s: %d\n", libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); rc = -EIO; + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2360,23 +2404,29 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) CERROR("pkey2pkey_index failed for %s: %d\n", libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); rc = -EIO; + reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_set_qp_state(conn, vv_qp_state_init); - if (rc != 0) + if (rc != 0) { + reason = IBNAL_REJECT_FATAL; goto reject; - + } + rc = kibnal_post_receives(conn); if (rc != 0) { CERROR("Can't post receives for %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); - if (rc != 0) + if (rc != 0) { + reason = IBNAL_REJECT_FATAL; goto reject; + } memset(&reply, 0, sizeof(reply)); reply.qpn = cv->cv_local_qpn; @@ -2412,13 +2462,13 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) /* back out state change (no callback happening) */ kibnal_set_conn_state(conn, IBNAL_CONN_INIT); rc = -EIO; + reason = IBNAL_REJECT_FATAL; reject: - CERROR("Rejected connreq from %s\n", libcfs_nid2str(rxmsg.ibm_srcnid)); + CDEBUG(D_NET, "Rejecting connreq from %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); - memset(&reject, 0, sizeof(reject)); - reject.reason = cm_rej_code_usr_rej; - cm_reject(cep, &reject); + kibnal_reject(cep, reason); if (conn != NULL) { LASSERT (rc != 0); @@ -2447,8 +2497,7 @@ kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg) if (pcr == NULL) { CERROR("Can't allocate passive connreq\n"); - cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */ - {.reason = cm_rej_code_no_res,})); + kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES); cm_destroy_cep(cep); return; } @@ -2521,6 +2570,20 @@ kibnal_connect_conn (kib_conn_t *conn) msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + msg.ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + msg.ibm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + /* ...and copy into cmreq to avoid alignment issues */ memcpy(&cmreq.priv_data, &msg, msg.ibm_nob); @@ -2679,8 +2742,34 @@ kibnal_check_connreply (kib_conn_t *conn) if (cv->cv_conndata.status == cm_event_conn_reject) { + if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) { + unsigned char *bytes = + cv->cv_conndata.data.reject.priv_data; + int magic = (bytes[0]) | + (bytes[1] << 8) | + (bytes[2] << 16) | + (bytes[3] << 24); + int version = (bytes[4]) | + (bytes[5] << 8); + int why = (bytes[6]); + + if (magic != IBNAL_MSG_MAGIC || + version != IBNAL_MSG_VERSION) + CERROR("conn -> %s rejected " + "(magic/ver %08x/%d why %d): " + "incompatible protocol\n", + libcfs_nid2str(peer->ibp_nid), + magic, version, why); + else + CERROR("conn -> %s rejected: fatal error %d\n", + libcfs_nid2str(peer->ibp_nid), why); + + kibnal_connreq_done(conn, 1, -ECONNREFUSED); + return; + } + if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) { - CERROR("conn -> %s rejected: %d\n", + CERROR("conn -> %s rejected: reason %d\n", libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.data.reject.reason); kibnal_connreq_done(conn, 1, -ECONNREFUSED); diff --git a/lnet/klnds/viblnd/viblnd_wire.h b/lnet/klnds/viblnd/viblnd_wire.h index 90e336c..bc6a70f 100644 --- a/lnet/klnds/viblnd/viblnd_wire.h +++ b/lnet/klnds/viblnd/viblnd_wire.h @@ -98,7 +98,7 @@ typedef struct } WIRE_ATTR ibm_u; } WIRE_ATTR kib_msg_t; -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_MAGIC LNET_PROTO_VIB_MAGIC /* unique magic */ #if IBNAL_USE_FMA /* ensure version changes on FMA */ #define IBNAL_MSG_VERSION 0x11 @@ -116,3 +116,8 @@ typedef struct #define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ #define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ #define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +/* connection rejection reasons */ +#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ +#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ +#define IBNAL_REJECT_FATAL 2 /* Anything else */ diff --git a/lnet/klnds/viblnd/wirecheck.c b/lnet/klnds/viblnd/wirecheck.c index 367c2ac..5a0e060 100644 --- a/lnet/klnds/viblnd/wirecheck.c +++ b/lnet/klnds/viblnd/wirecheck.c @@ -154,6 +154,10 @@ main (int argc, char **argv) CHECK_DEFINE (IBNAL_MSG_GET_REQ); CHECK_DEFINE (IBNAL_MSG_GET_DONE); + CHECK_DEFINE (IBNAL_REJECT_CONN_RACE); + CHECK_DEFINE (IBNAL_REJECT_NO_RESOURCES); + CHECK_DEFINE (IBNAL_REJECT_FATAL); + CHECK_STRUCT (kib_connparams_t); CHECK_MEMBER (kib_connparams_t, ibcp_queue_depth); CHECK_MEMBER (kib_connparams_t, ibcp_max_msg_size); diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 9fd8cff..c19bf06 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -282,7 +282,7 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a } break; - case IOC_LIBCFS_PING: { + case IOC_LIBCFS_PING_TEST: { extern void (kping_client)(struct libcfs_ioctl_data *); void (*ping)(struct libcfs_ioctl_data *); diff --git a/lnet/lnet/acceptor.c b/lnet/lnet/acceptor.c index 56d8308..9d065ce 100644 --- a/lnet/lnet/acceptor.c +++ b/lnet/lnet/acceptor.c @@ -40,10 +40,6 @@ static int accept_timeout = 5; CFS_MODULE_PARM(accept_timeout, "i", int, 0644, "Acceptor's timeout (seconds)"); -static int accept_proto_version = LNET_PROTO_ACCEPTOR_VERSION; -CFS_MODULE_PARM(accept_proto_version, "i", int, 0444, - "Acceptor protocol version (outgoing connection requests)"); - struct { int pta_shutdown; struct socket *pta_sock; @@ -150,7 +146,7 @@ lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, } /* Ensure writing connection requests don't block. PAGE_SIZE - * isn't excessive and easily big enough for all the NALs */ + * isn't excessive and easily big enough for all the LNDs */ rc = libcfs_sock_setbuf(sock, PAGE_SIZE, PAGE_SIZE); if (rc != 0) { CERROR("Error %d setting buffer sizes\n", rc); @@ -159,14 +155,30 @@ lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1); - if (accept_proto_version == LNET_PROTO_ACCEPTOR_VERSION) { - - LASSERT (the_lnet.ln_ptlcompat < 2); /* no portals peers */ - + if (the_lnet.ln_ptlcompat != 2) { + /* When portals compatibility is "strong", simply + * connect (i.e. send no acceptor connection request). + * Othewise send an acceptor connection request. I can + * have no portals peers so everyone else should + * understand my protocol. */ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; cr.acr_nid = peer_nid; + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + LNET_UNLOCK(); + } + rc = libcfs_sock_write(sock, &cr, sizeof(cr), 0); if (rc != 0) goto failed_sock; @@ -197,7 +209,7 @@ lnet_accept_magic(__u32 magic, __u32 constant) int lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic) { - lnet_acceptor_connreq_t cr; + lnet_acceptor_connreq_t cr; __u32 peer_ip; int peer_port; int rc; @@ -205,8 +217,8 @@ lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic) lnet_ni_t *ni; char *str; - /* CAVEAT EMPTOR: I may be called by a NAL in any thread's context if I - * passed the new socket "blindly" to the single NI that needed an + /* CAVEAT EMPTOR: I may be called by an LND in any thread's context if + * I passed the new socket "blindly" to the single NI that needed an * acceptor. If so, blind_ni != NULL... */ LASSERT (sizeof(cr) <= 16); /* not too big for the stack */ @@ -216,6 +228,24 @@ lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic) if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = libcfs_sock_write(sock, &cr, sizeof(cr), 0); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to LNET magic from %u.%u.%u.%u: %d\n", + HIPQUAD(peer_ip), rc); + return -EPROTO; + } + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) str = "'old' socknal/tcpnal"; else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) @@ -231,36 +261,56 @@ lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic) return -EPROTO; } - flip = magic != LNET_PROTO_ACCEPTOR_MAGIC; + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); - /* FTTB, we only have 1 acceptor protocol version. When this changes, - * we'll have to read the version number first before we know how much - * more to read... */ rc = libcfs_sock_read(sock, &cr.acr_version, - sizeof(cr) - - offsetof(lnet_acceptor_connreq_t, acr_version), + sizeof(cr.acr_version), accept_timeout); if (rc != 0) { - CERROR("Error %d reading connection request from " + CERROR("Error %d reading connection request version from " "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); return -EIO; } - if (flip) { + if (flip) __swab32s(&cr.acr_version); - __swab64s(&cr.acr_nid); - } if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { - LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u: " - " unrecognised protocol version %d\n", - HIPQUAD(peer_ip), cr.acr_version); + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), 0); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to version %d from %u.%u.%u.%u: %d\n", + peer_version, HIPQUAD(peer_ip), rc); return -EPROTO; } + rc = libcfs_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(lnet_acceptor_connreq_t, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); - if (ni == NULL || /* no matching net */ - ni->ni_nid != cr.acr_nid) /* right NET, but wrong NID! */ { + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ if (ni != NULL) lnet_ni_decref(ni); LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: " @@ -270,28 +320,25 @@ lnet_accept(lnet_ni_t *blind_ni, struct socket *sock, __u32 magic) } if (ni->ni_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ lnet_ni_decref(ni); LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: " " NI doesn not accept IP connections\n", HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); return -EPERM; } - + CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u%s\n", libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip), blind_ni == NULL ? "" : " (blind)"); if (blind_ni == NULL) { + /* called by the acceptor: call into the requested NI... */ rc = ni->ni_lnd->lnd_accept(ni, sock); - if (rc != 0) - CERROR("NI %s refused connection from %u.%u.%u.%u\n", - libcfs_nid2str(ni->ni_nid), HIPQUAD(peer_ip)); } else { - /* blind_ni is the only NI that needs me and it was given the - * chance to handle this connection request itself in case it - * was sent by an "old" socknal. But this connection request - * uses the new acceptor protocol and I'm just being called to - * verify and skip it */ + /* portals_compatible set and the (only) NI called me to verify + * and skip the connection request... */ + LASSERT (the_lnet.ln_ptlcompat != 0); LASSERT (ni == blind_ni); rc = 0; } @@ -306,25 +353,25 @@ lnet_acceptor(void *arg) { char name[16]; struct socket *newsock; - int rc; + int rc = 0; int n_acceptor_nis; __u32 magic; __u32 peer_ip; int peer_port; - lnet_ni_t *blind_ni; + lnet_ni_t *blind_ni = NULL; int secure = (int)((unsigned long)arg); LASSERT (lnet_acceptor_state.pta_sock == NULL); - /* If there is only a single NI that needs me, I'll pass her - * connections "blind". Otherwise I'll have to read the bytestream to - * see which NI the connection is for. NB I don't get to run at all if - * there are 0 acceptor_nis... */ - n_acceptor_nis = lnet_count_acceptor_nis(&blind_ni); - LASSERT (n_acceptor_nis > 0); - if (n_acceptor_nis > 1) { - lnet_ni_decref(blind_ni); - blind_ni = NULL; + if (the_lnet.ln_ptlcompat != 0) { + /* When portals_compatibility is enabled, peers may connect + * without sending an acceptor connection request. There is no + * ambiguity about which network the peer wants to connect to + * since there can only be 1 network, so I pass connections + * "blindly" to it. */ + n_acceptor_nis = lnet_count_acceptor_nis(&blind_ni); + LASSERT (n_acceptor_nis == 1); + LASSERT (blind_ni != NULL); } snprintf(name, sizeof(name), "acceptor_%03d", accept_port); @@ -440,11 +487,6 @@ lnet_acceptor_start(void) long pid; long secure; - /* If we're talking to any portals (pre-LNET) nodes we force the old - * acceptor protocol on outgoing connections */ - if (the_lnet.ln_ptlcompat > 1) - accept_proto_version = 0; - LASSERT (lnet_acceptor_state.pta_sock == NULL); init_mutex_locked(&lnet_acceptor_state.pta_signal); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 2b0905e..da1e036 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1071,13 +1071,21 @@ lnet_startup_lndnis (void) nicount++; } - if (nicount > 1 && the_lnet.ln_eqwaitni != NULL) { - lnd_type = the_lnet.ln_eqwaitni->ni_lnd->lnd_type; - LCONSOLE_ERROR("LND %s can only run single-network\n", - libcfs_lnd2str(lnd_type)); - goto failed; + if (nicount > 1) { + if (the_lnet.ln_eqwaitni != NULL) { + lnd_type = the_lnet.ln_eqwaitni->ni_lnd->lnd_type; + LCONSOLE_ERROR("LND %s can only run single-network\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + if (the_lnet.ln_ptlcompat != 0) { + LCONSOLE_ERROR("Can't run > 1 network when " + "portals_compatibility is set\n"); + goto failed; + } } - + return 0; failed: @@ -1190,10 +1198,19 @@ LNetNIInit(lnet_pid_t requested_pid) if (rc != 0) goto failed3; - lnet_proc_init(); the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + rc = lnet_ping_target_init(); + if (rc != 0) + goto failed4; + + lnet_proc_init(); goto out; + failed4: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); failed3: lnet_destroy_routes(); failed2: @@ -1215,12 +1232,17 @@ LNetNIFini() LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - the_lnet.ln_refcount--; - if (the_lnet.ln_refcount == 0) { - + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { LASSERT (!the_lnet.ln_niinit_self); lnet_proc_fini(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); lnet_destroy_routes(); lnet_shutdown_lndnis(); @@ -1277,6 +1299,23 @@ LNetCtl(unsigned int cmd, void *arg) data->ioc_u32[0] = rc; return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + LNET_LOCK(); + the_lnet.ln_testprotocompat = data->ioc_flags; + LNET_UNLOCK(); + return 0; + + case IOC_LIBCFS_PING: + rc = lnet_ping((lnet_process_id_t) {.nid = data->ioc_nid, + .pid = data->ioc_u32[0]}, + data->ioc_u32[1], /* timeout */ + (lnet_process_id_t *)data->ioc_pbuf1, + data->ioc_plen1/sizeof(lnet_process_id_t)); + if (rc < 0) + return rc; + data->ioc_count = rc; + return 0; default: ni = lnet_net2ni(data->ioc_net); @@ -1330,3 +1369,190 @@ LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) } +int +lnet_ping_target_init(void) +{ + static lnet_process_id_t my_ids[10]; + + lnet_handle_me_t meh; + int rc; + int rc2; + int n_ids; + + for (n_ids = 0; n_ids < sizeof(my_ids)/sizeof(my_ids[0]); n_ids++) { + rc = LNetGetId(n_ids, &my_ids[n_ids]); + + if (rc == 0) + continue; + + if (rc == -ENOENT) + break; + + CERROR("Error %d getting id %d\n", rc, n_ids); + return rc; + } + + rc = LNetMEAttach(LNET_RESERVED_PORTAL, + (lnet_process_id_t){.nid = LNET_NID_ANY, + .pid = LNET_PID_ANY}, + LNET_PING_MATCHBITS, 0xffffffffffffffffLL, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc != 0) { + CERROR("Can't create ping ME: %d\n", rc); + return rc; + } + + rc = LNetMDAttach(meh, + (lnet_md_t){.start = my_ids, + .length = n_ids * sizeof(my_ids[0]), + .threshold = LNET_MD_THRESH_INF, + .options = (LNET_MD_OP_GET | + LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE), + .eq_handle = LNET_EQ_NONE}, + LNET_RETAIN, + &the_lnet.ln_ping_target_md); + if (rc != 0) { + CERROR("Can't attach ping MD: %d\n", rc); + rc2 = LNetMEUnlink(meh); + LASSERT (rc2 == 0); + return rc; + } + + return 0; +} + +void +lnet_ping_target_fini(void) +{ + int rc = LNetMDUnlink(the_lnet.ln_ping_target_md); + + if (rc != 0) + CERROR("Can't unlink the ping MD: %d\n", rc); + + /* NB this MD may still be active: but since I have no EQ, I don't get + * to see the UNLINK event... */ +} + +int +lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids) +{ + lnet_process_id_t *tmp_ids; + lnet_handle_eq_t eqh; + lnet_handle_md_t mdh; + lnet_event_t event; + int which; + int unlinked = 0; + int replied = 0; + const int a_long_time = 60000; /* mS */ + int rc; + int rc2; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY || + timeout_ms > 500000 || /* arbitrary limit! */ + n_ids > 20) /* arbitrary limit! */ + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LUSTRE_SRV_LNET_PID; + + LIBCFS_ALLOC(tmp_ids, n_ids * sizeof(*tmp_ids)); + if (tmp_ids == NULL) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto out_0; + } + + rc = LNetMDBind((lnet_md_t){.start = tmp_ids, + .length = n_ids * sizeof(*tmp_ids), + .threshold = 2, /* GET/REPLY */ + .options = LNET_MD_TRUNCATE, + .eq_handle = eqh}, + LNET_UNLINK, + &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_1; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PING_MATCHBITS, 0); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + + rc2 = LNetMDUnlink(mdh); + LASSERT (rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout_ms = a_long_time; + } + + do { + rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which); + + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout_ms = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } + + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + + } while (rc2 <= 0 || !event.unlinked); + + if (replied) { + rc = rc / sizeof(*tmp_ids); + LASSERT (rc >= 0 && rc <= n_ids); +#ifdef __KERNEL__ + if (copy_to_user(ids, tmp_ids, rc * sizeof(*tmp_ids))) + rc = -EFAULT; +#else + memcpy(ids, tmp_ids, rc * sizeof(*tmp_ids)); +#endif + } else if (rc >= 0) { + CWARN("Unexpected rc >= 0 but no reply!\n"); + rc = -EIO; + } + + out_1: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT (rc2 == 0); + + out_0: + LIBCFS_FREE(tmp_ids, n_ids * sizeof(*tmp_ids)); + return rc; +} diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index ea5fa02..bcbc8a0 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -170,7 +170,6 @@ EXPORT_SYMBOL(lnet_copy_kiov2kiov); EXPORT_SYMBOL(lnet_finalize); EXPORT_SYMBOL(lnet_parse); EXPORT_SYMBOL(lnet_create_reply_msg); -EXPORT_SYMBOL(lnet_net2ni); MODULE_AUTHOR("Peter J. Braam "); MODULE_DESCRIPTION("Portals v3.1"); diff --git a/lnet/utils/gmlndnid.c b/lnet/utils/gmlndnid.c index f158745..c86d625 100644 --- a/lnet/utils/gmlndnid.c +++ b/lnet/utils/gmlndnid.c @@ -46,27 +46,34 @@ void usage(char *prg, int h) { - fprintf(stderr, "usage %s -n hostname | -l | -h\n", prg); - if (h) { - printf("\nGet Myrinet Global network ids for specified host\n" - "-l gets network id for local host\n"); - } + fprintf(stderr, + "usage %s -h\n" + " %s [-l] [-n hostname] [-L] [hostnames]\n", prg); + + if (h) + printf("Print Myrinet Global network ids for specified hosts\n" + "-l print local host's ID\n" + "-n hostname print given host's ID\n" + "-L print Myringet local net ID too\n" + "[hostnames] print ids of given hosts (local if none)\n"); } -unsigned -u_getgmnid(char *name, int get_local_id) +gm_status_t +print_gmid(char *name, int name_fieldlen, int show_local_id) { struct gm_port *gm_port; - int gm_port_id = 2; - gm_status_t gm_status = GM_SUCCESS; - unsigned global_nid = 0, local_nid = 0; /* gm ids never 0 */ + int gm_port_id; + gm_status_t gm_status; + unsigned int local_id; + unsigned int global_id; gm_status = gm_init(); if (gm_status != GM_SUCCESS) { fprintf(stderr, "gm_init: %s\n", gm_strerror(gm_status)); - return(0); + return gm_status; } + gm_port_id = 2; gm_status = gm_open(&gm_port, GM_UNIT, gm_port_id, "gmnalnid", GM_API_VERSION); if (gm_status != GM_SUCCESS) { @@ -82,77 +89,96 @@ u_getgmnid(char *name, int get_local_id) if (gm_status != GM_SUCCESS) { fprintf(stderr, "gm_open: %s\n",gm_strerror(gm_status)); - gm_finalize(); - return(0); + goto out_0; } } - if (get_local_id) { - local_nid = 1; + if (name == NULL) { + local_id = 1; + name = ""; } else { gm_status = gm_host_name_to_node_id_ex(gm_port, 1000000, name, - &local_nid); + &local_id); if (gm_status != GM_SUCCESS) { - fprintf(stderr, "gm_host_name_to_node_id_ex: %s\n", - gm_strerror(gm_status)); - gm_close(gm_port); - gm_finalize(); - return(0); + fprintf(stderr, "gm_host_name_to_node_id_ex(%s): %s\n", + name, gm_strerror(gm_status)); + goto out_1; } } - gm_status = gm_node_id_to_global_id(gm_port, local_nid, &global_nid) ; + gm_status = gm_node_id_to_global_id(gm_port, local_id, &global_id) ; if (gm_status != GM_SUCCESS) { - fprintf(stderr, "gm_node_id_to_global_id: %s\n", - gm_strerror(gm_status)); - gm_close(gm_port); - gm_finalize(); - return(0); + fprintf(stderr, "gm_node_id_to_global_id(%s:%d): %s\n", + name, local_id, gm_strerror(gm_status)); + goto out_1; } + + if (name_fieldlen > 0) + printf ("%*s ", name_fieldlen, name); + + if (!show_local_id) + printf("0x%x\n", global_id); + else + printf("local 0x%x global 0x%x\n", local_id, global_id); + + out_1: gm_close(gm_port); + out_0: gm_finalize(); - return(global_nid); + + return gm_status; } -int main(int argc, char **argv) +int +main (int argc, char **argv) { - unsigned int nid = 0; - char *name = NULL; int c; - int get_local_id = 0; + gm_status_t gmrc; + int rc; + int max_namelen = 0; + int show_local_id = 0; - while ((c = getopt(argc, argv, "n:lh")) != -1) { + while ((c = getopt(argc, argv, "n:lLh")) != -1) switch(c) { - case('n'): - if (get_local_id) { - usage(argv[0], 0); - exit(-1); - } - name = optarg; - break; - case('h'): + case 'h': usage(argv[0], 1); - exit(-1); - break; - case('l'): - if (name) { - usage(argv[0], 0); - exit(-1); - } - get_local_id = 1; + return 0; + + case 'L': + show_local_id = 1; break; + + case 'n': + gmrc = print_gmid(optarg, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; + + case 'l': + gmrc = print_gmid(NULL, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; + default: usage(argv[0], 0); - exit(-1); + return 2; } - } - if (!name && !get_local_id) { - usage(argv[0], 0); - exit(-1); + if (optind == argc) { + gmrc = print_gmid(NULL, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; } - nid = u_getgmnid(name, get_local_id); - printf("0x%x\n", nid); - exit(0); + if (optind != argc - 1) + for (c = optind; c < argc; c++) + if (strlen(argv[c]) > max_namelen) + max_namelen = strlen(argv[c]); + + rc = 0; + + for (c = optind; c < argc; c++) { + gmrc = print_gmid(argv[c], max_namelen, show_local_id); + + if (gmrc != GM_SUCCESS) + rc = 1; + } + + return rc; } diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 7a24c3d..abb2cdd 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -982,7 +982,7 @@ jt_ptl_print_active_txs (int argc, char **argv) return 0; } -int jt_ptl_ping(int argc, char **argv) +int jt_ptl_ping_test(int argc, char **argv) { int rc; lnet_nid_t nid; @@ -1025,7 +1025,7 @@ int jt_ptl_ping(int argc, char **argv) data.ioc_u32[0] = size; data.ioc_u32[1] = timeout; - rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING_TEST, &data); if (rc) { fprintf(stderr, "failed to start pinger: %s\n", strerror(errno)); @@ -1034,6 +1034,58 @@ int jt_ptl_ping(int argc, char **argv) return 0; } +int jt_ptl_ping(int argc, char **argv) +{ + int rc; + int timeout; + lnet_process_id_t id; + lnet_process_id_t ids[16]; + struct libcfs_ioctl_data data; + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [timeout (secs)] [pid]\n", argv[0]); + return 0; + } + + id.nid = libcfs_str2nid(argv[1]); + if (id.nid == LNET_NID_ANY) { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return -1; + } + + if (argc > 2) + timeout = 1000 * atol(argv[2]); + else + timeout = 1000; /* default 1 second timeout */ + + if (argc > 3) + id.pid = atol(argv[4]); + else + id.pid = LNET_PID_ANY; + + LIBCFS_IOC_INIT (data); + data.ioc_nid = id.nid; + data.ioc_u32[0] = id.pid; + data.ioc_u32[1] = timeout; + data.ioc_plen1 = sizeof(ids); + data.ioc_pbuf1 = (char *)ids; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING, &data); + if (rc != 0) { + fprintf(stderr, "failed to ping %s: %s\n", + id.pid == LNET_PID_ANY ? + libcfs_nid2str(id.nid) : libcfs_id2str(id), + strerror(errno)); + return -1; + } + + for (i = 0; i < data.ioc_count; i++) + printf("%s\n", libcfs_id2str(ids[i])); + + return 0; +} + int jt_ptl_mynid(int argc, char **argv) { struct libcfs_ioctl_data data; @@ -1686,3 +1738,36 @@ int jt_ptl_memhog(int argc, char **argv) return 0; } +int jt_ptl_testprotocompat(int argc, char **argv) +{ + struct libcfs_ioctl_data data; + int rc; + int flags; + char *end; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 0; + } + + flags = strtol(argv[1], &end, 0); + if (flags < 0 || *end != 0) { + fprintf(stderr, "Can't parse flags '%s'\n", argv[1]); + return -1; + } + + LIBCFS_IOC_INIT(data); + data.ioc_flags = flags; + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_TESTPROTOCOMPAT, &data); + + if (rc != 0) { + fprintf(stderr, "test proto compat %x failed: %s\n", + flags, strerror(errno)); + return -1; + } + + printf("test proto compat %x OK\n", flags); + return 0; +} + + diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c index 11dc075..c3ab2b7 100644 --- a/lnet/utils/ptlctl.c +++ b/lnet/utils/ptlctl.c @@ -43,7 +43,8 @@ command_t list[] = { {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"}, {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [nid]"}, {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits (no args)"}, - {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"testping", jt_ptl_ping_test, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"ping", jt_ptl_ping, 0, "ping (args: nid [timeout] [pid])"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, @@ -54,6 +55,7 @@ command_t list[] = { {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, + {"testprotocompat", jt_ptl_testprotocompat, 0, "usage: testprotocompat count"}, {"help", Parser_help, 0, "help"}, {"exit", Parser_quit, 0, "quit"}, {"quit", Parser_quit, 0, "quit"}, diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index a60a892..8db931f 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -120,6 +120,9 @@ command_t cmdlist[] = { "Omitting the count means indefinitely, 0 means restore, " "otherwise fail 'count' messages.\n" "usage: fail nid|_all_ [count]"}, + {"ping", jt_ptl_ping, 0, + "Check LNET connectivity\n" + "usage: ping nid [timeout] [pid]"}, /* Device selection commands */ {"=== device selection ===", jt_noop, 0, "device selection"}, -- 1.8.3.1