From ca387f068b8cb8b69eb6a24f9d03210b40bc68cb Mon Sep 17 00:00:00 2001 From: eeb Date: Wed, 23 Feb 2005 21:43:25 +0000 Subject: [PATCH] * First cut working vibnal --- lnet/autoconf/lustre-lnet.m4 | 108 +- lnet/klnds/viblnd/Makefile.in | 2 +- lnet/klnds/viblnd/Makefile.mk | 2 +- lnet/klnds/viblnd/viblnd.c | 1366 ++++++++------ lnet/klnds/viblnd/viblnd.h | 852 ++++----- lnet/klnds/viblnd/viblnd_cb.c | 4154 +++++++++++++++++++++-------------------- lnet/klnds/viblnd/vibnal_sa.c | 333 ---- lnet/utils/portals.c | 28 +- 8 files changed, 3319 insertions(+), 3526 deletions(-) delete mode 100644 lnet/klnds/viblnd/vibnal_sa.c diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 6780155..e9c5889 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -1,4 +1,25 @@ # +# LP_CHECK_GCC_VERSION +# +# Check compiler version +# +AC_DEFUN([LP_CHECK_GCC_VERSION], +[AC_MSG_CHECKING([compiler version]) +PTL_CC_VERSION=`$CC --version | awk '/^gcc/{print $ 3}'` +PTL_MIN_CC_VERSION="3.2.3" +v2n() { + awk -F. '{printf "%d\n", (($ 1)*100+($ 2))*100+($ 3)}' +} +if test -z "$PTL_CC_VERSION" -o \ + `echo $PTL_CC_VERSION | v2n` -ge `echo $PTL_MIN_CC_VERSION | v2n`; then + AC_MSG_RESULT([ok]) +else + AC_MSG_RESULT([Buggy compiler found]) + AC_MSG_ERROR([Need gcc version >= $PTL_MIN_CC_VERSION]) +fi +]) + +# # LP_CONFIG_ZEROCOPY # # check if zerocopy is available/wanted @@ -242,29 +263,66 @@ AC_SUBST(IIBNAL) # check for Voltaire infiniband support # AC_DEFUN([LP_CONFIG_VIB], -[AC_MSG_CHECKING([if Voltaire IB kernel headers are present]) -VIBCPPFLAGS="-I/usr/local/include/ibhost-kdevel -DCPU_BE=0 -DCPU_LE=1 -DGSI_PASS_PORT_NUM" -EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - vv_hca_h_t kib_hca; - vv_return_t retval; - - retval = vv_hca_open("ANY_HCA", NULL, &kib_hca); - - return retval == vv_return_ok ? 0 : 1; -],[ - AC_MSG_RESULT([yes]) - VIBNAL="vibnal" -],[ - AC_MSG_RESULT([no]) +[AC_MSG_CHECKING([whether to enable Voltaire IB support]) +VIBPATH="" +AC_ARG_WITH([vib], + AC_HELP_STRING([--with-vib=path], + [build vibnal against path]), + [ + case $with_vib in + no) AC_MSG_RESULT([no]);; + *) VIBPATH="${with_vib}/src/nvigor/ib-code" + if test -d "$with_vib" -a -d "$VIBPATH"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + AC_MSG_ERROR([No directory $VIBPATH]) + fi;; + esac + ],[ + AC_MSG_RESULT([no]) + ]) +if test -z "$VIBPATH"; then VIBNAL="" - VIBCPPFLAGS="" -]) -EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" +else + VIBCPPFLAGS="-I${VIBPATH}/include -I${VIBPATH}/cm" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" + LB_LINUX_TRY_COMPILE([ + #include + #include + #ifdef __BIG_ENDIAN + # define CPU_BE 1 + # define CPU_LE 0 + #endif + #ifdef __LITTLE_ENDIAN + # define CPU_BE 0 + # define CPU_LE 1 + #endif + #include + #include + #include + ],[ + vv_hca_h_t kib_hca; + vv_return_t vvrc; + cm_cep_handle_t cep; + ibat_arp_data_t arp_data; + ibat_stat_t ibatrc; + + vvrc = vv_hca_open("ANY_HCA", NULL, &kib_hca); + cep = cm_create_cep(cm_cep_transp_rc); + ibatrc = ibat_get_ib_data((uint32_t)0, (uint32_t)0, + ibat_paths_primary, &arp_data, + (ibat_get_ib_data_reply_fn_t)NULL, + NULL, 0); + return 0; + ],[ + VIBNAL="vibnal" + ],[ + AC_MSG_ERROR([can't compile vibnal with given path]) + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" +fi AC_SUBST(VIBCPPFLAGS) AC_SUBST(VIBNAL) ]) @@ -411,13 +469,15 @@ fi # Portals linux kernel checks # AC_DEFUN([LP_PROG_LINUX], -[LP_CONFIG_ZEROCOPY +[LP_CHECK_GCC_VERSION + +LP_CONFIG_ZEROCOPY LP_CONFIG_AFFINITY LP_CONFIG_QUADRICS LP_CONFIG_GM LP_CONFIG_OPENIB -LP_CONFIG_IIB LP_CONFIG_VIB +LP_CONFIG_IIB LP_CONFIG_RANAL LP_STRUCT_PAGE_LIST diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in index fd7bb05..5287e70 100644 --- a/lnet/klnds/viblnd/Makefile.in +++ b/lnet/klnds/viblnd/Makefile.in @@ -1,5 +1,5 @@ MODULES := kvibnal -kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o +kvibnal-objs := vibnal.o vibnal_cb.o EXTRA_POST_CFLAGS := @VIBCPPFLAGS@ diff --git a/lnet/klnds/viblnd/Makefile.mk b/lnet/klnds/viblnd/Makefile.mk index d08633a..ffc1510 100644 --- a/lnet/klnds/viblnd/Makefile.mk +++ b/lnet/klnds/viblnd/Makefile.mk @@ -6,5 +6,5 @@ include $(src)/../../Kernelenv obj-y += kvibnal.o -kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o +kvibnal-objs := vibnal.o vibnal_cb.o diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 0c0a0e7..2cb4b7d 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -26,12 +26,9 @@ nal_t kibnal_api; ptl_handle_ni_t kibnal_ni; +kib_data_t kibnal_data; kib_tunables_t kibnal_tunables; -kib_data_t kibnal_data = { - .kib_service_id = IBNAL_SERVICE_NUMBER, -}; - #ifdef CONFIG_SYSCTL #define IBNAL_SYSCTL 202 @@ -50,268 +47,330 @@ static ctl_table kibnal_top_ctl_table[] = { }; #endif -#ifdef unused void -print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +kibnal_pause(int ticks) { - char name[32]; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} - if (service == NULL) - { - CWARN("tag : %s\n" - "status : %d (NULL)\n", tag, rc); - return; - } - strncpy (name, service->ServiceName, sizeof(name)-1); - name[sizeof(name)-1] = 0; - - CWARN("tag : %s\n" - "status : %d\n" - "service id: "LPX64"\n" - "name : %s\n" - "NID : "LPX64"\n", tag, rc, - service->RID.ServiceID, name, - *kibnal_service_nid_field(service)); +__u32 +kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; } -#endif -/* - * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported. - * nid is the nid to advertize/query/unadvertize - */ -static void fill_sa_request(struct sa_request *request, int method, ptl_nid_t nid) +void +kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) { - gsi_dtgrm_t *dtgrm = request->dtgrm_req; - sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; - ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload; - - memset(mad, 0, MAD_BLOCK_SIZE); - - request->mad = mad; - - dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid; - dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level; - - mad->hdr.base_ver = MAD_IB_BASE_VERSION; - mad->hdr.class = MAD_CLASS_SUBN_ADM; - mad->hdr.class_ver = 2; - mad->hdr.m.ms.method = method; - mad->hdr.attrib_id = SA_SERVICE_RECORD; /* something(?) will swap that field */ - - /* Note: the transaction ID is set by the Voltaire stack if it is 0. */ - - /* TODO: change the 40 to sizeof(something) */ - mad->payload_len = cpu_to_be32(0x40 /*header size */ + - sizeof (ib_service_record_v2_t)); - - - mad->component_mask = cpu_to_be64( - (1ull << 0) | /* service_id */ - (1ull << 2) | /* service_pkey */ - (1ull << 6) | /* service_name */ - (1ull << 7) | /* service_data8[0] */ - (1ull << 8) | /* service_data8[1] */ - (1ull << 9) | /* service_data8[2] */ - (1ull << 10) | /* service_data8[3] */ - (1ull << 11) | /* service_data8[4] */ - (1ull << 12) | /* service_data8[5] */ - (1ull << 13) | /* service_data8[6] */ - (1ull << 14) /* service_data8[7] */ - ); - - sr->service_id = cpu_to_be64(kibnal_data.kib_service_id); - sr->service_pkey = cpu_to_be16(kibnal_data.kib_port_pkey); - - /* Set the service name and the data (bytes 0 to 7) in data8 */ - kibnal_set_service_keys(sr, nid); - - if (method == SUBN_ADM_SET) { - mad->component_mask |= cpu_to_be64( - (1ull << 1) | /* service_gid */ - (1ull << 4) /* service_lease */ - ); - - sr->service_gid = kibnal_data.kib_port_gid; - gid_swap(&sr->service_gid); - sr->service_lease = cpu_to_be32(0xffffffff); - } - - CDEBUG(D_NET, "SA request %02x for service id "LPX64" %s:"LPX64"\n", - mad->hdr.m.ms.method, - sr->service_id, - sr->service_name, - *kibnal_service_nid_field(sr)); + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; } -/* Do an advertizement operation: - * SUBN_ADM_GET = 0x01 (i.e. query), - * SUBN_ADM_SET = 0x02 (i.e. advertize), - * SUBN_ADM_DELETE = 0x15 (i.e. un-advertize). - * If callback is NULL, the function is synchronous (and context is ignored). - */ -int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context) +void +kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp) { - struct sa_request *request; - int ret; + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBNAL_MSG_MAGIC; + msg->ibm_version = IBNAL_MSG_VERSION; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid; + msg->ibm_srcstamp = kibnal_data.kib_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; +#if IBNAL_CKSUM + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); +#endif +} - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); +int +kibnal_unpack_msg(kib_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + int flip; + int msg_nob; + int i; + int n; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } - CDEBUG(D_NET, "kibnal_advertize_op: nid="LPX64", op=%d\n", nid, op); + if (msg->ibm_version != + (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) { + CERROR("Bad version: %d\n", msg->ibm_version); + return -EPROTO; + } - request = alloc_sa_request(); - if (request == NULL) { - CERROR("Cannot allocate a SA request"); - return -ENOMEM; + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; } - - fill_sa_request(request, op, nid); - if (callback) { - request->callback = callback; - request->context = context; - } else { - init_completion(&request->signal); + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; } - ret = vibnal_start_sa_request(request); - if (ret) { - CERROR("vibnal_send_sa failed: %d\n", ret); - free_sa_request(request); - } else { - if (callback) { - /* Return. The callback will have to free the SA request. */ - ret = 0; - } else { - wait_for_completion(&request->signal); + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kibnal_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + __swab16s(&msg->ibm_version); + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == PTL_NID_ANY) { + CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid); + return -EPROTO; + } - ret = request->status; + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBNAL_MSG_NOOP: + break; - if (ret != 0) { - CERROR ("Error %d in advertising operation %d for NID "LPX64"\n", - ret, op, kibnal_data.kib_nid); + case IBNAL_MSG_IMMEDIATE: + if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { + CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); + return -EPROTO; + } + break; + + case IBNAL_MSG_PUT_REQ: + /* CAVEAT EMPTOR! We don't actually put ibprm_rd on the wire; + * it's just there to remember the source buffers while we wait + * for the PUT_ACK */ + if (msg_nob < offsetof(kib_msg_t, ibm_u.putreq.ibprm_rd)) { + CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putreq))); + return -EPROTO; + } + break; + + case IBNAL_MSG_PUT_ACK: + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])); + return -EPROTO; + } + + if (flip) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); + } + + n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) + for (i = 0; i < n; i++) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi); } - - free_sa_request(request); + break; + + case IBNAL_MSG_GET_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.get))); + return -EPROTO; + } + if (flip) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); } - } - return ret; + n = msg->ibm_u.get.ibgm_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) + for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi); + } + break; + + case IBNAL_MSG_PUT_NAK: + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { + CERROR("Short RDMA completion: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.completion))); + return -EPROTO; + } + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBNAL_MSG_CONNREQ: + case IBNAL_MSG_CONNACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { + CERROR("Short connreq/ack: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.connparams))); + return -EPROTO; + } + if (flip) { + __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); + } + break; + } + return 0; } -static int +int kibnal_set_mynid(ptl_nid_t nid) { - struct timeval tv; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; - vv_return_t retval; + static cm_listen_data_t info; /* protected by kib_nid_mutex */ + + lib_ni_t *ni = &kibnal_lib.libnal_ni; + int rc; + cm_return_t cmrc; CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->ni_pid.nid); - do_gettimeofday(&tv); - down (&kibnal_data.kib_nid_mutex); - if (nid == kibnal_data.kib_nid) { + if (nid == ni->ni_pid.nid) { /* no change of NID */ up (&kibnal_data.kib_nid_mutex); return (0); } - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); - - /* Unsubscribes the current NID */ - if (kibnal_data.kib_nid != PTL_NID_ANY) { + CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid); - rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); + if (kibnal_data.kib_listen_handle != NULL) { + cmrc = cm_cancel(kibnal_data.kib_listen_handle); + if (cmrc != cm_stat_success) + CERROR ("Error %d stopping listener\n", cmrc); - if (rc) { - CERROR("Error %d unadvertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); - } - } + kibnal_pause(HZ/10); /* ensure no more callbacks */ - kibnal_data.kib_nid = ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + if (cmrc != vv_return_ok) + CERROR ("Error %d destroying CEP\n", cmrc); - /* Destroys the current endpoint, if any. */ - if (kibnal_data.kib_cep) { - retval = cm_cancel(kibnal_data.kib_cep); - if (retval) - CERROR ("Error %d stopping listener\n", retval); - - retval = cm_destroy_cep(kibnal_data.kib_cep); - if (retval) - CERROR ("Error %d destroying CEP\n", retval); - - kibnal_data.kib_cep = NULL; + kibnal_data.kib_listen_handle = NULL; } - + + /* Change NID. NB queued passive connection requests (if any) will be + * rejected with an incorrect destination NID */ + ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation++; + mb(); + /* Delete all existing peers and their connections after new * NID/incarnation set to ensure no old connections in our brave * new world. */ kibnal_del_peer (PTL_NID_ANY, 0); - if (kibnal_data.kib_nid == PTL_NID_ANY) { - /* No new NID to install. The driver is shuting down. */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - /* remove any previous advert (crashed node etc) */ - kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); - - kibnal_data.kib_cep = cm_create_cep(cm_cep_transp_rc); - if (kibnal_data.kib_cep == NULL) { - CERROR ("Can't create CEP\n"); - rc = -ENOMEM; - } else { - cm_return_t cmret; - cm_listen_data_t info; + if (ni->ni_pid.nid != PTL_NID_ANY) { /* got a new NID to install */ + kibnal_data.kib_listen_handle = + cm_create_cep(cm_cep_transp_rc); + if (kibnal_data.kib_listen_handle == NULL) { + CERROR ("Can't create listen CEP\n"); + rc = -ENOMEM; + goto failed_0; + } - CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_cep); + CDEBUG(D_NET, "Created CEP %p for listening\n", + kibnal_data.kib_listen_handle); memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = kibnal_data.kib_service_id; + info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id; - cmret = cm_listen(kibnal_data.kib_cep, &info, - kibnal_listen_callback, NULL); - if (cmret) { - CERROR ("cm_listen error: %d\n", cmret); + cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, + kibnal_listen_callback, NULL); + if (cmrc != 0) { + CERROR ("cm_listen error: %d\n", cmrc); rc = -EINVAL; - } else { - rc = 0; + goto failed_1; } } - - if (rc == 0) { - rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_SET, NULL, NULL); - if (rc == 0) { -#ifdef IBNAL_CHECK_ADVERT - kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_GET, NULL, NULL); -#endif - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - retval = cm_cancel (kibnal_data.kib_cep); - if (retval) - CERROR("cm_cancel failed: %d\n", retval); - retval = cm_destroy_cep (kibnal_data.kib_cep); - if (retval) - CERROR("cm_destroy_cep failed: %d\n", retval); - - /* remove any peers that sprung up while I failed to - * advertise myself */ - kibnal_del_peer (PTL_NID_ANY, 0); - } + up (&kibnal_data.kib_nid_mutex); + return (0); - kibnal_data.kib_nid = PTL_NID_ANY; + failed_1: + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + LASSERT (cmrc == cm_stat_success); + kibnal_data.kib_listen_handle = NULL; + failed_0: + ni->ni_pid.nid = PTL_NID_ANY; + kibnal_data.kib_incarnation++; + mb(); + kibnal_del_peer (PTL_NID_ANY, 0); up (&kibnal_data.kib_nid_mutex); - return (rc); + return rc; } kib_peer_t * @@ -340,7 +399,12 @@ kibnal_create_peer (ptl_nid_t nid) peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; atomic_inc (&kibnal_data.kib_npeers); - return (peer); + if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS) + return peer; + + CERROR("Too many peers: CQ will overflow\n"); + kibnal_peer_decref(peer); + return NULL; } void @@ -390,21 +454,6 @@ kibnal_find_peer_locked (ptl_nid_t nid) return (NULL); } -kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - kib_peer_addref(peer); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - return (peer); -} - void kibnal_unlink_peer_locked (kib_peer_t *peer) { @@ -414,16 +463,17 @@ kibnal_unlink_peer_locked (kib_peer_t *peer) LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - kib_peer_decref(peer); + kibnal_peer_decref(peer); } -static int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +int +kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, + int *persistencep) { kib_peer_t *peer; struct list_head *ptmp; - unsigned long flags; int i; + unsigned long flags; read_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -440,6 +490,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) continue; *nidp = peer->ibp_nid; + *ipp = peer->ibp_ip; *persistencep = peer->ibp_persistence; read_unlock_irqrestore(&kibnal_data.kib_global_lock, @@ -452,12 +503,14 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) return (-ENOENT); } -static int -kibnal_add_persistent_peer (ptl_nid_t nid) +int +kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip) { - unsigned long flags; kib_peer_t *peer; kib_peer_t *peer2; + unsigned long flags; + + CDEBUG(D_NET, LPX64"@%08x\n", nid, ip); if (nid == PTL_NID_ANY) return (-EINVAL); @@ -466,11 +519,11 @@ kibnal_add_persistent_peer (ptl_nid_t nid) if (peer == NULL) return (-ENOMEM); - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - kib_peer_decref (peer); + kibnal_peer_decref (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ @@ -478,13 +531,14 @@ kibnal_add_persistent_peer (ptl_nid_t nid) kibnal_nid2peerlist (nid)); } + peer->ibp_ip = ip; peer->ibp_persistence++; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (0); } -static void +void kibnal_del_peer_locked (kib_peer_t *peer, int single_share) { struct list_head *ctmp; @@ -517,16 +571,16 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share) int kibnal_del_peer (ptl_nid_t nid, int single_share) { - unsigned long flags; struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; int lo; int hi; int i; + unsigned long flags; int rc = -ENOENT; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; @@ -553,20 +607,19 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) } } out: - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (rc); } -static kib_conn_t * +kib_conn_t * kibnal_get_conn_by_idx (int index) { kib_peer_t *peer; struct list_head *ptmp; kib_conn_t *conn; struct list_head *ctmp; - unsigned long flags; int i; + unsigned long flags; read_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -583,10 +636,7 @@ kibnal_get_conn_by_idx (int index) continue; conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (conn); @@ -598,19 +648,124 @@ kibnal_get_conn_by_idx (int index) return (NULL); } +int +kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) +{ + static vv_qp_attr_t attr; + + kib_connvars_t *cv = conn->ibc_connvars; + vv_return_t vvrc; + + /* Only called by connd => static OK */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + + memset(&attr, 0, sizeof(attr)); + + switch (new_state) { + default: + LBUG(); + + case vv_qp_state_init: { + struct vv_qp_modify_init_st *init = &attr.modify.params.init; + + init->p_key_indx = cv->cv_pkey_index; + init->phy_port_num = cv->cv_port; + init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */ + init->access_control = vv_acc_r_mem_read | + vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */ + + attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | + VV_QP_AT_PHY_PORT_NUM | + VV_QP_AT_ACCESS_CON_F; + break; + } + case vv_qp_state_rtr: { + struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr; + vv_add_vec_t *av = &rtr->remote_add_vec; + + av->dlid = cv->cv_path.dlid; + av->grh_flag = (!IBNAL_LOCAL_SUB); + av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate); + av->service_level = cv->cv_path.sl; + av->source_path_bit = IBNAL_SOURCE_PATH_BIT; + av->pmtu = cv->cv_path.mtu; + av->rnr_retry_count = cv->cv_rnr_count; + av->global_dest.traffic_class = cv->cv_path.traffic_class; + av->global_dest.hope_limit = cv->cv_path.hop_limut; + av->global_dest.flow_lable = cv->cv_path.flow_label; + av->global_dest.s_gid_index = cv->cv_sgid_index; + // XXX other av fields zero? + + rtr->destanation_qp = cv->cv_remote_qpn; + rtr->receive_psn = cv->cv_rxpsn; + rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD; + + // XXX ? rtr->opt_min_rnr_nak_timer = 16; + + + // XXX sdp sets VV_QP_AT_OP_F but no actual optional options + attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | + VV_QP_AT_DEST_QP | + VV_QP_AT_R_PSN | + VV_QP_AT_MIN_RNR_NAK_T | + VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | + VV_QP_AT_OP_F; + break; + } + case vv_qp_state_rts: { + struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts; + + rts->send_psn = cv->cv_txpsn; + rts->local_ack_timeout = IBNAL_LOCAL_ACK_TIMEOUT; + rts->retry_num = IBNAL_RETRY_CNT; + rts->rnr_num = IBNAL_RNR_CNT; + rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; + + attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | + VV_QP_AT_L_ACK_T | + VV_QP_AT_RETRY_NUM | + VV_QP_AT_RNR_NUM | + VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; + break; + } + case vv_qp_state_error: + case vv_qp_state_reset: + attr.modify.vv_qp_attr_mask = 0; + break; + } + + attr.modify.qp_modify_into_state = new_state; + attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE; + + vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); + if (vvrc != vv_return_ok) { + CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", + conn->ibc_peer->ibp_nid, new_state, vvrc); + return -EIO; + } + + return 0; +} + kib_conn_t * -kibnal_create_conn (void) +kibnal_create_conn (cm_cep_handle_t cep) { - kib_conn_t *conn; - int i; - __u64 vaddr = 0; - __u64 vaddr_base; - int page_offset; - int ipage; - vv_qp_attr_t qp_attr; - vv_return_t retval; - int rc; - void *qp_context; + kib_conn_t *conn; + int i; + __u64 vaddr = 0; + __u64 vaddr_base; + int page_offset; + int ipage; + vv_return_t vvrc; + int rc; + + static vv_qp_attr_t reqattr; + static vv_qp_attr_t rspattr; + + /* Only the connd creates conns => single threaded */ + LASSERT(!in_interrupt()); + LASSERT(current == kibnal_data.kib_connd); PORTAL_ALLOC(conn, sizeof (*conn)); if (conn == NULL) { @@ -621,6 +776,7 @@ kibnal_create_conn (void) /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); + INIT_LIST_HEAD (&conn->ibc_early_rxs); INIT_LIST_HEAD (&conn->ibc_tx_queue); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); @@ -628,6 +784,18 @@ kibnal_create_conn (void) atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ + conn->ibc_cep = cep; + + PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed; + } + memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); + /* Random seed for QP sequence number */ + get_random_bytes(&conn->ibc_connvars->cv_rxpsn, + sizeof(conn->ibc_connvars->cv_rxpsn)); + PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) { CERROR("Cannot allocate RX buffers\n"); @@ -649,26 +817,27 @@ kibnal_create_conn (void) rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - if (kibnal_whole_mem()) { - void *newaddr; - vv_mem_reg_h_t mem_h; - vv_r_key_t r_key; +#if IBNAL_WHOLE_MEM + { + vv_mem_reg_h_t mem_h; + vv_r_key_t r_key; /* Voltaire stack already registers the whole * memory, so use that API. */ - retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - rx->rx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &rx->l_key, - &r_key); - if (retval) { - CERROR("vv_get_gen_mr_attrib failed: %d", retval); - /* TODO: free pages? */ - goto failed; - } + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + rx->rx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &rx->rx_lkey, + &r_key); + LASSERT (vvrc == vv_return_ok); } - +#else + rx->rx_vaddr = vaddr; +#endif + CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, + rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx)); + vaddr += IBNAL_MSG_SIZE; LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); @@ -682,47 +851,40 @@ kibnal_create_conn (void) } } - qp_attr = (vv_qp_attr_t) { - .create.qp_type = vv_qp_type_r_conn, - .create.cq_send_h = kibnal_data.kib_cq, - .create.cq_receive_h = kibnal_data.kib_cq, - .create.send_max_outstand_wr = IBNAL_TX_MAX_SG * - IBNAL_MSG_QUEUE_SIZE, - .create.receive_max_outstand_wr = IBNAL_MSG_QUEUE_SIZE, - .create.max_scatgat_per_send_wr = 1, - .create.max_scatgat_per_receive_wr = 1, - .create.signaling_type = vv_selectable_signaling, /* TODO: correct? */ - .create.pd_h = kibnal_data.kib_pd, - .create.recv_solicited_events = vv_signal_all, - }; - retval = vv_qp_create(kibnal_data.kib_hca, &qp_attr, NULL, - &conn->ibc_qp, &conn->ibc_qp_attrs); - if (retval != 0) { - CERROR ("Failed to create queue pair: %d\n", retval); + memset(&reqattr, 0, sizeof(reqattr)); + + reqattr.create.qp_type = vv_qp_type_r_conn; + reqattr.create.cq_send_h = kibnal_data.kib_cq; + reqattr.create.cq_receive_h = kibnal_data.kib_cq; + reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * + IBNAL_MSG_QUEUE_SIZE; + reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; + reqattr.create.max_scatgat_per_send_wr = 1; + reqattr.create.max_scatgat_per_receive_wr = 1; + reqattr.create.signaling_type = vv_selectable_signaling; + reqattr.create.pd_h = kibnal_data.kib_pd; + reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all; + + vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL, + &conn->ibc_qp, &rspattr); + if (vvrc != vv_return_ok) { + CERROR ("Failed to create queue pair: %d\n", vvrc); goto failed; } /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - - qp_attr = (vv_qp_attr_t) { - .modify.qp_modify_into_state = vv_qp_state_init, - .modify.vv_qp_attr_mask = VV_QP_AT_STATE | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_P_KEY_IX | VV_QP_AT_ACCESS_CON_F, - .modify.qp_type = vv_qp_type_r_conn, - - .modify.params.init.p_key_indx = 0, - .modify.params.init.phy_port_num = kibnal_data.kib_port, - .modify.params.init.access_control = vv_acc_r_mem_write | vv_acc_r_mem_read, - }; - retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs); - if (retval != 0) { - CERROR ("Failed to modify queue pair: %d\n", retval); - goto failed; - } - - retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); - if (retval) { - CERROR ("Failed to query queue pair: %d\n", retval); + conn->ibc_state = IBNAL_CONN_INIT; + conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; + + if (rspattr.create_return.receive_max_outstand_wr < + IBNAL_MSG_QUEUE_SIZE || + rspattr.create_return.send_max_outstand_wr < + (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) { + CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", + IBNAL_MSG_QUEUE_SIZE, + (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE, + rspattr.create_return.receive_max_outstand_wr, + rspattr.create_return.send_max_outstand_wr); goto failed; } @@ -738,91 +900,63 @@ kibnal_create_conn (void) void kibnal_destroy_conn (kib_conn_t *conn) { - vv_return_t retval; + vv_return_t vvrc; + + /* Only the connd does this (i.e. single threaded) */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); CDEBUG (D_NET, "connection %p\n", conn); LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_early_rxs)); LASSERT (list_empty(&conn->ibc_tx_queue)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + case IBNAL_CONN_DISCONNECTED: - /* called after connection sequence initiated */ + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); /* fall through */ - case IBNAL_CONN_INIT_QP: - /* _destroy includes an implicit Reset of the QP which - * discards posted work */ - retval = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); - if (retval) - CERROR("Can't destroy QP: %d\n", retval); + case IBNAL_CONN_INIT: + kibnal_set_qp_state(conn, vv_qp_state_reset); + vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); + if (vvrc != vv_return_ok) + CERROR("Can't destroy QP: %d\n", vvrc); /* fall through */ case IBNAL_CONN_INIT_NOTHING: break; - - default: - LASSERT (0); - } - - if (conn->ibc_cep != NULL) { - retval = cm_destroy_cep(conn->ibc_cep); - if (retval) - CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, - retval); } if (conn->ibc_rx_pages != NULL) kibnal_free_pages(conn->ibc_rx_pages); - + if (conn->ibc_rxs != NULL) PORTAL_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + if (conn->ibc_connvars != NULL) + PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + if (conn->ibc_peer != NULL) - kib_peer_decref(conn->ibc_peer); + kibnal_peer_decref(conn->ibc_peer); + + vvrc = cm_destroy_cep(conn->ibc_cep); + LASSERT (vvrc == vv_return_ok); PORTAL_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_connd_waitq); - } } -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* must disconnect before dropping the final ref */ - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); -} - -static int +int kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { kib_conn_t *conn; @@ -864,19 +998,19 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) return (count); } -static int +int kibnal_close_matching_conns (ptl_nid_t nid) { - unsigned long flags; kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; int hi; int i; + unsigned long flags; int count = 0; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; @@ -900,7 +1034,7 @@ kibnal_close_matching_conns (ptl_nid_t nid) } } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ if (nid == PTL_NID_ANY) @@ -909,31 +1043,32 @@ kibnal_close_matching_conns (ptl_nid_t nid) return (count == 0 ? -ENOENT : 0); } -static int +int kibnal_cmd(struct portals_cfg *pcfg, void * private) { int rc = -EINVAL; - ENTRY; LASSERT (pcfg != NULL); switch(pcfg->pcfg_command) { case NAL_CMD_GET_PEER: { ptl_nid_t nid = 0; + __u32 ip = 0; int share_count = 0; rc = kibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); + &nid, &ip, &share_count); pcfg->pcfg_nid = nid; pcfg->pcfg_size = 0; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; + pcfg->pcfg_id = ip; + pcfg->pcfg_misc = IBNAL_SERVICE_NUMBER; /* port */ pcfg->pcfg_count = 0; pcfg->pcfg_wait = share_count; break; } case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid, + pcfg->pcfg_id); /* IP */ break; } case NAL_CMD_DEL_PEER: { @@ -953,7 +1088,7 @@ kibnal_cmd(struct portals_cfg *pcfg, void * private) pcfg->pcfg_id = 0; pcfg->pcfg_misc = 0; pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); + kibnal_conn_decref(conn); } break; } @@ -970,20 +1105,21 @@ kibnal_cmd(struct portals_cfg *pcfg, void * private) } } - RETURN(rc); + return rc; } void kibnal_free_pages (kib_pages_t *p) { - int npages = p->ibp_npages; - vv_return_t retval; - int i; + int npages = p->ibp_npages; + vv_return_t vvrc; + int i; if (p->ibp_mapped) { - retval = vv_mem_region_destroy(kibnal_data.kib_hca, p->ibp_handle); - if (retval != 0) - CERROR ("Deregister error: %d\n", retval); + vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, + p->ibp_handle); + if (vvrc != vv_return_ok) + CERROR ("Deregister error: %d\n", vvrc); } for (i = 0; i < npages; i++) @@ -997,10 +1133,13 @@ int kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) { kib_pages_t *p; - vv_phy_list_t phys_pages; - vv_phy_buf_t *phys_buf; int i; - vv_return_t retval; +#if !IBNAL_WHOLE_MEM + vv_phy_list_t vv_phys; + vv_phy_buf_t *phys_pages; + vv_return_t vvrc; + vv_access_con_bit_mask_t access; +#endif PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { @@ -1020,57 +1159,124 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) } } - if (kibnal_whole_mem()) - goto out; - - PORTAL_ALLOC(phys_buf, npages * sizeof(vv_phy_buf_t)); - if (phys_buf == NULL) { - CERROR ("Can't allocate phys_buf for %d pages\n", npages); - /* XXX free ibp_pages? */ +#if !IBNAL_WHOLE_MEM + PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); + if (phys_pages == NULL) { + CERROR ("Can't allocate physarray for %d pages\n", npages); kibnal_free_pages(p); return (-ENOMEM); } - phys_pages.number_of_buff = npages; - phys_pages.phy_list = phys_buf; + vv_phys.number_of_buff = npages; + vv_phys.phy_list = phys_pages; - /* if we were using the _contig_ registration variant we would have - * an array of PhysAddr/Length pairs, but the discontiguous variant - * just takes the PhysAddr */ for (i = 0; i < npages; i++) { - phys_buf[i].start = kibnal_page2phys(p->ibp_pages[i]); - phys_buf[i].size = PAGE_SIZE; - } - - retval = vv_phy_mem_region_register(kibnal_data.kib_hca, - &phys_pages, - 0, /* requested vaddr */ - npages * PAGE_SIZE, - 0, /* offset */ - kibnal_data.kib_pd, - vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */ - &p->ibp_handle, &p->ibp_vaddr, - &p->ibp_lkey, &p->ibp_rkey); + phys_pages[i].size = PAGE_SIZE; + phys_pages[i].start = + kibnal_page2phys(p->ibp_pages[i]); + } + + VV_ACCESS_CONTROL_MASK_SET_ALL(access); + + vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca, + &vv_phys, + 0, /* requested vaddr */ + npages * PAGE_SIZE, 0, /* offset */ + kibnal_data.kib_pd, + access, + &p->ibp_handle, + &p->ibp_vaddr, + &p->ibp_lkey, + &p->ibp_rkey); - PORTAL_FREE(phys_buf, npages * sizeof(vv_phy_buf_t)); + PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); - if (retval) { - CERROR ("Error %d mapping %d pages\n", retval, npages); + if (vvrc != vv_return_ok) { + CERROR ("Error %d mapping %d pages\n", vvrc, npages); kibnal_free_pages(p); - return (-ENOMEM); + return (-EFAULT); } CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" " - "lkey %x rkey %x\n", npages, p->ibp_handle, - p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); + "lkey %x rkey %x\n", npages, p->ibp_handle, + p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); p->ibp_mapped = 1; -out: +#endif *pp = p; return (0); } -static int +int +kibnal_alloc_tx_descs (void) +{ + int i; + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) + return -ENOMEM; + + memset(kibnal_data.kib_tx_descs, 0, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + + PORTAL_ALLOC(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + return -ENOMEM; + + PORTAL_ALLOC(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + if (tx->tx_gl == NULL) + return -ENOMEM; + + PORTAL_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + return -ENOMEM; + } + + return 0; +} + +void +kibnal_free_tx_descs (void) +{ + int i; + + if (kibnal_data.kib_tx_descs == NULL) + return; + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + + if (tx->tx_wrq != NULL) + PORTAL_FREE(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + + if (tx->tx_gl != NULL) + PORTAL_FREE(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + + if (tx->tx_rd != NULL) + PORTAL_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); + } + + PORTAL_FREE(kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); +} + +int kibnal_setup_tx_descs (void) { int ipage = 0; @@ -1083,10 +1289,10 @@ kibnal_setup_tx_descs (void) int rc; /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 0); @@ -1100,35 +1306,32 @@ kibnal_setup_tx_descs (void) page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - - if (kibnal_whole_mem()) { - void *newaddr; - vv_mem_reg_h_t mem_h; - vv_return_t retval; +#if IBNAL_WHOLE_MEM + { + vv_mem_reg_h_t mem_h; + vv_r_key_t rkey; + vv_return_t vvrc; /* Voltaire stack already registers the whole * memory, so use that API. */ - retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - tx->tx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &tx->l_key, - &tx->r_key); - if (retval) { - CERROR("vv_get_gen_mr_attrib failed: %d", retval); - /* TODO: free pages? */ - /* TODO: return. */ - } + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + tx->tx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &tx->tx_lkey, + &rkey); + LASSERT (vvrc == vv_return_ok); } - +#else + tx->tx_vaddr = vaddr; +#endif tx->tx_isnblk = (i >= IBNAL_NTX); tx->tx_mapped = KIB_TX_UNMAPPED; - CDEBUG(D_NET, "Tx[%d] %p->%p\n", i, tx, tx->tx_msg); + CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, + tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx)); if (tx->tx_isnblk) list_add (&tx->tx_list, @@ -1153,12 +1356,11 @@ kibnal_setup_tx_descs (void) return (0); } -static void +void kibnal_api_shutdown (nal_t *nal) { - int i; - int rc; - vv_return_t retval; + int i; + vv_return_t vvrc; if (nal->nal_refct != 0) { /* This module got the first ref */ @@ -1178,16 +1380,16 @@ kibnal_api_shutdown (nal_t *nal) libcfs_nal_cmd_unregister(VIBNAL); /* No new peers */ - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ + /* resetting my NID removes my listener and nukes all current + * peers and their connections */ kibnal_set_mynid (PTL_NID_ANY); - /* Wait for all peer state to clean up (crazy) */ + /* Wait for all peer state to clean up */ i = 2; while (atomic_read (&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect (can take a few seconds)\n", + "waiting for %d peers to disconnect\n", atomic_read (&kibnal_data.kib_npeers)); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); @@ -1195,56 +1397,36 @@ kibnal_api_shutdown (nal_t *nal) /* fall through */ case IBNAL_INIT_CQ: - retval = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); - if (retval) - CERROR ("Destroy CQ error: %d\n", retval); + vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); + if (vvrc != vv_return_ok) + CERROR ("Destroy CQ error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_TXD: kibnal_free_pages (kibnal_data.kib_tx_pages); /* fall through */ -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif case IBNAL_INIT_PD: -#if IBNAL_WHOLE_MEM==0 - retval = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd); - if (retval != 0) - CERROR ("Destroy PD error: %d\n", retval); +#if !IBNAL_WHOLE_MEM + vvrc = vv_pd_deallocate(kibnal_data.kib_hca, + kibnal_data.kib_pd); + if (vvrc != vv_return_ok) + CERROR ("Destroy PD error: %d\n", vvrc); #endif /* fall through */ - case IBNAL_INIT_GSI: - retval = gsi_deregister_class(kibnal_data.gsi_handle); - if (retval != 0) - CERROR ("GSI deregister failed: %d\n", retval); - /* fall through */ - - case IBNAL_INIT_GSI_POOL: - gsi_dtgrm_pool_destroy(kibnal_data.gsi_pool_handle); - /* fall through */ - - case IBNAL_INIT_PORT: - /* XXX ??? */ - /* fall through */ - case IBNAL_INIT_ASYNC: - retval = vv_dell_async_event_cb (kibnal_data.kib_hca, - kibnal_ca_async_callback); - if (retval) - CERROR("deregister asynchronous call back error: %d\n", retval); + vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca, + kibnal_async_callback); + if (vvrc != vv_return_ok) + CERROR("vv_dell_async_event_cb error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_HCA: - retval = vv_hca_close(kibnal_data.kib_hca); - if (retval != 0) - CERROR ("Close HCA error: %d\n", retval); + vvrc = vv_hca_close(kibnal_data.kib_hca); + if (vvrc != vv_return_ok) + CERROR ("Close HCA error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_LIB: @@ -1252,8 +1434,6 @@ kibnal_api_shutdown (nal_t *nal) /* fall through */ case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { @@ -1262,7 +1442,9 @@ kibnal_api_shutdown (nal_t *nal) LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs)); LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ @@ -1285,9 +1467,7 @@ kibnal_api_shutdown (nal_t *nal) break; } - if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + kibnal_free_tx_descs(); if (kibnal_data.kib_peers != NULL) PORTAL_FREE (kibnal_data.kib_peers, @@ -1302,32 +1482,18 @@ kibnal_api_shutdown (nal_t *nal) kibnal_data.kib_init = IBNAL_INIT_NOTHING; } -#define roundup_power(val, power) \ - ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) - -/* this isn't very portable or sturdy in the face of funny mem/bus configs */ -static __u64 max_phys_mem(void) -{ - struct sysinfo si; - __u64 ret; - - si_meminfo(&si); - ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; - return roundup_power(ret, 128 * 1024 * 1024); -} -#undef roundup_power - -static int +int kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; + struct timeval tv; + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; vv_request_event_record_t req_er; - vv_return_t retval; + vv_return_t vvrc; LASSERT (nal == &kibnal_api); @@ -1340,9 +1506,13 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ + + do_gettimeofday(&tv); + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER; init_MUTEX (&kibnal_data.kib_nid_mutex); - kibnal_data.kib_nid = PTL_NID_ANY; rwlock_init(&kibnal_data.kib_global_lock); @@ -1357,7 +1527,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kibnal_data.kib_connd_lock); INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs); INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); init_waitqueue_head (&kibnal_data.kib_connd_waitq); spin_lock_init (&kibnal_data.kib_sched_lock); @@ -1370,22 +1542,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - INIT_LIST_HEAD (&kibnal_data.gsi_pending); - init_MUTEX (&kibnal_data.gsi_mutex); - - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); + rc = kibnal_alloc_tx_descs(); + if (rc != 0) { + CERROR("Can't allocate tx descs\n"); goto failed; } - + /* lists/ptrs/locks initialised */ kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ process_id.pid = requested_pid; - process_id.nid = kibnal_data.kib_nid; + process_id.nid = PTL_NID_ANY; rc = lib_init(&kibnal_lib, nal, process_id, requested_limits, actual_limits); @@ -1399,7 +1567,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /*****************************************************/ for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn vibnal scheduler[%d]: %d\n", i, rc); @@ -1414,9 +1582,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* TODO: apparently only one adapter is supported */ - retval = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca); - if (retval) { - CERROR ("Can't open CA: %d\n", retval); + vvrc = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca); + if (vvrc != vv_return_ok) { + CERROR ("Can't open CA: %d\n", vvrc); goto failed; } @@ -1425,12 +1593,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /* register to get HCA's asynchronous events. */ req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK; - retval = vv_set_async_event_cb (kibnal_data.kib_hca, - req_er, - kibnal_ca_async_callback); - - if (retval) { - CERROR ("Can't open CA: %d\n", retval); + vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er, + kibnal_async_callback); + if (vvrc != vv_return_ok) { + CERROR ("Can't open CA: %d\n", vvrc); goto failed; } @@ -1438,10 +1604,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /*****************************************************/ - retval = vv_hca_query(kibnal_data.kib_hca, - &kibnal_data.kib_hca_attrs); - if (retval) { - CERROR ("Can't size port attrs: %d\n", retval); + vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs); + if (vvrc != vv_return_ok) { + CERROR ("Can't size port attrs: %d\n", vvrc); goto failed; } @@ -1453,9 +1618,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, u_int32_t tbl_count; vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr; - retval = vv_port_query(kibnal_data.kib_hca, port_num, pattr); - if (retval) { - CERROR("vv_port_query failed for port %d: %d\n", port_num, retval); + vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr); + if (vvrc != vv_return_ok) { + CERROR("vv_port_query failed for port %d: %d\n", + port_num, vvrc); continue; } @@ -1476,16 +1642,22 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_port = port_num; tbl_count = 1; - retval = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); - if (retval) { - CERROR("vv_get_port_gid_tbl failed for port %d: %d\n", port_num, retval); + vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, + port_num, &tbl_count, + &kibnal_data.kib_port_gid); + if (vvrc != vv_return_ok) { + CERROR("vv_get_port_gid_tbl failed " + "for port %d: %d\n", port_num, vvrc); continue; } tbl_count = 1; - retval = vv_get_port_partition_tbl (kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey); - if (retval) { - CERROR("vv_get_port_partition_tbl failed for port %d: %d\n", port_num, retval); + vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, + port_num, &tbl_count, + &kibnal_data.kib_port_pkey); + if (vvrc != vv_return_ok) { + CERROR("vv_get_port_partition_tbl failed " + "for port %d: %d\n", port_num, vvrc); continue; } @@ -1505,45 +1677,19 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n", - kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); - CDEBUG(D_NET, "got guid "LPX64"\n", cpu_to_le64(kibnal_data.kib_port_gid.scope.g.eui64)); + kibnal_data.kib_port, + kibnal_data.kib_port_gid.scope.g.subnet, + kibnal_data.kib_port_gid.scope.g.eui64); - /* Active port found */ - kibnal_data.kib_init = IBNAL_INIT_PORT; /*****************************************************/ - /* Prepare things to be able to send/receive MADS */ - retval = gsi_dtgrm_pool_create(IBNAL_CONCURRENT_PEERS, &kibnal_data.gsi_pool_handle); - if (retval) { - CERROR("Could not create GSI pool: %d\n", retval); - goto failed; - } - kibnal_data.kib_init = IBNAL_INIT_GSI_POOL; - - retval = gsi_register_class(MAD_CLASS_SUBN_ADM, /* TODO: correct? */ - 2, /* version */ - "ANY_HCA", -#ifdef GSI_PASS_PORT_NUM - kibnal_data.kib_port, -#endif - 0, 0, - vibnal_mad_sent_cb, vibnal_mad_received_cb, - NULL, &kibnal_data.gsi_handle); - if (retval) { - CERROR("Cannot register GSI class: %d\n", retval); - goto failed; - } - - kibnal_data.kib_init = IBNAL_INIT_GSI; - /*****************************************************/ - -#if IBNAL_WHOLE_MEM==0 - retval = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); +#if !IBNAL_WHOLE_MEM + vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); #else - retval = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); + vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); #endif - if (retval) { - CERROR ("Can't create PD: %d\n", retval); + if (vvrc != 0) { + CERROR ("Can't create PD: %d\n", vvrc); goto failed; } @@ -1551,35 +1697,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if IBNAL_FMR - { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - - /*****************************************************/ - rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); @@ -1592,12 +1709,12 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, { uint32_t nentries; - retval = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, - kibnal_ca_callback, - NULL, /* context */ - &kibnal_data.kib_cq, &nentries); - if (retval) { - CERROR ("Can't create RX CQ: %d\n", retval); + vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + kibnal_cq_callback, + NULL, /* context */ + &kibnal_data.kib_cq, &nentries); + if (vvrc != 0) { + CERROR ("Can't create RX CQ: %d\n", vvrc); goto failed; } @@ -1610,8 +1727,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, goto failed; } - retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); - if (retval != 0) { + vvrc = vv_request_completion_notification(kibnal_data.kib_hca, + kibnal_data.kib_cq, + vv_next_solicit_unsolicit_event); + if (vvrc != 0) { CERROR ("Failed to re-arm completion queue: %d\n", rc); goto failed; } @@ -1657,16 +1776,17 @@ kibnal_module_init (void) { int rc; - if (sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len) { - CERROR("sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len\n"); - return -EINVAL; - } - + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + <= cm_REQ_priv_data_len); + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + <= cm_REP_priv_data_len); + CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) + <= IBNAL_MSG_SIZE); + CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) + <= IBNAL_MSG_SIZE); + /* the following must be sizeof(int) for proc_dointvec() */ - if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { - CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); - return -EINVAL; - } + CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); kibnal_api.nal_ni_init = kibnal_api_startup; kibnal_api.nal_ni_fini = kibnal_api_shutdown; diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index cf90aed..785494a 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -48,83 +48,104 @@ #include #include #include +#include -#define DEBUG_SUBSYSTEM S_IBNAL +#include +#include -#define IBNAL_CHECK_ADVERT +#define DEBUG_SUBSYSTEM S_NAL #include #include #include #include -#include -#include -#include -#include - -#if 0 -#undef CDEBUG -#define CDEBUG(mask, format, a...) printk(KERN_INFO "%s:%d - " format, __func__, __LINE__,##a) +/* CPU_{L,B}E #defines needed by Voltaire headers */ +#include +#ifdef __BIG_ENDIAN__ +#define CPU_BE 1 +#define CPU_LE 0 #endif - -#ifdef __CHECKER__ -#undef CDEBUG -#undef CERROR -#define CDEBUG(a...) -#define CERROR(a...) +#ifdef __LITTLE_ENDIAN__ +#define CPU_BE 0 +#define CPU_LE 1 #endif -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) +#include +#include +#include -/* Test for GCC > 3.2.2 */ -#if GCC_VERSION <= 30202 -/* GCC 3.2.2, and presumably several versions before it, will - * miscompile this driver. See - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +/* GCC 3.2.2, miscompiles this driver. + * See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +#define GCC_VERSION ((__GNUC__*100 + __GNUC_MINOR__)*100 + __GNUC_PATCHLEVEL__) +#if GCC_VERSION < 30203 #error Invalid GCC version. Must use GCC >= 3.2.3 #endif -#define IBNAL_SERVICE_NAME "vibnal" -#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* TODO */ - #if CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ #endif +/* sdp-connection.c */ +#define IBNAL_QKEY 0 +#define IBNAL_PKEY 0xffff +#define IBNAL_PKEY_IDX 0 +#define IBNAL_SGID_IDX 0 +#define IBNAL_SERVICE_LEVEL 0 +#define IBNAL_STATIC_RATE 0 +#define IBNAL_RETRY_CNT 7 +#define IBNAL_RNR_CNT 7 +#define IBNAL_EE_FLOW_CNT 1 +#define IBNAL_LOCAL_SUB 1 +#define IBNAL_TRAFFIC_CLASS 0 +#define IBNAL_SOURCE_PATH_BIT 0 +#define IBNAL_OUS_DST_RD 32 +#define IBNAL_IB_MTU vv_mtu_1024 + +/* sdp-hca-params.h */ +#define PATH_RATE_2_5GB 2 +#define MLX_IPD_1x 1 +#define MLX_IPD_4x 0 +#define IBNAL_R_2_STATIC_RATE(r) ((r) == PATH_RATE_2_5GB ? MLX_IPD_1x : MLX_IPD_4x) + +/* other low-level IB constants */ +#define IBNAL_LOCAL_ACK_TIMEOUT 0x12 +#define IBNAL_PKT_LIFETIME 5 +#define IBNAL_ARB_INITIATOR_DEPTH 0 +#define IBNAL_ARB_RESP_RES 0 +#define IBNAL_FAILOVER_ACCEPTED 0 +#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */ + #define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ #define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ -#define IBNAL_RETRY 5 /* # times to retry */ -#define IBNAL_RNR_RETRY 5 /* */ -#define IBNAL_CM_RETRY 5 /* # times to retry connection */ +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ +#define IBNAL_NTX 64 /* # tx descs */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ +/* reduced from 256 to ensure we register < 255 pages per region. + * this can change if we register all memory. */ -#define IBNAL_NTX 64 /* # tx descs */ -/* this had to be dropped down so that we only register < 255 pages per - * region. this will change if we register all memory. */ -#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_CKSUM 0 +#define IBNAL_WHOLE_MEM 1 +#if !IBNAL_WHOLE_MEM +# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)" +#endif /* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ @@ -134,30 +155,19 @@ #define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) #define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) +#if IBNAL_WHOLE_MEM +# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV +#else +# define IBNAL_MAX_RDMA_FRAGS 1 +#endif /* RX messages (per connection) */ #define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE #define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) #define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit + - 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ - (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_FMR 0 -#define IBNAL_WHOLE_MEM 1 -#define IBNAL_CKSUM 0 - -/* Starting sequence number. */ -#define IBNAL_STARTING_PSN 0x465A - -/* Timeout for SA requests, in seconds */ -#define GSI_TIMEOUT 5 -#define GSI_RETRY 10 +#define IBNAL_CQ_ENTRIES (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \ + IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS) typedef struct { @@ -165,8 +175,6 @@ typedef struct struct ctl_table_header *kib_sysctl; /* sysctl interface */ } kib_tunables_t; -/* some of these have specific types in the stack that just map back - * to the uFOO types, like IB_{L,R}_KEY. */ typedef struct { int ibp_npages; /* # pages */ @@ -184,51 +192,38 @@ typedef struct __u32 md_lkey; __u32 md_rkey; __u64 md_addr; -} kib_md_t __attribute__((packed)); +} kib_md_t; typedef struct { - /* initialisation state. These values are sorted by their initialization order. */ - enum { - IBNAL_INIT_NOTHING, - IBNAL_INIT_DATA, - IBNAL_INIT_LIB, - IBNAL_INIT_HCA, - IBNAL_INIT_ASYNC, - IBNAL_INIT_PORT, - IBNAL_INIT_GSI_POOL, - IBNAL_INIT_GSI, - IBNAL_INIT_PD, -#if IBNAL_FMR - IBNAL_INIT_FMR, -#endif - IBNAL_INIT_TXD, - IBNAL_INIT_CQ, - IBNAL_INIT_ALL, - } kib_init; - + int kib_init; /* initialisation state */ __u64 kib_incarnation; /* which one am I */ int kib_shutdown; /* shut down? */ atomic_t kib_nthreads; /* # live threads */ - __u64 kib_service_id; /* service number I listen on */ - vv_gid_t kib_port_gid; /* port GID in HOST ORDER! */ - vv_p_key_t kib_port_pkey; /* my pkey */ - ptl_nid_t kib_nid; /* my NID */ + __u64 kib_svc_id; /* service number I listen on */ + vv_gid_t kib_port_gid; /* device/port GID */ + vv_p_key_t kib_port_pkey; /* device/port pkey */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ - cm_cep_handle_t kib_cep; /* connection end point */ + cm_cep_handle_t kib_listen_handle; /* IB listen handle */ rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - + spinlock_t kib_vverbs_lock; /* serialize vverbs calls */ + int kib_ready; /* CQ callback fired */ + int kib_checking_cq; /* a scheduler is checking the CQ */ + struct list_head *kib_peers; /* hash table of all my known peers */ int kib_peer_hash_size; /* size of kib_peers */ atomic_t kib_npeers; /* # peers extant */ atomic_t kib_nconns; /* # connections extant */ - struct list_head kib_connd_conns; /* connections to progress */ - struct list_head kib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ - unsigned long kib_connd_waketime; /* when connd will wake */ + void *kib_connd; /* the connd task (serialisation assertions) */ + struct list_head kib_connd_peers; /* peers wanting to get connected */ + struct list_head kib_connd_pcreqs; /* passive connection requests */ + struct list_head kib_connd_conns; /* connections to setup/teardown */ + struct list_head kib_connd_zombies; /* connections with zero refcount */ + wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */ spinlock_t kib_connd_lock; /* serialise */ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ @@ -246,44 +241,36 @@ typedef struct spinlock_t kib_tx_lock; /* serialise */ vv_hca_h_t kib_hca; /* The HCA */ - vv_hca_attrib_t kib_hca_attrs; /* HCA attributes */ - + vv_hca_attrib_t kib_hca_attrs; /* its properties */ int kib_port; /* port on the device */ - vv_port_attrib_t kib_port_attr; /* port attributes */ + vv_port_attrib_t kib_port_attr; /* its properties */ vv_pd_h_t kib_pd; /* protection domain */ vv_cq_h_t kib_cq; /* completion queue */ - void *kib_listen_handle; /* where I listen for connections */ - - /* These fields are left untouched, so they can be shared. */ - union { - cm_drequest_data_t dreq_data; - cm_dreply_data_t drep_data; - } cm_data; - - /* Send and receive MADs (service records, path records) */ - gsi_class_handle_t gsi_handle; - gsi_dtgrm_pool_handle_t gsi_pool_handle; - struct semaphore gsi_mutex; /* protect GSI list - TODO:spinlock instead? */ - struct list_head gsi_pending; /* pending GSI datagrams */ - } kib_data_t; +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_HCA 3 +#define IBNAL_INIT_ASYNC 4 +#define IBNAL_INIT_PD 5 +#define IBNAL_INIT_TXD 6 +#define IBNAL_INIT_CQ 7 +#define IBNAL_INIT_ALL 8 + /************************************************************************ - * Wire message structs. + * IB Wire message format. * These are sent in sender's byte order (i.e. receiver flips). - * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD - * private data and SM service info), is LE on the wire. */ -/* also kib_md_t above */ - -typedef struct +typedef struct kib_connparams { - __u32 rd_nob; /* # of bytes */ - __u64 rd_addr; /* remote io vaddr */ -} kib_rdma_desc_t __attribute__((packed)); + __u32 ibcp_queue_depth; + __u32 ibcp_max_msg_size; + __u32 ibcp_max_frags; +} kib_connparams_t __attribute__((packed)); typedef struct { @@ -291,54 +278,91 @@ typedef struct char ibim_payload[0]; /* piggy-backed payload */ } kib_immediate_msg_t __attribute__((packed)); -/* these arrays serve two purposes during rdma. they are built on the passive - * side and sent to the active side as remote arguments. On the active side - * the descs are used as a data structure on the way to local gather items. - * the different roles result in split local/remote meaning of desc->rd_key */ +/* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper + * packing. Otherwise we can't fit enough frags into an IBNAL message (<= + * smallest page size on any arch). */ +typedef struct +{ + __u32 rf_nob; /* # of bytes */ + __u32 rf_addr_lo; /* lo 4 bytes of vaddr */ + __u32 rf_addr_hi; /* hi 4 bytes of vaddr */ +} kib_rdma_frag_t __attribute__((packed)); + typedef struct { - ptl_hdr_t ibrm_hdr; /* portals header */ - __u64 ibrm_cookie; /* opaque completion cookie */ - __u32 ibrm_num_descs; /* how many descs */ - __u32 rd_key; /* remote key */ - kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ -} kib_rdma_msg_t __attribute__((packed)); + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrag; /* # fragments */ + kib_rdma_frag_t rd_frags[0]; /* buffer frags */ +} kib_rdma_desc_t __attribute__((packed)); + +/* CAVEAT EMPTOR! We don't actually put ibprm_rd on the wire; it's just there + * to remember the source buffers while we wait for the PUT_ACK */ -#define kib_rdma_msg_len(num_descs) \ - offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) +typedef struct +{ + ptl_hdr_t ibprm_hdr; /* portals header */ + __u64 ibprm_cookie; /* opaque completion cookie */ + kib_rdma_frag_t ibprm_rd; /* source buffer */ +} kib_putreq_msg_t __attribute__((packed)); + +typedef struct +{ + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ +} kib_putack_msg_t __attribute__((packed)); + +typedef struct +{ + ptl_hdr_t ibgm_hdr; /* portals header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ +} kib_get_msg_t __attribute__((packed)); typedef struct { __u64 ibcm_cookie; /* opaque completion cookie */ - __u32 ibcm_status; /* completion status */ + __s32 ibcm_status; /* < 0 failure: >= 0 length */ } kib_completion_msg_t __attribute__((packed)); typedef struct { - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ -#if IBNAL_CKSUM - __u32 ibm_nob; - __u32 ibm_cksum; -#endif + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + union { + kib_connparams_t connparams; kib_immediate_msg_t immediate; - kib_rdma_msg_t rdma; + kib_putreq_msg_t putreq; + kib_putack_msg_t putack; + kib_get_msg_t get; kib_completion_msg_t completion; } ibm_u __attribute__((packed)); } kib_msg_t __attribute__((packed)); #define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 1 /* current protocol version */ +#define IBNAL_MSG_VERSION 4 /* current protocol version */ +#define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ +#define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ #define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBNAL_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBNAL_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBNAL_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ /***********************************************************************/ @@ -346,14 +370,26 @@ typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ struct kib_conn *rx_conn; /* owning conn */ - int rx_rdma; /* RDMA completion posted? */ + int rx_responded; /* responded to peer? */ int rx_posted; /* posted? */ - kib_msg_t *rx_msg; /* pre-mapped buffer */ - vv_l_key_t l_key; - vv_wr_t rx_wrq; +#if IBNAL_WHOLE_MEM + vv_l_key_t rx_lkey; /* local key */ +#else + __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ +#endif + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + vv_wr_t rx_wrq; /* receive work item */ vv_scatgat_t rx_gl; /* and its memory */ } kib_rx_t; +#if IBNAL_WHOLE_MEM +# define KIBNAL_RX_VADDR(rx) ((__u64)((unsigned long)((rx)->rx_msg))) +# define KIBNAL_RX_LKEY(rx) ((rx)->rx_lkey) +#else +# define KIBNAL_RX_VADDR(rx) ((rx)->rx_vaddr) +# define KIBNAL_RX_LKEY(rx) ((rx)->rx_conn->ibc_rx_pages->ibp_lkey) +#endif + typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ @@ -361,55 +397,59 @@ typedef struct kib_tx /* transmit message */ struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ + int tx_waiting; /* waiting for peer */ int tx_status; /* completion status */ unsigned long tx_deadline; /* completion deadline */ - int tx_passive_rdma; /* peer sucks/blows */ - int tx_passive_rdma_wait; /* waiting for peer to complete */ - __u64 tx_passive_rdma_cookie; /* completion cookie */ + __u64 tx_cookie; /* completion cookie */ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ +#if IBNAL_WHOLE_MEM + vv_l_key_t tx_lkey; /* local key for message buffer */ +#else kib_md_t tx_md; /* RDMA mapping (active/passive) */ - kib_msg_t *tx_msg; /* pre-mapped buffer */ - vv_l_key_t l_key; - vv_r_key_t r_key; - int tx_nsp; /* # send work items */ - vv_wr_t tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ - vv_scatgat_t tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ + __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ +#endif + kib_msg_t *tx_msg; /* message buffer (host vaddr) */ + int tx_nwrq; /* # send work items */ + vv_wr_t *tx_wrq; /* send work items... */ + vv_scatgat_t *tx_gl; /* ...and their memory */ + kib_rdma_desc_t *tx_rd; /* rdma descriptor (src buffers) */ } kib_tx_t; +#if IBNAL_WHOLE_MEM +# define KIBNAL_TX_VADDR(tx) ((__u64)((unsigned long)((tx)->tx_msg))) +# define KIBNAL_TX_LKEY(tx) ((tx)->tx_lkey) +#else +# define KIBNAL_TX_VADDR(tx) ((tx)->tx_vaddr) +# define KIBNAL_TX_LKEY(tx) (kibnal_data.kib_tx_pages->ibp_lkey) +#endif + #define KIB_TX_UNMAPPED 0 #define KIB_TX_MAPPED 1 -#define KIB_TX_MAPPED_FMR 2 - -typedef struct kib_wire_connreq -{ - __u32 wcr_magic; /* I'm an openibnal connreq */ - __u16 wcr_version; /* this is my version number */ - __u16 wcr_queue_depth; /* this is my receive queue size */ - __u64 wcr_nid; /* peer's NID */ - __u64 wcr_incarnation; /* peer's incarnation */ -} kib_wire_connreq_t; - -typedef struct kib_gid -{ - __u64 hi, lo; -} kib_gid_t; -typedef struct kib_connreq -{ - /* connection-in-progress */ - struct kib_conn *cr_conn; - kib_wire_connreq_t cr_wcr; - __u64 cr_tid; - //ib_service_record_v2_t cr_service; - kib_gid_t cr_gid; - ib_path_record_v2_t cr_path; - - union { - cm_request_data_t cr_cm_req; - cm_rtu_data_t cr_cm_rtu; - } ; - -} kib_connreq_t; +/* Passive connection request (listener callback) queued for handling by connd */ +typedef struct kib_pcreq +{ + struct list_head pcr_list; /* queue for handling by connd */ + cm_cep_handle_t pcr_cep; /* listening handle */ + cm_request_data_t pcr_cmreq; /* request data */ +} kib_pcreq_t; + +typedef struct kib_connvars +{ + /* connection-in-progress variables */ + __u32 cv_port; + __u32 cv_pkey_index; + __u32 cv_rnr_count; + __u32 cv_sgid_index; + __u32 cv_remote_qpn; + __u32 cv_local_qpn; + __u32 cv_rxpsn; + __u32 cv_txpsn; + ib_path_record_v2_t cv_path; + ibat_arp_data_t cv_arp; + ibat_stat_t cv_arprc; + cm_conn_data_t cv_conndata; +} kib_connvars_t; typedef struct kib_conn { @@ -422,43 +462,39 @@ typedef struct kib_conn int ibc_nsends_posted; /* # uncompleted sends */ int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ - int ibc_rcvd_disconnect;/* received discon request */ - int ibc_sent_disconnect;/* sent discon request */ + int ibc_disconnect; /* some disconnect callback fired */ + int ibc_comms_error; /* set on comms error */ + struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ struct list_head ibc_tx_queue; /* send queue */ struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ kib_rx_t *ibc_rxs; /* the rx descs */ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ vv_qp_h_t ibc_qp; /* queue pair */ - cm_cep_handle_t ibc_cep; /* connection ID? */ - vv_qp_attr_t ibc_qp_attrs; /* QP attrs */ - kib_connreq_t *ibc_connreq; /* connection request state */ + cm_cep_handle_t ibc_cep; /* connection endpoint */ + kib_connvars_t *ibc_connvars; /* in-progress connection state */ } kib_conn_t; -#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define IBNAL_CONN_CONNECTING 2 /* started to connect */ -#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ -#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ -#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ -#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ - -#define KIB_ASSERT_CONN_STATE(conn, state) do { \ - LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ -} while (0) - -#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ - LASSERTF(low <= high, "%d %d\n", low, high); \ - LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ - "%d\n", conn->ibc_state); \ -} while (0) +#define IBNAL_CONN_INIT_NOTHING 0 /* incomplete init */ +#define IBNAL_CONN_INIT 1 /* completed init */ +#define IBNAL_CONN_ACTIVE_ARP 2 /* active arping */ +#define IBNAL_CONN_ACTIVE_CONNECT 3 /* active sending req */ +#define IBNAL_CONN_ACTIVE_CHECK_REPLY 4 /* active checking reply */ +#define IBNAL_CONN_ACTIVE_RTU 5 /* active sending rtu */ +#define IBNAL_CONN_PASSIVE_WAIT 6 /* passive waiting for rtu */ +#define IBNAL_CONN_ESTABLISHED 7 /* connection established */ +#define IBNAL_CONN_DISCONNECT1 8 /* disconnect phase 1 */ +#define IBNAL_CONN_DISCONNECT2 9 /* disconnect phase 2 */ +#define IBNAL_CONN_DISCONNECTED 10 /* disconnect complete */ typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ ptl_nid_t ibp_nid; /* who's on the other end(s) */ + __u32 ibp_ip; /* IP to query for peer conn params */ + int ibp_port; /* port to qery for peer conn params */ + __u64 ibp_incarnation; /* peer's incarnation */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ struct list_head ibp_conns; /* all active connections */ @@ -468,75 +504,95 @@ typedef struct kib_peer unsigned long ibp_reconnect_interval; /* exponential backoff */ } kib_peer_t; -struct sa_request; -typedef void (*sa_request_cb_t)(struct sa_request *request); - -struct sa_request { - /* Link all the pending GSI datagrams together. */ - struct list_head list; - int retry; /* number of retries left (after a timeout only) */ - int status; /* status of the request */ - gsi_dtgrm_t *dtgrm_req; /* request */ - gsi_dtgrm_t *dtgrm_resp; /* response */ - sa_mad_v2_t *mad; /* points inside the datagram */ +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; - void *context; +extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); +extern void kibnal_pack_msg(kib_msg_t *msg, int credits, + ptl_nid_t dstnid, __u64 dststamp); +extern int kibnal_unpack_msg(kib_msg_t *msg, int nob); +extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid); +extern void kibnal_destroy_peer(kib_peer_t *peer); +extern int kibnal_del_peer(ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid); +extern void kibnal_unlink_peer_locked(kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked(kib_peer_t *peer, + __u64 incarnation); +extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep); +extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); - struct timer_list timer; +extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages(kib_pages_t *p); - /* When the requests is completed, we either call the callback - * or post a completion. They are mutually exclusive. */ - struct completion signal; - sa_request_cb_t callback; -}; +extern void kibnal_check_sends(kib_conn_t *conn); +extern void kibnal_close_conn_locked(kib_conn_t *conn, int error); +extern void kibnal_destroy_conn(kib_conn_t *conn); +extern int kibnal_thread_start(int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd(void *arg); +extern void kibnal_init_tx_msg(kib_tx_t *tx, int type, int body_nob); +extern void kibnal_close_conn(kib_conn_t *conn, int why); +extern int kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state); +extern void kibnal_async_callback(vv_event_record_t ev); +extern void kibnal_cq_callback(unsigned long context); +extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject); +extern void kibnal_pause(int ticks); +extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn); +extern int kibnal_init_rdma(kib_tx_t *tx, int type, int nob, + kib_rdma_desc_t *dstrd, __u64 dstcookie); -/* The CM callback are called on the interrupt level. However we - * cannot do everything we want on that level, so we let keventd run - * the callback. */ -struct cm_off_level { - struct tq_struct tq; +static inline int +wrq_signals_completion (vv_wr_t *wrq) +{ + return wrq->completion_notification != 0; +} - cm_cep_handle_t cep; - cm_conn_data_t *info; - kib_conn_t *conn; -}; +static inline void +kibnal_conn_addref (kib_conn_t *conn) +{ + CDEBUG(D_NET, "++conn[%p] (%d)\n", + conn, atomic_read(&conn->ibc_refcount)); + LASSERT(atomic_read(&conn->ibc_refcount) > 0); + atomic_inc(&conn->ibc_refcount); +} -extern lib_nal_t kibnal_lib; -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; +static inline void +kibnal_conn_decref (kib_conn_t *conn) +{ + unsigned long flags; + + CDEBUG(D_NET, "--conn[%p] (%d)\n", + conn, atomic_read(&conn->ibc_refcount)); + LASSERT(atomic_read(&conn->ibc_refcount) > 0); + if (atomic_dec_and_test(&conn->ibc_refcount)) { + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_zombies); + wake_up(&kibnal_data.kib_connd_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + } +} -static inline int wrq_signals_completion(vv_wr_t *wrq) +static inline void +kibnal_peer_addref (kib_peer_t *peer) { - return wrq->completion_notification != 0; + CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); + LASSERT(atomic_read(&peer->ibp_refcount) > 0); + atomic_inc(&peer->ibp_refcount); } -/******************************************************************************/ - -/* these are purposely avoiding using local vars so they don't increase - * stack consumption. */ - -#define kib_peer_addref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - atomic_inc(&peer->ibp_refcount); \ -} while (0) - -#define kib_peer_decref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - if (atomic_dec_and_test (&peer->ibp_refcount)) { \ - CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ - peer->ibp_nid, peer); \ - kibnal_destroy_peer (peer); \ - } \ -} while (0) - -/******************************************************************************/ +static inline void +kibnal_peer_decref (kib_peer_t *peer) +{ + CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); + + LASSERT(atomic_read(&peer->ibp_refcount) > 0); + if (atomic_dec_and_test (&peer->ibp_refcount)) + kibnal_destroy_peer (peer); +} static inline struct list_head * kibnal_nid2peerlist (ptl_nid_t nid) @@ -547,7 +603,7 @@ kibnal_nid2peerlist (ptl_nid_t nid) } static inline int -kibnal_peer_active(kib_peer_t *peer) +kibnal_peer_active (kib_peer_t *peer) { /* Am I in the peer hash table? */ return (!list_empty(&peer->ibp_list)); @@ -558,43 +614,23 @@ kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { /* CAVEAT EMPTOR: tx takes caller's ref on conn */ - LASSERT (tx->tx_nsp > 0); /* work items set up */ - LASSERT (tx->tx_conn == NULL); /* only set here */ - - tx->tx_conn = conn; + LASSERT (tx->tx_nwrq > 0); /* work items set up */ + if (tx->tx_conn == NULL) { + kibnal_conn_addref(conn); + tx->tx_conn = conn; + } else { + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); + } tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); } -static inline __u64* -kibnal_service_nid_field(ib_service_record_v2_t *sr) -{ - /* The service key mask must have byte 0 to 7 set. */ - return (__u64 *)sr->service_data8; -} - -static inline void -kibnal_set_service_keys(ib_service_record_v2_t *sr, ptl_nid_t nid) -{ - LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(sr->service_name)); - - strcpy (sr->service_name, IBNAL_SERVICE_NAME); - - *kibnal_service_nid_field(sr) = cpu_to_le64(nid); -} - -#if CONFIG_X86 -/* TODO: use vv_va2adverize instead */ static inline __u64 kibnal_page2phys (struct page *p) { - __u64 page_number = p - mem_map; - - return (page_number << PAGE_SHIFT); + return page_to_phys(p); } -#else -# error "no page->phys" -#endif /* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to * use the lowest bit of the work request id as a flag to determine if @@ -622,199 +658,35 @@ kibnal_wreqid_is_rx (vv_wr_id_t wreqid) return (wreqid & 1) != 0; } -static inline int -kibnal_whole_mem(void) -{ -#if IBNAL_WHOLE_MEM - return true; -#else - return false; -#endif -} - -/* Voltaire stores GIDs in host order. */ -static inline void gid_swap(vv_gid_t *gid) -{ - u_int64_t s; - - s = gid->scope.g.subnet; - gid->scope.g.subnet = cpu_to_be64(gid->scope.g.eui64); - gid->scope.g.eui64 = cpu_to_be64(s); -} - -#if 0 -static void dump_qp(kib_conn_t *conn) +static inline void +kibnal_set_conn_state (kib_conn_t *conn, int state) { - vv_qp_attr_t *qp_attrs; - void *qp_context; - vv_return_t retval; - - CERROR("QP dumping %p\n", conn); - - retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); - if (retval) { - CERROR ("Couldn't query qp attributes: %d\n", retval); - return; - } - - qp_attrs = &conn->ibc_qp_attrs; - - CERROR("QP %x dump\n", qp_attrs->query.qp_num); - CERROR(" vv_qp_attr_mask = %llx\n", qp_attrs->query.vv_qp_attr_mask); - CERROR(" qp_state = %d\n", qp_attrs->query.qp_state); - CERROR(" cq_send_h = %p\n", qp_attrs->query.cq_send_h); - CERROR(" cq_receive_h = %p \n", qp_attrs->query.cq_receive_h); - CERROR(" send_max_outstand_wr = %d\n", qp_attrs->query.send_max_outstand_wr); - CERROR(" receive_max_outstand_wr = %d\n", qp_attrs->query.receive_max_outstand_wr); - CERROR(" max_scatgat_per_send_wr = %d\n", qp_attrs->query.max_scatgat_per_send_wr); - CERROR(" max_scatgat_per_receive_wr = %d\n", qp_attrs->query.max_scatgat_per_receive_wr); - CERROR(" send_psn = %x\n", qp_attrs->query.send_psn); - CERROR(" receve_psn = %x\n", qp_attrs->query.receve_psn); - CERROR(" access_control = %x\n", qp_attrs->query.access_control); - CERROR(" phy_port_num = %d\n", qp_attrs->query.phy_port_num); - CERROR(" primary_p_key_indx = %x\n", qp_attrs->query.primary_p_key_indx); - CERROR(" q_key = %x\n", qp_attrs->query.q_key); - CERROR(" destanation_qp = %x\n", qp_attrs->query.destanation_qp); - CERROR(" rdma_r_atom_outstand_num = %d\n", qp_attrs->query.rdma_r_atom_outstand_num); - CERROR(" responder_rdma_r_atom_num = %d\n", qp_attrs->query.responder_rdma_r_atom_num); - CERROR(" min_rnr_nak_timer = %d\n", qp_attrs->query.min_rnr_nak_timer); - CERROR(" pd_h = %lx\n", qp_attrs->query.pd_h); - CERROR(" recv_solicited_events = %d\n", qp_attrs->query.recv_solicited_events); - CERROR(" send_signaled_comp = %d\n", qp_attrs->query.send_signaled_comp); - CERROR(" flow_control = %d\n", qp_attrs->query.flow_control); + conn->ibc_state = state; + mb(); } -#else -#define dump_qp(a) -#endif -#if 0 -static void dump_wqe(vv_wr_t *wr) +static inline __u64 +kibnal_rf_addr (kib_rdma_frag_t *rf) { - CERROR("Dumping send WR %p\n", wr); - - CERROR(" wr_id = %llx\n", wr->wr_id); - CERROR(" completion_notification = %d\n", wr->completion_notification); - CERROR(" scatgat_list = %p\n", wr->scatgat_list); - CERROR(" num_of_data_segments = %d\n", wr->num_of_data_segments); - - if (wr->scatgat_list && wr->num_of_data_segments) { - CERROR(" scatgat_list[0].v_address = %p\n", wr->scatgat_list[0].v_address); - CERROR(" scatgat_list[0].length = %d\n", wr->scatgat_list[0].length); - CERROR(" scatgat_list[0].l_key = %x\n", wr->scatgat_list[0].l_key); - } - - CERROR(" wr_type = %d\n", wr->wr_type); - - switch(wr->wr_type) { - case vv_wr_send: - CERROR(" send\n"); - - CERROR(" fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator); - break; - - case vv_wr_receive: - break; - - case vv_wr_rdma_write: - case vv_wr_rdma_read: - CERROR(" rdma\n"); - CERROR(" fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator); - CERROR(" r_addr = %llx\n", wr->type.send.send_qp_type.rc_type.r_addr); - CERROR(" r_r_key = %x\n", wr->type.send.send_qp_type.rc_type.r_r_key); - break; - - default: - break; - } + return (((__u64)rf->rf_addr_hi)<<32) | ((__u64)rf->rf_addr_lo); } -#else -#define dump_wqe(a) -#endif - -#if 0 -static void dump_wc(vv_wc_t *wc) +static inline void +kibnal_rf_set (kib_rdma_frag_t *rf, __u64 addr, int nob) { - CERROR("Dumping WC\n"); - - CERROR(" wr_id = %llx\n", wc->wr_id); - CERROR(" operation_type = %d\n", wc->operation_type); - CERROR(" num_bytes_transfered = %lld\n", wc->num_bytes_transfered); - CERROR(" completion_status = %d\n", wc->completion_status); + rf->rf_addr_lo = addr & 0xffffffff; + rf->rf_addr_hi = (addr >> 32) & 0xffffffff; + rf->rf_nob = nob; } -#else -#define dump_wc(a) -#endif - -#if 0 -static void hexdump(char *string, void *ptr, int len) -{ - unsigned char *c = ptr; - int i; - - if (len < 0 || len > 2048) { - printk("XXX what the hell? %d\n",len); - return; - } - - printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); - for (i = 0; i < len;) { - printk("%02x",*(c++)); - i++; - if (!(i & 15)) { - printk("\n"); - } else if (!(i&1)) { - printk(" "); - } - } - - if(len & 15) { - printk("\n"); - } +static inline int +kibnal_rd_size (kib_rdma_desc_t *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrag; i++) + size += rd->rd_frags[i].rf_nob; + + return size; } -#else -#define hexdump(a,b,c) -#endif - -/*--------------------------------------------------------------------------*/ - - -extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); -extern void kibnal_destroy_peer (kib_peer_t *peer); -extern int kibnal_del_peer (ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); -extern void kibnal_unlink_peer_locked (kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_put_conn (kib_conn_t *conn); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); - -extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages (kib_pages_t *p); - -extern void kibnal_check_sends (kib_conn_t *conn); -extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd (void *arg); -extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -extern void kibnal_close_conn (kib_conn_t *conn, int why); -extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); - -void kibnal_ca_async_callback(vv_event_record_t ev); -void kibnal_ca_callback (unsigned long context); -extern void vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm); -extern void vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm); -extern int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context); -extern int vibnal_start_sa_request(struct sa_request *request); -extern struct sa_request *alloc_sa_request(void); -extern void free_sa_request(struct sa_request *request); -extern int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context); diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index e21d62f..ee860f0 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -24,36 +24,17 @@ #include "vibnal.h" -static void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); - -/* - * LIB functions follow - * - */ -static void -kibnal_schedule_tx_done (kib_tx_t *tx) -{ - unsigned long flags; - - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -static void +void kibnal_tx_done (kib_tx_t *tx) { ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - unsigned long flags; int i; - vv_return_t retval; + LASSERT (!in_interrupt()); LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ +#if !IBNAL_WHOLE_MEM switch (tx->tx_mapped) { default: LBUG(); @@ -61,35 +42,17 @@ kibnal_tx_done (kib_tx_t *tx) case KIB_TX_UNMAPPED: break; - case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - retval = vv_mem_region_destroy(kibnal_data.kib_hca, tx->tx_md.md_handle); - LASSERT (retval == vv_return_ok); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; - -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); + case KIB_TX_MAPPED: { + vv_return_t vvrc; - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, + tx->tx_md.md_handle); + LASSERT (vvrc == vv_return_ok); tx->tx_mapped = KIB_TX_UNMAPPED; break; -#endif } - + } +#endif for (i = 0; i < 2; i++) { /* tx may have up to 2 libmsgs to finalise */ if (tx->tx_libmsg[i] == NULL) @@ -100,15 +63,14 @@ kibnal_tx_done (kib_tx_t *tx) } if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); + kibnal_conn_decref(tx->tx_conn); tx->tx_conn = NULL; } - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; + tx->tx_nwrq = 0; tx->tx_status = 0; - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + spin_lock(&kibnal_data.kib_tx_lock); if (tx->tx_isnblk) { list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); @@ -117,18 +79,17 @@ kibnal_tx_done (kib_tx_t *tx) wake_up (&kibnal_data.kib_idle_tx_waitq); } - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + spin_unlock(&kibnal_data.kib_tx_lock); } -static kib_tx_t * +kib_tx_t * kibnal_get_idle_tx (int may_block) { - unsigned long flags; kib_tx_t *tx = NULL; ENTRY; for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + spin_lock(&kibnal_data.kib_tx_lock); /* "normal" descriptor is free */ if (!list_empty (&kibnal_data.kib_idle_txs)) { @@ -150,7 +111,7 @@ kibnal_get_idle_tx (int may_block) } /* block for idle tx */ - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + spin_unlock(&kibnal_data.kib_tx_lock); wait_event (kibnal_data.kib_idle_tx_waitq, !list_empty (&kibnal_data.kib_idle_txs) || @@ -160,410 +121,437 @@ kibnal_get_idle_tx (int may_block) if (tx != NULL) { list_del (&tx->tx_list); - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; +#if IBNAL_WHOLE_MEM LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); +#endif + LASSERT (tx->tx_nwrq == 0); LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); LASSERT (tx->tx_status == 0); LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); LASSERT (tx->tx_libmsg[0] == NULL); LASSERT (tx->tx_libmsg[1] == NULL); } - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + spin_unlock(&kibnal_data.kib_tx_lock); RETURN(tx); } -static int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -static void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) -{ - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) - continue; - - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); - - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); - - if (idle) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); - return; - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); -} - -static void -kibnal_post_rx (kib_rx_t *rx, int do_credits) +int +kibnal_post_rx (kib_rx_t *rx, int credit) { kib_conn_t *conn = rx->rx_conn; int rc = 0; - unsigned long flags; - vv_return_t retval; + vv_return_t vvrc; - ENTRY; + LASSERT (!in_interrupt()); rx->rx_gl = (vv_scatgat_t) { - .v_address = (void *)rx->rx_msg, + .v_address = (void *)((unsigned long)KIBNAL_RX_VADDR(rx)), + .l_key = KIBNAL_RX_LKEY(rx), .length = IBNAL_MSG_SIZE, - .l_key = rx->l_key, }; rx->rx_wrq = (vv_wr_t) { - .wr_id = kibnal_ptr2wreqid(rx, 1), + .wr_id = (unsigned long)rx, .completion_notification = 1, .scatgat_list = &rx->rx_gl, .num_of_data_segments = 1, .wr_type = vv_wr_receive, }; - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DREP); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); LASSERT (!rx->rx_posted); - rx->rx_posted = 1; - mb(); - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else { - retval = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq); + CDEBUG(D_NET, "posting rx [%d %x %p]\n", + rx->rx_wrq.scatgat_list->length, + rx->rx_wrq.scatgat_list->l_key, + rx->rx_wrq.scatgat_list->v_address); - if (retval) { - CDEBUG(D_NET, "post failed %d\n", retval); - rc = -EINVAL; - } - CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return 0; } + + rx->rx_posted = 1; + + spin_lock(&conn->ibc_lock); + /* Serialise vv_post_receive; it's not re-entrant on the same QP */ + vvrc = vv_post_receive(kibnal_data.kib_hca, + conn->ibc_qp, &rx->rx_wrq); + spin_unlock(&conn->ibc_lock); - if (rc == 0) { - if (do_credits) { - spin_lock_irqsave(&conn->ibc_lock, flags); + if (vvrc == 0) { + if (credit) { + spin_lock(&conn->ibc_lock); conn->ibc_outstanding_credits++; - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); kibnal_check_sends(conn); } - EXIT; - return; + return 0; } + + CERROR ("post rx -> "LPX64" failed %d\n", + conn->ibc_peer->ibp_nid, vvrc); + rc = -EIO; + kibnal_close_conn(rx->rx_conn, rc); + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return rc; +} - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); +int +kibnal_post_receives (kib_conn_t *conn) +{ + int i; + int rc; + + LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_comms_error == 0); + + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc. This ref remains until kibnal_post_rx + * fails (i.e. actual failure or we're disconnecting) */ + kibnal_conn_addref(conn); + rc = kibnal_post_rx (&conn->ibc_rxs[i], 0); + if (rc != 0) + return rc; } - /* Drop rx's ref */ - kibnal_put_conn (conn); - EXIT; + return 0; } -#if IBNAL_CKSUM -static inline __u32 kibnal_cksum (void *ptr, int nob) +kib_tx_t * +kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) { - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; + struct list_head *tmp; - return (sum); -} -#endif + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_sending != 0 || tx->tx_waiting); -static void -kibnal_rx_callback (vv_wc_t *wc) -{ - kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->wr_id); - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int nob = wc->num_bytes_transfered; - const int base_nob = offsetof(kib_msg_t, ibm_u); - int credits; - int flipped; - unsigned long flags; - __u32 i; -#if IBNAL_CKSUM - __u32 msg_cksum; - __u32 computed_cksum; -#endif + if (tx->tx_cookie != cookie) + continue; - /* we set the QP to erroring after we've finished disconnecting, - * maybe we should do so sooner. */ - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DISCONNECTED); + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; - CDEBUG(D_NET, "rx %p conn %p, nob=%d\n", rx, conn, nob); + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} - LASSERT (rx->rx_posted); - rx->rx_posted = 0; - mb(); +void +kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + int idle; - /* receives complete with error in any case after we've started - * disconnecting */ - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto failed; + spin_lock(&conn->ibc_lock); - if (wc->completion_status != vv_comp_status_success) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->completion_status); - goto failed; - } + tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); - if (nob < base_nob) { - CERROR ("Short rx from "LPX64": %d < expected %d\n", - conn->ibc_peer->ibp_nid, nob, base_nob); - goto failed; + CWARN("Unmatched completion type %x cookie "LPX64 + " from "LPX64"\n", + txtype, cookie, conn->ibc_peer->ibp_nid); + kibnal_close_conn (conn, -EPROTO); + return; } - /* Receiver does any byte flipping if necessary... */ + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBNAL_MSG_GET_REQ) { + /* XXX layering violation: set REPLY data length */ + LASSERT (tx->tx_libmsg[1] != NULL); + LASSERT (tx->tx_libmsg[1]->ev.type == + PTL_EVENT_REPLY_END); - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flipped = 0; - } else { - if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->ibm_magic, conn->ibc_peer->ibp_nid); - goto failed; + tx->tx_libmsg[1]->ev.mlength = status; } - flipped = 1; - __swab16s (&msg->ibm_version); - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); } + + tx->tx_waiting = 0; - if (msg->ibm_version != IBNAL_MSG_VERSION) { - CERROR ("Incompatible msg version %d (%d expected)\n", - msg->ibm_version, IBNAL_MSG_VERSION); - goto failed; - } + idle = tx->tx_sending == 0; + if (idle) + list_del(&tx->tx_list); -#if IBNAL_CKSUM - if (nob != msg->ibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); - goto failed; - } + spin_unlock(&conn->ibc_lock); + + if (idle) + kibnal_tx_done(tx); +} - msg_cksum = le32_to_cpu(msg->ibm_cksum); - msg->ibm_cksum = 0; - computed_cksum = kibnal_cksum (msg, nob); +void +kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) +{ + kib_tx_t *tx = kibnal_get_idle_tx(0); - if (msg_cksum != computed_cksum) { - CERROR ("Checksum failure %d: (%d expected)\n", - computed_cksum, msg_cksum); -// goto failed; + if (tx == NULL) { + CERROR("Can't get tx for completion %x for "LPX64"\n", + type, conn->ibc_peer->ibp_nid); + return; } - CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); -#endif + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); + + kibnal_queue_tx(tx, conn); +} + +void +kibnal_handle_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + int credits = msg->ibm_credits; + kib_tx_t *tx; + int rc; + + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; + CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n", + msg->ibm_type, credits, conn->ibc_peer->ibp_nid); + if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - + spin_unlock(&conn->ibc_lock); + kibnal_check_sends(conn); } switch (msg->ibm_type) { + default: + CERROR("Bad IBNAL message type %x from "LPX64"\n", + msg->ibm_type, conn->ibc_peer->ibp_nid); + break; + case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); - return; + break; case IBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (kib_immediate_msg_t)) { - CERROR ("Short IMMEDIATE from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); break; - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (kib_rdma_msg_t)) { - CERROR ("Short RDMA msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32(msg->ibm_u.rdma.ibrm_num_descs); - - CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); - - if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || - (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > - min(nob, IBNAL_MSG_SIZE))) { - CERROR ("num_descs %d too large\n", - msg->ibm_u.rdma.ibrm_num_descs); - goto failed; - } + case IBNAL_MSG_PUT_REQ: + rx->rx_responded = 0; + lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx); + if (rx->rx_responded) + break; - if (flipped) { - __swab32(msg->ibm_u.rdma.rd_key); - } + /* I wasn't asked to transfer any payload data. This happens + * if the PUT didn't match, or got truncated. */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, + msg->ibm_u.putreq.ibprm_cookie); + break; - for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + case IBNAL_MSG_PUT_NAK: + CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid); + kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; - if (flipped) { - __swab32(desc->rd_nob); - __swab64(desc->rd_addr); - } + case IBNAL_MSG_PUT_ACK: + spin_lock(&conn->ibc_lock); + tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); - CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", - msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob); + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from "LPX64"\n", + conn->ibc_peer->ibp_nid); + kibnal_close_conn(conn, -EPROTO); + break; } + + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) while tx_waiting is set, tx_complete() won't touch it. + */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, + kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc < 0) + CERROR("Can't setup rdma for PUT to "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + + spin_lock(&conn->ibc_lock); + if (tx->tx_status == 0 && rc < 0) + tx->tx_status = rc; + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kibnal_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); break; - + case IBNAL_MSG_PUT_DONE: + kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBNAL_MSG_GET_REQ: + rx->rx_responded = 0; + lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx); + if (rx->rx_responded) /* I responded to the GET_REQ */ + break; + /* NB GET didn't match (I'd have responded even with no payload + * data) */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA, + msg->ibm_u.get.ibgm_cookie); + break; + case IBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (kib_completion_msg_t)) { - CERROR ("Short COMPLETION msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32s(&msg->ibm_u.completion.ibcm_status); - - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); - return; - - default: - CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->ibm_type); + kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + kibnal_post_rx(rx, 1); +} + +void +kibnal_rx_complete (kib_rx_t *rx, int nob, vv_comp_status_t vvrc) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + unsigned long flags; + int rc; + + CDEBUG (D_NET, "rx %p conn %p\n", rx, conn); + LASSERT (rx->rx_posted); + rx->rx_posted = 0; + + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + goto ignore; + + if (vvrc != vv_comp_status_success) { + CERROR("Rx from "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, vvrc); goto failed; } - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + rc = kibnal_unpack_msg(msg, nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from "LPX64"\n", + rc, conn->ibc_peer->ibp_nid); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || + msg->ibm_dststamp != kibnal_data.kib_incarnation) { + CERROR ("Stale rx from "LPX64"\n", + conn->ibc_peer->ibp_nid); + goto failed; + } + + /* racing with connection establishment/teardown! */ + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + return; + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + } + kibnal_handle_rx(rx); return; failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); - + kibnal_close_conn(conn, -EIO); + ignore: /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); + kibnal_conn_decref(conn); } -static void -kibnal_rx (kib_rx_t *rx) +#if IBNAL_WHOLE_MEM +int +kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, + unsigned long page_offset, unsigned long len) { - kib_msg_t *msg = rx->rx_msg; + kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; + vv_l_key_t l_key; + vv_r_key_t r_key; + void *addr; + void *vaddr; + vv_mem_reg_h_t mem_h; + vv_return_t vvrc; - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; + if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { + CERROR ("Too many RDMA fragments\n"); + return -EMSGSIZE; + } - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; + addr = (void *)(((unsigned long)kmap(page)) + page_offset); - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); - break; - - case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - break; + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, addr, + len, &mem_h, &l_key, &r_key); + LASSERT (vvrc == vv_return_ok); - case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); - break; - - default: - LBUG(); - break; + kunmap(page); + + if (active) { + if (rd->rd_nfrag == 0) { + rd->rd_key = l_key; + } else if (l_key != rd->rd_key) { + CERROR ("> 1 key for single RDMA desc\n"); + return -EINVAL; + } + vaddr = addr; + } else { + if (rd->rd_nfrag == 0) { + rd->rd_key = r_key; + } else if (r_key != rd->rd_key) { + CERROR ("> 1 key for single RDMA desc\n"); + return -EINVAL; + } + vv_va2advertise_addr(kibnal_data.kib_hca, addr, &vaddr); } - kibnal_post_rx (rx, 1); + kibnal_rf_set(frag, (unsigned long)vaddr, len); + + CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] %p\n", + rd->rd_nfrag, frag->rf_nob, rd->rd_key, + frag->rf_addr_hi, frag->rf_addr_lo, addr); + + rd->rd_nfrag++; + return 0; } -static struct page * +struct page * kibnal_kvaddr_to_page (unsigned long vaddr) { struct page *page; @@ -580,93 +568,26 @@ kibnal_kvaddr_to_page (unsigned long vaddr) else page = virt_to_page (vaddr); - if (!VALID_PAGE (page)) - page = NULL; - - return page; -} - -static void -kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, - unsigned long len, int active) -{ - kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; - kib_rdma_desc_t *desc; - vv_l_key_t l_key; - vv_r_key_t r_key; - void *addr; - vv_mem_reg_h_t mem_h; - vv_return_t retval; - - LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", - ibrm->ibrm_num_descs); - - desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; - - addr = page_address(page) + page_offset; - - /* TODO: This next step is only needed to get either the lkey - * or the rkey. However they should be the same than for the - * tx buffer, so we might as well use it. */ - retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - addr, - len, - &mem_h, - &l_key, - &r_key); - if (retval) { - CERROR("vv_get_gen_mr_attrib failed: %d", retval); - /* TODO: this shouldn't really fail, but what if? */ - return; - } - - if (active) { - ibrm->rd_key = l_key; - } else { - ibrm->rd_key = r_key; - - vv_va2advertise_addr(kibnal_data.kib_hca, addr, &addr); - } - - desc->rd_addr = (__u64)(unsigned long)addr; - desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ - - ibrm->ibrm_num_descs++; + return VALID_PAGE(page) ? page : NULL; } -static int -kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) -{ - struct page *page; - int page_offset, len; - - while (nob > 0) { - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) - return -EFAULT; - - page_offset = vaddr & (PAGE_SIZE - 1); - len = min(nob, (int)PAGE_SIZE - page_offset); - - kibnal_fill_ibrm(tx, page, page_offset, len, active); - nob -= len; - vaddr += len; - } - - return 0; -} - -static int -kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access, - int niov, struct iovec *iov, int offset, int nob, int active) +int +kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, + vv_access_con_bit_mask_t access, + int niov, struct iovec *iov, int offset, int nob) { - void *vaddr; - vv_return_t retval; + /* active if I'm sending */ + int active = ((access & vv_acc_r_mem_write) == 0); + int fragnob; + int rc; + unsigned long vaddr; + struct page *page; + int page_offset; LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT ((rd != tx->tx_rd) == !active); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -675,60 +596,154 @@ kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access, LASSERT (niov > 0); } - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } + rd->rd_nfrag = 0; + do { + LASSERT (niov > 0); - /* our large contiguous iov could be backed by multiple physical - * pages. */ - if (kibnal_whole_mem()) { - int rc; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + - offset, nob, active); - if (rc != 0) { - CERROR ("Can't map iov: %d\n", rc); + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR ("Can't find page\n"); + return -EFAULT; + } + + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + rc = kibnal_append_rdfrag(rd, active, page, + page_offset, fragnob); + if (rc != 0) return rc; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; } - return 0; + nob -= fragnob; + } while (nob > 0); + + return 0; +} + +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, + vv_access_con_bit_mask_t access, + int nkiov, ptl_kiov_t *kiov, int offset, int nob) +{ + /* active if I'm sending */ + int active = ((access & vv_acc_r_mem_write) == 0); + int fragnob; + int rc; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT ((rd != tx->tx_rd) == !active); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + rd->rd_nfrag = 0; + do { + LASSERT (nkiov > 0); + fragnob = min((int)(kiov->kiov_len - offset), nob); + + rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, + kiov->kiov_offset + offset, + fragnob); + if (rc != 0) + return rc; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + return 0; +} +#else +int +kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, + vv_access_con_bit_mask_t access, + int niov, struct iovec *iov, int offset, int nob) + +{ + /* active if I'm sending */ + int active = ((access & vv_acc_r_mem_write) == 0); + void *vaddr; + vv_return_t vvrc; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT ((rd != tx->tx_rd) == !active); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); } vaddr = (void *)(((unsigned long)iov->iov_base) + offset); tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - retval = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob, - kibnal_data.kib_pd, access, - &tx->tx_md.md_handle, &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - if (retval != 0) { - CERROR ("Can't map vaddr %p: %d\n", vaddr, retval); - return -EINVAL; + vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob, + kibnal_data.kib_pd, access, + &tx->tx_md.md_handle, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + if (vvrc != vv_return_ok) { + CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc); + return -EFAULT; } tx->tx_mapped = KIB_TX_MAPPED; + + rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; + rd->rd_nfrag = 1; + kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob); + return (0); } -static int -kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access, - int nkiov, ptl_kiov_t *kiov, - int offset, int nob, int active) +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, + vv_access_con_bit_mask_t access, + int nkiov, ptl_kiov_t *kiov, int offset, int nob) { + /* active if I'm sending */ + int active = ((access & vv_acc_r_mem_write) == 0); + vv_return_t vvrc; vv_phy_list_t phys_pages; - vv_phy_buf_t *phys_buf = NULL; + vv_phy_buf_t *phys; int page_offset; int nphys; int resid; - int phys_size = 0; - int i, rc = 0; - vv_return_t retval; + int phys_size; + int rc; CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -737,27 +752,19 @@ kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access, LASSERT (nkiov > 0); } - page_offset = kiov->kiov_offset + offset; - nphys = 1; - - if (!kibnal_whole_mem()) { - phys_size = nkiov * sizeof(vv_phy_buf_t); - PORTAL_ALLOC(phys_buf, phys_size); - - if (phys_buf == NULL) { - CERROR ("Can't allocate phys_buf\n"); - return (-ENOMEM); - } + phys_size = nkiov * sizeof (*phys); + PORTAL_ALLOC(phys, phys_size); + if (phys == NULL) { + CERROR ("Can't allocate tmp phys\n"); + return (-ENOMEM); + } - phys_buf[0].start = kibnal_page2phys(kiov->kiov_page); - phys_buf[0].size = PAGE_SIZE; + page_offset = kiov->kiov_offset + offset; - } else { - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, - kiov->kiov_len, active); - } + phys[0].start = kibnal_page2phys(kiov->kiov_page); + phys[0].size = PAGE_SIZE; + nphys = 1; resid = nob - (kiov->kiov_len - offset); while (resid > 0) { @@ -768,99 +775,73 @@ kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access, if (kiov->kiov_offset != 0 || ((resid > PAGE_SIZE) && kiov->kiov_len < PAGE_SIZE)) { + int i; /* Can't have gaps */ CERROR ("Can't make payload contiguous in I/O VM:" "page %d, offset %d, len %d \n", nphys, kiov->kiov_offset, kiov->kiov_len); - for (i = -nphys; i < nkiov; i++) - { + for (i = -nphys; i < nkiov; i++) CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } + i, kiov[i].kiov_page, + kiov[i].kiov_offset, + kiov[i].kiov_len); rc = -EINVAL; goto out; } - if (nphys == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - if (!kibnal_whole_mem()) { - LASSERT (nphys * sizeof (vv_phy_buf_t) < phys_size); - phys_buf[nphys].start = kibnal_page2phys(kiov->kiov_page); - phys_buf[nphys].size = PAGE_SIZE; - - } else { - if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - kibnal_fill_ibrm(tx, kiov->kiov_page, - kiov->kiov_offset, kiov->kiov_len, - active); - } + LASSERT (nphys * sizeof (*phys) < phys_size); + phys[nphys].start = kibnal_page2phys(kiov->kiov_page); + phys[nphys].size = PAGE_SIZE; - nphys ++; + nphys++; resid -= PAGE_SIZE; } - if (kibnal_whole_mem()) - goto out; - #if 0 CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); for (i = 0; i < nphys; i++) CWARN (" [%d] "LPX64"\n", i, phys[i]); #endif -#if IBNAL_FMR -#error "vibnal hasn't learned about FMR yet" - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys_pages, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - retval = vv_phy_mem_region_register(kibnal_data.kib_hca, - &phys_pages, - IBNAL_RDMA_BASE, - nphys, - 0, /* offset */ - kibnal_data.kib_pd, - vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */ - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (retval == vv_return_ok) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); -#if IBNAL_FMR - tx->tx_mapped = KIB_TX_MAPPED_FMR; -#else - tx->tx_mapped = KIB_TX_MAPPED; -#endif - } else { - CERROR ("Can't map phys_pages: %d\n", retval); + vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca, + &phys_pages, + IBNAL_RDMA_BASE, + nphys, + page_offset, + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + + if (vvrc != vv_return_ok) { + CERROR ("Can't map phys: %d\n", vvrc); rc = -EFAULT; + goto out; } - out: - if (phys_buf != NULL) - PORTAL_FREE(phys_buf, phys_size); + CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: " + "lkey %x, rkey %x, addr "LPX64"\n", + nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey, + tx->tx_md.md_addr); + + tx->tx_mapped = KIB_TX_MAPPED; + rc = 0; + rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; + rd->rd_nfrag = 1; + kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob); + + out: + PORTAL_FREE(phys, phys_size); return (rc); } +#endif -static kib_conn_t * +kib_conn_t * kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; @@ -876,108 +857,162 @@ kibnal_find_conn_locked (kib_peer_t *peer) void kibnal_check_sends (kib_conn_t *conn) { - unsigned long flags; kib_tx_t *tx; + vv_return_t vvrc; int rc; int i; int done; - int nwork; - ENTRY; - - spin_lock_irqsave (&conn->ibc_lock, flags); + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid); + return; + } + + spin_lock(&conn->ibc_lock); LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); if (list_empty(&conn->ibc_tx_queue) && conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); tx = kibnal_get_idle_tx(0); /* don't block */ if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); + if (tx != NULL) kibnal_queue_tx_locked(tx, conn); - } } while (!list_empty (&conn->ibc_tx_queue)) { tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); LASSERT (conn->ibc_outstanding_credits >= 0); LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) - GOTO(out, 0); - - if (conn->ibc_credits == 0) /* no credits */ - GOTO(out, 1); + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) { + CDEBUG(D_NET, LPX64": posted enough\n", + conn->ibc_peer->ibp_nid); + break; + } + + if (conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, LPX64": no credits\n", + conn->ibc_peer->ibp_nid); + break; + } if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - GOTO(out, 2); - + conn->ibc_outstanding_credits == 0) { /* giving back credits */ + CDEBUG(D_NET, LPX64": not using last credit\n", + conn->ibc_peer->ibp_nid); + break; + } + list_del (&tx->tx_list); if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, LPX64": redundant noop\n", + conn->ibc_peer->ibp_nid); continue; } - tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; - conn->ibc_outstanding_credits = 0; + kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation); + conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; conn->ibc_credits--; - /* we only get a tx completion for the final rdma op */ - tx->tx_sending = 0; - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() from + * the first send; hence the += rather than = below. */ + tx->tx_sending++; + list_add (&tx->tx_list, &conn->ibc_active_txs); -#if IBNAL_CKSUM - tx->tx_msg->ibm_cksum = 0; - tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); -#endif - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - LASSERT (tx->tx_nsp > 0); + /* Keep holding ibc_lock while posting sends on this + * connection; vv_post_send() isn't re-entrant on the same + * QP!! */ + + LASSERT (tx->tx_nwrq > 0); rc = -ECONNABORTED; - nwork = 0; + vvrc = vv_return_ok; if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - vv_return_t retval; - tx->tx_status = 0; - rc = 0; - - retval = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nsp, tx->tx_wrq, vv_operation_type_send_rc); - - if (retval != 0) { - CERROR("post send failed with %d\n", retval); - rc = -ECONNABORTED; - break; +#if 1 + vvrc = vv_post_send_list(kibnal_data.kib_hca, + conn->ibc_qp, + tx->tx_nwrq, + tx->tx_wrq, + vv_operation_type_send_rc); + rc = (vvrc == vv_return_ok) ? 0 : -EIO; +#else + /* Only post 1 item at a time for now (so we know + * exactly how many got posted successfully) */ + for (i = 0; i < tx->tx_nwrq; i++) { + switch (tx->tx_wrq[i].wr_type) { + case vv_wr_send: + CDEBUG(D_NET, "[%d]posting send [%d %x %p]%s: %x\n", + i, + tx->tx_wrq[i].scatgat_list->length, + tx->tx_wrq[i].scatgat_list->l_key, + tx->tx_wrq[i].scatgat_list->v_address, + tx->tx_wrq[i].type.send.send_qp_type.rc_type.fance_indicator ? + "(fence)":"", + tx->tx_msg->ibm_type); + break; + case vv_wr_rdma_write: + CDEBUG(D_NET, "[%d]posting PUT [%d %x %p]->[%x "LPX64"]\n", + i, + tx->tx_wrq[i].scatgat_list->length, + tx->tx_wrq[i].scatgat_list->l_key, + tx->tx_wrq[i].scatgat_list->v_address, + tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_r_key, + tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_addr); + break; + case vv_wr_rdma_read: + CDEBUG(D_NET, "[%d]posting GET [%d %x %p]->[%x "LPX64"]\n", + i, + tx->tx_wrq[i].scatgat_list->length, + tx->tx_wrq[i].scatgat_list->l_key, + tx->tx_wrq[i].scatgat_list->v_address, + tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_r_key, + tx->tx_wrq[i].type.send.send_qp_type.rc_type.r_addr); + break; + default: + LBUG(); + } + vvrc = vv_post_send(kibnal_data.kib_hca, + conn->ibc_qp, + &tx->tx_wrq[i], + vv_operation_type_send_rc); + CDEBUG(D_NET, LPX64": post %d/%d\n", + conn->ibc_peer->ibp_nid, i, tx->tx_nwrq); + if (vvrc != vv_return_ok) { + rc = -EIO; + break; + } } - - tx->tx_sending = tx->tx_nsp; +#endif } if (rc != 0) { @@ -988,18 +1023,18 @@ kibnal_check_sends (kib_conn_t *conn) conn->ibc_nsends_posted--; tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - - /* TODO: I think this is buggy if vv_post_send_list failed. */ + tx->tx_waiting = 0; + tx->tx_sending--; + done = (tx->tx_sending == 0); if (done) list_del (&tx->tx_list); - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + vvrc, conn->ibc_peer->ibp_nid); else CDEBUG (D_NET, "Error %d posting transmit to " LPX64"\n", rc, conn->ibc_peer->ibp_nid); @@ -1010,179 +1045,225 @@ kibnal_check_sends (kib_conn_t *conn) kibnal_tx_done (tx); return; } - } - EXIT; -out: - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); } -static void -kibnal_tx_callback (vv_wc_t *wc) +void +kibnal_tx_complete (kib_tx_t *tx, int final_send, vv_comp_status_t vvrc) { - kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->wr_id); - kib_conn_t *conn; - unsigned long flags; + kib_tx_t *tx = (kib_tx_t *)((unsigned long)wc->wr_id); + kib_conn_t *conn = tx->tx_conn; + int failed = (vvrc != vv_comp_status_success); int idle; - conn = tx->tx_conn; - LASSERT (conn != NULL); + CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, + tx->tx_sending, tx->tx_nwrq, wc->completion_status); LASSERT (tx->tx_sending != 0); - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_sending, tx->tx_nsp, wc->completion_status); + if (failed && + conn->ibc_state == IBNAL_CONN_ESTABLISHED) + CERROR ("Tx completion to "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->completion_status); + + /* I should only get RDMA notifications of errors */ + LASSERT (final_send || failed); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ + * gets to free it, which also drops its ref on 'conn'. */ - tx->tx_sending--; + if (final_send) /* this is the last work item */ + tx->tx_sending--; + + if (failed) { + tx->tx_waiting = 0; + tx->tx_status = -EIO; + } + idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + !tx->tx_waiting; /* Not waiting for peer */ if (idle) list_del(&tx->tx_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); /* 1 ref for me.... */ if (tx->tx_sending == 0) conn->ibc_nsends_posted--; - if (wc->completion_status != vv_comp_status_success && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (idle) kibnal_tx_done (tx); - if (wc->completion_status != vv_comp_status_success) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->completion_status); - kibnal_close_conn (conn, -ENETDOWN); - } else { - /* can I shovel some more sends out the door? */ + if (failed) + kibnal_close_conn (conn, -EIO); + else kibnal_check_sends(conn); - } - kibnal_put_conn (conn); + kibnal_conn_decref(conn); /* ...until here */ } -void -kibnal_ca_async_callback(vv_event_record_t ev) +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { - /* XXX flesh out. this seems largely for async errors */ - CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data); + vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq]; + vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (tx->tx_nwrq >= 0 && + tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); + LASSERT (nob <= IBNAL_MSG_SIZE); + + kibnal_init_msg(tx->tx_msg, type, body_nob); + + *gl = (vv_scatgat_t) { + .v_address = (void *)((unsigned long)KIBNAL_TX_VADDR(tx)), + .l_key = KIBNAL_TX_LKEY(tx), + .length = nob, + }; + + memset(wrq, 0, sizeof(*wrq)); + + wrq->wr_id = (unsigned long)tx; + wrq->wr_type = vv_wr_send; + wrq->scatgat_list = gl; + wrq->num_of_data_segments = 1; + wrq->completion_notification = 1; + wrq->type.send.solicited_event = 1; + wrq->type.send.immidiate_data_indicator = 0; + wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; + + tx->tx_nwrq++; } -void -kibnal_ca_callback (unsigned long unused_context) +int +kibnal_init_rdma (kib_tx_t *tx, int type, int nob, + kib_rdma_desc_t *dstrd, __u64 dstcookie) { - vv_wc_t wc; - int armed = 0; - vv_return_t retval; + /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ + int resid = nob; + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + kib_rdma_frag_t *srcfrag; + int srcidx; + kib_rdma_frag_t *dstfrag; + int dstidx; + vv_scatgat_t *gl; + vv_wr_t *wrq; + int wrknob; + int rc; - for(;;) { + /* Called by scheduler */ + LASSERT (!in_interrupt()); - while (vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc) == vv_return_ok) { + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); - /* We will need to rearm the CQ to avoid a potential race. */ - armed = 0; + srcidx = dstidx = 0; + srcfrag = &srcrd->rd_frags[0]; + dstfrag = &dstrd->rd_frags[0]; + rc = resid; - if (kibnal_wreqid_is_rx(wc.wr_id)) - kibnal_rx_callback(&wc); - else - kibnal_tx_callback(&wc); + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrag) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; } - - if (armed) - return; - retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); - if (retval != 0) { - CERROR ("Failed to re-arm completion queue: %d\n", retval); - return; + if (dstidx == dstrd->rd_nfrag) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; } - armed = 1; - } -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nsp]; - vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); - LASSERT (nob <= IBNAL_MSG_SIZE); - - tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; - tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; - tx->tx_msg->ibm_type = type; -#if IBNAL_CKSUM - tx->tx_msg->ibm_nob = nob; -#endif - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); + if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { + CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", + srcidx, srcrd->rd_nfrag, + dstidx, dstrd->rd_nfrag); + rc = -EMSGSIZE; + break; + } - *gl = (vv_scatgat_t) { - .v_address = (void *)tx->tx_msg, - .length = nob, - .l_key = tx->l_key, - }; + wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); - wrq->wr_id = kibnal_ptr2wreqid(tx, 0); - wrq->completion_notification = 1; - wrq->scatgat_list = gl; - wrq->num_of_data_segments = 1; - wrq->wr_type = vv_wr_send; + gl = &tx->tx_gl[tx->tx_nwrq]; + gl->v_address = (void *)((unsigned long)kibnal_rf_addr(srcfrag)); + gl->length = wrknob; + gl->l_key = srcrd->rd_key; - wrq->type.send.solicited_event = 1; + wrq = &tx->tx_wrq[tx->tx_nwrq]; + wrq->wr_id = (unsigned long)tx; + /* All frags give completion until we've sussed how to submit + * all frags + completion message and only (but reliably) get + * notification on the completion message */ + wrq->completion_notification = 0; + wrq->scatgat_list = gl; + wrq->num_of_data_segments = 1; + wrq->wr_type = vv_wr_rdma_write; + wrq->type.send.solicited_event = 0; + wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; + wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag); + wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; + + resid -= wrknob; + if (wrknob < srcfrag->rf_nob) { + kibnal_rf_set(srcfrag, + kibnal_rf_addr(srcfrag) + resid, + srcfrag->rf_nob - wrknob); + } else { + srcfrag++; + srcidx++; + } + + if (wrknob < dstfrag->rf_nob) { + kibnal_rf_set(dstfrag, + kibnal_rf_addr(dstfrag) + resid, + dstfrag->rf_nob - wrknob); + } else { + dstfrag++; + dstidx++; + } + + tx->tx_nwrq++; + } - wrq->type.send.send_qp_type.rc_type.fance_indicator = fence; + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - tx->tx_nsp++; + return rc; } -static void +void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) { - unsigned long flags; - - spin_lock_irqsave(&conn->ibc_lock, flags); - + spin_lock(&conn->ibc_lock); kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); kibnal_check_sends(conn); } -static void +void kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { - unsigned long flags; kib_peer_t *peer; kib_conn_t *conn; + unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ read_lock_irqsave(g_lock, flags); @@ -1196,13 +1277,11 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) conn = kibnal_find_conn_locked (peer); if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + kibnal_conn_addref(conn); /* 1 ref for me... */ read_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...to here */ return; } @@ -1212,7 +1291,7 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) peer = kibnal_find_peer_locked (nid); if (peer == NULL) { - write_unlock_irqrestore (g_lock, flags); + write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; kibnal_tx_done (tx); return; @@ -1221,328 +1300,84 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - write_unlock_irqrestore (g_lock, flags); + kibnal_conn_addref(conn); /* 1 ref for me... */ + write_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...until here */ return; } if (peer->ibp_connecting == 0) { if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { - write_unlock_irqrestore (g_lock, flags); + write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; kibnal_tx_done (tx); return; } peer->ibp_connecting = 1; - - kib_peer_addref(peer); /* extra ref for connd */ + kibnal_peer_addref(peer); /* extra ref for connd */ - spin_lock (&kibnal_data.kib_connd_lock); + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&kibnal_data.kib_connd_lock); + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } /* A connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - write_unlock_irqrestore (g_lock, flags); + write_unlock_irqrestore(g_lock, flags); } -static ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) +int +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if kibnal_get_peer (nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->libnal_ni.ni_pid.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ptl_err_t +kibnal_sendmsg(lib_nal_t *nal, + void *private, + lib_msg_t *libmsg, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + int payload_offset, + int payload_nob) { - int nob = libmsg->md->length; - kib_tx_t *tx; kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; int rc; - vv_access_con_bit_mask_t access; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ + int n; - access = vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind; + /* NB 'private' is different depending on what we're sending.... */ - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64 + " pid %d\n", payload_nob, payload_niov, nid , pid); - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob, 0); - else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob, 0); + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); - if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); - rc = -ENOMEM; - goto failed; - } - } - - tx->tx_passive_rdma = 1; - - ibmsg = tx->tx_msg; - - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - /* map_kiov alrady filled the rdma descs for the whole_mem case */ - if (!kibnal_whole_mem()) { - ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - ibmsg->ibm_u.rdma.ibrm_num_descs = 1; - } - - kibnal_init_tx_msg (tx, type, - kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); - - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; - - kibnal_launch_tx(tx, nid); - return (PTL_OK); - - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (PTL_FAIL); -} - -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - vv_access_con_bit_mask_t access; - vv_wr_operation_t rdma_op; - int rc; - __u32 i; - - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); - - /* Called by scheduler */ - LASSERT (!in_interrupt ()); - - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; - - if (type == IBNAL_MSG_GET_DONE) { - access = 0; - rdma_op = vv_wr_rdma_write; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access = vv_acc_l_mem_write; - rdma_op = vv_wr_rdma_read; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); - } - - tx = kibnal_get_idle_tx (0); /* Mustn't block */ - if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 - " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); - return; - } - LASSERT (tx->tx_nsp == 0); - - if (nob == 0) - GOTO(init_tx, 0); - - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); - else - rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); - - if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - if (!kibnal_whole_mem()) { - tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; - } - - /* XXX ugh. different page-sized hosts. */ - if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != - rxmsg->ibm_u.rdma.ibrm_num_descs) { - CERROR("tx descs (%u) != rx descs (%u)\n", - tx->tx_msg->ibm_u.rdma.ibrm_num_descs, - rxmsg->ibm_u.rdma.ibrm_num_descs); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - /* map_kiov filled in the rdma descs which describe our side of the - * rdma transfer. */ - /* ibrm_num_descs was verified in rx_callback */ - for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ - vv_scatgat_t *ds = &tx->tx_gl[i]; - vv_wr_t *wrq = &tx->tx_wrq[i]; - - ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; - rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; - - ds->v_address = (void *)(unsigned long)ldesc->rd_addr; - ds->length = ldesc->rd_nob; - ds->l_key = tx->tx_msg->ibm_u.rdma.rd_key; - - wrq->wr_id = kibnal_ptr2wreqid(tx, 0); - -#if 0 - /* only the last rdma post triggers tx completion */ - if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) - wrq->completion_notification = 1; - else - wrq->completion_notification = 0; - -#else - /* TODO: hack. Right now complete everything, else the - * driver will deadlock. This is less efficient than - * requestion a notification for only a few of the - * WQE. */ - wrq->completion_notification = 1; -#endif - - wrq->scatgat_list = ds; - wrq->num_of_data_segments = 1; - wrq->wr_type = rdma_op; - - wrq->type.send.solicited_event = 0; - - wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; - wrq->type.send.send_qp_type.rc_type.r_addr = rdesc->rd_addr; - wrq->type.send.send_qp_type.rc_type.r_r_key = rxmsg->ibm_u.rdma.rd_key; - - CDEBUG(D_NET, "prepared RDMA with r_addr=%llx r_key=%x\n", - wrq->type.send.send_qp_type.rc_type.r_addr, - wrq->type.send.send_qp_type.rc_type.r_r_key); - - tx->tx_nsp++; - } - -init_tx: - txmsg = tx->tx_msg; - - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; - - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; - } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); - } - - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ - kibnal_queue_tx(tx, rx->rx_conn); -} - -static ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); + /* Thread context */ + LASSERT (!in_interrupt()); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); @@ -1555,61 +1390,166 @@ kibnal_sendmsg(lib_nal_t *nal, /* reply's 'private' is the incoming receive */ kib_rx_t *rx = private; - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, + LASSERT(rx != NULL); + + if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) { + /* RDMA not expected */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) { + CERROR("REPLY for "LPX64" too big (RDMA not requested):" + "%d (max for message is %d)\n", + nid, payload_nob, IBNAL_MSG_SIZE); + CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n", + nob, nid); + return PTL_FAIL; + } + break; + } + + /* Incoming message consistent with RDMA? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) { + CERROR("REPLY to "LPX64" bad msg type %x!!!\n", + nid, rx->rx_msg->ibm_type); + return PTL_FAIL; + } + + /* NB rx_complete() will send GET_NAK when I return to it from + * here, unless I set rx_responded! */ + + tx = kibnal_get_idle_tx(0); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to "LPX64"\n", nid); + return PTL_FAIL; + } + + if (payload_nob == 0) + rc = 0; + else if (payload_kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, + payload_niov, payload_iov, payload_offset, payload_nob); - return (PTL_OK); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc); + kibnal_tx_done(tx); + return PTL_FAIL; } - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); + rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from "LPX64": %d\n", + nid, rc); + } else if (rc == 0) { + /* No RDMA: local completion may happen now! */ + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK); + } else { + /* RDMA: lib_finalize(libmsg) when it completes */ + tx->tx_libmsg[0] = libmsg; } - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d (max for message is %d)\n", - nid, payload_nob, IBNAL_MSG_SIZE); - return (PTL_FAIL); - } - break; + kibnal_queue_tx(tx, rx->rx_conn); + rx->rx_responded = 1; + return (rc >= 0) ? PTL_OK : PTL_FAIL; } case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ + /* will the REPLY message be small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; + if (nob <= IBNAL_MSG_SIZE) + break; + + tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */ + LASSERT (tx != NULL); + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + + if ((libmsg->md->options & PTL_MD_KIOV) == 0) + rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, + vv_acc_r_mem_write, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, libmsg->md->length); + else + rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, + vv_acc_r_mem_write, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, libmsg->md->length); + if (rc != 0) { + CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc); + kibnal_tx_done(tx); + return PTL_FAIL; + } + + n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); + kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); + + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); + if (tx->tx_libmsg[1] == NULL) { + CERROR("Can't create reply for GET -> "LPX64"\n", nid); + kibnal_tx_done(tx); + return PTL_FAIL; + } + + tx->tx_libmsg[0] = libmsg; /* finalise libmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kibnal_launch_tx(tx, nid); + return PTL_OK; case PTL_MSG_ACK: LASSERT (payload_nob == 0); break; case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ + /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); - - break; + if (nob <= IBNAL_MSG_SIZE) + break; + + tx = kibnal_get_idle_tx(1); /* may block: caller is app thread */ + LASSERT (tx != NULL); + + if (payload_kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc); + kibnal_tx_done(tx); + return PTL_FAIL; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); + + tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kibnal_launch_tx(tx, nid); + return PTL_OK; } + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBNAL_MSG_SIZE); + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + type == PTL_MSG_REPLY)); if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); + CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid); + return PTL_NO_SPACE; } ibmsg = tx->tx_msg; @@ -1626,18 +1566,15 @@ kibnal_sendmsg(lib_nal_t *nal, payload_offset, payload_nob); } - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); - - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); + tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ kibnal_launch_tx(tx, nid); - return (PTL_OK); + return PTL_OK; } -static ptl_err_t +ptl_err_t kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, @@ -1651,7 +1588,7 @@ kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, payload_offset, payload_len)); } -static ptl_err_t +ptl_err_t kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, @@ -1663,28 +1600,33 @@ kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, payload_offset, payload_len)); } -static ptl_err_t +ptl_err_t kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) + size_t offset, int mlen, int rlen) { kib_rx_t *rx = private; kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int rc; + int n; LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); + LASSERT (mlen >= 0); + LASSERT (!in_interrupt()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); switch (rxmsg->ibm_type) { default: LBUG(); - return (PTL_FAIL); case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > IBNAL_MSG_SIZE) { CERROR ("Immediate message from "LPX64" too big: %d\n", rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); return (PTL_FAIL); @@ -1702,22 +1644,65 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case IBNAL_MSG_GET_RDMA: + case IBNAL_MSG_PUT_REQ: + /* NB rx_complete() will send PUT_NAK when I return to it from + * here, unless I set rx_responded! */ + + if (mlen == 0) { /* No payload to RDMA */ + lib_finalize(nal, NULL, libmsg, PTL_OK); + return PTL_OK; + } + + tx = kibnal_get_idle_tx(0); + if (tx == NULL) { + CERROR("Can't allocate tx for "LPX64"\n", + conn->ibc_peer->ibp_nid); + return PTL_FAIL; + } + + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + vv_acc_r_mem_write, + niov, iov, offset, mlen); + else + rc = kibnal_setup_rd_kiov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + vv_acc_r_mem_write, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + kibnal_tx_done(tx); + return PTL_FAIL; + } + + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; + + n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); + + tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kibnal_queue_tx(tx, conn); + + LASSERT (!rx->rx_responded); + rx->rx_responded = 1; + return PTL_OK; + + case IBNAL_MSG_GET_REQ: /* We get called here just to discard any junk after the * GET hdr. */ LASSERT (libmsg == NULL); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); - return (PTL_OK); } } -static ptl_err_t +ptl_err_t kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) @@ -1726,7 +1711,7 @@ kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, offset, mlen, rlen)); } -static ptl_err_t +ptl_err_t kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) @@ -1735,14 +1720,6 @@ kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, offset, mlen, rlen)); } -/***************************************************************************** - * the rest of this file concerns connection management. active connetions - * start with connect_peer, passive connections start with passive_callback. - * active disconnects start with conn_close, cm_callback starts passive - * disconnects and contains the guts of how the disconnect state machine - * progresses. - *****************************************************************************/ - int kibnal_thread_start (int (*fn)(void *arg), void *arg) { @@ -1755,43 +1732,36 @@ kibnal_thread_start (int (*fn)(void *arg), void *arg) return (0); } -static void +void kibnal_thread_fini (void) { atomic_dec (&kibnal_data.kib_nthreads); } -/* this can be called by anyone at any time to close a connection. if - * the connection is still established it heads to the connd to start - * the disconnection in a safe context. It has no effect if called - * on a connection that is already disconnecting */ void kibnal_close_conn_locked (kib_conn_t *conn, int error) { - /* This just does the immmediate housekeeping, and schedules the - * connection for the connd to finish off. + /* This just does the immmediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). * Caller holds kib_global_lock exclusively in irq context */ kib_peer_t *peer = conn->ibc_peer; - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, - IBNAL_CONN_DISCONNECTED); + LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - return; /* already disconnecting */ + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + return; /* already being handled */ CDEBUG (error == 0 ? D_NET : D_ERROR, "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_connd_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_connd_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - } + /* kib_connd_conns takes ibc_list's ref */ + list_del (&conn->ibc_list); if (list_empty (&peer->ibp_conns) && peer->ibp_persistence == 0) { @@ -1799,45 +1769,139 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) kibnal_unlink_peer_locked (peer); } - conn->ibc_state = IBNAL_CONN_SEND_DREQ; + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1); - spin_lock (&kibnal_data.kib_connd_lock); + spin_lock(&kibnal_data.kib_connd_lock); list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&kibnal_data.kib_connd_lock); + spin_unlock(&kibnal_data.kib_connd_lock); } void kibnal_close_conn (kib_conn_t *conn, int error) { - unsigned long flags; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + unsigned long flags; + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); kibnal_close_conn_locked (conn, error); - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} + +void +kibnal_handle_early_rxs(kib_conn_t *conn) +{ + unsigned long flags; + kib_rx_t *rx; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + kibnal_handle_rx(rx); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} + +void +kibnal_conn_disconnected(kib_conn_t *conn) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + + /* I'm the connd */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); + + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); + + /* move QP to error state to make posted work items complete */ + kibnal_set_qp_state(conn, vv_qp_state_error); + + spin_lock(&conn->ibc_lock); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + if (tx->tx_sending != 0) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_waiting || + tx->tx_sending != 0); + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + if (tx->tx_sending != 0) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock(&conn->ibc_lock); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } + + kibnal_handle_early_rxs(conn); } -static void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) +void +kibnal_peer_connect_failed (kib_peer_t *peer, int active) { - LIST_HEAD (zombies); + struct list_head zombies; kib_tx_t *tx; unsigned long flags; - LASSERT (rc != 0); + /* Only the connd creates conns => single threaded */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; + if (active) { + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + } else { + LASSERT (!kibnal_peer_active(peer)); + } + if (peer->ibp_connecting != 0) { /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return; } @@ -1848,15 +1912,9 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, IBNAL_MAX_RECONNECT_INTERVAL); - /* Take peer's blocked blocked transmits; I'll complete - * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { @@ -1868,996 +1926,896 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) LASSERT (list_empty(&peer->ibp_tx_queue)); } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); - - while (!list_empty (&zombies)) { + if (list_empty (&zombies)) + return; + + CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid); + do { tx = list_entry (zombies.next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* complete now */ tx->tx_status = -EHOSTUNREACH; kibnal_tx_done (tx); - } + } while (!list_empty (&zombies)); } -static void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) +void +kibnal_connreq_done(kib_conn_t *conn, int active, int status) { - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int i; + static cm_reject_data_t rej; - CDEBUG(D_NET, "Enter kibnal_connreq_done for conn=%p, active=%d, status=%d\n", - conn, active, status); + struct list_head txs; + kib_peer_t *peer = conn->ibc_peer; + kib_peer_t *peer2; + unsigned long flags; + kib_tx_t *tx; - /* passive connection has no connreq & vice versa */ - LASSERTF(!active == !(conn->ibc_connreq != NULL), - "%d %p\n", active, conn->ibc_connreq); + /* Only the connd creates conns => single threaded */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); if (active) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting != 0); - - if (status == 0) { - /* connection established... */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_ESTABLISHED; - - if (!kibnal_peer_active(peer)) { - /* ...but peer deleted meantime */ - status = -ECONNABORTED; - } + LASSERT (peer->ibp_connecting > 0); } else { - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, - IBNAL_CONN_CONNECTING); + LASSERT (!kibnal_peer_active(peer)); } + + PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; - if (status == 0) { - /* Everything worked! */ - - peer->ibp_connecting--; - - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - kibnal_queue_tx_locked (tx, conn); - } - - spin_unlock (&conn->ibc_lock); - - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + if (status != 0) { + /* failed to establish connection */ + switch (conn->ibc_state) { + default: + LBUG(); + case IBNAL_CONN_ACTIVE_CHECK_REPLY: + /* got a connection reply but failed checks */ + LASSERT (active); + memset(&rej, 0, sizeof(rej)); + rej.reason = cm_rej_code_usr_rej; + cm_reject(conn->ibc_cep, &rej); + break; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + case IBNAL_CONN_ACTIVE_CONNECT: + LASSERT (active); + cm_cancel(conn->ibc_cep); + kibnal_pause(HZ/10); + /* cm_connect() failed immediately or + * callback returned failure */ + break; - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + case IBNAL_CONN_ACTIVE_ARP: + LASSERT (active); + /* ibat_get_ib_data() failed immediately + * or callback returned failure */ + break; - CDEBUG(D_NET, "RX[%d] %p->%p\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg); + case IBNAL_CONN_INIT: + break; - kibnal_post_rx (&conn->ibc_rxs[i], 0); + case IBNAL_CONN_PASSIVE_WAIT: + LASSERT (!active); + /* cm_accept callback returned failure */ + break; } - kibnal_check_sends (conn); + kibnal_peer_connect_failed(conn->ibc_peer, active); + kibnal_conn_disconnected(conn); return; } - /* connection failed */ - if (state == IBNAL_CONN_CONNECTING) { - /* schedule for connd to close */ - kibnal_close_conn_locked (conn, status); - } else { - /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_connect_failed (conn->ibc_peer, active, status); + /* connection established */ + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* If we didn't establish the connection we don't have to pass - * through the disconnect protocol before dropping the CM ref */ - if (state < IBNAL_CONN_CONNECTING) - kibnal_put_conn (conn); -} - -static int -kibnal_accept (kib_conn_t **connp, cm_cep_handle_t *cep, - ptl_nid_t nid, __u64 incarnation, int queue_depth) -{ - kib_conn_t *conn = kibnal_create_conn(); - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - - if (conn == NULL) - return (-ENOMEM); - - if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-EPROTO); + if (active) { + LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU); + } else { + LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); } - /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (nid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-ENOMEM); + kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); + + if (!active) { + peer2 = kibnal_find_peer_locked(peer->ibp_nid); + if (peer2 != NULL) { + /* already in the peer table; swap */ + conn->ibc_peer = peer2; + kibnal_peer_addref(peer2); + kibnal_peer_decref(peer); + peer = conn->ibc_peer; + } else { + /* add 'peer' to the peer table */ + kibnal_peer_addref(peer); + list_add_tail(&peer->ibp_list, + kibnal_nid2peerlist(peer->ibp_nid)); + } } - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + /* Add conn to peer's list and nuke any dangling conns from a different + * peer instance... */ + kibnal_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer->ibp_conns); + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); + + if (!kibnal_peer_active(peer) || /* peer has been deleted */ + conn->ibc_comms_error != 0 || /* comms error */ + conn->ibc_disconnect) { /* need to disconnect */ + + /* start to shut down connection */ + kibnal_close_conn_locked(conn, -ECONNABORTED); - peer2 = kibnal_find_peer_locked(nid); - if (peer2 == NULL) { - /* peer table takes my ref on peer */ - list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); - } else { - kib_peer_decref (peer); - peer = peer2; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + kibnal_peer_connect_failed(peer, active); + return; } - kib_peer_addref(peer); /* +1 ref for conn */ - peer->ibp_connecting++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - /* conn->ibc_cep is set when cm_accept is called */ - conn->ibc_incarnation = incarnation; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - *connp = conn; - return (0); -} + if (active) + peer->ibp_connecting--; -static void kibnal_move_qp_to_error(kib_conn_t *conn) -{ - vv_qp_attr_t qp_attr; - vv_return_t retval; + /* grab pending txs while I have the lock */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + /* reset reconnect interval for next attempt */ + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock (&conn->ibc_lock); + while (!list_empty (&txs)) { + tx = list_entry (txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - qp_attr.modify.qp_modify_into_state = vv_qp_state_error; - qp_attr.modify.vv_qp_attr_mask = VV_QP_AT_STATE; - qp_attr.modify.qp_type = vv_qp_type_r_conn; + kibnal_queue_tx_locked (tx, conn); + } + spin_unlock (&conn->ibc_lock); + kibnal_check_sends (conn); - retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs); - if (retval) - CERROR("couldn't move qp into error state, error %d\n", retval); + /* schedule blocked rxs */ + kibnal_handle_early_rxs(conn); } -static void kibnal_flush_pending(kib_conn_t *conn) +void +kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) { - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - int done; - - /* NB we wait until the connection has closed before completing - * outstanding passive RDMAs so we can be sure the network can't - * touch the mapped memory any more. */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); - - /* set the QP to the error state so that we get flush callbacks - * on our posted receives which can then drop their conn refs */ - kibnal_move_qp_to_error(conn); - - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); + static cm_dreply_data_t drep; /* just zeroed space */ + + kib_conn_t *conn = (kib_conn_t *)arg; + unsigned long flags; + + /* CAVEAT EMPTOR: tasklet context */ - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); + switch (cmdata->status) { + default: + LBUG(); + + case cm_event_disconn_request: + /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done + * IBNAL_CONN_ESTABLISHED: I start it closing + * otherwise: it's closing anyway */ + cm_disconnect(conn->ibc_cep, NULL, &drep); + cm_cancel(conn->ibc_cep); - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + LASSERT (!conn->ibc_disconnect); + conn->ibc_disconnect = 1; - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; + switch (conn->ibc_state) { + default: + LBUG(); - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); + case IBNAL_CONN_ACTIVE_RTU: + /* kibnal_connreq_done is getting there; It'll see + * ibc_disconnect set... */ + kibnal_conn_decref(conn); /* lose my ref */ + break; - if (!done) - continue; + case IBNAL_CONN_ESTABLISHED: + /* kibnal_connreq_done got there already; get + * disconnect going... */ + kibnal_close_conn_locked(conn, 0); + kibnal_conn_decref(conn); /* lose my ref */ + break; - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } + case IBNAL_CONN_DISCONNECT1: + /* kibnal_terminate_conn is getting there; It'll see + * ibc_disconnect set... */ + kibnal_conn_decref(conn); /* lose my ref */ + break; - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); + case IBNAL_CONN_DISCONNECT2: + /* kibnal_terminate_conn got there already; complete + * the disconnect. NB kib_connd_conns takes my ref */ + spin_lock(&kibnal_data.kib_connd_lock); + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up(&kibnal_data.kib_connd_waitq); + spin_unlock(&kibnal_data.kib_connd_lock); + break; + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + return; - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del(&tx->tx_list); - kibnal_tx_done (tx); + case cm_event_disconn_timeout: + case cm_event_disconn_reply: + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2); + LASSERT (!conn->ibc_disconnect); + conn->ibc_disconnect = 1; + + /* kibnal_terminate_conn sent the disconnect request. + * NB kib_connd_conns takes my ref */ + spin_lock(&kibnal_data.kib_connd_lock); + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up(&kibnal_data.kib_connd_waitq); + spin_unlock(&kibnal_data.kib_connd_lock); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + break; + + case cm_event_connected: + case cm_event_conn_timeout: + case cm_event_conn_reject: + LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); + conn->ibc_connvars->cv_conndata = *cmdata; + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up(&kibnal_data.kib_connd_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + break; } } -static void -kibnal_reject (cm_cep_handle_t cep, cm_rej_code_t reason) +void +kibnal_check_passive_wait(kib_conn_t *conn) { - cm_reject_data_t *rej; - - PORTAL_ALLOC(rej, sizeof(*rej)); - if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ - return; - - rej->reason = reason; - cm_reject(cep, rej); - PORTAL_FREE(rej, sizeof(*rej)); -} + int rc; -static void get_av_from_path(ib_path_record_v2_t *path, vv_add_vec_t *av) -{ - av->service_level = path->sl; - av->grh_flag = 0; /* TODO: correct? */ - av->dlid = path->dlid; - av->pmtu = path->mtu; - - /* From sdp-hca-params.h. */ - switch(path->rate) { - case 2: - av->max_static_rate = 1; - break; - case 3: - case 4: + switch (conn->ibc_connvars->cv_conndata.status) { default: - av->max_static_rate = 0; + LBUG(); + + case cm_event_connected: + kibnal_conn_addref(conn); /* ++ ref for CM callback */ + rc = kibnal_set_qp_state(conn, vv_qp_state_rts); + if (rc != 0) + conn->ibc_comms_error = rc; + /* connection _has_ been established; it's just that we've had + * an error immediately... */ + kibnal_connreq_done(conn, 0, 0); + break; + + case cm_event_conn_timeout: + kibnal_connreq_done(conn, 0, -ETIMEDOUT); + break; + + case cm_event_conn_reject: + kibnal_connreq_done(conn, 0, -ECONNRESET); break; } +} - av->l_ack_timeout = IBNAL_ACK_TIMEOUT; - av->retry_count = IBNAL_RETRY; - av->rnr_retry_count = IBNAL_RNR_RETRY; - av->source_path_bit = 0; - - av->global_dest.flow_lable = path->flow_label; - av->global_dest.hope_limit = path->hop_limut; - av->global_dest.traffic_class = path->traffic_class; - av->global_dest.s_gid_index = 0; - av->global_dest.d_gid = path->dgid; -}; - -static vv_return_t -kibnal_qp_rts(vv_qp_h_t qp_handle, __u32 qpn, __u8 resp_res, - ib_path_record_v2_t *path, __u8 init_depth, __u32 send_psn) +void +kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) { - vv_qp_attr_t qp_attr; - vv_return_t retval; + static cm_reply_data_t reply; + static cm_reject_data_t reject; - ENTRY; + kib_msg_t *rxmsg = (kib_msg_t *)cmreq->priv_data; + kib_msg_t *txmsg; + kib_conn_t *conn = NULL; + int rc = 0; + kib_connvars_t *cv; + kib_peer_t *tmp_peer; + cm_return_t cmrc; + vv_return_t vvrc; + + /* I'm the connd executing in thread context + * No concurrency problems with static data! */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); -#if 1 - /* TODO - Hack. I don't know whether I get bad values from the - * stack or if I'm using the wrong names. */ - resp_res = 8; - init_depth = 8; -#endif + if (cmreq->sid != IBNAL_SERVICE_NUMBER) { + CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", + cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER); + goto reject; + } - /* RTR */ - qp_attr.modify.qp_modify_into_state = vv_qp_state_rtr; - qp_attr.modify.vv_qp_attr_mask = - VV_QP_AT_STATE | - VV_QP_AT_ADD_VEC | - VV_QP_AT_DEST_QP | - VV_QP_AT_R_PSN | - VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | - VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_OP_F; - - qp_attr.modify.qp_type = vv_qp_type_r_conn; - - get_av_from_path(path, &qp_attr.modify.params.rtr.remote_add_vec); - qp_attr.modify.params.rtr.destanation_qp = qpn; - qp_attr.modify.params.rtr.receive_psn = IBNAL_STARTING_PSN; - qp_attr.modify.params.rtr.responder_rdma_r_atom_num = resp_res; - qp_attr.modify.params.rtr.opt_min_rnr_nak_timer = 16; /* 20 ms */ - - /* For now, force MTU to 1KB (Voltaire's advice). */ - qp_attr.modify.params.rtr.remote_add_vec.pmtu = vv_mtu_1024; - - retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL); - if (retval) { - CERROR("Cannot modify QP to RTR: %d\n", retval); - RETURN(retval); - } - - /* RTS */ - qp_attr.modify.qp_modify_into_state = vv_qp_state_rts; - qp_attr.modify.vv_qp_attr_mask = - VV_QP_AT_STATE | - VV_QP_AT_L_ACK_T | - VV_QP_AT_RETRY_NUM | - VV_QP_AT_RNR_NUM | - VV_QP_AT_S_PSN | - VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; - qp_attr.modify.qp_type = vv_qp_type_r_conn; - - qp_attr.modify.params.rts.local_ack_timeout = path->pkt_life_time + 2; /* 2 or 1? */ - qp_attr.modify.params.rts.retry_num = IBNAL_RETRY; - qp_attr.modify.params.rts.rnr_num = IBNAL_RNR_RETRY; - qp_attr.modify.params.rts.send_psn = send_psn; - qp_attr.modify.params.rts.dest_out_rdma_r_atom_num = init_depth; - qp_attr.modify.params.rts.flow_control = 1; /* Stack does not use it. */ - - retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL); - if (retval) { - CERROR("Cannot modify QP to RTS: %d\n", retval); - } - - RETURN(retval); -} + rc = kibnal_unpack_msg(rxmsg, cm_REQ_priv_data_len); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto reject; + } -static void -kibnal_connect_reply (cm_cep_handle_t cep, cm_conn_data_t *info, kib_conn_t *conn) -{ - vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; - kib_wire_connreq_t *wcr; - cm_reply_data_t *rep = &info->data.reply; - cm_rej_code_t reason; - vv_return_t retval; + if (rxmsg->ibm_type != IBNAL_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from "LPX64"\n", + rxmsg->ibm_type, rxmsg->ibm_srcnid); + goto reject; + } - wcr = (kib_wire_connreq_t *)info->data.reply.priv_data; + if (rxmsg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) { + CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n", + rxmsg->ibm_srcnid, rxmsg->ibm_dstnid); + goto reject; + } - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't connect "LPX64": bad magic %08x\n", - conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = cm_rej_code_usr_rej); + if (rxmsg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n", + rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_queue_depth, + IBNAL_MSG_QUEUE_SIZE); + goto reject; } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't connect "LPX64": bad version %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = cm_rej_code_usr_rej); + + if (rxmsg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { + CERROR("Can't accept "LPX64": message size %d too big (%d max)\n", + rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_msg_size, + IBNAL_MSG_SIZE); + goto reject; } - - if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { - CERROR ("Can't connect "LPX64": bad queue depth %d\n", - conn->ibc_peer->ibp_nid, - le16_to_cpu(wcr->wcr_queue_depth)); - GOTO(reject, reason = cm_rej_code_usr_rej); + + if (rxmsg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n", + rxmsg->ibm_srcnid, rxmsg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MAX_RDMA_FRAGS); + goto reject; } - - if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { - CERROR ("Unexpected NID "LPX64" from "LPX64"\n", - le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - GOTO(reject, reason = cm_rej_code_usr_rej); + + conn = kibnal_create_conn(cep); + if (conn == NULL) { + CERROR("Can't create conn for "LPX64"\n", rxmsg->ibm_srcnid); + goto reject; } - - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - retval = kibnal_qp_rts(conn->ibc_qp, rep->qpn, - min_t(__u8, rep->arb_initiator_depth, - ca_attr->max_read_atom_qp_outstanding), - &conn->ibc_connreq->cr_path, - min_t(__u8, rep->arb_resp_res, - ca_attr->max_qp_depth_for_init_read_atom), - rep->start_psn); - - if (retval) { - CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", - conn, conn->ibc_peer->ibp_nid, retval); - GOTO(reject, reason = cm_rej_code_no_qp); - } - - dump_qp(conn); - - /* the callback arguments are ignored for an active accept */ - /* TODO: memset cmrtu? */ - retval = cm_accept(cep, NULL, &conn->ibc_connreq->cr_cm_rtu, kibnal_cm_callback, conn); - if (retval) { - CERROR("Connection %p -> "LPX64" CMAccept RTU failed: %d\n", - conn, conn->ibc_peer->ibp_nid, retval); - kibnal_connreq_done (conn, 1, -ECONNABORTED); - /* XXX don't call reject after accept fails? */ - return; + + /* assume 'rxmsg->ibm_srcnid' is a new peer */ + tmp_peer = kibnal_create_peer (rxmsg->ibm_srcnid); + if (tmp_peer == NULL) { + CERROR("Can't create tmp peer for "LPX64"\n", rxmsg->ibm_srcnid); + kibnal_conn_decref(conn); + conn = NULL; + goto reject; } - CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 1, 0); - - return; + conn->ibc_peer = tmp_peer; /* conn takes over my ref */ + conn->ibc_incarnation = rxmsg->ibm_srcstamp; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; -reject: - kibnal_reject(cep, reason); - kibnal_connreq_done (conn, 1, -EPROTO); -} + cv = conn->ibc_connvars; -/* Off level CM callback */ -static void -_kibnal_cm_callback(void * arg) -{ - struct cm_off_level *cm_tq = arg; - cm_cep_handle_t cep = cm_tq->cep; - cm_conn_data_t *info = cm_tq->info; - kib_conn_t *conn = cm_tq->conn; - vv_return_t retval; + cv->cv_txpsn = cmreq->cep_data.start_psn; + cv->cv_remote_qpn = cmreq->cep_data.qpn; + cv->cv_path = cmreq->path_data.path; + cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt; + // XXX cmreq->cep_data.retry_cnt; + cv->cv_port = cmreq->cep_data.local_port_num; - CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep); + vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, + &cv->cv_path.sgid, &cv->cv_sgid_index); + LASSERT (vvrc == vv_return_ok); + + vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, + cv->cv_path.pkey, &cv->cv_pkey_index); + LASSERT (vvrc == vv_return_ok); - PORTAL_FREE(cm_tq, sizeof(*cm_tq)); + rc = kibnal_set_qp_state(conn, vv_qp_state_init); + if (rc != 0) + goto reject; - /* Established Connection Notifier */ - switch (info->status) { - case cm_event_connected: - CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - kibnal_connreq_done (conn, 0, 0); - break; + rc = kibnal_post_receives(conn); + if (rc != 0) { + CERROR("Can't post receives for "LPX64"\n", rxmsg->ibm_srcnid); + goto reject; + } - case cm_event_conn_timeout: - case cm_event_conn_reject: - /* TODO: be sure this is called only if REQ times out. */ - CERROR("connection timed out\n"); - LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, -EINVAL); - break; + rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); + if (rc != 0) + goto reject; + + memset(&reply, 0, sizeof(reply)); + reply.qpn = cv->cv_local_qpn; + reply.qkey = IBNAL_QKEY; + reply.start_psn = cv->cv_rxpsn; + reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH; + reply.arb_resp_res = IBNAL_ARB_RESP_RES; + reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED; + reply.rnr_retry_count = cv->cv_rnr_count; + reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay; + + txmsg = (kib_msg_t *)&reply.priv_data; + kibnal_init_msg(txmsg, IBNAL_MSG_CONNACK, + sizeof(txmsg->ibm_u.connparams)); + LASSERT (txmsg->ibm_nob <= cm_REP_priv_data_len); + txmsg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; + txmsg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; + txmsg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; + kibnal_pack_msg(txmsg, 0, rxmsg->ibm_srcnid, rxmsg->ibm_srcstamp); + + kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT); + + cmrc = cm_accept(conn->ibc_cep, &reply, NULL, + kibnal_cm_callback, conn); - case cm_event_conn_reply: - kibnal_connect_reply(cep, info, conn); - break; + if (cmrc == cm_stat_success) + return; /* callback has got my ref on conn */ - case cm_event_disconn_request: - /* XXX lock around these state management bits? */ - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - kibnal_close_conn (conn, 0); - conn->ibc_state = IBNAL_CONN_DREP; + /* back out state change (no callback happening) */ + kibnal_set_conn_state(conn, IBNAL_CONN_INIT); + rc = -EIO; - retval = cm_disconnect(conn->ibc_cep, NULL, &kibnal_data.cm_data.drep_data); - if (retval) - CERROR("disconnect rep failed: %d\n", retval); - - /* Fall through ... */ - - /* these both guarantee that no more cm callbacks will occur */ - case cm_event_disconnected: /* aka cm_event_disconn_timeout */ - case cm_event_disconn_reply: - CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", - conn, conn->ibc_peer->ibp_nid); + reject: + CERROR("Rejected connreq from "LPX64"\n", rxmsg->ibm_srcnid); - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - kibnal_flush_pending(conn); - kibnal_put_conn(conn); /* Lose CM's ref */ - break; - - default: - CERROR("unknown status %d on Connection %p -> "LPX64"\n", - info->status, conn, conn->ibc_peer->ibp_nid); - LBUG(); - break; - } + memset(&reject, 0, sizeof(reject)); + reject.reason = cm_rej_code_usr_rej; + cm_reject(cep, &reject); - return; + if (conn != NULL) { + LASSERT (rc != 0); + kibnal_connreq_done(conn, 0, rc); + } else { + cm_destroy_cep(cep); + } } -static void -kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg) +void +kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg) { - struct cm_off_level *cm_tq; + cm_request_data_t *cmreq = &data->data.request; + kib_pcreq_t *pcr; + unsigned long flags; + + LASSERT (arg == NULL); - LASSERT(cep); - LASSERT(info); + if (data->status != cm_event_conn_request) { + CERROR("status %d is not cm_event_conn_request\n", + data->status); + return; + } - CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep); + PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr)); + if (pcr == NULL) { + CERROR("Can't allocate passive connreq\n"); - PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq)); - if (cm_tq == NULL) { - CERROR("Failed to allocate a CM off level structure\n"); + cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */ + {.reason = cm_rej_code_no_res,})); + cm_destroy_cep(cep); return; } - cm_tq->tq.sync = 0; - cm_tq->tq.routine = _kibnal_cm_callback; - cm_tq->tq.data = cm_tq; - - cm_tq->cep = cep; - cm_tq->info = info; - cm_tq->conn = (kib_conn_t *)arg; + pcr->pcr_cep = cep; + pcr->pcr_cmreq = *cmreq; + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - schedule_task(&cm_tq->tq); + list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs); + wake_up(&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } -static int -kibnal_set_cm_flags(cm_cep_handle_t cep) -{ -#ifdef TODO -voltaire cm doesnot appear to have that functionnality - FSTATUS frc; - uint32 value = 1; - frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, - (char *)&value, sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting timeout callback: %d\n", frc); - return -1; - } +void +kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, + void *arg) +{ + /* CAVEAT EMPTOR: tasklet context */ + kib_conn_t *conn = (kib_conn_t *)arg; + kib_connvars_t *cv = conn->ibc_connvars; + unsigned long flags; -#if 0 - frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, - sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting async accept: %d\n", frc); - return -1; - } -#endif -#endif + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); + cv->cv_conndata = *cd; - return 0; + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + /* connd takes my ref */ + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up(&kibnal_data.kib_connd_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } -/* Off level listen callback */ -static void -_kibnal_listen_callback(void *arg) +void +kibnal_connect_conn (kib_conn_t *conn) { - struct cm_off_level *cm_tq = arg; - cm_cep_handle_t cep = cm_tq->cep; - cm_conn_data_t *info = cm_tq->info; - vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; - cm_request_data_t *req; - cm_reply_data_t *rep = NULL; - kib_wire_connreq_t *wcr; - kib_conn_t *conn = NULL; - cm_rej_code_t reason = 0; - int rc = 0; - vv_return_t retval; - vv_qp_attr_t *query; - void *qp_context; - - LASSERT(cep); - LASSERT(info); - - CDEBUG(D_NET, "LISTEN status 0x%x for CEP %p\n", info->status, cep); - - PORTAL_FREE(cm_tq, sizeof(*cm_tq)); - - req = &info->data.request; - wcr = (kib_wire_connreq_t *)req->priv_data; + static cm_request_data_t cmreq; + kib_msg_t *msg = (kib_msg_t *)&cmreq.priv_data; + kib_connvars_t *cv = conn->ibc_connvars; + kib_peer_t *peer = conn->ibc_peer; + cm_return_t cmrc; + + /* Only called by connd => statics OK */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - CDEBUG(D_NET, "%d from "LPX64"\n", info->status, - le64_to_cpu(wcr->wcr_nid)); + memset(&cmreq, 0, sizeof(cmreq)); -#ifdef TODO - is there an equivalent? - if (info->status == FCM_CONNECT_CANCEL) - return; -#endif + cmreq.sid = IBNAL_SERVICE_NUMBER; + + cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid; + cmreq.cep_data.qpn = cv->cv_local_qpn; + cmreq.cep_data.retry_cnt = IBNAL_RETRY_CNT; + cmreq.cep_data.rtr_retry_cnt = IBNAL_RNR_CNT; + cmreq.cep_data.start_psn = cv->cv_rxpsn; + cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT; + // XXX ack_timeout? + // offered_resp_res + // offered_initiator_depth + + cmreq.path_data.subn_local = IBNAL_LOCAL_SUB; + cmreq.path_data.path = cv->cv_path; - LASSERT (info->status == cm_event_conn_request); + kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + LASSERT(msg->ibm_nob <= cm_REQ_priv_data_len); + msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; + msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; + msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; + kibnal_pack_msg(msg, 0, peer->ibp_nid, 0); - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't accept: bad magic %08x\n", - le32_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = cm_rej_code_usr_rej); - } + CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid); - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't accept: bad version %d\n", - le16_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = cm_rej_code_usr_rej); - } + kibnal_conn_addref(conn); /* ++ref for CM callback */ + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT); - rc = kibnal_accept(&conn, cep, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); - if (rc != 0) { - CERROR ("Can't accept "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), rc); - GOTO(out, reason = cm_rej_code_no_res); - } - - /* TODO: I hope I got the ca_attr names correctly. */ - retval = kibnal_qp_rts(conn->ibc_qp, req->cep_data.qpn, - min_t(__u8, req->cep_data.offered_initiator_depth, - ca_attr->max_read_atom_qp_outstanding), - &req->path_data.path, - min_t(__u8, req->cep_data.offered_resp_res, - ca_attr->max_qp_depth_for_init_read_atom), - req->cep_data.start_psn); - - if (retval) { - CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), retval); - GOTO(out, reason = cm_rej_code_no_qp); - } - - dump_qp(conn); - - retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); - if (retval) { - CERROR ("Couldn't query qp attributes "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), retval); - GOTO(out, reason = cm_rej_code_no_qp); - } - query = &conn->ibc_qp_attrs; - - PORTAL_ALLOC(rep, sizeof(*rep)); - if (rep == NULL) { - CERROR ("can't reply and receive buffers\n"); - GOTO(out, reason = cm_rej_code_insuff_resp_res); - } - - /* don't try to deref this into the incoming wcr :) */ - wcr = (kib_wire_connreq_t *)rep->priv_data; - - *rep = (cm_reply_data_t) { - .qpn = query->query.qp_num, - .start_psn = query->query.receve_psn, - .arb_resp_res = query->query.rdma_r_atom_outstand_num, - .arb_initiator_depth = query->query.rdma_r_atom_outstand_num, - .targ_ack_delay = 0, - .failover_accepted = 0, - .end_to_end_flow_ctrl = 1, /* (query->query.flow_control is never set) */ - .rnr_retry_count = req->cep_data.rtr_retry_cnt, - }; + cmrc = cm_connect(conn->ibc_cep, &cmreq, + kibnal_active_connect_callback, conn); + if (cmrc == cm_stat_success) { + CDEBUG(D_NET, "connection REQ sent to "LPX64"\n", + peer->ibp_nid); + return; + } - *wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; + CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc); + kibnal_conn_decref(conn); /* drop callback's ref */ + kibnal_connreq_done(conn, 1, -EHOSTUNREACH); +} - retval = cm_accept(cep, rep, NULL, kibnal_cm_callback, conn); +void +kibnal_check_connreply (kib_conn_t *conn) +{ + static cm_rtu_data_t rtu; - PORTAL_FREE(rep, sizeof(*rep)); + kib_connvars_t *cv = conn->ibc_connvars; + cm_reply_data_t *reply = &cv->cv_conndata.data.reply; + kib_msg_t *msg = (kib_msg_t *)&reply->priv_data; + kib_peer_t *peer = conn->ibc_peer; + cm_return_t cmrc; + cm_cep_handle_t cep; + unsigned long flags; + int rc; - if (retval) { - /* XXX it seems we don't call reject after this point? */ - CERROR("cm_accept() failed: %d, aborting\n", retval); - rc = -ECONNABORTED; - goto out; - } + /* Only called by connd => statics OK */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); - if (kibnal_set_cm_flags(conn->ibc_cep)) { - rc = -ECONNABORTED; - goto out; - } + if (cv->cv_conndata.status == cm_event_conn_reply) { + cv->cv_remote_qpn = reply->qpn; + cv->cv_txpsn = reply->start_psn; + // XXX reply->targ_ack_delay; + cv->cv_rnr_count = reply->rnr_retry_count; - conn->ibc_cep = cep; + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); + rc = kibnal_unpack_msg(msg, cm_REP_priv_data_len); + if (rc != 0) { + CERROR("Can't unpack reply from "LPX64"\n", + peer->ibp_nid); + kibnal_connreq_done(conn, 1, rc); + return; + } -out: - if (reason) { - kibnal_reject(cep, reason); - rc = -ECONNABORTED; - } + if (msg->ibm_type != IBNAL_MSG_CONNACK ) { + CERROR("Unexpected message type %d from "LPX64"\n", + msg->ibm_type, peer->ibp_nid); + kibnal_connreq_done(conn, 1, -EPROTO); + return; + } - return; -} + if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n", + peer->ibp_nid, msg->ibm_u.connparams.ibcp_queue_depth, + IBNAL_MSG_QUEUE_SIZE); + kibnal_connreq_done(conn, 1, -EPROTO); + return; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { + CERROR(LPX64" max message size %d too big (%d max)\n", + peer->ibp_nid, msg->ibm_u.connparams.ibcp_max_msg_size, + IBNAL_MSG_SIZE); + kibnal_connreq_done(conn, 1, -EPROTO); + return; + } -void -kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg) -{ - struct cm_off_level *cm_tq; + if (msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR(LPX64" max frags %d too big (%d max)\n", + peer->ibp_nid, msg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MAX_RDMA_FRAGS); + kibnal_connreq_done(conn, 1, -EPROTO); + return; + } + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + rc = (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || + msg->ibm_dststamp != kibnal_data.kib_incarnation) ? + -ESTALE : 0; + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + if (rc != 0) { + CERROR("Stale connection reply from "LPX64"\n", + peer->ibp_nid); + kibnal_connreq_done(conn, 1, rc); + return; + } - LASSERT(cep); - LASSERT(info); - LASSERT(arg == NULL); /* no conn yet for passive */ + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + rc = kibnal_post_receives(conn); + if (rc != 0) { + CERROR("Can't post receives for "LPX64"\n", + peer->ibp_nid); + kibnal_connreq_done(conn, 1, rc); + return; + } + + rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); + if (rc != 0) { + kibnal_connreq_done(conn, 1, rc); + return; + } + + rc = kibnal_set_qp_state(conn, vv_qp_state_rts); + if (rc != 0) { + kibnal_connreq_done(conn, 1, rc); + return; + } + + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU); + kibnal_conn_addref(conn); /* ++for CM callback */ + + memset(&rtu, 0, sizeof(rtu)); + cmrc = cm_accept(conn->ibc_cep, NULL, &rtu, + kibnal_cm_callback, conn); + if (cmrc == cm_stat_success) { + /* Now I'm racing with disconnect signalled by + * kibnal_cm_callback */ + kibnal_connreq_done(conn, 1, 0); + return; + } - PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq)); - if (cm_tq == NULL) { - CERROR("Failed to allocate a CM off level structure\n"); + CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc); + /* Back out of RTU: no callback coming */ + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); + kibnal_conn_decref(conn); + kibnal_connreq_done(conn, 1, -EIO); return; } - cm_tq->tq.sync = 0; - cm_tq->tq.routine = _kibnal_listen_callback; - cm_tq->tq.data = cm_tq; + if (cv->cv_conndata.status == cm_event_conn_reject) { - cm_tq->cep = cep; - cm_tq->info = info; - cm_tq->conn = NULL; - - schedule_task(&cm_tq->tq); -} + if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) { + CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid, + cv->cv_conndata.data.reject.reason); + kibnal_connreq_done(conn, 1, -ECONNREFUSED); + return; + } -static void -kibnal_pathreq_callback (struct sa_request *request) -{ - vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = request->context; - gsi_dtgrm_t *dtgrm; - sa_mad_v2_t *mad; - ib_path_record_v2_t *path; - u64 component_mask; - cm_return_t cmret; - - if (request->status) { - CERROR ("status %d\n", request->status); - free_sa_request(request); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } + CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid); - dtgrm = request->dtgrm_resp; - mad = (sa_mad_v2_t *) dtgrm->mad; - path = (ib_path_record_v2_t *) mad->payload; - - /* Put the path record in host order for that stack. */ - gid_swap(&path->sgid); - gid_swap(&path->dgid); - path->slid = be16_to_cpu(path->slid); - path->dlid = be16_to_cpu(path->dlid); - path->flow_label = be32_to_cpu(path->flow_label); - path->pkey = be16_to_cpu(path->pkey); - path->sl = be16_to_cpu(path->sl); - - CDEBUG(D_NET, "sgid "LPX64":"LPX64" dgid " - LPX64":"LPX64" pkey %x\n", - path->sgid.scope.g.subnet, - path->sgid.scope.g.eui64, - path->dgid.scope.g.subnet, - path->dgid.scope.g.eui64, - path->pkey); - -#if TODO - component_mask = be64_to_cpu(mad->component_mask); - if ((component_mask && (1ull << 1)) == 0) { - CERROR ("no servivce GID in SR: "LPX64"\n", component_mask); - free_sa_request(request); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } -#endif + cep = cm_create_cep(cm_cep_transp_rc); + if (cep == NULL) { + CERROR("Can't create new CEP\n"); + kibnal_connreq_done(conn, 1, -ENOMEM); + return; + } - conn->ibc_connreq->cr_path = *path; + cmrc = cm_cancel(conn->ibc_cep); + LASSERT (cmrc == cm_stat_success); + cmrc = cm_destroy_cep(conn->ibc_cep); + LASSERT (cmrc == cm_stat_success); - free_sa_request(request); + conn->ibc_cep = cep; - conn->ibc_cep = cm_create_cep(cm_cep_transp_rc); - if (conn->ibc_cep == NULL) { - CERROR ("Can't create CEP\n"); - kibnal_connreq_done (conn, 1, -EINVAL); + /* retry connect */ + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); + kibnal_connect_conn(conn); return; } - if (kibnal_set_cm_flags(conn->ibc_cep)) { - kibnal_connreq_done (conn, 1, -EINVAL); + CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid, + cv->cv_conndata.status); + kibnal_connreq_done(conn, 1, -ECONNABORTED); +} + +void +kibnal_send_connreq (kib_conn_t *conn) +{ + kib_peer_t *peer = conn->ibc_peer; + kib_connvars_t *cv = conn->ibc_connvars; + ibat_arp_data_t *arp = &cv->cv_arp; + ib_path_record_v2_t *path = &cv->cv_path; + vv_return_t vvrc; + int rc; + + /* Only called by connd => statics OK */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); + + if (cv->cv_arprc != ibat_stat_ok) { + CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid, + HIPQUAD(peer->ibp_ip), cv->cv_arprc); + kibnal_connreq_done(conn, 1, -ENETUNREACH); return; } - conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; + if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) { + CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid); - conn->ibc_connreq->cr_cm_req = (cm_request_data_t) { - .sid = kibnal_data.kib_service_id, - .cep_data = (cm_cep_data_t) { - .ca_guid = kibnal_data.kib_hca_attrs.guid, - .end_to_end_flow_ctrl = 1, - .port_guid = kibnal_data.kib_port_gid.scope.g.eui64, - .local_port_num = kibnal_data.kib_port, - .start_psn = IBNAL_STARTING_PSN, - .qpn = conn->ibc_qp_attrs.query.qp_num, - .retry_cnt = IBNAL_RETRY, - .rtr_retry_cnt = IBNAL_RNR_RETRY, - .ack_timeout = IBNAL_ACK_TIMEOUT, - .offered_resp_res = ca_attr->max_read_atom_qp_outstanding, - .offered_initiator_depth = ca_attr->max_qp_depth_for_init_read_atom, - }, - .path_data = (cm_cep_path_data_t) { - .subn_local = TRUE, - .path = conn->ibc_connreq->cr_path, - }, - }; + *path = *arp->primary_path; -#if 0 - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; -#endif - /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; + vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid, + &cv->cv_port); + LASSERT (vvrc == vv_return_ok); -#if 0 - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); -#endif + vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, + &path->sgid, &cv->cv_sgid_index); + LASSERT (vvrc == vv_return_ok); - memset(conn->ibc_connreq->cr_cm_req.priv_data, 0, - cm_REQ_priv_data_len); - memcpy(conn->ibc_connreq->cr_cm_req.priv_data, - &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, + path->pkey, &cv->cv_pkey_index); + LASSERT (vvrc == vv_return_ok); - /* kibnal_cm_callback gets my conn ref */ - cmret = cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cm_req, - kibnal_cm_callback, conn); + path->mtu = IBNAL_IB_MTU; - if (cmret) { - CERROR ("Connect failed: %d\n", cmret); - /* Back out state change as connect failed */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, -EINVAL); - } + } else if ((arp->mask & IBAT_LID_VALID) != 0) { + CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n", + peer->ibp_nid, HIPQUAD(peer->ibp_ip)); - CDEBUG(D_NET, "connection REQ sent\n"); -} + cv->cv_pkey_index = IBNAL_PKEY_IDX; + cv->cv_sgid_index = IBNAL_SGID_IDX; + cv->cv_port = arp->local_port_num; -static void -kibnal_service_get_callback (struct sa_request *request) -{ - kib_conn_t *conn = request->context; - gsi_dtgrm_t *dtgrm; - sa_mad_v2_t *mad; - ib_service_record_v2_t *sr; - u64 component_mask; - int ret; - - if (request->status) { - CERROR ("status %d\n", request->status); - free_sa_request(request); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } + memset(path, 0, sizeof(*path)); - dtgrm = request->dtgrm_resp; - mad = (sa_mad_v2_t *) dtgrm->mad; - sr = (ib_service_record_v2_t *) mad->payload; + vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port, + &path->sgid); + LASSERT (vvrc == vv_return_ok); - CDEBUG(D_NET, "sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", - sr->service_id, - sr->service_gid.scope.g.subnet, - sr->service_gid.scope.g.eui64, - sr->service_pkey); + vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port, + &path->slid); + LASSERT (vvrc == vv_return_ok); - component_mask = be64_to_cpu(mad->component_mask); - if ((component_mask && (1ull << 1)) == 0) { - CERROR ("no service GID in SR: "LPX64"\n", component_mask); - free_sa_request(request); - kibnal_connreq_done (conn, 1, -EINVAL); + path->dgid = arp->gid; + path->sl = IBNAL_SERVICE_LEVEL; + path->dlid = arp->lid; + path->mtu = IBNAL_IB_MTU; + path->rate = IBNAL_STATIC_RATE; + path->pkt_life_time = IBNAL_PKT_LIFETIME; + path->pkey = IBNAL_PKEY; + path->traffic_class = IBNAL_TRAFFIC_CLASS; + } else { + CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", + peer->ibp_nid, HIPQUAD(peer->ibp_ip)); + kibnal_connreq_done(conn, 1, -ENETUNREACH); return; } - //conn->ibc_connreq->cr_service = sr; + rc = kibnal_set_qp_state(conn, vv_qp_state_init); + if (rc != 0) { + kibnal_connreq_done(conn, 1, rc); + } + + /* do the actual connection request */ + kibnal_connect_conn(conn); +} - /* Return the response datagram to its pool. We don't need it anymore. */ - gsi_dtgrm_pool_put(request->dtgrm_resp); - request->dtgrm_resp = NULL; +void +kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) +{ + /* CAVEAT EMPTOR: tasklet context */ + kib_conn_t *conn = (kib_conn_t *)arg; + kib_peer_t *peer = conn->ibc_peer; + unsigned long flags; - /* kibnal_pathreq_callback gets my conn ref */ - ret = kibnal_pathrecord_op(request, sr->service_gid, kibnal_pathreq_callback, conn); - if (ret) { - CERROR ("Path record request failed: %d\n", ret); - kibnal_connreq_done (conn, 1, -EINVAL); - } + CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u rc %d LID %s PATH %s\n", + peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc, + (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid", + (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid"); + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - return; + conn->ibc_connvars->cv_arprc = arprc; + conn->ibc_connvars->cv_arp = *arp_data; + + /* connd takes over my ref on conn */ + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up(&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); } -static void -kibnal_connect_peer (kib_peer_t *peer) +void +kibnal_arp_peer (kib_peer_t *peer) { - kib_conn_t *conn = kibnal_create_conn(); - struct sa_request *request; - int ret; + cm_cep_handle_t cep; + kib_conn_t *conn; + int ibatrc; + /* Only the connd does this (i.e. single threaded) */ + LASSERT (current == kibnal_data.kib_connd); LASSERT (peer->ibp_connecting != 0); - if (conn == NULL) { - CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); + cep = cm_create_cep(cm_cep_transp_rc); + if (cep == NULL) { + CERROR ("Can't create cep for conn->"LPX64"\n", + peer->ibp_nid); + kibnal_peer_connect_failed(peer, 1); return; } - conn->ibc_peer = peer; - kib_peer_addref(peer); - - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done (conn, 1, -ENOMEM); + conn = kibnal_create_conn(cep); + if (conn == NULL) { + CERROR ("Can't allocate conn->"LPX64"\n", + peer->ibp_nid); + cm_destroy_cep(cep); + kibnal_peer_connect_failed(peer, 1); return; } - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + conn->ibc_peer = peer; + kibnal_peer_addref(peer); - /* kibnal_service_get_callback gets my conn ref */ - ret = kibnal_advertize_op(peer->ibp_nid, SUBN_ADM_GET, kibnal_service_get_callback, conn); + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); - if (ret) { - CERROR("kibnal_advertize_op failed for op %d NID "LPX64"\n", SUBN_ADM_GET, peer->ibp_nid); - /* TODO: I'm unsure yet whether ret contains a - * consistent error type, so I return -EIO in the - * meantime. */ - kibnal_connreq_done (conn, 1, -EIO); - } + ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, + ibat_paths_primary, + &conn->ibc_connvars->cv_arp, + kibnal_arp_callback, conn, 0); + CDEBUG(D_NET,"ibatrc %d\n", ibatrc); + switch (ibatrc) { + default: + LBUG(); + + case ibat_stat_pending: + /* NB callback has my ref on conn */ + break; + + case ibat_stat_ok: + /* Immediate return (ARP cache hit) == no callback. */ + kibnal_send_connreq(conn); + kibnal_conn_decref(conn); + break; - return; + case ibat_stat_error: + case ibat_stat_timeout: + case ibat_stat_not_found: + CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid, + HIPQUAD(peer->ibp_ip), ibatrc); + kibnal_connreq_done(conn, 1, -ENETUNREACH); + kibnal_conn_decref(conn); + break; + } } -static int +int kibnal_conn_timed_out (kib_conn_t *conn) { kib_tx_t *tx; struct list_head *ttmp; - unsigned long flags; - spin_lock_irqsave (&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); list_for_each (ttmp, &conn->ibc_tx_queue) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); return 1; } } @@ -2865,24 +2823,20 @@ kibnal_conn_timed_out (kib_conn_t *conn) list_for_each (ttmp, &conn->ibc_active_txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || + LASSERT (tx->tx_waiting || tx->tx_sending != 0); if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); return 1; } } - spin_unlock_irqrestore (&conn->ibc_lock, flags); - + spin_unlock(&conn->ibc_lock); return 0; } -static void +void kibnal_check_conns (int idx) { struct list_head *peers = &kibnal_data.kib_peers[idx]; @@ -2904,7 +2858,7 @@ kibnal_check_conns (int idx) list_for_each (ctmp, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs @@ -2913,20 +2867,21 @@ kibnal_check_conns (int idx) if (!kibnal_conn_timed_out(conn)) continue; + + /* Handle timeout by closing the whole connection. We + * can only be sure RDMA activity has ceased once the + * QP has been modified. */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); + kibnal_conn_addref(conn); /* 1 ref for me... */ - atomic_inc (&conn->ibc_refcount); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); CERROR("Timed out RDMA with "LPX64"\n", peer->ibp_nid); kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); + kibnal_conn_decref(conn); /* ...until here */ /* start again now I've dropped the lock */ goto again; @@ -2936,39 +2891,50 @@ kibnal_check_conns (int idx) read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } -static void -kib_connd_handle_state(kib_conn_t *conn) +void +kibnal_disconnect_conn (kib_conn_t *conn) { - vv_return_t retval; - - switch (conn->ibc_state) { - /* all refs have gone, free and be done with it */ - case IBNAL_CONN_DISCONNECTED: - kibnal_destroy_conn (conn); - return; /* avoid put_conn */ + static cm_drequest_data_t dreq; /* just for the space */ + + cm_return_t cmrc; + unsigned long flags; - case IBNAL_CONN_SEND_DREQ: - - retval = cm_disconnect(conn->ibc_cep, &kibnal_data.cm_data.dreq_data, NULL); - if (retval) /* XXX do real things */ - CERROR("disconnect failed: %d\n", retval); - - conn->ibc_state = IBNAL_CONN_DREQ; - break; + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* a callback got to the conn before we did */ - case IBNAL_CONN_DREP: - break; - - default: - CERROR ("Bad conn %p state: %d\n", conn, - conn->ibc_state); - LBUG(); - break; + if (conn->ibc_disconnect) { + /* Had the CM callback already */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + kibnal_conn_disconnected(conn); + return; } + + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); + + /* active disconnect */ + cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL); + if (cmrc == cm_stat_success) { + /* waiting for CM */ + conn->ibc_state = IBNAL_CONN_DISCONNECT2; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + return; + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - /* drop ref from close_conn */ - kibnal_put_conn(conn); + cm_cancel(conn->ibc_cep); + kibnal_pause(HZ/10); + + if (!conn->ibc_disconnect) /* CM callback will never happen now */ + kibnal_conn_decref(conn); + + LASSERT (atomic_read(&conn->ibc_refcount) > 0); + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); + + kibnal_conn_disconnected(conn); } int @@ -2976,10 +2942,12 @@ kibnal_connd (void *arg) { wait_queue_t wait; unsigned long flags; + kib_pcreq_t *pcr; kib_conn_t *conn; kib_peer_t *peer; int timeout; int i; + int dropped_lock; int peer_index = 0; unsigned long deadline = jiffies; @@ -2987,48 +2955,99 @@ kibnal_connd (void *arg) kportal_blockallsigs (); init_waitqueue_entry (&wait, current); + kibnal_data.kib_connd = current; spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - for (;;) { - if (!list_empty (&kibnal_data.kib_connd_conns)) { - conn = list_entry (kibnal_data.kib_connd_conns.next, + while (!kibnal_data.kib_shutdown) { + + dropped_lock = 0; + + if (!list_empty (&kibnal_data.kib_connd_zombies)) { + conn = list_entry (kibnal_data.kib_connd_zombies.next, kib_conn_t, ibc_list); list_del (&conn->ibc_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - kib_connd_handle_state(conn); + dropped_lock = 1; + + kibnal_destroy_conn(conn); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - continue; } + if (!list_empty (&kibnal_data.kib_connd_pcreqs)) { + pcr = list_entry(kibnal_data.kib_connd_pcreqs.next, + kib_pcreq_t, pcr_list); + list_del(&pcr->pcr_list); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + dropped_lock = 1; + + kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq); + PORTAL_FREE(pcr, sizeof(*pcr)); + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + } + if (!list_empty (&kibnal_data.kib_connd_peers)) { peer = list_entry (kibnal_data.kib_connd_peers.next, kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + dropped_lock = 1; - kibnal_connect_peer (peer); - kib_peer_decref (peer); + kibnal_arp_peer (peer); + kibnal_peer_decref (peer); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - /* shut down and nobody left to reap... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + dropped_lock = 1; - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + switch (conn->ibc_state) { + default: + LBUG(); + + case IBNAL_CONN_ACTIVE_ARP: + kibnal_send_connreq(conn); + break; + + case IBNAL_CONN_ACTIVE_CONNECT: + kibnal_check_connreply(conn); + break; + + case IBNAL_CONN_PASSIVE_WAIT: + kibnal_check_passive_wait(conn); + break; + + case IBNAL_CONN_DISCONNECT1: + case IBNAL_CONN_DISCONNECT2: + kibnal_disconnect_conn(conn); + break; + } + kibnal_conn_decref(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { const int n = 4; const int p = 1; int chunk = kibnal_data.kib_peer_hash_size; + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + dropped_lock = 1; + /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a * proportion of the peer table and I need to check @@ -3050,21 +3069,21 @@ kibnal_connd (void *arg) } deadline += p * HZ; + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); } - kibnal_data.kib_connd_waketime = jiffies + timeout; - + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ set_current_state (TASK_INTERRUPTIBLE); add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - if (!kibnal_data.kib_shutdown && - list_empty (&kibnal_data.kib_connd_conns) && - list_empty (&kibnal_data.kib_connd_peers)) - schedule_timeout (timeout); + schedule_timeout (timeout); set_current_state (TASK_RUNNING); remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } @@ -3074,78 +3093,127 @@ kibnal_connd (void *arg) return (0); } +void +kibnal_async_callback(vv_event_record_t ev) +{ + CERROR("type: %d, port: %d, data: "LPX64"\n", + ev.event_type, ev.port_num, ev.type.data); +} + +void +kibnal_cq_callback (unsigned long unused_context) +{ + unsigned long flags; + + CDEBUG(D_NET, "!!\n"); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + int kibnal_scheduler(void *arg) { long id = (long)arg; + wait_queue_t wait; char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; + vv_wc_t wc; + vv_return_t vvrc; + vv_return_t vvrc2; unsigned long flags; - int rc; - int counter = 0; - int did_something; + int busy_loops = 0; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); kportal_daemonize(name); kportal_blockallsigs(); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + init_waitqueue_entry(&wait, current); - for (;;) { - did_something = 0; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); + while (!kibnal_data.kib_shutdown) { + if (busy_loops++ >= IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + our_cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + if (kibnal_data.kib_ready && + !kibnal_data.kib_checking_cq) { + /* take ownership of completion polling */ + kibnal_data.kib_checking_cq = 1; + /* Assume I'll exhaust the CQ */ + kibnal_data.kib_ready = 0; + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + vvrc = vv_poll_for_completion(kibnal_data.kib_hca, + kibnal_data.kib_cq, &wc); + if (vvrc == vv_return_err_cq_empty) { + vvrc2 = vv_request_completion_notification( + kibnal_data.kib_hca, + kibnal_data.kib_cq, + vv_next_solicit_unsolicit_event); + LASSERT (vvrc2 == vv_return_ok); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + /* give up ownership of completion polling */ + kibnal_data.kib_checking_cq = 0; - kibnal_rx(rx); + if (vvrc == vv_return_err_cq_empty) + continue; - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } + LASSERT (vvrc == vv_return_ok); + /* Assume there's more: get another scheduler to check + * while I handle this completion... */ - /* shut down and no receives to complete... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - (kibnal_data.kib_shutdown && - atomic_read (&kibnal_data.kib_nconns) == 0)); - } else { - our_cond_resched(); - } - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + switch (wc.operation_type) { + case vv_wc_send_rq: + kibnal_rx_complete((kib_rx_t *)((unsigned long)wc.wr_id), + wc.completion_status, + wc.num_bytes_transfered); + break; + case vv_wc_send_sq: + kibnal_tx_complete((kib_tx_t *)((unsigned long)wc.wr_id), + 1, wc.completion_status); + break; + case vv_wc_rdma_write_sq: + kibnal_tx_complete((kib_tx_t *)((unsigned long)wc.wr_id), + 0, wc.completion_status); + break; + default: + LBUG(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + continue; } + + /* Nothing to do; sleep... */ + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kibnal_data.kib_sched_waitq, &wait); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + schedule(); + + remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); diff --git a/lnet/klnds/viblnd/vibnal_sa.c b/lnet/klnds/viblnd/vibnal_sa.c deleted file mode 100644 index c8ff098..0000000 --- a/lnet/klnds/viblnd/vibnal_sa.c +++ /dev/null @@ -1,333 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Frank Zago - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "vibnal.h" - -/*--------------------------------------------------------------------------*/ - -struct sa_request *alloc_sa_request(void) -{ - struct sa_request *request; - gsi_dtgrm_t *dtgrm; - vv_return_t retval; - - PORTAL_ALLOC(request, sizeof(*request)); - if (request == NULL) - return NULL; - - retval = gsi_dtgrm_pool_get(kibnal_data.gsi_pool_handle, &dtgrm); - if (retval) { - CERROR("cannot get a datagram: %d\n", retval); - PORTAL_FREE(request, sizeof(*request)); - return NULL; - } - - memset(request, 0, sizeof(*request)); - - request->dtgrm_req = dtgrm; - request->retry = GSI_RETRY; /* retry the request up to 10 times */ - - return request; -} - -void free_sa_request(struct sa_request *request) -{ - if (request) { - if (request->dtgrm_req) { - gsi_dtgrm_pool_put(request->dtgrm_req); - } - - if (request->dtgrm_resp) { - gsi_dtgrm_pool_put(request->dtgrm_resp); - } - - PORTAL_FREE(request, sizeof(*request)); - } -} - -/*--------------------------------------------------------------------------*/ - -static void complete_sa_request(struct sa_request *request) -{ - if (request->callback) { - request->callback(request); - } else { - complete(&request->signal); - } -} - -static void -sa_request_timeout_handler(unsigned long context) -{ - struct sa_request *request = (struct sa_request *)context; - int ret; - vv_return_t retval; - - if (request->retry--) { - /* Resend */ - - CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - retrying (%d retry left)\n", request->mad->hdr.transact_id, request->retry); - retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req); - if (retval) { - CERROR("gsi_post_send_dtgrm failed: %d\n", retval); - ret = -EIO; - } else { - - /* restart the timer */ - request->timer.expires = jiffies + (HZ * GSI_TIMEOUT); - add_timer(&request->timer); - - ret = 0; - } - } else { - CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - no more retry\n", request->mad->hdr.transact_id); - ret = ETIMEDOUT; - } - - if (ret) { - request->status = ret; - complete_sa_request(request); - } -} - -/*--------------------------------------------------------------------------*/ - -/* Send a SA request */ -int vibnal_start_sa_request(struct sa_request *request) -{ - int ret; - vv_return_t vv_stat; - int retval; - - CDEBUG (D_NET, "querying SA\n"); - - /* Put the request on the pending list and get a transaction ID. */ - down(&kibnal_data.gsi_mutex); - - list_add_tail(&request->list, &kibnal_data.gsi_pending); - - up(&kibnal_data.gsi_mutex); - - retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req); - if (retval) { - CERROR("gsi_post_send_dtgrm failed: %d\n", retval); - return -EIO; - } - - /* TODO: This might create a race condition if the response has - * already been received. */ - init_timer(&request->timer); - request->timer.expires = jiffies + (HZ * GSI_TIMEOUT); - request->timer.data = (unsigned long)request; - request->timer.function = sa_request_timeout_handler; - add_timer(&request->timer); - - CDEBUG(D_NET, "Posted MAD with TID= "LPX64"\n", request->mad->hdr.transact_id); - return 0; -} - -/* Received a MAD */ -void -vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t *dtgrm) -{ - sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; - ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload; - struct list_head *this; - struct sa_request *request; - - CDEBUG(D_NET, "Received new MAD\n"); - - /* Validate the MAD */ - if (mad->hdr.base_ver != MAD_IB_BASE_VERSION || - mad->hdr.class != MAD_CLASS_SUBN_ADM || - mad->hdr.class_ver != 2) { - CDEBUG(D_NET, "ignoring MAD (base_ver=%x, class=%x, class_ver=%x)\n", - mad->hdr.base_ver, mad->hdr.class, mad->hdr.class_ver); - return; - } - - /* We don't care about queries, only about responses */ - if (mad->hdr.m.ms.r != 1) { - CDEBUG(D_NET, "ignoring MAD (response=%d)\n", mad->hdr.m.ms.r); - return; - } - - /* We only care about service records and path records. */ - if (mad->hdr.attrib_id != SA_SERVICE_RECORD && - mad->hdr.attrib_id != SA_PATH_RECORD) { - CDEBUG(D_NET, "ignoring MAD (attrib_id=%x)\n", mad->hdr.attrib_id); - return; - } - - /* Find the MAD request in our list */ - request = NULL; - - down(&kibnal_data.gsi_mutex); - - list_for_each(this, &kibnal_data.gsi_pending) { - struct sa_request *_request = list_entry(this, struct sa_request, list); - - CDEBUG(D_NET, "Comparing pending MAD TID "LPX64" with incoming MAD TID "LPX64"\n", - _request->mad->hdr.transact_id, mad->hdr.transact_id); - - if (_request->mad->hdr.transact_id == mad->hdr.transact_id) { - CDEBUG(D_NET, "TIDs match\n"); - request = _request; - break; - } - } - - if (request == NULL) { - up(&kibnal_data.gsi_mutex); - CDEBUG(D_NET, "ignoring MAD (TID = "LPX64"\n", mad->hdr.transact_id); - return; - } - - up(&kibnal_data.gsi_mutex); - - /* Stop the timer and remove the request from the pending list of requests. */ - del_timer_sync(&request->timer); - - down(&kibnal_data.gsi_mutex); - - list_del(&request->list); - - up(&kibnal_data.gsi_mutex); - - request->dtgrm_resp = dtgrm; - - /* Depending on the response, update the status. This is not exact - * because a non-zero status is not always an error, but that - * should be good enough right now. */ - /* TODO: fix. */ - if (mad->hdr.u.ns.status.raw16) { - CDEBUG(D_NET, "MAD response has bad status: %x\n", mad->hdr.u.ns.status.raw16); - request->status = -EIO; - } else { - request->status = 0; - } - - CDEBUG(D_NET, "incoming MAD successfully processed (status is %d)\n", request->status); - - complete_sa_request(request); -} - -/* MAD send completion */ -void -vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm) -{ - sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; - - /* Don't do anything. We might have to resend the datagram later. */ - CDEBUG(D_NET, "Datagram with TID "LPX64" sent.\n", mad->hdr.transact_id); -} - -/* - * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported. - * nid is the nid to advertize/query/unadvertize - * Note: dgid is in network order. - */ -static void fill_pathrecord_request(struct sa_request *request, vv_gid_t dgid) -{ - gsi_dtgrm_t *dtgrm = request->dtgrm_req; - sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; - ib_path_record_v2_t *path = (ib_path_record_v2_t *) mad->payload; - - memset(mad, 0, MAD_BLOCK_SIZE); - - request->mad = mad; - - dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid; - dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level; - - mad->hdr.base_ver = MAD_IB_BASE_VERSION; - mad->hdr.class = MAD_CLASS_SUBN_ADM; - mad->hdr.class_ver = 2; - mad->hdr.m.ms.method = SUBN_ADM_GET; - mad->hdr.attrib_id = SA_PATH_RECORD; /* something(?) will swap that field */ - mad->hdr.attrib_modifier = 0xFFFFFFFF; /* and that one too? */ - - /* Note: the transaction ID is set by the Voltaire stack if it is 0. */ - - /* TODO: these harcoded value to something better */ - mad->payload_len = cpu_to_be32(0x40 /*header size*/ + 0x35 /* PathRecord size */); - - mad->component_mask = cpu_to_be64( - (1 << 2) | /* DGID */ - (1 << 3) | /* SGID */ - (1 << 12)| /* numb_paths*/ - (1 << 13) /* P_key */ - ); - - path->pkey = cpu_to_be16(kibnal_data.kib_port_pkey); - path->sgid = kibnal_data.kib_port_gid; - gid_swap(&path->sgid); - path->dgid = dgid; /* already in network order */ - path->numb_path = 1; -} - -/* - * Do a path record query - * If callback is NULL, the function is synchronous (and context is ignored). - * Note: dgid is in network order. - */ -/* TODO: passing a request is a bit of a hack, but since this function - * is called under interrupt, we cannot allocate memory here :(. */ -int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context) -{ - int ret; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - - fill_pathrecord_request(request, dgid); - - if (callback) { - request->callback = callback; - request->context = context; - } else { - init_completion(&request->signal); - } - - ret = vibnal_start_sa_request(request); - if (ret) { - CERROR("vibnal_send_sa failed: %d\n", ret); - free_sa_request(request); - } else { - if (callback) { - /* Return. The callback will have to free the SA request. */ - ret = 0; - } else { - wait_for_completion(&request->signal); - - ret = request->status; - - if (ret != 0) { - CERROR ("Error %d in querying a path record\n", ret); - } - - free_sa_request(request); - } - } - - return ret; -} diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 09fdf5f..d874486 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -694,7 +694,7 @@ jt_ptl_print_peers (int argc, char **argv) ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1), ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), pcfg.pcfg_misc, pcfg.pcfg_count); - else if (g_nal_is_compatible(NULL, RANAL, OPENIBNAL, 0)) + else if (g_nal_is_compatible(NULL, RANAL, OPENIBNAL, VIBNAL, 0)) printf (LPX64"[%d]@%s:%d\n", pcfg.pcfg_nid, pcfg.pcfg_wait, ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), @@ -728,8 +728,14 @@ jt_ptl_add_peer (int argc, char **argv) argv[0]); return 0; } + } else if (g_nal_is_compatible(NULL, VIBNAL, 0)) { + if (argc != 3) { + fprintf (stderr, "usage(vib): %s nid ipaddr\n", + argv[0]); + return 0; + } } else if (argc != 2) { - fprintf (stderr, "usage(iib,vib): %s nid\n", argv[0]); + fprintf (stderr, "usage(iib): %s nid\n", argv[0]); return 0; } @@ -739,16 +745,16 @@ jt_ptl_add_peer (int argc, char **argv) return -1; } - if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, RANAL, 0)) { - if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { - fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); - return -1; - } + if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, VIBNAL, RANAL, 0) && + ptl_parse_ipaddr (&ip, argv[2]) != 0) { + fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); + return -1; + } - if (ptl_parse_port (&port, argv[3]) != 0) { - fprintf (stderr, "Can't parse port: %s\n", argv[3]); - return -1; - } + if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, RANAL, 0) && + ptl_parse_port (&port, argv[3]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return -1; } PCFG_INIT(pcfg, NAL_CMD_ADD_PEER); -- 1.8.3.1